diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 904eebaebe0..f8efce2c51f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: strategy: fail-fast: false matrix: - dir: [./, ./integration/, ./crypto/, ./insecure/] + dir: [./, ./integration/, ./insecure/] name: Lint runs-on: ubuntu-latest steps: @@ -41,10 +41,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Install C formatter - run: sudo apt-get install -y clang-format - - name: Run C formatter and sanitizer for ./crypto - run: make -C crypto c-format && make -C crypto c-sanitize - name: Run go generate run: go generate working-directory: ${{ matrix.dir }} @@ -134,11 +130,6 @@ jobs: fail-fast: false matrix: include: - - name: crypto - setup: noop - retries: 1 - race: 1 - runner: ubuntu-latest - name: insecure setup: install-tools retries: 5 diff --git a/Makefile b/Makefile index 9a68b66a9ac..0ff01f0fcc6 100644 --- a/Makefile +++ b/Makefile @@ -169,7 +169,6 @@ generate-mocks: install-mock-generators mockery --name '.*' --dir=engine/execution/ --case=underscore --output="engine/execution/mock" --outpkg="mock" mockery --name 'Backend' --dir=engine/collection/rpc --case=underscore --output="engine/collection/rpc/mock" --outpkg="mock" mockery --name 'ProviderEngine' --dir=engine/execution/provider --case=underscore --output="engine/execution/provider/mock" --outpkg="mock" - (cd ./crypto && mockery --name 'PublicKey' --case=underscore --output="../module/mock" --outpkg="mock") mockery --name '.*' --dir=state/cluster --case=underscore --output="state/cluster/mock" --outpkg="mock" mockery --name '.*' --dir=module --case=underscore --output="./module/mock" --outpkg="mock" mockery --name '.*' --dir=module/mempool --case=underscore --output="./module/mempool/mock" --outpkg="mempool" diff --git a/crypto/Dockerfile b/crypto/Dockerfile deleted file mode 100644 index 9c3fbff6363..00000000000 --- a/crypto/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -# gcr.io/dl-flow/golang-cmake - -FROM golang:1.20-buster -RUN apt-get update -RUN apt-get -y install zip -RUN go install github.com/axw/gocov/gocov@latest -RUN go install github.com/matm/gocov-html@latest -WORKDIR /go/src/flow diff --git a/crypto/Makefile b/crypto/Makefile deleted file mode 100644 index 14016e40619..00000000000 --- a/crypto/Makefile +++ /dev/null @@ -1,107 +0,0 @@ -# Name of the cover profile -COVER_PROFILE := cover.out - -IMAGE_TAG := v0.0.7 - -# OS -UNAME := $(shell uname -s) - -# allows CI to specify whether to have race detection on / off -ifeq ($(RACE_DETECTOR),1) - RACE_FLAG := -race -else - RACE_FLAG := -endif - -# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. -ifeq ($(UNAME),Linux) -# detect ADX support on the CURRENT linux machine. - ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) -else -# on non-linux machines, set the flag to 1 by default - ADX_SUPPORT := 1 -endif - -# the crypto package uses BLST source files underneath which may use ADX instructions. -ifeq ($(ADX_SUPPORT), 1) -# if ADX instructions are supported, default is to use a fast ADX BLST implementation - CRYPTO_FLAG := "" -else -# if ADX instructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation - CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" -endif -CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) - -# format C code -.PHONY: c-format -c-format: - clang-format -style=llvm -dump-config > .clang-format - clang-format -i *.c - clang-format -i *.h - rm -f .clang-format - git diff --exit-code - -# address sanitization and other checks -.SILENT: c-asan -c-asan: -# - address sanitization and other checks (only on linux) - if [ $(UNAME) = "Linux" ]; then \ - $(CGO_FLAG) CC="clang -O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \ - LD="-fsanitize=address -fsanitize=leak" go test; \ - if [ $$? -ne 0 ]; then exit 1; fi; \ - else \ - echo "sanitization is only supported on Linux"; \ - fi; \ - -# memory sanitization -.SILENT: c-msan -c-msan: -# - memory sanitization (only on linux and using clang) - (could use go test -msan) -# currently, this leads to many false positives, most likely because of assembly code not handled properly -# by asan. If you would like to run this command, you can use `NO_MSAN` to diable msan in some C functions. -# For instance "void NO_MSAN f() {...}" disables msan in function f. `NO_MSAN` is already defined in -# bls12381_utils.h - if [ $(UNAME) = "Linux" ]; then \ - $(CGO_FLAG) CC="clang -DMSAN -O0 -g -fsanitize=memory -fno-omit-frame-pointer -fsanitize-memory-track-origins" \ - LD="-fsanitize=memory" go test; \ - if [ $$? -ne 0 ]; then exit 1; fi; \ - else \ - echo "sanitization is only supported on Linux"; \ - fi; \ - -# sanitize C code -.SILENT: c-sanitize -c-sanitize: c-asan -# - address sanitization and other checks (only on linux) -# - memory sanitization (target m-san) is disabled because of multiple false positives - -# Go tidy -.PHONY: go-tidy -go-tidy: - go mod tidy -v - git diff --exit-code - -# Go lint -.PHONY: go-lint -go-lint: -lint: go-tidy - # revive -config revive.toml - golangci-lint run -v ./... - -# test all packages -.PHONY: test -test: -# root package - $(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) -# sub packages - $(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./hash - $(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./random - -.PHONY: docker-build -docker-build: - docker build -t gcr.io/dl-flow/golang-cmake:latest -t gcr.io/dl-flow/golang-cmake:$(IMAGE_TAG) . - -.PHONY: docker-push -docker-push: - docker push gcr.io/dl-flow/golang-cmake:latest - docker push "gcr.io/dl-flow/golang-cmake:$(IMAGE_TAG)" diff --git a/crypto/README.md b/crypto/README.md index c15d0a36462..3c9f7839d4a 100644 --- a/crypto/README.md +++ b/crypto/README.md @@ -1,102 +1,7 @@ # Flow Cryptography -This Go package provides the cryptography tools needed by the Flow blockchain. -Most of the primitives and protocols can be used in other projects and are not specific to Flow. - -Flow is an ongoing project, which means that new features will still be added and modifications will still be made to improve security and performance of the cryptography package. - -Notes: - - The package has been audited for security in January 2021 on [this version](https://github.com/onflow/flow-go/tree/2707acdabb851138e298b2d186e73f47df8a14dd). The package had a major refactor to switch all the BLS12-381 curve implementation to use [BLST](https://github.com/supranational/blst/tree/master/src) starting from [this version](TODO: link the commit/tag). - - The package does not provide security against side channel or fault attacks. - -## Package import - -To use the Flow cryptography package, you can: - -- get the package -``` -go get github.com/onflow/flow-go/crypto -``` -- or simply import the package to your Go project - ``` -import "github.com/onflow/flow-go/crypto" -``` - -## Algorithms - -### Hashing and Message Authentication Code: - -`crypto/hash` provides the hashing and MAC algorithms required for Flow. All algorithm implement the generic interface `Hasher`. All digests are of the generic type `Hash`. - - * SHA-3: 256 and 384 output sizes - * Legacy Kaccak: 256 output size - * SHA-2: 256 and 384 output sizes - * KMAC: 128 variant - -### Signature schemes - -All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey`. All signatures are of the generic type `Signature`. - - * ECDSA - * public keys are compressed or uncompressed. - * ephemeral key is derived from the private key, hash and the system entropy (based on https://golang.org/pkg/crypto/ecdsa/). - * supports NIST P-256 (secp256r1) and secp256k1 curves. - - * BLS - * supports [BLS12-381](https://electriccoin.co/blog/new-snark-curve/) curve. - * is implementing the minimal-signature-size variant: - signatures in G1 and public keys in G2. - * default set-up uses [compressed](https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) G1/G2 points, - but uncompressed format is also supported. - * hashing to curve uses the [Simplified SWU map-to-curve](https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#section-6.6.3). - * expanding the message in hash-to-curve uses a cSHAKE-based KMAC128 with a domain separation tag. - KMAC128 serves as an expand_message_xof function. - * this results in the full ciphersuite BLS_SIG_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for signatures - and BLS_POP_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for proofs of possession. - * signature verification includes the signature membership check in G1. - * public key membership check in G2 is provided outside of the signature verification. - * aggregation of signatures, public keys and private keys. - * verification of an aggregated signature of a single message under multiple public keys. - * verification of an aggregated signature of multiple messages under multiple public keys. - * batch verification of multiple signatures of a single message under multiple - public keys, using a binary tree of aggregations. - * SPoCK scheme based on BLS: verifies two signatures have been generated from the same message that is unknown to the verifier. - -### PRNG - - * ChaCha20-based CSPRNG - -## Protocols - -### Threshold Signature - - * BLS-based threshold signature - * [non interactive](https://www.iacr.org/archive/pkc2003/25670031/25670031.pdf) threshold signature reconstruction. - * supports only BLS 12-381 curve with the same features above. - * (t+1) signatures are required to reconstruct the threshold signature. - * key generation (single dealer) to provide the set of keys. - * provides a stateless api and a stateful api. - - -### Discrete-Log based distributed key generation - -All supported Distributed Key Generation protocols are [discrete log based](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.50.2737&rep=rep1&type=pdf) and are implemented for the same BLS setup on the BLS 12-381 curve. The protocols generate key sets for the BLS-based threshold signature. - - * Feldman VSS - * simple verifiable secret sharing with a single dealer. - * the library does not implement the communication channels between participants. The caller should implement the methods `PrivateSend` (1-to-1 messaging) and `Broadcast` (1-to-n messaging) - * 1-to-1 messaging must be a private channel, the caller must make sure the channel preserves confidentialiy and authenticates the sender. - * 1-to-n broadcasting is a reliable broadcast, where honest senders are able to reach all honest receivers, and where all honest receivers end up with the same received messages. The channel should also authenticate the broadcaster. - * It is recommended that both communication channels are unique per protocol instance. This could be achieved by prepending the messages to send/broadcast by a unique protocol instance ID. - * Feldman VSS Qual. - * an extension of the simple Feldman VSS. - * implements a complaint mechanism to qualify/disqualify the dealer. - * Joint Feldman (Pedersen) - * distributed generation. - * based on multiple parallel instances of Feldman VSS Qual with multiple dealers. - * same assumptions about the communication channels as in Feldman VSS. - - +Note: This module has been deprecated. The latest supported version is `v0.25.0`. The module migrated since to `github.com/onflow/crypto`. Please use the new module `github.com/onflow/crypto` instead. Version `v0.25.0` is equivalent to version `v0.25.0` on the new module. +Files on this module have been removed starting from `v0.25.1` to accommodate the `github.com/onflow/flow-go` repository. diff --git a/crypto/bls.go b/crypto/bls.go deleted file mode 100644 index 27ddd881bfd..00000000000 --- a/crypto/bls.go +++ /dev/null @@ -1,528 +0,0 @@ -package crypto - -// BLS signature scheme implementation using the BLS12-381 curve -// ([zcash]https://electriccoin.co/blog/new-snark-curve/). -// Pairing, ellipic curve and modular arithmetic are using [BLST](https://github.com/supranational/blst/tree/master/src) -// tools underneath. -// This implementation does not include security against side-channel or fault attacks. - -// Existing features: -// - the implementation variant is minimal-signature-size: -// shorter signatures in G1, longer public keys in G2 -// - serialization of points on G1 and G2 is compressed ([zcash] -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// - hashing to curve uses the Simplified SWU map-to-curve -// (https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#section-6.6.3) -// - expanding the message in hash-to-curve uses a cSHAKE-based KMAC128 with a domain separation tag. -// KMAC128 serves as an expand_message_xof function. -// - this results in the full ciphersuite BLS_SIG_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for signatures -// and BLS_POP_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for proofs of possession. -// - signature verification checks the membership of signature in G1. -// - the public key membership check in G2 is implemented separately from the signature verification. -// - multi-signature tools are defined in bls_multisg.go -// - SPoCK scheme based on BLS: verifies two signatures are generated from the same message, -// even though the message is unknown to the verifier. - -// #include "bls_include.h" -import "C" - -import ( - "bytes" - "crypto/sha256" - "fmt" - - "golang.org/x/crypto/hkdf" - - "github.com/onflow/flow-go/crypto/hash" -) - -const ( - // SignatureLenBLSBLS12381 is the serialization size of a `G_1` element. - SignatureLenBLSBLS12381 = g1BytesLen - // PubKeyLenBLSBLS12381 is the serialization size of a `G_2` element. - PubKeyLenBLSBLS12381 = g2BytesLen - // PrKeyLenBLSBLS12381 is the serialization size of a `F_r` element, - // where `r` is the order of `G_1` and `G_2`. - PrKeyLenBLSBLS12381 = frBytesLen - - // Hash to curve params - // hash to curve suite ID of the form : CurveID_ || HashID_ || MapID_ || encodingVariant_ - h2cSuiteID = "BLS12381G1_XOF:KMAC128_SSWU_RO_" - // scheme implemented as a countermasure for rogue attacks of the form : SchemeTag_ - schemeTag = "POP_" - // Cipher suite used for BLS signatures of the form : BLS_SIG_ || h2cSuiteID || SchemeTag_ - blsSigCipherSuite = "BLS_SIG_" + h2cSuiteID + schemeTag - // Cipher suite used for BLS PoP of the form : BLS_POP_ || h2cSuiteID || SchemeTag_ - // The PoP cipher suite is guaranteed to be different than all signature ciphersuites - blsPOPCipherSuite = "BLS_POP_" + h2cSuiteID + schemeTag - // expandMsgOutput is the output length of the expand_message step as required by the - // hash_to_curve algorithm (and the map to G1 step). - expandMsgOutput = int(C.MAP_TO_G1_INPUT_LEN) -) - -// blsBLS12381Algo, embeds SignAlgo -type blsBLS12381Algo struct { - // the signing algo and parameters - algo SigningAlgorithm -} - -// BLS context on the BLS 12-381 curve -var blsInstance *blsBLS12381Algo - -// NewExpandMsgXOFKMAC128 returns a new expand_message_xof instance for -// the hash-to-curve function, hashing data to G1 on BLS12 381. -// This instance must only be used to generate signatures (and not PoP), -// because the internal ciphersuite is customized for signatures. It -// is guaranteed to be different than the expand_message_xof instance used -// to generate proofs of possession. -// -// KMAC128 is used as the underligned extendable-output function (xof) -// as required by https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#section-5.4.4. -// -// `domainTag` is a domain separation tag that defines the protocol and its subdomain. Such tag should be of the -// format: -V-CS-with- where is the name of the protocol, the protocol -// version number and the index of the ciphersuite in the protocol. -// The function suffixes the given `domainTag` by the BLS ciphersuite supported by the library. -// -// The returned instance is a `Hasher` and can be used to generate BLS signatures -// with the `Sign` method. -func NewExpandMsgXOFKMAC128(domainTag string) hash.Hasher { - // application tag is guaranteed to be different than the tag used - // to generate proofs of possession - // postfix the domain tag with the BLS ciphersuite - key := domainTag + blsSigCipherSuite - return internalExpandMsgXOFKMAC128(key) -} - -// returns an expand_message_xof instance for -// the hash-to-curve function, hashing data to G1 on BLS12 381. -// The key is used as a customizer rather than a MAC key. -func internalExpandMsgXOFKMAC128(key string) hash.Hasher { - // blsKMACFunction is the customizer used for KMAC in BLS - const blsKMACFunction = "H2C" - // the error is ignored as the parameter lengths are chosen to be in the correct range for kmac - // (tested by TestBLSBLS12381Hasher) - kmac, _ := hash.NewKMAC_128([]byte(key), []byte(blsKMACFunction), expandMsgOutput) - return kmac -} - -// checkBLSHasher asserts that the given `hasher` is not nil and -// has an output size of `expandMsgOutput`. Otherwise an error is returned: -// - nilHasherError if the hasher is nil -// - invalidHasherSizeError if the hasher's output size is not `expandMsgOutput` (128 bytes) -func checkBLSHasher(hasher hash.Hasher) error { - if hasher == nil { - return nilHasherError - } - if hasher.Size() != expandMsgOutput { - return invalidHasherSizeErrorf("hasher's size needs to be %d, got %d", expandMsgOutput, hasher.Size()) - } - return nil -} - -// Sign signs an array of bytes using the private key -// -// Signature is compressed [zcash] -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format- -// The private key is read only. -// If the hasher used is KMAC128, the hasher is read only. -// It is recommended to use Sign with the hasher from NewExpandMsgXOFKMAC128. If not, the hasher used -// must expand the message to 1024 bits. It is also recommended to use a hasher -// with a domain separation tag. -// -// The function returns: -// - (false, nilHasherError) if a hasher is nil -// - (false, invalidHasherSizeError) if a hasher's output size is not 128 bytes -// - (signature, nil) otherwise -func (sk *prKeyBLSBLS12381) Sign(data []byte, kmac hash.Hasher) (Signature, error) { - // sanity check of input hasher - err := checkBLSHasher(kmac) - if err != nil { - return nil, err - } - - // hash the input to 128 bytes - h := kmac.ComputeHash(data) - - s := make([]byte, SignatureLenBLSBLS12381) - C.bls_sign((*C.uchar)(&s[0]), - (*C.Fr)(&sk.scalar), - (*C.uchar)(&h[0]), - (C.int)(len(h))) - return s, nil -} - -// Verify verifies a signature of a byte array using the public key and the input hasher. -// -// If the input signature slice has an invalid length or fails to deserialize into a curve -// subgroup point, the function returns false without an error. -// -// The function assumes the public key is in the valid G2 subgroup because -// all the package functions generating a BLS `PublicKey` include a G2-membership check. -// The public keys are not guaranteed to be non-identity, and therefore the function -// includes an identity comparison. Verifications against an identity public key -// are invalid to avoid equivocation issues. -// The signature membership check in G1 is included in the verification. -// -// If the hasher used is ExpandMsgXOFKMAC128, the hasher is read only. -// -// The function returns: -// - (false, nilHasherError) if a hasher is nil -// - (false, invalidHasherSizeError) if a hasher's output size is not 128 bytes -// - (false, error) if an unexpected error occurs -// - (validity, nil) otherwise -func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) (bool, error) { - // check of input hasher - err := checkBLSHasher(kmac) - if err != nil { - return false, err - } - - if len(s) != SignatureLenBLSBLS12381 { - return false, nil - } - - // hash the input to 128 bytes - h := kmac.ComputeHash(data) - - // check for identity public key - if pk.isIdentity { - return false, nil - } - - verif := C.bls_verify((*C.E2)(&pk.point), - (*C.uchar)(&s[0]), - (*C.uchar)(&h[0]), - (C.int)(len(h))) - - switch verif { - case invalid: - return false, nil - case valid: - return true, nil - default: - return false, fmt.Errorf("signature verification failed: code %d", verif) - } -} - -// IsBLSSignatureIdentity checks whether the input signature is -// the identity signature (point at infinity in G1). -// -// An identity signature is always an invalid signature even when -// verified against the identity public key. -// This identity check is useful when an aggregated signature is -// suspected to be equal to identity, which avoids failing the aggregated -// signature verification. -func IsBLSSignatureIdentity(s Signature) bool { - return bytes.Equal(s, g1Serialization) -} - -// generatePrivateKey deterministically generates a private key for BLS on BLS12-381 curve. -// The minimum size of the input seed is 32 bytes. -// -// It is recommended to use a secure crypto RNG to generate the seed. -// Otherwise, the seed must have enough entropy. -// -// The generated private key (resp. its corresponding public key) is guaranteed -// to not be equal to the identity element of Z_r (resp. G2). -func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) { - if len(ikm) < KeyGenSeedMinLen || len(ikm) > KeyGenSeedMaxLen { - return nil, invalidInputsErrorf( - "seed length should be at least %d bytes and at most %d bytes", - KeyGenSeedMinLen, KeyGenSeedMaxLen) - } - - // HKDF parameters - - // use SHA2-256 as the building block H in HKDF - hashFunction := sha256.New - // salt = H(UTF-8("BLS-SIG-KEYGEN-SALT-")) as per draft-irtf-cfrg-bls-signature-05 section 2.3. - saltString := "BLS-SIG-KEYGEN-SALT-" - hasher := hashFunction() - hasher.Write([]byte(saltString)) - salt := make([]byte, hasher.Size()) - hasher.Sum(salt[:0]) - - // L is the OKM length - // L = ceil((3 * ceil(log2(r))) / 16) which makes L (security_bits/8)-larger than r size - okmLength := (3 * frBytesLen) / 2 - - // HKDF secret = IKM || I2OSP(0, 1) - secret := make([]byte, len(ikm)+1) - copy(secret, ikm) - defer overwrite(secret) // overwrite secret - // HKDF info = key_info || I2OSP(L, 2) - keyInfo := "" // use empty key diversifier. TODO: update header to accept input identifier - info := append([]byte(keyInfo), byte(okmLength>>8), byte(okmLength)) - - sk := newPrKeyBLSBLS12381(nil) - for { - // instantiate HKDF and extract L bytes - reader := hkdf.New(hashFunction, secret, salt, info) - okm := make([]byte, okmLength) - n, err := reader.Read(okm) - if err != nil || n != okmLength { - return nil, fmt.Errorf("key generation failed because of the HKDF reader, %d bytes were read: %w", - n, err) - } - defer overwrite(okm) // overwrite okm - - // map the bytes to a private key using modular reduction - // SK = OS2IP(OKM) mod r - isZero := mapToFr(&sk.scalar, okm) - if !isZero { - return sk, nil - } - - // update salt = H(salt) - hasher.Reset() - hasher.Write(salt) - salt = hasher.Sum(salt[:0]) - } -} - -const invalidBLSSignatureHeader = byte(0xE0) - -// BLSInvalidSignature returns an invalid signature that fails when verified -// with any message and public key, which can be used for testing. -// -// The signature bytes represent an invalid serialization of a point which -// makes the verification fail early. The verification would return (false, nil). -func BLSInvalidSignature() Signature { - signature := make([]byte, SignatureLenBLSBLS12381) - signature[0] = invalidBLSSignatureHeader // invalid header as per the Zcash serialization - return signature -} - -// decodePrivateKey decodes a slice of bytes into a private key. -// Decoding assumes a bytes big endian format. -// It checks the scalar is non-zero and is less than the group order. -func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, error) { - sk := newPrKeyBLSBLS12381(nil) - - err := readScalarFrStar(&sk.scalar, privateKeyBytes) - if err != nil { - return nil, fmt.Errorf("failed to read the private key: %w", err) - } - return sk, nil -} - -// decodePublicKey decodes a slice of bytes into a public key. -// This function includes a membership check in G2. -// -// Note the function does not reject the infinity point (identity element of G2). -// However, the comparison to identity is cached in the `PublicKey` structure for -// a faster check during signature verifications. Any verification against an identity -// public key outputs `false`. -func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, error) { - if len(publicKeyBytes) != PubKeyLenBLSBLS12381 { - return nil, invalidInputsErrorf("input length must be %d, got %d", - PubKeyLenBLSBLS12381, len(publicKeyBytes)) - } - var pk pubKeyBLSBLS12381 - err := readPointE2(&pk.point, publicKeyBytes) - if err != nil { - return nil, fmt.Errorf("decode public key failed: %w", err) - } - - // membership check in G2 - if !bool(C.E2_in_G2((*C.E2)(&pk.point))) { - return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group") - } - - // check point is non-infinity and cache it - pk.isIdentity = (&pk.point).isInfinity() - - return &pk, nil -} - -// decodePublicKeyCompressed decodes a slice of bytes into a public key. -// since we use the compressed representation by default, this checks the default and delegates to decodePublicKeyCompressed -func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (PublicKey, error) { - if !isG2Compressed() { - panic("library is not configured to use compressed public key serialization") - } - return a.decodePublicKey(publicKeyBytes) -} - -// prKeyBLSBLS12381 is the private key of BLS using BLS12_381, it implements PrivateKey -type prKeyBLSBLS12381 struct { - // public key - pk *pubKeyBLSBLS12381 - // private key data - scalar scalar -} - -var _ PrivateKey = (*prKeyBLSBLS12381)(nil) - -// newPrKeyBLSBLS12381 creates a new BLS private key with the given scalar. -// If no scalar is provided, the function allocates an -// empty scalar. -func newPrKeyBLSBLS12381(x *scalar) *prKeyBLSBLS12381 { - if x != nil { - return &prKeyBLSBLS12381{ - // the embedded public key is only computed when needed - scalar: *x, - } - } - return &prKeyBLSBLS12381{} -} - -// Algorithm returns the Signing Algorithm -func (sk *prKeyBLSBLS12381) Algorithm() SigningAlgorithm { - return BLSBLS12381 -} - -// Size returns the private key length in bytes -func (sk *prKeyBLSBLS12381) Size() int { - return PrKeyLenBLSBLS12381 -} - -// computePublicKey generates the public key corresponding to -// the input private key. The function makes sure the public key -// is valid in G2. -func (sk *prKeyBLSBLS12381) computePublicKey() { - var newPk pubKeyBLSBLS12381 - // compute public key pk = g2^sk - generatorScalarMultG2(&newPk.point, &sk.scalar) - - // cache the identity comparison - newPk.isIdentity = (&sk.scalar).isZero() - - sk.pk = &newPk -} - -// PublicKey returns the public key corresponding to the private key -func (sk *prKeyBLSBLS12381) PublicKey() PublicKey { - if sk.pk != nil { - return sk.pk - } - sk.computePublicKey() - return sk.pk -} - -// Encode returns a byte encoding of the private key. -// The encoding is a raw encoding in big endian padded to the group order -func (a *prKeyBLSBLS12381) Encode() []byte { - dest := make([]byte, frBytesLen) - writeScalar(dest, &a.scalar) - return dest -} - -// Equals checks is two public keys are equal. -func (sk *prKeyBLSBLS12381) Equals(other PrivateKey) bool { - otherBLS, ok := other.(*prKeyBLSBLS12381) - if !ok { - return false - } - return (&sk.scalar).equals(&otherBLS.scalar) -} - -// String returns the hex string representation of the key. -func (sk *prKeyBLSBLS12381) String() string { - return sk.scalar.String() -} - -// pubKeyBLSBLS12381 is the public key of BLS using BLS12_381, -// it implements PublicKey. -type pubKeyBLSBLS12381 struct { - // The package guarantees an instance is only created with a point - // on the correct G2 subgroup. No membership check is needed when the - // instance is used in any BLS function. - // However, an instance can be created with an infinity point. Although - // infinity is a valid G2 point, some BLS functions fail (return false) - // when used with an infinity point. The package caches the infinity - // comparison in pubKeyBLSBLS12381 for a faster check. The package makes - // sure the comparison is performed after an instance is created. - // - // public key G2 point - point pointE2 - // G2 identity check cache - isIdentity bool -} - -var _ PublicKey = (*pubKeyBLSBLS12381)(nil) - -// newPubKeyBLSBLS12381 creates a new BLS public key with the given point. -// If no scalar is provided, the function allocates an -// empty scalar. -func newPubKeyBLSBLS12381(p *pointE2) *pubKeyBLSBLS12381 { - if p != nil { - key := &pubKeyBLSBLS12381{ - point: *p, - } - // cache the identity comparison for a faster check - // during signature verifications - key.isIdentity = p.isInfinity() - return key - } - return &pubKeyBLSBLS12381{} -} - -// Algorithm returns the Signing Algorithm -func (pk *pubKeyBLSBLS12381) Algorithm() SigningAlgorithm { - return BLSBLS12381 -} - -// Size returns the public key lengh in bytes -func (pk *pubKeyBLSBLS12381) Size() int { - return PubKeyLenBLSBLS12381 -} - -// EncodeCompressed returns a byte encoding of the public key. -// The encoding is a compressed encoding of the point -// [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format- -func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte { - if !isG2Compressed() { - panic("library is not configured to use compressed public key serialization") - } - return a.Encode() -} - -// Encode returns a byte encoding of the public key (a G2 point). -// The current encoding is a compressed serialization of G2 following [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format- -// -// The function should evolve in the future to support uncompressed compresion too. -func (a *pubKeyBLSBLS12381) Encode() []byte { - dest := make([]byte, g2BytesLen) - writePointE2(dest, &a.point) - return dest -} - -// Equals checks is two public keys are equal -func (pk *pubKeyBLSBLS12381) Equals(other PublicKey) bool { - otherBLS, ok := other.(*pubKeyBLSBLS12381) - if !ok { - return false - } - return pk.point.equals(&otherBLS.point) -} - -// String returns the hex string representation of the key. -func (pk *pubKeyBLSBLS12381) String() string { - return pk.point.String() -} - -// This is only a TEST function. -// signWithXMDSHA256 signs a message using XMD_SHA256 as a hash to field. -// -// The function is in this file because cgo can't be used in go test files. -// TODO: implement a hasher for XMD SHA256 and use the `Sign` function. -func (sk *prKeyBLSBLS12381) signWithXMDSHA256(data []byte) Signature { - - dst := []byte("BLS_SIG_BLS12381G1_XMD:SHA-256_SSWU_RO_NUL_") - hash := make([]byte, expandMsgOutput) - // XMD using SHA256 - C.xmd_sha256((*C.uchar)(&hash[0]), - (C.int)(expandMsgOutput), - (*C.uchar)(&data[0]), (C.int)(len(data)), - (*C.uchar)(&dst[0]), (C.int)(len(dst))) - - // sign the hash - s := make([]byte, SignatureLenBLSBLS12381) - C.bls_sign((*C.uchar)(&s[0]), - (*C.Fr)(&sk.scalar), - (*C.uchar)(&hash[0]), - (C.int)(len(hash))) - return s -} diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c deleted file mode 100644 index fc29046e47f..00000000000 --- a/crypto/bls12381_utils.c +++ /dev/null @@ -1,1174 +0,0 @@ -// this file contains utility functions for the curve BLS 12-381 -// these tools are shared by the BLS signature scheme, the BLS based threshold -// signature, BLS-SPoCK and the BLS distributed key generation protocols - -#include "bls12381_utils.h" -#include "assert.h" -#include "bls_include.h" - -// compile all blst C src along with this file -#include "blst_src.c" - -// make sure flow crypto types are consistent with BLST types -void types_sanity(void) { - assert(sizeof(Fr) == sizeof(vec256)); - assert(sizeof(Fp) == sizeof(vec384)); - assert(sizeof(Fp2) == sizeof(vec384x)); - assert(sizeof(E1) == sizeof(POINTonE1)); - assert(sizeof(E2) == sizeof(POINTonE2)); - assert(sizeof(Fp12) == sizeof(vec384fp12)); -} - -// ------------------- Fr utilities - -// Montgomery constant R related to the curve order r -// R = (1<<256) mod r -const Fr BLS12_381_rR = {{ - TO_LIMB_T(0x1824b159acc5056f), - TO_LIMB_T(0x998c4fefecbc4ff5), - TO_LIMB_T(0x5884b7fa00034802), - TO_LIMB_T(0x00000001fffffffe), -}}; - -// returns true if a is zero and false otherwise -bool Fr_is_zero(const Fr *a) { return vec_is_zero(a, sizeof(Fr)); } - -// returns true if a == b and false otherwise -bool Fr_is_equal(const Fr *a, const Fr *b) { - return vec_is_equal(a, b, sizeof(Fr)); -} - -// sets `a` to limb `l` -void Fr_set_limb(Fr *a, const limb_t l) { - vec_zero((byte *)a + sizeof(limb_t), sizeof(Fr) - sizeof(limb_t)); - *((limb_t *)a) = l; -} - -void Fr_copy(Fr *res, const Fr *a) { - if ((uptr_t)a == (uptr_t)res) { - return; - } - vec_copy((byte *)res, (byte *)a, sizeof(Fr)); -} - -// sets `a` to 0 -void Fr_set_zero(Fr *a) { vec_zero((byte *)a, sizeof(Fr)); } - -void Fr_add(Fr *res, const Fr *a, const Fr *b) { - add_mod_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r); -} - -void Fr_sub(Fr *res, const Fr *a, const Fr *b) { - sub_mod_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r); -} - -void Fr_neg(Fr *res, const Fr *a) { - cneg_mod_256((limb_t *)res, (limb_t *)a, 1, BLS12_381_r); -} - -// res = a*b*R^(-1) -void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b) { - mul_mont_sparse_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r, r0); -} - -// res = a^2 * R^(-1) -void Fr_squ_montg(Fr *res, const Fr *a) { - sqr_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_r, r0); -} - -// res = a*R -void Fr_to_montg(Fr *res, const Fr *a) { - mul_mont_sparse_256((limb_t *)res, (limb_t *)a, BLS12_381_rRR, BLS12_381_r, - r0); -} - -// res = a*R^(-1) -void Fr_from_montg(Fr *res, const Fr *a) { - from_mont_256((limb_t *)res, (limb_t *)a, BLS12_381_r, r0); -} - -// res = a^(-1)*R -void Fr_inv_montg_eucl(Fr *res, const Fr *a) { - // copied and modified from BLST code - // Copyright Supranational LLC - static const vec256 rx2 = { - /* left-aligned value of the modulus */ - TO_LIMB_T(0xfffffffe00000002), - TO_LIMB_T(0xa77b4805fffcb7fd), - TO_LIMB_T(0x6673b0101343b00a), - TO_LIMB_T(0xe7db4ea6533afa90), - }; - vec512 temp; - ct_inverse_mod_256(temp, (limb_t *)a, BLS12_381_r, rx2); - redc_mont_256((limb_t *)res, temp, BLS12_381_r, r0); -} - -// computes the sum of the array elements and writes the sum in jointx -void Fr_sum_vector(Fr *jointx, const Fr x[], const int x_len) { - Fr_set_zero(jointx); - for (int i = 0; i < x_len; i++) { - Fr_add(jointx, jointx, &x[i]); - } -} - -// internal type of BLST `pow256` uses bytes little endian. -// input is bytes big endian as used by Flow crypto lib external scalars. -static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES]) { - byte *b = (byte *)a + Fr_BYTES - 1; - if ((uptr_t)ret == (uptr_t)a) { // swap in place - for (int i = 0; i < Fr_BYTES / 2; i++) { - byte tmp = *ret; - *(ret++) = *b; - *(b--) = tmp; - } - } else { - for (int i = 0; i < Fr_BYTES; i++) { - *(ret++) = *(b--); - } - } -} - -// internal type of BLST `pow256` uses bytes little endian. -static void pow256_from_Fr(pow256 ret, const Fr *in) { - le_bytes_from_limbs(ret, (limb_t *)in, Fr_BYTES); -} - -// reads a scalar in `a` and checks it is a valid Fr element (a < r). -// input is bytes-big-endian. -// returns: -// - BAD_ENCODING if the length is invalid -// - BAD_VALUE if the scalar isn't in Fr -// - VALID if the scalar is valid -ERROR Fr_read_bytes(Fr *a, const byte *in, int in_len) { - if (in_len != Fr_BYTES) { - return BAD_ENCODING; - } - // compare to r using BLST internal function - pow256 tmp; - pow256_from_be_bytes(tmp, in); - // (check_mod_256 compares pow256 against a vec256!) - if (!check_mod_256(tmp, BLS12_381_r)) { - return BAD_VALUE; - } - vec_zero(tmp, sizeof(tmp)); - limbs_from_be_bytes((limb_t *)a, in, Fr_BYTES); - return VALID; -} - -// reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r). -// input bytes are big endian. -// returns: -// - BAD_ENCODING if the length is invalid -// - BAD_VALUE if the scalar isn't in Fr_star -// - VALID if the scalar is valid -ERROR Fr_star_read_bytes(Fr *a, const byte *in, int in_len) { - int ret = Fr_read_bytes(a, in, in_len); - if (ret != VALID) { - return ret; - } - // check if a=0 - if (Fr_is_zero(a)) { - return BAD_VALUE; - } - return VALID; -} - -// write Fr element `a` in big endian bytes. -void Fr_write_bytes(byte *out, const Fr *a) { - // be_bytes_from_limbs works for both limb endianness types - be_bytes_from_limbs(out, (limb_t *)a, Fr_BYTES); -} - -// maps big-endian bytes of any size into an Fr element using modular reduction. -// Input is byte-big-endian, output is Fr (internally vec256). -// -// Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p, -// limb_t n0) to reduce 512 bits at a time. -static void Fr_from_be_bytes(Fr *out, const byte *in, const int in_len) { - // input can be written in base 2^|R|, with R the Montgomery constant - // N = l_1 + L_2*2^|R| .. + L_n*2^(|R|*(n-1)) - // Therefore N mod p can be expressed using R as: - // N mod p = l_1 + L_2*R .. + L_n*R^(n-1) - Fr digit, radix; - Fr_set_zero(out); - Fr_copy(&radix, (Fr *)BLS12_381_rRR); // R^2 - - int n = in_len; - byte *p = (byte *)in + in_len; - while (n > Fr_BYTES) { - // limbs_from_be_bytes works for both limb endiannesses - limbs_from_be_bytes((limb_t *)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i - Fr_mul_montg(&digit, &digit, - &radix); // l_i * R^i (i is the loop number starting at 1) - Fr_add(out, out, &digit); - Fr_mul_montg(&radix, &radix, (Fr *)BLS12_381_rRR); // R^(i+1) - n -= Fr_BYTES; - } - Fr_set_zero(&digit); - limbs_from_be_bytes((limb_t *)&digit, p - n, n); - Fr_mul_montg(&digit, &digit, &radix); - Fr_add(out, out, &digit); - // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n, - // reduce the extra R - Fr_from_montg(out, out); - // clean up possible sensitive data - Fr_set_zero(&digit); -} - -// Reads a scalar from an array and maps it to Fr using modular reduction. -// Input is byte-big-endian as used by the external APIs. -// It returns true if scalar is zero and false otherwise. -bool map_bytes_to_Fr(Fr *a, const byte *in, int in_len) { - Fr_from_be_bytes(a, in, in_len); - return Fr_is_zero(a); -} - -// ------------------- Fp utilities - -// Montgomery constants related to the prime p -const Fp BLS12_381_pR = {ONE_MONT_P}; /* R mod p = (1<<384)%p */ - -// sets `a` to 0 -static void Fp_set_zero(Fp *a) { vec_zero((byte *)a, sizeof(Fp)); } - -// sets `a` to limb `l` -static void Fp_set_limb(Fp *a, const limb_t l) { - vec_zero((byte *)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t)); - *((limb_t *)a) = l; -} - -void Fp_copy(Fp *res, const Fp *a) { - if ((uptr_t)a == (uptr_t)res) { - return; - } - vec_copy((byte *)res, (byte *)a, sizeof(Fp)); -} - -static void Fp_add(Fp *res, const Fp *a, const Fp *b) { - add_mod_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P); -} - -static void Fp_sub(Fp *res, const Fp *a, const Fp *b) { - sub_mod_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P); -} - -static void Fp_neg(Fp *res, const Fp *a) { - cneg_mod_384((limb_t *)res, (limb_t *)a, 1, BLS12_381_P); -} - -// checks if `a` is a quadratic residue in Fp. If yes, it computes -// the square root in `res`. -// -// The boolean output is valid whether `a` is in Montgomery form or not, -// since montgomery constant `R` is a quadratic residue. -// However, the square root is valid only if `a` is in montgomery form. -static bool Fp_sqrt_montg(Fp *res, const Fp *a) { - return sqrt_fp((limb_t *)res, (limb_t *)a); -} - -static bool Fp_check(const Fp *a) { - // use same method as in BLST internal function - // which seems the most efficient. The method uses the assembly-based - // modular addition instead of limbs comparison - Fp temp; - Fp_add(&temp, a, &ZERO_384); - return vec_is_equal(&temp, a, Fp_BYTES); - // no need to clear `tmp` as no current use-case involves sensitive data being - // passed as `a` -} - -// res = a*b*R^(-1) -void Fp_mul_montg(Fp *res, const Fp *a, const Fp *b) { - mul_mont_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P, p0); -} - -// res = a^2 * R^(-1) -void Fp_squ_montg(Fp *res, const Fp *a) { - sqr_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0); -} - -// res = a*R -void Fp_to_montg(Fp *res, const Fp *a) { - mul_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_RR, BLS12_381_P, p0); -} - -// res = a*R^(-1) -void Fp_from_montg(Fp *res, const Fp *a) { - from_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0); -} - -// reads a scalar in `out` and checks it is a valid Fp element (out < p). -// input is bytes-big-endian. -// returns: -// - BAD_ENCODING if the length is invalid -// - BAD_VALUE if the scalar isn't in Fp -// - VALID if the scalar is valid -ERROR Fp_read_bytes(Fp *out, const byte *in, int in_len) { - if (in_len != Fp_BYTES) { - return BAD_ENCODING; - } - limbs_from_be_bytes((limb_t *)out, in, Fp_BYTES); - // compare read scalar to p - if (!Fp_check(out)) { - return BAD_VALUE; - } - return VALID; -} - -// write Fp element to `out`, -// assuming `out` has `Fp_BYTES` allocated bytes. -void Fp_write_bytes(byte *out, const Fp *a) { - be_bytes_from_limbs(out, (limb_t *)a, Fp_BYTES); -} - -// returns the sign of y: -// 1 if y > (p - 1)/2 and 0 otherwise. -// y is in montgomery form! -static byte Fp_get_sign(const Fp *y) { - // - BLST's sgn0_pty_mont_384 requires input to be in Montg form. - // - The needed sign bit is on position 1 - return (sgn0_pty_mont_384((const limb_t *)y, BLS12_381_P, p0) >> 1) & 1; -} - -// ------------------- Fp^2 utilities - -// sets `a` to limb `l` -static void Fp2_set_limb(Fp2 *a, const limb_t l) { - Fp_set_limb(&real(a), l); - Fp_set_zero(&imag(a)); -} - -static void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) { - add_mod_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P); -} - -static void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) { - sub_mod_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P); -} - -static void Fp2_neg(Fp2 *res, const Fp2 *a) { - cneg_mod_384(real(res), real(a), 1, BLS12_381_P); - cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P); -} - -// res = a*b in montgomery form -static void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) { - mul_mont_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P, p0); -} - -// res = a^2 in montgomery form -static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { - sqr_mont_384x((vec384 *)res, (vec384 *)a, BLS12_381_P, p0); -} - -// checks if `a` is a quadratic residue in Fp^2. If yes, it computes -// the square root in `res`. -// -// The boolean output is valid whether `a` is in Montgomery form or not, -// since montgomery constant `R` is itself a quadratic residue. -// However, the square root is correct only if `a` is in montgomery form -// (the square root would be in montgomery form too). -static bool Fp2_sqrt_montg(Fp2 *res, const Fp2 *a) { - return sqrt_fp2((vec384 *)res, (vec384 *)a); -} - -// returns the sign of y: -// sign(y_0) if y_1 = 0, else sign(y_1). -// y coordinates must be in montgomery form! -static byte Fp2_get_sign(Fp2 *y) { - // - BLST's sgn0_pty_mont_384x requires input to be in montgomery form. - // - the sign bit is on position 1 - return (sgn0_pty_mont_384x((vec384 *)y, BLS12_381_P, p0) >> 1) & 1; -} - -// reads an Fp^2 element in `a`. -// input is a serialization of real(a) concatenated to serializetion of imag(a). -// a[i] are both Fp elements. -// returns: -// - BAD_ENCODING if the length is invalid -// - BAD_VALUE if the scalar isn't in Fp -// - VALID if the scalar is valid -static ERROR Fp2_read_bytes(Fp2 *a, const byte *in, int in_len) { - if (in_len != Fp2_BYTES) { - return BAD_ENCODING; - } - ERROR ret = Fp_read_bytes(&real(a), in, Fp_BYTES); - if (ret != VALID) { - return ret; - } - ret = Fp_read_bytes(&imag(a), in + Fp_BYTES, Fp_BYTES); - if (ret != VALID) { - return ret; - } - return VALID; -} - -// write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes. -void Fp2_write_bytes(byte *out, const Fp2 *a) { - Fp_write_bytes(out, &real(a)); - Fp_write_bytes(out + Fp_BYTES, &imag(a)); -} - -// ------------------- E1 utilities - -void E1_copy(E1 *res, const E1 *p) { - if ((uptr_t)p == (uptr_t)res) { - return; - } - vec_copy(res, p, sizeof(E1)); -} - -// checks p1 == p2 -bool E1_is_equal(const E1 *p1, const E1 *p2) { - // `POINTonE1_is_equal` includes the infinity case - return POINTonE1_is_equal((const POINTonE1 *)p1, (const POINTonE1 *)p2); -} - -// compare `p` to infinity -bool E1_is_infty(const E1 *p) { - // BLST infinity points are defined by Z=0 - return vec_is_zero(p->z, sizeof(p->z)); -} - -// set `p` to infinity -void E1_set_infty(E1 *p) { - // BLST infinity points are defined by Z=0 - vec_zero(p->z, sizeof(p->z)); -} - -// converts an E1 point from Jacobian into affine coordinates (z=1) -void E1_to_affine(E1 *res, const E1 *p) { - // optimize in case coordinates are already affine - if (vec_is_equal(p->z, BLS12_381_pR, Fp_BYTES)) { - E1_copy(res, p); - return; - } - // convert from Jacobian - POINTonE1_from_Jacobian((POINTonE1 *)res, (const POINTonE1 *)p); -} - -// checks affine point `p` is in E1 -bool E1_affine_on_curve(const E1 *p) { - // BLST's `POINTonE1_affine_on_curve` does not include the infinity case! - return POINTonE1_affine_on_curve((POINTonE1_affine *)p) | E1_is_infty(p); -} - -// checks if input E1 point is on the subgroup G1. -// It assumes input `p` is on E1. -bool E1_in_G1(const E1 *p) { - // currently uses Scott method - return POINTonE1_in_G1((const POINTonE1 *)p); -} - -// E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or -// uncompressed form. The resulting point is guaranteed to be on curve E1 (no G1 -// check is included). Expected serialization follows: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// -// returns: -// - BAD_ENCODING if the length is invalid or serialization header bits are -// invalid -// - BAD_VALUE if Fp coordinates couldn't deserialize -// - POINT_NOT_ON_CURVE if deserialized point isn't on E1 -// - VALID if deserialization is valid - -// Note: could use POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, -// but needs to update the logic around G2 subgroup check -ERROR E1_read_bytes(E1 *a, const byte *in, const int in_len) { - // check the length - if (in_len != G1_SER_BYTES) { - return BAD_ENCODING; - } - - // check the compression bit - int compressed = in[0] >> 7; - if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) { - return BAD_ENCODING; - } - - // check if the point in infinity - int is_infinity = in[0] & 0x40; - if (is_infinity) { - // the remaining bits need to be cleared - if (in[0] & 0x3F) { - return BAD_ENCODING; - } - for (int i = 1; i < G1_SER_BYTES - 1; i++) { - if (in[i]) { - return BAD_ENCODING; - } - } - E1_set_infty(a); - return VALID; - } - - // read the sign bit and check for consistency - int y_sign = (in[0] >> 5) & 1; - if (y_sign && (!compressed)) { - return BAD_ENCODING; - } - - // use a temporary buffer to mask the header bits and read a.x - byte temp[Fp_BYTES]; - memcpy(temp, in, Fp_BYTES); - temp[0] &= 0x1F; // clear the header bits - ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp)); - if (ret != VALID) { - return ret; - } - Fp_to_montg(&a->x, &a->x); - - // set a.z to 1 - Fp_copy(&a->z, &BLS12_381_pR); - - if (G1_SERIALIZATION == UNCOMPRESSED) { - ret = Fp_read_bytes(&a->y, in + Fp_BYTES, sizeof(a->y)); - if (ret != VALID) { - return ret; - } - Fp_to_montg(&a->y, &a->y); - // check read point is on curve - if (!E1_affine_on_curve(a)) { - return POINT_NOT_ON_CURVE; - } - return VALID; - } - - // compute the possible square root - Fp_squ_montg(&a->y, &a->x); - Fp_mul_montg(&a->y, &a->y, &a->x); // x^3 - Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in montg form - // check whether x^3+b is a quadratic residue - if (!Fp_sqrt_montg(&a->y, &a->y)) { - return POINT_NOT_ON_CURVE; - } - - // resulting (x,y) is guaranteed to be on curve (y is already in montg form) - if (Fp_get_sign(&a->y) != y_sign) { - Fp_neg(&a->y, &a->y); // flip y sign if needed - } - return VALID; -} - -// E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or -// uncompressed form. It assumes buffer is of length G1_SER_BYTES The -// serialization follows: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -void E1_write_bytes(byte *out, const E1 *a) { - if (E1_is_infty(a)) { - memset(out, 0, G1_SER_BYTES); - // set the infinity bit - out[0] = (G1_SERIALIZATION << 7) | (1 << 6); - return; - } - E1 tmp; - E1_to_affine(&tmp, a); - - Fp_from_montg(&tmp.x, &tmp.x); - Fp_write_bytes(out, &tmp.x); - - if (G1_SERIALIZATION == COMPRESSED) { - out[0] |= (Fp_get_sign(&tmp.y) << 5); - } else { - Fp_from_montg(&tmp.y, &tmp.y); - Fp_write_bytes(out + Fp_BYTES, &tmp.y); - } - // compression bit - out[0] |= (G1_SERIALIZATION << 7); -} - -// generic point addition that must handle doubling and points at infinity -void E1_add(E1 *res, const E1 *a, const E1 *b) { - POINTonE1_dadd((POINTonE1 *)res, (POINTonE1 *)a, (POINTonE1 *)b, NULL); -} - -// Point negation: res = -a -void E1_neg(E1 *res, const E1 *a) { - E1_copy(res, a); - POINTonE1_cneg((POINTonE1 *)res, 1); -} - -// Exponentiation of a generic point `a` in E1, res = expo.a -void E1_mult(E1 *res, const E1 *p, const Fr *expo) { - pow256 tmp; - pow256_from_Fr(tmp, expo); - POINTonE1_mult_glv((POINTonE1 *)res, (POINTonE1 *)p, tmp); - vec_zero(&tmp, sizeof(tmp)); -} - -// computes the sum of the E1 array elements `y[i]` and writes it in `sum`. -void E1_sum_vector(E1 *sum, const E1 *y, const int len) { - E1_set_infty(sum); - for (int i = 0; i < len; i++) { - E1_add(sum, sum, &y[i]); - } -} - -// Computes the sum of input E1 elements flattened in a single byte -// array `in_bytes` of `in_len` bytes. and writes the sum (E1 element) as -// bytes in `out`. -// The function does not check membership of E1 inputs in G1 -// subgroup. The header is using byte pointers to minimize Cgo calls from the Go -// layer. -int E1_sum_vector_byte(byte *out, const byte *in_bytes, const int in_len) { - int error = UNDEFINED; - // sanity check that `len` is multiple of `G1_SER_BYTES` - if (in_len % G1_SER_BYTES) { - error = INVALID; - goto mem_error; - } - int n = in_len / G1_SER_BYTES; // number of signatures - - E1 *vec = (E1 *)malloc(n * sizeof(E1)); - if (!vec) { - goto mem_error; - } - - // import the points from the array - for (int i = 0; i < n; i++) { - // deserialize each point from the input array - if (E1_read_bytes(&vec[i], &in_bytes[G1_SER_BYTES * i], G1_SER_BYTES) != - VALID) { - error = INVALID; - goto out; - } - } - // sum the points - E1 acc; - E1_sum_vector(&acc, vec, n); - // export the result - E1_write_bytes(out, &acc); - error = VALID; -out: - free(vec); -mem_error: - return error; -} - -// Exponentiation of generator g1 of G1, res = expo.g1 -void G1_mult_gen(E1 *res, const Fr *expo) { - pow256 tmp; - pow256_from_Fr(tmp, expo); - POINTonE1_mult_glv((POINTonE1 *)res, &BLS12_381_G1, tmp); - vec_zero(&tmp, sizeof(tmp)); -} - -// Reads a scalar bytes and maps it to Fp using modular reduction. -// output is in Montgomery form. -// `in_len` must be less or equal to 96 bytes and must be a multiple of 8. -// This function is only used by `map_to_G1` where input is 64 bytes. -// input `in_len` is not checked to satisfy the conditions above. -static void map_96_bytes_to_Fp(Fp *a, const byte *in, int in_len) { - vec768 tmp; - vec_zero(&tmp, sizeof(tmp)); - limbs_from_be_bytes((limb_t *)tmp, in, in_len); - redc_mont_384((limb_t *)a, tmp, BLS12_381_P, p0); // aR^(-2) - Fp_mul_montg(a, a, (Fp *)BLS12_381_RRRR); // aR -} - -// maps bytes input `hash` to G1. -// `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes) -// It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf -int map_to_G1(E1 *h, const byte *hash, const int hash_len) { - // sanity check of length - if (hash_len != MAP_TO_G1_INPUT_LEN) { - return INVALID; - } - // map to field elements - Fp u[2]; - const int half = MAP_TO_G1_INPUT_LEN / 2; - map_96_bytes_to_Fp(&u[0], hash, half); - map_96_bytes_to_Fp(&u[1], hash + half, half); - // map field elements to G1 - // inputs must be in Montgomery form - map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]); - return VALID; -} - -// maps the bytes to a point in G1. -// `len` should be at least Fr_BYTES. -// this is a testing file only, should not be used in any protocol! -void unsafe_map_bytes_to_G1(E1 *p, const byte *bytes, int len) { - assert(len >= Fr_BYTES); - // map to Fr - Fr log; - map_bytes_to_Fr(&log, bytes, len); - // multiplies G1 generator by a random scalar - G1_mult_gen(p, &log); -} - -// maps bytes to a point in E1\G1. -// `len` must be at least 96 bytes. -// this is a testing function only, should not be used in any protocol! -void unsafe_map_bytes_to_G1complement(E1 *p, const byte *in, int in_len) { - assert(in_len >= 96); - Fp u; - map_96_bytes_to_Fp(&u, in, 96); - // map to E1's isogenous and then to E1 - map_to_isogenous_E1((POINTonE1 *)p, u); - isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p); - // clear G1 order - E1_mult(p, p, (Fr *)&BLS12_381_r); -} - -// ------------------- E2 utilities - -const E2 *BLS12_381_g2 = (const E2 *)&BLS12_381_G2; -const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2; - -// E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or -// uncompressed form. The resulting point is guaranteed to be on curve E2 (no G2 -// check is included). -// E2 point is in affine coordinates. This avoids further conversions -// when the point is used in multiple pairing computation. -// -// returns: -// - BAD_ENCODING if the length is invalid or serialization header bits are -// invalid -// - BAD_VALUE if Fp^2 coordinates couldn't deserialize -// - POINT_NOT_ON_CURVE if deserialized point isn't on E2 -// - VALID if deserialization is valid -// -// Note: can use with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, -// and update the logic around G2 subgroup check. -ERROR E2_read_bytes(E2 *a, const byte *in, const int in_len) { - // check the length - if (in_len != G2_SER_BYTES) { - return BAD_ENCODING; - } - - // check the compression bit - int compressed = in[0] >> 7; - if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { - return BAD_ENCODING; - } - - // check if the point in infinity - int is_infinity = in[0] & 0x40; - if (is_infinity) { - // the remaining bits need to be cleared - if (in[0] & 0x3F) { - return BAD_ENCODING; - } - for (int i = 1; i < G2_SER_BYTES - 1; i++) { - if (in[i]) { - return BAD_ENCODING; - } - } - E2_set_infty(a); - return VALID; - } - - // read the sign bit and check for consistency - int y_sign = (in[0] >> 5) & 1; - if (y_sign && (!compressed)) { - return BAD_ENCODING; - } - - // use a temporary buffer to mask the header bits and read a.x - byte temp[Fp2_BYTES]; - memcpy(temp, in, Fp2_BYTES); - temp[0] &= 0x1F; // clear the header bits - ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp)); - if (ret != VALID) { - return ret; - } - Fp2 *a_x = &(a->x); - Fp_to_montg(&real(a_x), &real(a_x)); - Fp_to_montg(&imag(a_x), &imag(a_x)); - - // set a.z to 1 - Fp2 *a_z = &(a->z); - Fp_copy(&real(a_z), &BLS12_381_pR); - Fp_set_zero(&imag(a_z)); - - Fp2 *a_y = &(a->y); - if (G2_SERIALIZATION == UNCOMPRESSED) { - ret = Fp2_read_bytes(a_y, in + Fp2_BYTES, sizeof(a->y)); - if (ret != VALID) { - return ret; - } - Fp_to_montg(&real(a_y), &real(a_y)); - Fp_to_montg(&imag(a_y), &imag(a_y)); - // check read point is on curve - if (!E2_affine_on_curve(a)) { - return POINT_NOT_ON_CURVE; - } - return VALID; - } - - // compute the possible square root - Fp2_squ_montg(a_y, a_x); - Fp2_mul_montg(a_y, a_y, a_x); // x^3 - Fp2_add(a_y, a_y, &B_E2); // B_E2 is already in Montg form - if (!Fp2_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue - return POINT_NOT_ON_CURVE; - - // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) - if (Fp2_get_sign(a_y) != y_sign) { - Fp2_neg(a_y, a_y); // flip y sign if needed - } - return VALID; -} - -// E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or -// uncompressed form. It assumes buffer is of length G2_SER_BYTES The -// serialization follows: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -void E2_write_bytes(byte *out, const E2 *a) { - if (E2_is_infty(a)) { - // set the infinity bit - out[0] = (G2_SERIALIZATION << 7) | (1 << 6); - memset(out + 1, 0, G2_SER_BYTES - 1); - return; - } - E2 tmp; - E2_to_affine(&tmp, a); - - Fp2 *t_x = &(tmp.x); - Fp_from_montg(&real(t_x), &real(t_x)); - Fp_from_montg(&imag(t_x), &imag(t_x)); - Fp2_write_bytes(out, t_x); - - Fp2 *t_y = &(tmp.y); - if (G2_SERIALIZATION == COMPRESSED) { - out[0] |= (Fp2_get_sign(t_y) << 5); - } else { - Fp_from_montg(&real(t_y), &real(t_y)); - Fp_from_montg(&imag(t_y), &imag(t_y)); - Fp2_write_bytes(out + Fp2_BYTES, t_y); - } - - out[0] |= (G2_SERIALIZATION << 7); -} - -// set p to infinity -void E2_set_infty(E2 *p) { - // BLST infinity points are defined by Z=0 - vec_zero(p->z, sizeof(p->z)); -} - -// check if `p` is infinity -bool E2_is_infty(const E2 *p) { - // BLST infinity points are defined by Z=0 - return vec_is_zero(p->z, sizeof(p->z)); -} - -// checks affine point `p` is in E2 -bool E2_affine_on_curve(const E2 *p) { - // BLST's `POINTonE2_affine_on_curve` does not include the infinity case! - return POINTonE2_affine_on_curve((POINTonE2_affine *)p) | E2_is_infty(p); -} - -// checks p1 == p2 -bool E2_is_equal(const E2 *p1, const E2 *p2) { - // `POINTonE2_is_equal` includes the infinity case - return POINTonE2_is_equal((const POINTonE2 *)p1, (const POINTonE2 *)p2); -} - -// res = p -void E2_copy(E2 *res, const E2 *p) { - if ((uptr_t)p == (uptr_t)res) { - return; - } - vec_copy(res, p, sizeof(E2)); -} - -// converts an E2 point from Jacobian into affine coordinates (z=1) -void E2_to_affine(E2 *res, const E2 *p) { - // optimize in case coordinates are already affine - if (vec_is_equal(p->z, BLS12_381_Rx.p2, sizeof(p->z))) { - E2_copy(res, p); - return; - } - // convert from Jacobian - POINTonE2_from_Jacobian((POINTonE2 *)res, (const POINTonE2 *)p); -} - -// generic point addition that must handle doubling and points at infinity -void E2_add(E2 *res, const E2 *a, const E2 *b) { - POINTonE2_dadd((POINTonE2 *)res, (POINTonE2 *)a, (POINTonE2 *)b, NULL); -} - -// generic point double that must handle point at infinity -static void E2_double(E2 *res, const E2 *a) { - POINTonE2_double((POINTonE2 *)res, (POINTonE2 *)a); -} - -// Point negation: res = -a -void E2_neg(E2 *res, const E2 *a) { - E2_copy(res, a); - POINTonE2_cneg((POINTonE2 *)res, 1); -} - -// Exponentiation of a generic point `a` in E2, res = expo.a -void E2_mult(E2 *res, const E2 *p, const Fr *expo) { - pow256 tmp; - pow256_from_Fr(tmp, expo); - POINTonE2_mult_gls((POINTonE2 *)res, (POINTonE2 *)p, tmp); - vec_zero(&tmp, sizeof(tmp)); -} - -// Exponentiation of a generic point `a` in E2 by a byte exponent, -// using a classic double-and-add algorithm (non constant-time) -void E2_mult_small_expo(E2 *res, const E2 *p, const byte expo) { - // return early if expo is zero - if (expo == 0) { - E2_set_infty(res); - return; - } - // expo is non zero - - byte mask = 1 << 7; - // process the most significant zero bits - while ((expo & mask) == 0) { - mask >>= 1; - } - - // process the first `1` bit - E2 tmp; - E2_copy(&tmp, p); - mask >>= 1; - // scan the remaining bits - for (; mask != 0; mask >>= 1) { - E2_double(&tmp, &tmp); - if (expo & mask) { - E2_add(&tmp, &tmp, p); - } - } - E2_copy(res, &tmp); -} - -// Exponentiation of generator g2 of G2, res = expo.g2 -void G2_mult_gen(E2 *res, const Fr *expo) { - pow256 tmp; - pow256_from_Fr(tmp, expo); - POINTonE2_mult_gls((POINTonE2 *)res, (POINTonE2 *)BLS12_381_g2, tmp); - vec_zero(&tmp, sizeof(tmp)); -} - -// Exponentiation of generator g2 of G2, res = expo.g2. -// -// Result is converted to affine. This is useful for results being used multiple -// times in pairings. Conversion to affine saves later pre-pairing conversions. -void G2_mult_gen_to_affine(E2 *res, const Fr *expo) { - G2_mult_gen(res, expo); - E2_to_affine(res, res); -} - -// checks if input E2 point is on the subgroup G2. -// It assumes input `p` is on E2. -bool E2_in_G2(const E2 *p) { - // currently uses Scott method - return POINTonE2_in_G2((const POINTonE2 *)p); -} - -// computes the sum of the E2 array elements `y[i]` and writes it in `sum` -void E2_sum_vector(E2 *sum, const E2 *y, const int y_len) { - E2_set_infty(sum); - for (int i = 0; i < y_len; i++) { - E2_add(sum, sum, &y[i]); - } -} - -// computes the sum of the E2 array elements `y[i]`, converts it -// to affine coordinates, and writes it in `sum`. -// -// Result is converted to affine. This is useful for results being used multiple -// times in pairings. Conversion to affine saves later pre-pairing conversions. -void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int y_len) { - E2_sum_vector(sum, y, y_len); - E2_to_affine(sum, sum); -} - -// Subtracts all G2 array elements `y` from an element `x` and writes the -// result in res. -void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int y_len) { - E2_sum_vector(res, y, y_len); - E2_neg(res, res); - E2_add(res, x, res); -} - -// maps the bytes to a point in G2. -// `in_len` should be at least Fr_BYTES. -// this is a testing tool only, it should not be used in any protocol! -void unsafe_map_bytes_to_G2(E2 *p, const byte *in, int in_len) { - assert(in_len >= Fr_BYTES); - // map to Fr - Fr log; - map_bytes_to_Fr(&log, in, in_len); - // multiplies G2 generator by a random scalar - G2_mult_gen(p, &log); -} - -// maps `in` to a point in E2\G2 and stores it in p. -// `len` should be at least 192. -// this is a testing tool only, it should not be used in any protocol! -void unsafe_map_bytes_to_G2complement(E2 *p, const byte *in, int in_len) { - assert(in_len >= 192); - Fp2 u; - map_96_bytes_to_Fp(&real(&u), in, 96); - map_96_bytes_to_Fp(&imag(&u), in + 96, 96); - // map to E2's isogenous and then to E2 - map_to_isogenous_E2((POINTonE2 *)p, u); - isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p); - // clear G2 order - E2_mult(p, p, (Fr *)&BLS12_381_r); -} - -// ------------------- Pairing utilities - -bool Fp12_is_one(Fp12 *a) { - return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12)); -} - -void Fp12_set_one(Fp12 *a) { vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12)); } - -// computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1]) -// by optimizing a common final exponentiation for all pairings. -// result is stored in `res`. -// It assumes `p` and `q` are correctly initialized and all -// p[i] and q[i] are respectively on G1 and G2 (it does not -// check their memberships). -void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) { - // easier access pointer - vec384fp6 *res_vec = (vec384fp6 *)res; - // N_MAX is defined within BLST. It should represent a good tradeoff of the - // max number of miller loops to be batched in one call to `miller_loop_n`. - // miller_loop_n expects an array of `POINTonEx_affine`. - POINTonE1_affine p_aff[N_MAX]; - POINTonE2_affine q_aff[N_MAX]; - int n = 0; // the number of couples (p,q) held in p_aff and q_aff - int init_flag = 0; - - for (int i = 0; i < len; i++) { - if (E1_is_infty(p + i) || E2_is_infty(q + i)) { - continue; - } - // `miller_loop_n` expects affine coordinates in a `POINTonEx_affine` array. - // `POINTonEx_affine` has a different size than `POINTonEx` and `Ex` ! - E1 tmp1; - E1_to_affine(&tmp1, p + i); - vec_copy(p_aff + n, &tmp1, sizeof(POINTonE1_affine)); - E2 tmp2; - E2_to_affine(&tmp2, q + i); - vec_copy(q_aff + n, &tmp2, sizeof(POINTonE2_affine)); - n++; - // if p_aff and q_aff are filled, batch `N_MAX` miller loops - if (n == N_MAX) { - if (!init_flag) { - miller_loop_n(res_vec, q_aff, p_aff, N_MAX); - init_flag = 1; - } else { - vec384fp12 tmp; - miller_loop_n(tmp, q_aff, p_aff, N_MAX); - mul_fp12(res_vec, res_vec, tmp); - } - n = 0; - } - } - // if p_aff and q_aff aren't empty, - // the remaining couples are also batched in `n` miller loops - if (n > 0) { - if (!init_flag) { - miller_loop_n(res_vec, q_aff, p_aff, n); - init_flag = 1; - } else { - vec384fp12 tmp; - miller_loop_n(tmp, q_aff, p_aff, n); - mul_fp12(res_vec, res_vec, tmp); - } - } - - // check if no miller loop was computed - if (!init_flag) { - Fp12_set_one(res); - } - final_exp(res_vec, res_vec); -} - -// ------------------- Other utilities - -// This is a testing function and is not used in exported functions -// It uses an expand message XMD based on SHA2-256. -void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, - int len_dst) { - expand_message_xmd(hash, len_hash, NULL, 0, msg, len_msg, dst, len_dst); -} - -// DEBUG printing functions -#ifdef DEBUG -void bytes_print_(char *s, byte *data, int len) { - if (strlen(s)) - printf("[%s]:\n", s); - for (int i = 0; i < len; i++) - printf("%02X,", data[i]); - printf("\n"); -} - -void Fr_print_(char *s, Fr *a) { - if (strlen(s)) - printf("[%s]:\n", s); - limb_t *p = (limb_t *)(a) + Fr_LIMBS; - for (int i = 0; i < Fr_LIMBS; i++) - printf("%016llX", *(--p)); - printf("\n"); -} - -void Fp_print_(char *s, const Fp *a) { - if (strlen(s)) - printf("[%s]:\n", s); - Fp tmp; - Fp_from_montg(&tmp, a); - limb_t *p = (limb_t *)(&tmp) + Fp_LIMBS; - for (int i = 0; i < Fp_LIMBS; i++) - printf("%016llX ", *(--p)); - printf("\n"); -} - -void Fp2_print_(char *s, const Fp2 *a) { - if (strlen(s)) - printf("[%s]:\n", s); - Fp_print_("", &real(a)); - Fp_print_("", &imag(a)); -} - -void Fp12_print_(char *s, const Fp12 *a) { - if (strlen(s)) - printf("[%s]:\n", s); - for (int i = 0; i < 2; i++) { - vec384fp6 *a_ = (vec384fp6 *)a + i; - for (int j = 0; j < 3; j++) { - vec384fp2 *a__ = (vec384fp2 *)a_ + j; - Fp2_print_("", a__); - } - } -} - -void E1_print_(char *s, const E1 *p, const int jacob) { - E1 a; - E1_copy(&a, p); - if (!jacob) - E1_to_affine(&a, &a); - if (strlen(s)) - printf("[%s]:\n", s); - Fp_print_(".x", &(a.x)); - Fp_print_(".y", &(a.y)); - if (jacob) - Fp_print_(".z", &(a.z)); -} - -void E2_print_(char *s, const E2 *p, const int jacob) { - E2 a; - E2_copy(&a, p); - if (!jacob) - E2_to_affine(&a, &a); - if (strlen(s)) - printf("[%s]:\n", s); - Fp2_print_("", &(a.x)); - Fp2_print_("", &(a.y)); - if (jacob) - Fp2_print_("", &(a.z)); -} - -#endif diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go deleted file mode 100644 index 65a54bb9dd4..00000000000 --- a/crypto/bls12381_utils.go +++ /dev/null @@ -1,351 +0,0 @@ -package crypto - -// this file contains utility functions for the curve BLS 12-381 -// these tools are shared by the BLS signature scheme, the BLS based threshold signature -// and the BLS distributed key generation protocols - -// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -Wall -fno-builtin-memcpy -fno-builtin-memset -Wno-unused-function -Wno-unused-macros -Wno-unused-variable -// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx -// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ -// #include "bls12381_utils.h" -// -// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) -// # include -// # include -// # include -// static void handler(int signum) -// { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=\"-O -D__BLST_PORTABLE__\"\n"; -// ssize_t n = write(2, &text, strlen(text)); -// _exit(128+SIGILL); -// (void)n; -// } -// __attribute__((constructor)) static void flow_crypto_cgo_init() -// { Fp temp = { 0 }; -// struct sigaction act = {{ handler }}, oact; -// sigaction(SIGILL, &act, &oact); -// Fp_squ_montg(&temp, &temp); -// sigaction(SIGILL, &oact, NULL); -// } -// #endif -// -import "C" -import ( - "errors" - "fmt" - - "github.com/onflow/flow-go/crypto/random" -) - -// Go wrappers around BLST C types -type pointE1 C.E1 -type pointE2 C.E2 -type scalar C.Fr - -// Note that scalars and field elements F_r are represented in Go by the same type -// called `scalar`, which is internally represented by C type `Fr`. Scalars used by the -// Go layer are all reduced modulo the curve order `r`. - -const ( - // BLS12-381 related lengths imported from the C layer - frBytesLen = int(C.Fr_BYTES) - fpBytesLen = int(C.Fp_BYTES) - g1BytesLen = int(C.G1_SER_BYTES) - g2BytesLen = int(C.G2_SER_BYTES) - - // error constants imported from the C layer - valid = C.VALID - invalid = C.INVALID - badEncoding = C.BAD_ENCODING - badValue = C.BAD_VALUE - pointNotOnCurve = C.POINT_NOT_ON_CURVE -) - -// header of the point at infinity serializations -var g1SerHeader byte // g1 (G1 identity) -var g2SerHeader byte // g2 (G2 identity) - -// `g1` serialization -var g1Serialization []byte - -var g2PublicKey pubKeyBLSBLS12381 - -// initialization of BLS12-381 curve -func initBLS12381() { - C.types_sanity() - - if isG1Compressed() { - g1SerHeader = 0xC0 - } else { - g1SerHeader = 0x40 - } - g1Serialization = append([]byte{g1SerHeader}, make([]byte, g1BytesLen-1)...) - if isG2Compressed() { - g2SerHeader = 0xC0 - } else { - g2SerHeader = 0x40 - } - // set a global point to infinity - C.E2_set_infty((*C.E2)(&g2PublicKey.point)) - g2PublicKey.isIdentity = true -} - -// String returns a hex-encoded representation of the scalar. -func (a *scalar) String() string { - encoding := make([]byte, frBytesLen) - writeScalar(encoding, a) - return fmt.Sprintf("%#x", encoding) -} - -// String returns a hex-encoded representation of the E2 point. -func (p *pointE2) String() string { - encoding := make([]byte, g2BytesLen) - writePointE2(encoding, p) - return fmt.Sprintf("%#x", encoding) -} - -// Scalar multiplication of a generic point `p` in E1 -func (p *pointE1) scalarMultE1(res *pointE1, expo *scalar) { - C.E1_mult((*C.E1)(res), (*C.E1)(p), (*C.Fr)(expo)) -} - -// Scalar multiplication of generator g1 in G1 -func generatorScalarMultG1(res *pointE1, expo *scalar) { - C.G1_mult_gen((*C.E1)(res), (*C.Fr)(expo)) -} - -// Scalar multiplication of generator g2 in G2 -// -// This often results in a public key that is used in -// multiple pairing computation. Therefore, convert the -// resulting point to affine coordinate to save pre-pairing -// conversions. -func generatorScalarMultG2(res *pointE2, expo *scalar) { - C.G2_mult_gen_to_affine((*C.E2)(res), (*C.Fr)(expo)) -} - -// comparison in Fr where r is the group order of G1/G2 -// (both scalars should be reduced mod r) -func (x *scalar) equals(other *scalar) bool { - return bool(C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other))) -} - -// comparison in E1 -func (p *pointE1) equals(other *pointE1) bool { - return bool(C.E1_is_equal((*C.E1)(p), (*C.E1)(other))) -} - -// comparison in E2 -func (p *pointE2) equals(other *pointE2) bool { - return bool(C.E2_is_equal((*C.E2)(p), (*C.E2)(other))) -} - -// Comparison to zero in Fr. -// Scalar must be already reduced modulo r -func (x *scalar) isZero() bool { - return bool(C.Fr_is_zero((*C.Fr)(x))) -} - -// Comparison to point at infinity in G2. -func (p *pointE2) isInfinity() bool { - return bool(C.E2_is_infty((*C.E2)(p))) -} - -// generates a random element in F_r using input random source, -// and saves the random in `x`. -// returns `true` if generated element is zero. -func randFr(x *scalar, rand random.Rand) bool { - // use extra 128 bits to reduce the modular reduction bias - bytes := make([]byte, frBytesLen+securityBits/8) - rand.Read(bytes) - // modular reduction - return mapToFr(x, bytes) -} - -// generates a random element in F_r* using input random source, -// and saves the random in `x`. -func randFrStar(x *scalar, rand random.Rand) { - isZero := true - // extremely unlikely this loop runs more than once, - // but force the output to be non-zero instead of propagating an error. - for isZero { - isZero = randFr(x, rand) - } -} - -// mapToFr reads a scalar from a slice of bytes and maps it to Fr using modular reduction. -// The resulting element `k` therefore satisfies 0 <= k < r. -// It returns true if scalar is zero and false otherwise. -func mapToFr(x *scalar, src []byte) bool { - isZero := C.map_bytes_to_Fr((*C.Fr)(x), - (*C.uchar)(&src[0]), - (C.int)(len(src))) - return bool(isZero) -} - -// writeScalar writes a scalar in a slice of bytes -func writeScalar(dest []byte, x *scalar) { - C.Fr_write_bytes((*C.uchar)(&dest[0]), (*C.Fr)(x)) -} - -// writePointE2 writes a G2 point in a slice of bytes -// The slice should be of size g2BytesLen and the serialization -// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func writePointE2(dest []byte, a *pointE2) { - C.E2_write_bytes((*C.uchar)(&dest[0]), (*C.E2)(a)) -} - -// writePointE1 writes a G1 point in a slice of bytes -// The slice should be of size g1BytesLen and the serialization -// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func writePointE1(dest []byte, a *pointE1) { - C.E1_write_bytes((*C.uchar)(&dest[0]), (*C.E1)(a)) -} - -// read an Fr* element from a byte slice -// and stores it into a `scalar` type element. -func readScalarFrStar(a *scalar, src []byte) error { - read := C.Fr_star_read_bytes( - (*C.Fr)(a), - (*C.uchar)(&src[0]), - (C.int)(len(src))) - - switch read { - case valid: - return nil - case badEncoding: - return invalidInputsErrorf("input length must be %d, got %d", - frBytesLen, len(src)) - case badValue: - return invalidInputsErrorf("scalar is not in the correct range") - default: - return invalidInputsErrorf("reading the scalar failed") - } -} - -// readPointE2 reads a E2 point from a slice of bytes -// The slice is expected to be of size g2BytesLen and the deserialization -// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. -// No G2 membership check is performed. -func readPointE2(a *pointE2, src []byte) error { - read := C.E2_read_bytes((*C.E2)(a), - (*C.uchar)(&src[0]), - (C.int)(len(src))) - - switch read { - case valid: - return nil - case badEncoding, badValue: - return invalidInputsErrorf("input could not deserialize to an E2 point") - case pointNotOnCurve: - return invalidInputsErrorf("input is not a point on curve E2") - default: - return errors.New("reading E2 point failed") - } -} - -// readPointE1 reads a E1 point from a slice of bytes -// The slice should be of size g1BytesLen and the deserialization -// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. -// No G1 membership check is performed. -func readPointE1(a *pointE1, src []byte) error { - read := C.E1_read_bytes((*C.E1)(a), - (*C.uchar)(&src[0]), - (C.int)(len(src))) - - switch read { - case valid: - return nil - case badEncoding, badValue: - return invalidInputsErrorf("input could not deserialize to a E1 point") - case pointNotOnCurve: - return invalidInputsErrorf("input is not a point on curve E1") - default: - return errors.New("reading E1 point failed") - } -} - -// checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used -// in go test files. -func checkMembershipG1(pt *pointE1) bool { - return bool(C.E1_in_G1((*C.E1)(pt))) -} - -// checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used -// in go test files. -func checkMembershipG2(pt *pointE2) bool { - return bool(C.E2_in_G2((*C.E2)(pt))) -} - -// This is only a TEST/DEBUG/BENCH function. -// It returns the hash-to-G1 point from a slice of 128 bytes -func mapToG1(data []byte) *pointE1 { - l := len(data) - var h pointE1 - if C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) != valid { - return nil - } - return &h -} - -// mapToG1 is a test function, it wraps a call to C since cgo can't be used in go test files. -// It maps input bytes to a point in G2 and stores it in input point. -// THIS IS NOT the kind of mapping function that is used in BLS signature. -func unsafeMapToG1(pt *pointE1, seed []byte) { - C.unsafe_map_bytes_to_G1((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) -} - -// unsafeMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files. -// It generates a random point in E2\G2 and stores it in input point. -func unsafeMapToG1Complement(pt *pointE1, seed []byte) { - C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) -} - -// unsafeMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files. -// It maps input bytes to a point in G2 and stores it in input point. -// THIS IS NOT the kind of mapping function that is used in BLS signature. -func unsafeMapToG2(pt *pointE2, seed []byte) { - C.unsafe_map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) -} - -// unsafeMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files. -// It generates a random point in E2\G2 and stores it in input point. -func unsafeMapToG2Complement(pt *pointE2, seed []byte) { - C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) -} - -// This is only a TEST function. -// It hashes `data` to a G1 point using the tag `dst` and returns the G1 point serialization. -// The function uses xmd with SHA256 in the hash-to-field. -func hashToG1Bytes(data, dst []byte) []byte { - hash := make([]byte, expandMsgOutput) - - inputLength := len(data) - if len(data) == 0 { - data = make([]byte, 1) - } - - // XMD using SHA256 - C.xmd_sha256((*C.uchar)(&hash[0]), - (C.int)(expandMsgOutput), - (*C.uchar)(&data[0]), (C.int)(inputLength), - (*C.uchar)(&dst[0]), (C.int)(len(dst))) - - // map the hash to G1 - var point pointE1 - if C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) != valid { - return nil - } - - // serialize the point - pointBytes := make([]byte, g1BytesLen) - writePointE1(pointBytes, &point) - return pointBytes -} - -func isG1Compressed() bool { - return g1BytesLen == fpBytesLen -} - -func isG2Compressed() bool { - return g2BytesLen == 2*fpBytesLen -} diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h deleted file mode 100644 index 923208ef3f3..00000000000 --- a/crypto/bls12381_utils.h +++ /dev/null @@ -1,165 +0,0 @@ -// this file contains utility functions for the curve BLS 12-381 -// these tools are shared by the BLS signature scheme, the BLS based threshold -// signature, BLS-SPoCK and the BLS distributed key generation protocols - -#ifndef _BLS12_381_UTILS_H -#define _BLS12_381_UTILS_H - -#include "blst_include.h" -#include -#include - -typedef uint8_t byte; -typedef _Bool bool; // assuming cgo is using a modern enough compiler - -// minimum targeted security level -#define SEC_BITS 128 - -typedef enum { - VALID = 0, - INVALID, - BAD_ENCODING, - BAD_VALUE, - POINT_NOT_ON_CURVE, - POINT_NOT_IN_GROUP, - UNDEFINED, -} ERROR; - -#define BITS_TO_BYTES(x) ((x + 7) >> 3) -#define BITS_TO_LIMBS(x) ((x + 63) >> 6) -#define BYTES_TO_LIMBS(x) ((x + 7) >> 3) -#define LIMBS_TO_BYTES(x) ((x) << 3) -#define MIN(a, b) ((a) > (b) ? (b) : (a)) - -// Fields and Group serialization lengths -#define Fp_BITS 381 -#define Fp2_BYTES (2 * Fp_BYTES) -#define Fp_LIMBS BITS_TO_LIMBS(Fp_BITS) -#define Fp_BYTES LIMBS_TO_BYTES(Fp_LIMBS) // BLST implements Fp as a limb array -#define Fr_BITS 255 -#define Fr_LIMBS BITS_TO_LIMBS(Fr_BITS) -#define Fr_BYTES LIMBS_TO_BYTES(Fr_LIMBS) // BLST implements Fr as a limb array - -#define G1_BYTES (2 * Fp_BYTES) -#define G2_BYTES (2 * Fp2_BYTES) - -// Compressed and uncompressed points -#define UNCOMPRESSED 0 -#define COMPRESSED (UNCOMPRESSED ^ 1) -#define G1_SERIALIZATION (COMPRESSED) -#define G2_SERIALIZATION (COMPRESSED) -#define G1_SER_BYTES \ - (G1_SERIALIZATION == UNCOMPRESSED ? G1_BYTES : (G1_BYTES / 2)) -#define G2_SER_BYTES \ - (G2_SERIALIZATION == UNCOMPRESSED ? G2_BYTES : (G2_BYTES / 2)) - -// init-related functions -void types_sanity(void); - -// Fr utilities -extern const Fr BLS12_381_rR; -bool Fr_is_zero(const Fr *a); -bool Fr_is_equal(const Fr *a, const Fr *b); -void Fr_set_limb(Fr *, const limb_t); -void Fr_copy(Fr *, const Fr *); -void Fr_set_zero(Fr *); -void Fr_add(Fr *res, const Fr *a, const Fr *b); -void Fr_sub(Fr *res, const Fr *a, const Fr *b); -void Fr_neg(Fr *res, const Fr *a); -void Fr_sum_vector(Fr *, const Fr x[], const int); -void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b); -void Fr_squ_montg(Fr *res, const Fr *a); -void Fr_to_montg(Fr *res, const Fr *a); -void Fr_from_montg(Fr *res, const Fr *a); -void Fr_inv_montg_eucl(Fr *res, const Fr *a); -ERROR Fr_read_bytes(Fr *a, const byte *bin, int len); -ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len); -void Fr_write_bytes(byte *bin, const Fr *a); -bool map_bytes_to_Fr(Fr *, const byte *, int); - -// Fp utilities -void Fp_mul_montg(Fp *, const Fp *, const Fp *); -void Fp_squ_montg(Fp *, const Fp *); - -// E1 and G1 utilities -void E1_copy(E1 *, const E1 *); -bool E1_is_equal(const E1 *, const E1 *); -void E1_set_infty(E1 *); -bool E1_is_infty(const E1 *); -void E1_to_affine(E1 *, const E1 *); -bool E1_affine_on_curve(const E1 *); -bool E1_in_G1(const E1 *); -void E1_mult(E1 *, const E1 *, const Fr *); -void E1_add(E1 *, const E1 *, const E1 *); -void E1_neg(E1 *, const E1 *); -void E1_sum_vector(E1 *, const E1 *, const int); -int E1_sum_vector_byte(byte *, const byte *, const int); -void G1_mult_gen(E1 *, const Fr *); -ERROR E1_read_bytes(E1 *, const byte *, const int); -void E1_write_bytes(byte *, const E1 *); -void unsafe_map_bytes_to_G1(E1 *, const byte *, int); -void unsafe_map_bytes_to_G1complement(E1 *, const byte *, int); - -#define MAP_TO_G1_INPUT_LEN (2 * (Fp_BYTES + SEC_BITS / 8)) -int map_to_G1(E1 *, const byte *, const int); - -// E2 and G2 utilities -void E2_set_infty(E2 *p); -bool E2_is_infty(const E2 *); -bool E2_affine_on_curve(const E2 *); -bool E2_is_equal(const E2 *, const E2 *); -void E2_copy(E2 *, const E2 *); -void E2_to_affine(E2 *, const E2 *); -ERROR E2_read_bytes(E2 *, const byte *, const int); -void E2_write_bytes(byte *, const E2 *); -void G2_mult_gen(E2 *, const Fr *); -void G2_mult_gen_to_affine(E2 *, const Fr *); -void E2_mult(E2 *, const E2 *, const Fr *); -void E2_mult_small_expo(E2 *, const E2 *, const byte); -void E2_add(E2 *res, const E2 *a, const E2 *b); -void E2_neg(E2 *, const E2 *); -void E2_sum_vector(E2 *, const E2 *, const int); -void E2_sum_vector_to_affine(E2 *, const E2 *, const int); -void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len); -bool E2_in_G2(const E2 *); -void unsafe_map_bytes_to_G2(E2 *, const byte *, int); -void unsafe_map_bytes_to_G2complement(E2 *, const byte *, int); - -// pairing and Fp12 -bool Fp12_is_one(Fp12 *); -void Fp12_set_one(Fp12 *); -void Fp12_multi_pairing(Fp12 *, const E1 *, const E2 *, const int); - -// utility testing function -void xmd_sha256(byte *, int, byte *, int, byte *, int); - -// Debugging related functions -// DEBUG can be enabled directly from the Go command: CC="clang -DDEBUG" go test -#ifdef DEBUG -#include -void bytes_print_(char *, byte *, int); -void Fr_print_(char *, Fr *); -void Fp_print_(char *, const Fp *); -void Fp2_print_(char *, const Fp2 *); -void Fp12_print_(char *, const Fp12 *); -void E1_print_(char *, const E1 *, const int); -void E2_print_(char *, const E2 *, const int); - -#endif /* DEBUG */ - -// memory sanitization disabler -#define NO_MSAN -#ifdef MSAN -/* add NO_MSAN to a function defintion to disable MSAN in that function ( void - * NO_MSAN f(..) {} ) */ -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) -// disable memory sanitization in this function because of a -// use-of-uninitialized-value false positive. -#undef NO_MSAN -#define NO_MSAN __attribute__((no_sanitize("memory"))) -#endif /* __has_feature(memory_sanitizer) */ -#endif /* __has_feature*/ -#endif /*MSAN*/ - -#endif /* BLS12_381_UTILS */ \ No newline at end of file diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go deleted file mode 100644 index a528e240363..00000000000 --- a/crypto/bls12381_utils_test.go +++ /dev/null @@ -1,273 +0,0 @@ -package crypto - -import ( - "crypto/rand" - "encoding/hex" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -// Sanity check of G1 and G2 scalar multiplication -func TestScalarMultBLS12381(t *testing.T) { - expoBytes, err := hex.DecodeString("444465cb6cc2dba9474e6beeb6a9013fbf1260d073429fb14a31e63e89129390") - require.NoError(t, err) - - var expo scalar - isZero := mapToFr(&expo, expoBytes) - require.False(t, isZero) - - // G1 generator multiplication - // Note that generator and random point multiplications - // are implemented with the same algorithm - t.Run("G1", func(t *testing.T) { - if !isG1Compressed() { - t.Skip() - } - var p pointE1 - generatorScalarMultG1(&p, &expo) - expected, err := hex.DecodeString("96484ca50719f5d2533047960878b6bae8289646c0f00a942a1e6992be9981a9e0c7a51e9918f9b19d178cf04a8018a4") - require.NoError(t, err) - pBytes := make([]byte, g1BytesLen) - writePointE1(pBytes, &p) - assert.Equal(t, pBytes, expected) - }) - - // G2 generator multiplication - // Note that generator and random point multiplications - // are implemented with the same algorithm - t.Run("G2", func(t *testing.T) { - if !isG2Compressed() { - t.Skip() - } - var p pointE2 - generatorScalarMultG2(&p, &expo) - expected, err := hex.DecodeString("b35f5043f166848805b98da62dcb9c5d2f25e497bd0d9c461d4a00d19e4e67cc1e813de3c99479d5a2c62fb754fd7df40c4fd60c46834c8ae665343a3ff7dc3cc929de34ad62b7b55974f4e3fd20990d3e564b96e4d33de87716052d58cf823e") - require.NoError(t, err) - pBytes := make([]byte, g2BytesLen) - writePointE2(pBytes, &p) - assert.Equal(t, pBytes, expected) - }) -} - -// G1 and G2 scalar multiplication -func BenchmarkScalarMult(b *testing.B) { - seed := make([]byte, securityBits/8) - _, err := rand.Read(seed) - require.NoError(b, err) - - var expo scalar - _ = mapToFr(&expo, seed) - - // G1 generator multiplication - // Note that generator and random point multiplications - // are implemented with the same algorithm - var res pointE1 - b.Run("G1 gen", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - generatorScalarMultG1(&res, &expo) - } - }) - - // E1 random point multiplication - // Note that generator and random point multiplications - // are implemented with the same algorithm - b.Run("E1 rand", func(b *testing.B) { - var res pointE1 - b.ResetTimer() - for i := 0; i < b.N; i++ { - res.scalarMultE1(&res, &expo) - } - }) - - // G2 generator multiplication - // Note that generator and random point multiplications - // are implemented with the same algorithm - b.Run("G2 gen", func(b *testing.B) { - var res pointE2 - b.ResetTimer() - for i := 0; i < b.N; i++ { - generatorScalarMultG2(&res, &expo) - } - }) -} - -// Sanity-check of the map-to-G1 with regards to the IETF draft hash-to-curve -func TestMapToG1(t *testing.T) { - if !isG1Compressed() { - t.Skip() - } - // test vectors from https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#appendix-J.9.1 - dst := []byte("QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_RO_") - - msgs := [][]byte{ - []byte{}, - []byte("abc"), - []byte("abcdef0123456789"), - []byte("q128_qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq"), - []byte("a512_aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"), - } - - expectedPointString := []string{ - "052926add2207b76ca4fa57a8734416c8dc95e24501772c814278700eed6d1e4e8cf62d9c09db0fac349612b759e79a1", - "03567bc5ef9c690c2ab2ecdf6a96ef1c139cc0b2f284dca0a9a7943388a49a3aee664ba5379a7655d3c68900be2f6903", - "11e0b079dea29a68f0383ee94fed1b940995272407e3bb916bbf268c263ddd57a6a27200a784cbc248e84f357ce82d98", - "15f68eaa693b95ccb85215dc65fa81038d69629f70aeee0d0f677cf22285e7bf58d7cb86eefe8f2e9bc3f8cb84fac488", - "082aabae8b7dedb0e78aeb619ad3bfd9277a2f77ba7fad20ef6aabdc6c31d19ba5a6d12283553294c1825c4b3ca2dcfe", - } - - for i, msg := range msgs { - pointBytes := hashToG1Bytes(msg, dst) - require.NotNil(t, pointBytes) - - expectedPointBytes, err := hex.DecodeString(expectedPointString[i]) - require.NoError(t, err) - // skip comparing the first 3 bits that depend on the serialization scheme - pointBytes[0] = (expectedPointBytes[0] & 0xE0) | (pointBytes[0] & 0x1F) - assert.Equal(t, expectedPointBytes, pointBytes, "map to G1 should match the IETF draft test vector") - } -} - -// Hashing to G1 bench -func BenchmarkMapToG1(b *testing.B) { - input := make([]byte, expandMsgOutput) - for i := 0; i < len(input); i++ { - input[i] = byte(i) - } - b.ResetTimer() - var p *pointE1 - for i := 0; i < b.N; i++ { - p = mapToG1(input) - } - require.NotNil(b, p) -} - -// test subgroup membership check in G1 and G2 -func TestSubgroupCheck(t *testing.T) { - prg := getPRG(t) - seed := make([]byte, 192) - _, err := prg.Read(seed) - require.NoError(t, err) - - t.Run("G1", func(t *testing.T) { - var p pointE1 - unsafeMapToG1(&p, seed) // point in G1 - assert.True(t, checkMembershipG1(&p)) - - unsafeMapToG1Complement(&p, seed) // point in E2\G2 - assert.False(t, checkMembershipG1(&p)) - }) - - t.Run("G2", func(t *testing.T) { - var p pointE2 - unsafeMapToG2(&p, seed) // point in G2 - assert.True(t, checkMembershipG2(&p)) - - unsafeMapToG2Complement(&p, seed) // point in E2\G2 - assert.False(t, checkMembershipG2(&p)) - }) -} - -// subgroup membership check bench -func BenchmarkSubgroupCheck(b *testing.B) { - seed := make([]byte, g2BytesLen) - _, err := rand.Read(seed) - require.NoError(b, err) - - b.Run("G1", func(b *testing.B) { - var p pointE1 - unsafeMapToG1(&p, seed) // point in G1 - b.ResetTimer() - for i := 0; i < b.N; i++ { - _ = checkMembershipG1(&p) // G1 - } - }) - - b.Run("G2", func(b *testing.B) { - var p pointE2 - unsafeMapToG2(&p, seed) // point in G2 - b.ResetTimer() - for i := 0; i < b.N; i++ { - _ = checkMembershipG2(&p) // G2 - } - }) -} - -// specific test of G1 points Encode and decode (BLS signature since the library is set for min_sig). -// G2 points read and write are implicitly tested by public keys Encode/Decode. -func TestReadWriteG1(t *testing.T) { - prg := getPRG(t) - seed := make([]byte, frBytesLen) - bytes := make([]byte, g1BytesLen) - // generate a random G1 point, encode it, decode it, - // and compare it the original point - t.Run("random points", func(t *testing.T) { - iterations := 50 - for i := 0; i < iterations; i++ { - var p, q pointE1 - _, err := prg.Read(seed) - unsafeMapToG1(&p, seed) - require.NoError(t, err) - writePointE1(bytes, &p) - err = readPointE1(&q, bytes) - require.NoError(t, err) - assert.True(t, p.equals(&q)) - } - }) - - t.Run("infinity", func(t *testing.T) { - var p, q pointE1 - seed := make([]byte, frBytesLen) - unsafeMapToG1(&p, seed) // this results in the infinity point given how `unsafeMapToG1` works with an empty scalar - writePointE1(bytes, &p) - require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check - err := readPointE1(&q, bytes) - require.NoError(t, err) - assert.True(t, p.equals(&q)) - }) -} - -// test some edge cases of MapToFr to validate modular reduction and endianness: -// - inputs `0` and curve order `r` -// - inputs `1` and `r+1` -func TestMapToFr(t *testing.T) { - var x scalar - offset := 10 - bytes := make([]byte, frBytesLen+offset) - expectedEncoding := make([]byte, frBytesLen) - // zero bytes - isZero := mapToFr(&x, bytes) - assert.True(t, isZero) - assert.True(t, x.isZero()) - assert.Equal(t, expectedEncoding, newPrKeyBLSBLS12381(&x).Encode()) - // curve order bytes - copy(bytes[offset:], BLS12381Order) - isZero = mapToFr(&x, bytes) - assert.True(t, isZero) - assert.True(t, x.isZero()) - assert.Equal(t, expectedEncoding, newPrKeyBLSBLS12381(&x).Encode()) - // curve order + 1 - g1, err := hex.DecodeString("824aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb813e02b6052719f607dacd3a088274f65596bd0d09920b61ab5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e") - require.NoError(t, err) - bytes[len(bytes)-1] += 1 - isZero = mapToFr(&x, bytes) - assert.False(t, isZero) - assert.False(t, x.isZero()) - expectedEncoding[frBytesLen-1] = 1 - sk := newPrKeyBLSBLS12381(&x) - assert.Equal(t, expectedEncoding, sk.Encode()) - // check scalar is equal to "1" in the lower layer (scalar multiplication) - assert.Equal(t, sk.PublicKey().Encode(), g1, "scalar should be 1, check endianness in the C layer") - // 1 - copy(bytes[offset:], expectedEncoding) - isZero = mapToFr(&x, bytes) - assert.False(t, isZero) - assert.False(t, x.isZero()) - expectedEncoding[frBytesLen-1] = 1 - sk = newPrKeyBLSBLS12381(&x) - assert.Equal(t, expectedEncoding, sk.Encode()) - // check scalar is equal to "1" in the lower layer (scalar multiplication) - assert.Equal(t, sk.PublicKey().Encode(), g1, "scalar should be 1, check endianness in the C layer") -} diff --git a/crypto/bls_core.c b/crypto/bls_core.c deleted file mode 100644 index 65f510f5987..00000000000 --- a/crypto/bls_core.c +++ /dev/null @@ -1,506 +0,0 @@ -#include "bls_include.h" - -// this file is about the core functions required by the BLS signature scheme - -// Compute a BLS signature from a G1 point (not checked) and writes it in `out`. -// `out` must be allocated properly with `G1_SER_BYTES` bytes. -static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) { - // s = h^sk - E1 s; - E1_mult(&s, h, sk); - E1_write_bytes(out, &s); -} - -// Computes a BLS signature from a hash and writes it in `out`. -// `hash` represents the hashed message with length `hash_len` equal to -// `MAP_TO_G1_INPUT_LEN`. -// `out` must be allocated properly with `G1_SER_BYTES` bytes. -int bls_sign(byte *out, const Fr *sk, const byte *hash, const int hash_len) { - // hash to G1 - E1 h; - if (map_to_G1(&h, hash, hash_len) != VALID) { - return INVALID; - } - // s = h^sk - bls_sign_E1(out, sk, &h); - return VALID; -} - -extern const E2 *BLS12_381_minus_g2; - -// Verifies a BLS signature (G1 point) against a public key (G2 point) -// and a message hash `h` (G1 point). -// Hash, signature and public key are assumed to be in G1, G1 and G2 -// respectively. -// This function only checks the pairing equality. -static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) { - E1 elemsG1[2]; - E2 elemsG2[2]; - - // elemsG1[0] = s, elemsG1[1] = h - E1_copy(&elemsG1[0], s); - E1_copy(&elemsG1[1], h); - - // elemsG2[0] = -g2, elemsG2[1] = pk - E2_copy(&elemsG2[0], BLS12_381_minus_g2); - E2_copy(&elemsG2[1], pk); - - // double pairing - Fp12 e; - Fp12_multi_pairing(&e, elemsG1, elemsG2, 2); - if (Fp12_is_one(&e)) { - return VALID; - } - return INVALID; -} - -// Verifies the validity of an aggregated BLS signature under distinct messages. -// -// Each message is mapped to a set of public keys, so that the verification -// equation is optimized to compute one pairing per message. -// - sig is the signature. -// - nb_hashes is the number of the messages (hashes) in the map -// - hashes is pointer to all flattened hashes in order where the hash at index -// i has a byte length len_hashes[i], -// is mapped to pks_per_hash[i] public keys. -// - the keys are flattened in pks in the same hashes order. -// -// membership check of the signature in G1 is verified in this function -// membership check of pks in G2 is not verified in this function -// the membership check is separated to allow optimizing multiple verifications -// using the same pks -int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes, - const byte *hashes, const uint32_t *len_hashes, - const uint32_t *pks_per_hash, const E2 *pks) { - - int ret = UNDEFINED; // return value - - E1 *elemsG1 = (E1 *)malloc((nb_hashes + 1) * sizeof(E1)); - if (!elemsG1) - goto outG1; - E2 *elemsG2 = (E2 *)malloc((nb_hashes + 1) * sizeof(E2)); - if (!elemsG2) - goto outG2; - - // elemsG1[0] = sig - if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { - ret = INVALID; - goto out; - } - - // check signature is in G1 - if (!E1_in_G1(&elemsG1[0])) { - ret = INVALID; - goto out; - } - - // elemsG2[0] = -g2 - E2_copy(&elemsG2[0], BLS12_381_minus_g2); - - // map all hashes to G1 - int offset = 0; - for (int i = 1; i < nb_hashes + 1; i++) { - // elemsG1[i] = h - // hash to G1 - map_to_G1(&elemsG1[i], &hashes[offset], len_hashes[i - 1]); - offset += len_hashes[i - 1]; - } - - // aggregate public keys mapping to the same hash - offset = 0; - for (int i = 1; i < nb_hashes + 1; i++) { - // elemsG2[i] = agg_pk[i] - E2_sum_vector(&elemsG2[i], &pks[offset], pks_per_hash[i - 1]); - offset += pks_per_hash[i - 1]; - } - - // multi pairing - Fp12 e; - Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_hashes + 1); - if (Fp12_is_one(&e)) { - ret = VALID; - } else { - ret = INVALID; - } - -out: - free(elemsG2); -outG2: - free(elemsG1); -outG1: - return ret; -} - -// Verifies the validity of an aggregated BLS signature under distinct public -// keys. -// -// Each key is mapped to a set of messages, so that the verification equation is -// optimized to compute one pairing per public key. -// - nb_pks is the number of the public keys in the map. -// - pks is pointer to all pks in order where the key at index i -// is mapped to hashes_per_pk[i] hashes. -// - the messages (hashes) are flattened in hashes in the same public key order, -// each with a length in len_hashes. -// -// membership check of the signature in G1 is verified in this function -// membership check of pks in G2 is not verified in this function -// the membership check is separated to allow optimizing multiple verifications -// using the same pks -int bls_verifyPerDistinctKey(const byte *sig, const int nb_pks, const E2 *pks, - const uint32_t *hashes_per_pk, const byte *hashes, - const uint32_t *len_hashes) { - - int ret = UNDEFINED; // return value - - E1 *elemsG1 = (E1 *)malloc((nb_pks + 1) * sizeof(E1)); - if (!elemsG1) - goto outG1; - E2 *elemsG2 = (E2 *)malloc((nb_pks + 1) * sizeof(E2)); - if (!elemsG2) - goto outG2; - - // elemsG1[0] = s - if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { - ret = INVALID; - goto out; - } - - // check s in G1 - if (!E1_in_G1(&elemsG1[0])) { - ret = INVALID; - goto out; - } - - // elemsG2[0] = -g2 - E2_copy(&elemsG2[0], BLS12_381_minus_g2); - - // set the public keys - for (int i = 1; i < nb_pks + 1; i++) { - E2_copy(&elemsG2[i], &pks[i - 1]); - } - - // map all hashes to G1 and aggregate the ones with the same public key - - // tmp_hashes is a temporary array of all hashes under a same key mapped to a - // G1 point. tmp_hashes size is set to the maximum possible size to minimize - // malloc calls. - int tmp_hashes_size = hashes_per_pk[0]; - for (int i = 1; i < nb_pks; i++) { - if (hashes_per_pk[i] > tmp_hashes_size) { - tmp_hashes_size = hashes_per_pk[i]; - } - } - E1 *tmp_hashes = (E1 *)malloc(tmp_hashes_size * sizeof(E1)); - if (!tmp_hashes) { - ret = UNDEFINED; - goto out; - } - - // sum hashes under the same key - int data_offset = 0; - int index_offset = 0; - for (int i = 1; i < nb_pks + 1; i++) { - for (int j = 0; j < hashes_per_pk[i - 1]; j++) { - // map the hash to G1 - map_to_G1(&tmp_hashes[j], &hashes[data_offset], len_hashes[index_offset]); - data_offset += len_hashes[index_offset]; - index_offset++; - } - // aggregate all the points of the array - E1_sum_vector(&elemsG1[i], tmp_hashes, hashes_per_pk[i - 1]); - } - free(tmp_hashes); - - // multi pairing - Fp12 e; - Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_pks + 1); - - if (Fp12_is_one(&e)) { - ret = VALID; - } else { - ret = INVALID; - } - -out: - free(elemsG2); -outG2: - free(elemsG1); -outG1: - return ret; -} - -// Verifies a BLS signature in a byte buffer. -// membership check of the signature in G1 is verified. -// membership check of pk in G2 is not verified in this function. -// the membership check in G2 is separated to optimize multiple verifications -// using the same key. `hash` represents the hashed message with length -// `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. -int bls_verify(const E2 *pk, const byte *sig, const byte *hash, - const int hash_len) { - E1 s, h; - // deserialize the signature into a curve point - if (E1_read_bytes(&s, sig, G1_SER_BYTES) != VALID) { - return INVALID; - } - - // check s is in G1 - if (!E1_in_G1(&s)) { - return INVALID; - } - - if (map_to_G1(&h, hash, hash_len) != VALID) { - return INVALID; - } - - return bls_verify_E1(pk, &s, &h); -} - -// binary tree structure to be used by bls_batch verify. -// Each node contains a signature and a public key, the signature (resp. the -// public key) being the aggregated signature of the two children's signature -// (resp. public keys). The leaves contain the initial signatures and public -// keys. -typedef struct st_node { - E1 *sig; - E2 *pk; - struct st_node *left; - struct st_node *right; -} node; - -static node *new_node(const E2 *pk, const E1 *sig) { - node *t = (node *)malloc(sizeof(node)); - if (t) { - t->pk = (E2 *)pk; - t->sig = (E1 *)sig; - t->right = t->left = NULL; - } - return t; -} - -static void free_tree(node *root) { - if (!root) - return; - - // only free pks and sigs of non-leafs, data of leafs are allocated - // as an entire array in `bls_batch_verify`. - if (root->left) { // no need to check the right child for the leaf check - // because - // the recursive build starts with the left side first - // pointer free - free(root->sig); - free(root->pk); - // free the children nodes - free_tree(root->left); - free_tree(root->right); - } - free(root); -} - -// builds a binary tree of aggregation of signatures and public keys -// recursively. -static node *build_tree(const int len, const E2 *pks, const E1 *sigs) { - // check if a leaf is reached - if (len == 1) { - return new_node(&pks[0], &sigs[0]); // use the first element of the arrays - } - - // a leaf is not reached yet, - int right_len = len / 2; - int left_len = len - right_len; - - // create a new node with new points - E2 *new_pk = (E2 *)malloc(sizeof(E2)); - if (!new_pk) { - goto error; - } - E1 *new_sig = (E1 *)malloc(sizeof(E1)); - if (!new_sig) { - goto error_sig; - } - - node *t = new_node(new_pk, new_sig); - if (!t) - goto error_node; - - // build the tree in a top-down way - t->left = build_tree(left_len, &pks[0], &sigs[0]); - if (!t->left) { - free_tree(t); - goto error; - } - - t->right = build_tree(right_len, &pks[left_len], &sigs[left_len]); - if (!t->right) { - free_tree(t); - goto error; - } - // sum the children - E1_add(t->sig, t->left->sig, t->right->sig); - E2_add(t->pk, t->left->pk, t->right->pk); - return t; - -error_node: - free(new_sig); -error_sig: - free(new_pk); -error: - return NULL; -} - -// verify the binary tree and fill the results using recursive batch -// verifications. -static void bls_batch_verify_tree(const node *root, const int len, - byte *results, const E1 *h) { - // verify the aggregated signature against the aggregated public key. - int res = bls_verify_E1(root->pk, root->sig, h); - - // if the result is valid, all the subtree signatures are valid. - if (res == VALID) { - for (int i = 0; i < len; i++) { - if (results[i] == UNDEFINED) - results[i] = VALID; // do not overwrite invalid results - } - return; - } - - // check if root is a leaf - if (root->left == NULL) { // no need to check the right side - *results = INVALID; - return; - } - - // otherwise, at least one of the subtree signatures is invalid. - // use the binary tree structure to find the invalid signatures. - int right_len = len / 2; - int left_len = len - right_len; - bls_batch_verify_tree(root->left, left_len, &results[0], h); - bls_batch_verify_tree(root->right, right_len, &results[left_len], h); -} - -// Batch verifies the validity of a multiple BLS signatures of the -// same message under multiple public keys. Each signature at index `i` is -// verified against the public key at index `i`. `seed` is used as the entropy -// source for randoms required by the computation. The function assumes the -// source size is at least (16*sigs_len) of random bytes of entropy at least 128 -// bits. -// -// - membership checks of all signatures is verified upfront. -// - use random coefficients for signatures and public keys at the same index to -// prevent -// indices mixup. -// - optimize the verification by verifying an aggregated signature against an -// aggregated -// public key, and use a top-down recursive verification to find invalid -// signatures. -void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input, - const byte *sigs_bytes, const byte *data, - const int data_len, const byte *seed) { - - // initialize results to undefined - memset(results, UNDEFINED, sigs_len); - - // build the arrays of G1 and G2 elements to verify - E2 *pks = (E2 *)malloc(sigs_len * sizeof(E2)); - if (!pks) { - return; - } - E1 *sigs = (E1 *)malloc(sigs_len * sizeof(E1)); - if (!sigs) { - goto out_sigs; - } - - E1 h; - if (map_to_G1(&h, data, data_len) != VALID) { - goto out; - } - - for (int i = 0; i < sigs_len; i++) { - // convert the signature points: - // - invalid points are stored as infinity points with an invalid result, so - // that the tree aggregations remain valid. - // - valid points are multiplied by a random scalar (same for public keys at - // same index) to make sure a signature at index (i) is verified against the - // public key at the same index. - int read_ret = - E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES * i], G1_SER_BYTES); - if (read_ret != VALID || !E1_in_G1(&sigs[i])) { - // set signature and key to infinity (no effect on the aggregation tree) - // and set result to invalid (result won't be overwritten) - E2_set_infty(&pks[i]); - E1_set_infty(&sigs[i]); - results[i] = INVALID; - } else { - // choose a random non-zero coefficient of at least 128 bits - Fr r, one; - // r = random, i-th seed is used for i-th signature - Fr_set_zero(&r); - const int seed_len = SEC_BITS / 8; - limbs_from_be_bytes((limb_t *)&r, seed + (seed_len * i), - seed_len); // faster shortcut than Fr_map_bytes - // r = random + 1 - Fr_set_limb(&one, 1); - Fr_add(&r, &r, &one); - // multiply public key and signature by the same random exponent r - E2_mult(&pks[i], &pks_input[i], &r); - E1_mult(&sigs[i], &sigs[i], &r); - } - } - // build a binary tree of aggregations - node *root = build_tree(sigs_len, &pks[0], &sigs[0]); - if (!root) { - goto out; - } - - // verify the binary tree and fill the results using batch verification - bls_batch_verify_tree(root, sigs_len, &results[0], &h); - // free the allocated tree - free_tree(root); -out: - free(sigs); -out_sigs: - free(pks); -} - -// Verifies the validity of 2 SPoCK proofs and 2 public keys. -// Membership check in G1 of both proofs is verified in this function. -// Membership check in G2 of both keys is not verified in this function. -// the membership check in G2 is separated to allow optimizing multiple -// verifications using the same public keys. -int bls_spock_verify(const E2 *pk1, const byte *sig1, const E2 *pk2, - const byte *sig2) { - E1 elemsG1[2]; - E2 elemsG2[2]; - - // elemsG1[0] = s1 - if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != VALID) { - return INVALID; - }; - // check s1 is in G1 - if (!E1_in_G1(&elemsG1[0])) { - return INVALID; - } - - // elemsG1[1] = s2 - if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != VALID) { - return INVALID; - }; - // check s2 is in G1 - if (!E1_in_G1(&elemsG1[1])) { - return INVALID; - } - - // elemsG2[1] = pk1 - E2_copy(&elemsG2[1], pk1); - - // elemsG2[0] = -pk2 - E2_neg(&elemsG2[0], pk2); - - // double pairing - Fp12 e; - Fp12_multi_pairing(&e, elemsG1, elemsG2, 2); - - if (Fp12_is_one(&e)) { - return VALID; - } - return INVALID; -} diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go deleted file mode 100644 index 3b3939eaf6c..00000000000 --- a/crypto/bls_crossBLST_test.go +++ /dev/null @@ -1,218 +0,0 @@ -package crypto - -// This file contains tests against the library BLST (https://github.com/supranational/blst). -// The purpose of these tests is to detect differences with a different implementation of BLS on the BLS12-381 -// curve since the BLS IETF draft (https://datatracker.ietf.org/doc/draft-irtf-cfrg-bls-signature/) doesn't -// provide extensive test vectors. -// -// This file also serves as a way to test the Flow crypto module against random input data -// generated by the "rapid" package. If the comparison against BLST is removed in the future, -// it is mandatory to add fuzzing-like tests using random inputs. -// -// A detected difference with BLST library doesn't necessary mean a bug or a non-standard implementation since -// both libraries might have made different choices. It is nevertheless a good flag for possible bugs or deviations -// from the standard as both libraries are being developed. - -import ( - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "pgregory.net/rapid" - - "github.com/onflow/flow-go/crypto/internal/blst" -) - -// validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library -func validPrivateKeyBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) - // TODO: require.NoError(t, err) seems to mess with rapid - if err != nil { - assert.FailNow(t, "failed key generation") - } - return sk.Encode() -} - -// validPublicKeyBytesFlow generates bytes of a valid public key in Flow library -func validPublicKeyBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(t, err) - return sk.PublicKey().Encode() -} - -// validSignatureBytesFlow generates bytes of a valid signature in Flow library -func validSignatureBytesFlow(t *rapid.T) []byte { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(t, err) - hasher := NewExpandMsgXOFKMAC128("random_tag") - message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Draw(t, "msg").([]byte) - signature, err := sk.Sign(message, hasher) - require.NoError(t, err) - return signature -} - -// validPrivateKeyBytesBLST generates bytes of a valid private key in BLST library -func validPrivateKeyBytesBLST(t *rapid.T) []byte { - randomSlice := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen) - ikm := randomSlice.Draw(t, "ikm").([]byte) - return blst.KeyGen(ikm).Serialize() -} - -// validPublicKeyBytesBLST generates bytes of a valid public key in BLST library -func validPublicKeyBytesBLST(t *rapid.T) []byte { - ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte) - blstS := blst.KeyGen(ikm) - blstG2 := new(blst.P2Affine).From(blstS) - return blstG2.Compress() -} - -// validSignatureBytesBLST generates bytes of a valid signature in BLST library -func validSignatureBytesBLST(t *rapid.T) []byte { - ikm := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "ikm").([]byte) - blstS := blst.KeyGen(ikm[:]) - blstG1 := new(blst.P1Affine).From(blstS) - return blstG1.Compress() -} - -// testEncodeDecodePrivateKeyCrossBLST tests encoding and decoding of private keys are consistent with BLST. -// This test assumes private key serialization is identical to the one in BLST. -func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), PrKeyLenBLSBLS12381, PrKeyLenBLSBLS12381) - validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow) - validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST) - // skBytes are bytes of either a valid or a random private key - skBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) - - // check decoding results are consistent - skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) - var skBLST blst.Scalar - res := skBLST.Deserialize(skBytes) - - flowPass := err == nil - blstPass := res != nil - require.Equal(t, flowPass, blstPass, "deserialization of the private key %x differs", skBytes) - - // check private keys are equal - if blstPass && flowPass { - skFlowOutBytes := skFlow.Encode() - skBLSTOutBytes := skBLST.Serialize() - - assert.Equal(t, skFlowOutBytes, skBLSTOutBytes) - } -} - -// testEncodeDecodePublicKeyCrossBLST tests encoding and decoding of public keys keys are consistent with BLST. -// This test assumes public key serialization is identical to the one in BLST. -func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), PubKeyLenBLSBLS12381, PubKeyLenBLSBLS12381) - validSliceFlow := rapid.Custom(validPublicKeyBytesFlow) - validSliceBLST := rapid.Custom(validPublicKeyBytesBLST) - // pkBytes are bytes of either a valid or a random public key - pkBytes := rapid.OneOf(randomSlice, validSliceFlow, validSliceBLST).Example().([]byte) - - // check decoding results are consistent - pkFlow, err := DecodePublicKey(BLSBLS12381, pkBytes) - var pkBLST blst.P2Affine - res := pkBLST.Deserialize(pkBytes) - pkValidBLST := pkBLST.KeyValidate() - - flowPass := err == nil - blstPass := res != nil && pkValidBLST - require.Equal(t, flowPass, blstPass, "deserialization of pubkey %x differs", pkBytes) - - // check public keys are equal - if flowPass && blstPass { - pkFlowOutBytes := pkFlow.Encode() - pkBLSTOutBytes := pkBLST.Compress() - assert.Equal(t, pkFlowOutBytes, pkBLSTOutBytes) - } -} - -// testEncodeDecodeG1CrossBLST tests encoding and decoding of G1 points are consistent with BLST. -// This test assumes signature serialization is identical to BLST. -func testEncodeDecodeG1CrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), g1BytesLen, g1BytesLen) - validSignatureFlow := rapid.Custom(validSignatureBytesFlow) - validSignatureBLST := rapid.Custom(validSignatureBytesBLST) - // sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes - sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte) - - // check decoding results are consistent - var pointFlow pointE1 - err := readPointE1(&pointFlow, sigBytes) - flowPass := (err == nil) && (checkMembershipG1(&pointFlow)) - - var pointBLST blst.P1Affine - // res is non-nil iff point is in G1 - res := pointBLST.Uncompress(sigBytes) - blstPass := (res != nil) && pointBLST.SigValidate(false) - - require.Equal(t, flowPass, blstPass, "deserialization of G1 %x differs", sigBytes) - - // check both serializations of G1 points are equal - if flowPass && blstPass { - sigFlowOutBytes := make([]byte, g1BytesLen) - writePointE1(sigFlowOutBytes, &pointFlow) - sigBLSTOutBytes := pointBLST.Compress() - assert.Equal(t, sigFlowOutBytes, sigBLSTOutBytes) - } -} - -// testSignHashCrossBLST tests signing a hashed message is consistent with BLST. -// -// The tests assumes the used hash-to-field and map-to-curve are identical in the 2 signatures: -// - hash-to-field : use XMD_SHA256 in both signatures -// - map to curve : Flow and BLST use an SWU mapping consistent with the test vector in -// https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#appendix-J.9.1 -// (Flow map to curve is tested agaisnt the IETF draft in TestMapToG1, BLST map to curve is not -// tested in this repo) -// -// The test also assumes Flow signature serialization is identical to the one in BLST. -func testSignHashCrossBLST(t *rapid.T) { - // decode two private keys from the same bytes - skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte) - skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) - - require.NoError(t, err) - var skBLST blst.Scalar - res := skBLST.Deserialize(skBytes) - require.NotNil(t, res) - - // generate two signatures using both libraries - blsCipher := []byte("BLS_SIG_BLS12381G1_XMD:SHA-256_SSWU_RO_NUL_") - message := rapid.SliceOfN(rapid.Byte(), 1, 1000).Example().([]byte) - - var sigBLST blst.P1Affine - sigBLST.Sign(&skBLST, message, blsCipher) - sigBytesBLST := sigBLST.Compress() - - skFlowBLS, ok := skFlow.(*prKeyBLSBLS12381) - require.True(t, ok) - sigFlow := skFlowBLS.signWithXMDSHA256(message) - sigBytesFlow := sigFlow.Bytes() - - // check both signatures are equal - assert.Equal(t, sigBytesBLST, sigBytesFlow) -} - -func testKeyGenCrossBLST(t *rapid.T) { - seed := rapid.SliceOfN(rapid.Byte(), KeyGenSeedMinLen, KeyGenSeedMaxLen).Draw(t, "seed").([]byte) - - skFlow, err := GeneratePrivateKey(BLSBLS12381, seed) - if err != nil { - assert.FailNow(t, "failed key generation") - } - skBLST := blst.KeyGen(seed) - assert.Equal(t, skFlow.Encode(), skBLST.Serialize()) -} - -func TestCrossBLST(t *testing.T) { - rapid.Check(t, testKeyGenCrossBLST) - rapid.Check(t, testEncodeDecodePrivateKeyCrossBLST) - rapid.Check(t, testEncodeDecodePublicKeyCrossBLST) - rapid.Check(t, testEncodeDecodeG1CrossBLST) - rapid.Check(t, testSignHashCrossBLST) -} diff --git a/crypto/bls_include.h b/crypto/bls_include.h deleted file mode 100644 index af380735237..00000000000 --- a/crypto/bls_include.h +++ /dev/null @@ -1,22 +0,0 @@ -// this file is about the core functions required by the BLS signature scheme - -#ifndef _BLS_INCLUDE_H -#define _BLS_INCLUDE_H - -#include "bls12381_utils.h" - -// BLS signature core (functions in bls_core.c) -int bls_sign(byte *, const Fr *, const byte *, const int); -int bls_verify(const E2 *, const byte *, const byte *, const int); -int bls_verifyPerDistinctMessage(const byte *, const int, const byte *, - const uint32_t *, const uint32_t *, - const E2 *); -int bls_verifyPerDistinctKey(const byte *, const int, const E2 *, - const uint32_t *, const byte *, const uint32_t *); -void bls_batch_verify(const int, byte *, const E2 *, const byte *, const byte *, - const int, const byte *); - -// BLS based SPoCK -int bls_spock_verify(const E2 *, const byte *, const E2 *, const byte *); - -#endif diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go deleted file mode 100644 index ea534f790f1..00000000000 --- a/crypto/bls_multisig.go +++ /dev/null @@ -1,576 +0,0 @@ -package crypto - -import ( - "crypto/rand" - "errors" - "fmt" - - "github.com/onflow/flow-go/crypto/hash" -) - -// BLS multi-signature using BLS12-381 curve -// ([zcash]https://github.com/zkcrypto/pairing/blob/master/src/bls12_381/README.md#bls12-381) -// Pairing, ellipic curve and modular arithmetic are using [BLST](https://github.com/supranational/blst/tree/master/src) -// tools underneath. -// This implementation does not include any security against side-channel side-channel or fault attacks. - -// Existing features: -// - the same BLS set-up in bls.go -// - Use the proof of possession scheme (PoP) to prevent against rogue public-key attack. -// - Aggregation of private keys, public keys and signatures. -// - Subtraction of multiple public keys from an (aggregated) public key. -// - Multi-signature verification of an aggregated signature of a single message -// under multiple public keys. -// - Multi-signature verification of an aggregated signature of multiple messages under -// multiple public keys. -// - batch verification of multiple signatures of a single message under multiple -// public keys, using a binary tree of aggregations. - -// #include "bls12381_utils.h" -// #include "bls_include.h" -import "C" - -// the PoP hasher, used to generate and verify PoPs -// The key is based on blsPOPCipherSuite which guarantees -// that hash_to_field of PoP is orthogonal to all hash_to_field functions -// used for signatures. -var popKMAC = internalExpandMsgXOFKMAC128(blsPOPCipherSuite) - -// BLSGeneratePOP returns a proof of possession (PoP) for the receiver private key. -// -// The KMAC hasher used in the function is guaranteed to be orthogonal to all hashers used -// for signatures or SPoCK proofs on this package. This means a specific domain tag is used -// to generate PoP and is not used by any other application. -// -// The function returns: -// - (nil, notBLSKeyError) if the input key is not of type BLS BLS12-381 -// - (pop, nil) otherwise -func BLSGeneratePOP(sk PrivateKey) (Signature, error) { - _, ok := sk.(*prKeyBLSBLS12381) - if !ok { - return nil, notBLSKeyError - } - // sign the public key - return sk.Sign(sk.PublicKey().Encode(), popKMAC) -} - -// BLSVerifyPOP verifies a proof of possession (PoP) for the receiver public key. -// -// The function internally uses the same KMAC hasher used to generate the PoP. -// The hasher is guaranteed to be orthogonal to any hasher used to generate signature -// or SPoCK proofs on this package. -// Note that verifying a PoP against an idenity public key fails. -// -// The function returns: -// - (false, notBLSKeyError) if the input key is not of type BLS BLS12-381 -// - (validity, nil) otherwise -func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) { - _, ok := pk.(*pubKeyBLSBLS12381) - if !ok { - return false, notBLSKeyError - } - // verify the signature against the public key - return pk.Verify(s, pk.Encode(), popKMAC) -} - -// AggregateBLSSignatures aggregates multiple BLS signatures into one. -// -// Signatures could be generated from the same or distinct messages, they -// could also be the aggregation of other signatures. -// The order of the signatures in the slice does not matter since the aggregation -// is commutative. The slice should not be empty. -// No G1 membership check is performed on the input signatures. -// -// The function returns: -// - (nil, blsAggregateEmptyListError) if no signatures are provided (input slice is empty) -// - (nil, invalidSignatureError) if a deserialization of at least one signature fails (input is an invalid serialization of a -// compressed E1 element following [zcash] -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-). -// G1 membership is not checked. -// - (nil, error) if an unexpected error occurs -// - (aggregated_signature, nil) otherwise -func AggregateBLSSignatures(sigs []Signature) (Signature, error) { - // check for empty list - if len(sigs) == 0 { - return nil, blsAggregateEmptyListError - } - - // flatten the shares (required by the C layer) - flatSigs := make([]byte, 0, SignatureLenBLSBLS12381*len(sigs)) - for i, sig := range sigs { - if len(sig) != SignatureLenBLSBLS12381 { - return nil, fmt.Errorf("signature at index %d has an invalid length: %w", i, invalidSignatureError) - } - flatSigs = append(flatSigs, sig...) - } - aggregatedSig := make([]byte, SignatureLenBLSBLS12381) - - // add the points in the C layer - result := C.E1_sum_vector_byte( - (*C.uchar)(&aggregatedSig[0]), - (*C.uchar)(&flatSigs[0]), - (C.int)(len(flatSigs))) - - switch result { - case valid: - return aggregatedSig, nil - case invalid: - return nil, invalidSignatureError - default: - return nil, fmt.Errorf("aggregating signatures failed") - } -} - -// AggregateBLSPrivateKeys aggregates multiple BLS private keys into one. -// -// The order of the keys in the slice does not matter since the aggregation -// is commutative. The slice should not be empty. -// No check is performed on the input private keys. -// Input or output private keys could be equal to the identity element (zero). Note that any -// signature generated by the identity key is invalid (to avoid equivocation issues). -// -// The function returns: -// - (nil, notBLSKeyError) if at least one key is not of type BLS BLS12-381 -// - (nil, blsAggregateEmptyListError) if no keys are provided (input slice is empty) -// - (aggregated_key, nil) otherwise -func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { - // check for empty list - if len(keys) == 0 { - return nil, blsAggregateEmptyListError - } - - scalars := make([]scalar, 0, len(keys)) - for i, sk := range keys { - skBls, ok := sk.(*prKeyBLSBLS12381) - if !ok { - return nil, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) - } - scalars = append(scalars, skBls.scalar) - } - - var sum scalar - C.Fr_sum_vector((*C.Fr)(&sum), (*C.Fr)(&scalars[0]), - (C.int)(len(scalars))) - return newPrKeyBLSBLS12381(&sum), nil -} - -// AggregateBLSPublicKeys aggregate multiple BLS public keys into one. -// -// The order of the keys in the slice does not matter since the aggregation -// is commutative. The slice should not be empty. -// No check is performed on the input public keys. The input keys are guaranteed by -// the package constructors to be on the G2 subgroup. -// Input or output keys can be equal to the identity key. Note that any -// signature verified against the identity key is invalid (to avoid equivocation issues). -// -// The function returns: -// - (nil, notBLSKeyError) if at least one key is not of type BLS BLS12-381 -// - (nil, blsAggregateEmptyListError) no keys are provided (input slice is empty) -// - (aggregated_key, nil) otherwise -func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { - - // check for empty list - if len(keys) == 0 { - return nil, blsAggregateEmptyListError - } - - points := make([]pointE2, 0, len(keys)) - for i, pk := range keys { - pkBLS, ok := pk.(*pubKeyBLSBLS12381) - if !ok { - return nil, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) - } - points = append(points, pkBLS.point) - } - - var sum pointE2 - C.E2_sum_vector_to_affine((*C.E2)(&sum), (*C.E2)(&points[0]), - (C.int)(len(points))) - - sumKey := newPubKeyBLSBLS12381(&sum) - return sumKey, nil -} - -// IdentityBLSPublicKey returns an identity public key which corresponds to the point -// at infinity in G2 (identity element g2). -func IdentityBLSPublicKey() PublicKey { - return &g2PublicKey -} - -// RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key. -// -// The common use case assumes the aggregated public key was initially formed using -// the keys to be removed (directly or using other aggregated forms). However the function -// can still be called in different use cases. -// The order of the keys to be removed in the slice does not matter since the removal -// is commutative. The slice of keys to be removed can be empty. -// No check is performed on the input public keys. The input keys are guaranteed by the -// package constructors to be on the G2 subgroup. -// Input or output keys can be equal to the identity key. -// -// The function returns: -// - (nil, notBLSKeyError) if at least one input key is not of type BLS BLS12-381 -// - (remaining_key, nil) otherwise -func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, error) { - - aggPKBLS, ok := aggKey.(*pubKeyBLSBLS12381) - if !ok { - return nil, notBLSKeyError - } - - pointsToSubtract := make([]pointE2, 0, len(keysToRemove)) - for i, pk := range keysToRemove { - pkBLS, ok := pk.(*pubKeyBLSBLS12381) - if !ok { - return nil, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) - } - pointsToSubtract = append(pointsToSubtract, pkBLS.point) - } - - // check for empty list to avoid a cgo edge case - if len(keysToRemove) == 0 { - return aggKey, nil - } - - var resultPoint pointE2 - C.E2_subtract_vector((*C.E2)(&resultPoint), (*C.E2)(&aggPKBLS.point), - (*C.E2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract))) - - resultKey := newPubKeyBLSBLS12381(&resultPoint) - return resultKey, nil -} - -// VerifyBLSSignatureOneMessage is a multi-signature verification that verifies a -// BLS signature of a single message against multiple BLS public keys. -// -// The input signature could be generated by aggregating multiple signatures of the -// message under multiple private keys. The public keys corresponding to the signing -// private keys are passed as input to this function. -// The caller must make sure the input public keys's proofs of possession have been -// verified prior to calling this function (or each input key is sum of public keys of -// which proofs of possession have been verified). -// -// The input hasher is the same hasher used to generate all initial signatures. -// The order of the public keys in the slice does not matter. -// Membership check is performed on the input signature but is not performed on the input -// public keys (membership is guaranteed by using the package functions). -// If the input public keys add up to the identity public key, the signature is invalid -// to avoid signature equivocation issues. -// -// This is a special case function of VerifyBLSSignatureManyMessages, using a single -// message and hasher. -// -// The function returns: -// - (false, nilHasherError) if hasher is nil -// - (false, invalidHasherSizeError) if hasher's output size is not 128 bytes -// - (false, notBLSKeyError) if at least one key is not of type pubKeyBLSBLS12381 -// - (nil, blsAggregateEmptyListError) if input key slice is empty -// - (false, error) if an unexpected error occurs -// - (validity, nil) otherwise -func VerifyBLSSignatureOneMessage( - pks []PublicKey, s Signature, message []byte, kmac hash.Hasher, -) (bool, error) { - // public key list must be non empty, this is checked internally by AggregateBLSPublicKeys - aggPk, err := AggregateBLSPublicKeys(pks) - if err != nil { - return false, fmt.Errorf("verify signature one message failed: %w", err) - } - return aggPk.Verify(s, message, kmac) -} - -// VerifyBLSSignatureManyMessages is a multi-signature verification that verifies a -// BLS signature under multiple messages and public keys. -// -// The input signature could be generated by aggregating multiple signatures of distinct -// messages under distinct private keys. The verification is performed against the message -// at index (i) and the public key at the same index (i) of the input messages and public keys. -// The hasher at index (i) is used to hash the message at index (i). -// -// Since the package only supports the Proof of Possession scheme, the function does not enforce -// input messages to be distinct. Thereore, the caller must make sure the input public keys's proofs -// of possession have been verified prior to calling this function (or each input key is sum of public -// keys of which proofs of possession have been verified). -// -// The verification is optimized to compute one pairing per distinct message, or one pairing -// per distinct key, whatever way offers less pairings calls. If all messages are the same, the -// function has the same behavior as VerifyBLSSignatureOneMessage. If there is one input message and -// input public key, the function has the same behavior as pk.Verify. -// Membership check is performed on the input signature. -// In order to avoid equivocation issues, any identity public key results in the overall -// signature being invalid. -// -// The function returns: -// - (false, nilHasherError) if a hasher is nil -// - (false, invalidHasherSizeError) if a hasher's output size is not 128 bytes -// - (false, notBLSKeyError) if at least one key is not a BLS BLS12-381 key -// - (false, invalidInputsError) if size of keys is not matching the size of messages and hashers -// - (false, blsAggregateEmptyListError) if input key slice `pks` is empty -// - (false, error) if an unexpected error occurs -// - (validity, nil) otherwise -func VerifyBLSSignatureManyMessages( - pks []PublicKey, s Signature, messages [][]byte, kmac []hash.Hasher, -) (bool, error) { - - // check signature length - if len(s) != SignatureLenBLSBLS12381 { - return false, nil - } - // check the list lengths - if len(pks) == 0 { - return false, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError) - } - if len(pks) != len(messages) || len(kmac) != len(messages) { - return false, invalidInputsErrorf( - "input lists must be equal, messages are %d, keys are %d, hashers are %d", - len(messages), - len(pks), - len(kmac)) - } - - // compute the hashes - hashes := make([][]byte, 0, len(messages)) - for i, k := range kmac { - if err := checkBLSHasher(k); err != nil { - return false, fmt.Errorf("hasher at index %d is invalid: %w ", i, err) - } - hashes = append(hashes, k.ComputeHash(messages[i])) - } - - // two maps to count the type (keys or messages) with the least distinct elements. - // mapPerHash maps hashes to keys while mapPerPk maps keys to hashes. - // The comparison of the maps length minimizes the number of pairings to - // compute by aggregating either public keys or the message hashes in - // the verification equation. - mapPerHash := make(map[string][]pointE2) - mapPerPk := make(map[pointE2][][]byte) - // Note: mapPerPk is using a cgo structure as map keys which may lead to 2 equal public keys - // being considered distinct. This does not make the verification equation wrong but leads to - // computing extra pairings. This case is considered unlikely to happen since a caller is likely - // to use the same struct for a same public key. - // One way to fix this is to use the public key encoding as the map keys and store the "pointE2" - // structure with the map value, which adds more complexity and processing time. - - // fill the 2 maps - for i, pk := range pks { - pkBLS, ok := pk.(*pubKeyBLSBLS12381) - if !ok { - return false, fmt.Errorf( - "public key at index %d is invalid: %w", - i, notBLSKeyError) - } - // check identity check - if pkBLS.isIdentity { - return false, nil - } - - mapPerHash[string(hashes[i])] = append(mapPerHash[string(hashes[i])], pkBLS.point) - mapPerPk[pkBLS.point] = append(mapPerPk[pkBLS.point], hashes[i]) - } - - var verif (C.int) - //compare the 2 maps for the shortest length - if len(mapPerHash) < len(mapPerPk) { - // aggregate keys per distinct hashes - // using the linearity of the pairing on the G2 variables. - flatDistinctHashes := make([]byte, 0) - lenHashes := make([]uint32, 0) - pkPerHash := make([]uint32, 0, len(mapPerHash)) - allPks := make([]pointE2, 0) - for hash, pksVal := range mapPerHash { - flatDistinctHashes = append(flatDistinctHashes, []byte(hash)...) - lenHashes = append(lenHashes, uint32(len([]byte(hash)))) - pkPerHash = append(pkPerHash, uint32(len(pksVal))) - allPks = append(allPks, pksVal...) - } - verif = C.bls_verifyPerDistinctMessage( - (*C.uchar)(&s[0]), - (C.int)(len(mapPerHash)), - (*C.uchar)(&flatDistinctHashes[0]), - (*C.uint32_t)(&lenHashes[0]), - (*C.uint32_t)(&pkPerHash[0]), - (*C.E2)(&allPks[0]), - ) - - } else { - // aggregate hashes per distinct key - // using the linearity of the pairing on the G1 variables. - distinctPks := make([]pointE2, 0, len(mapPerPk)) - hashPerPk := make([]uint32, 0, len(mapPerPk)) - flatHashes := make([]byte, 0) - lenHashes := make([]uint32, 0) - for pk, hashesVal := range mapPerPk { - distinctPks = append(distinctPks, pk) - hashPerPk = append(hashPerPk, uint32(len(hashesVal))) - for _, h := range hashesVal { - flatHashes = append(flatHashes, h...) - lenHashes = append(lenHashes, uint32(len(h))) - } - } - - verif = C.bls_verifyPerDistinctKey( - (*C.uchar)(&s[0]), - (C.int)(len(mapPerPk)), - (*C.E2)(&distinctPks[0]), - (*C.uint32_t)(&hashPerPk[0]), - (*C.uchar)(&flatHashes[0]), - (*C.uint32_t)(&lenHashes[0])) - } - - switch verif { - case invalid: - return false, nil - case valid: - return true, nil - default: - return false, fmt.Errorf("signature verification failed") - } -} - -// BatchVerifyBLSSignaturesOneMessage is a batch verification of multiple -// BLS signatures of a single message against multiple BLS public keys that -// is faster than verifying the signatures one by one. -// -// Each signature at index (i) of the input signature slice is verified against -// the public key of the same index (i) in the input key slice. -// The input hasher is the same used to generate all signatures. -// The returned boolean slice is of the same length of the signatures slice, -// where the boolean at index (i) is true if signature (i) verifies against -// public key (i), and false otherwise. -// In the case where an error occurs during the execution of the function, -// all the returned boolean values are `false`. -// -// The caller must make sure the input public keys's proofs of possession have been -// verified prior to calling this function (or each input key is sum of public -// keys of which proofs of possession have been verified). -// -// Membership checks are performed on the input signatures but are not performed -// on the input public keys (which are guaranteed by the package to be on the correct -// G2 subgroup). -// In order to avoid equivocation issues, any identity public key results in the corresponding -// signature being invalid. -// -// The function returns: -// - ([]false, nilHasherError) if a hasher is nil -// - ([]false, invalidHasherSizeError) if a hasher's output size is not 128 bytes -// - ([]false, notBLSKeyError) if at least one key is not of type BLS BLS12-381 -// - ([]false, invalidInputsError) if size of keys is not matching the size of signatures -// - ([]false, blsAggregateEmptyListError) if input key slice is empty -// - ([]false, error) if an unexpected error occurs -// - ([]validity, nil) otherwise -func BatchVerifyBLSSignaturesOneMessage( - pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher, -) ([]bool, error) { - // boolean array returned when errors occur - falseSlice := make([]bool, len(sigs)) - - // empty list check - if len(pks) == 0 { - return falseSlice, fmt.Errorf("invalid list of public keys: %w", blsAggregateEmptyListError) - } - - if len(pks) != len(sigs) { - return falseSlice, invalidInputsErrorf( - "keys length %d and signatures length %d are mismatching", - len(pks), - len(sigs)) - } - - if err := checkBLSHasher(kmac); err != nil { - return falseSlice, err - } - - // flatten the shares (required by the C layer) - flatSigs := make([]byte, 0, SignatureLenBLSBLS12381*len(sigs)) - pkPoints := make([]pointE2, 0, len(pks)) - - getIdentityPoint := func() pointE2 { - pk, _ := IdentityBLSPublicKey().(*pubKeyBLSBLS12381) // second value is guaranteed to be true - return pk.point - } - - returnBool := make([]bool, len(sigs)) - for i, pk := range pks { - pkBLS, ok := pk.(*pubKeyBLSBLS12381) - if !ok { - return falseSlice, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) - } - - if len(sigs[i]) != SignatureLenBLSBLS12381 || pkBLS.isIdentity { - // case of invalid signature: set the signature and public key at index `i` - // to identities so that there is no effect on the aggregation tree computation. - // However, the boolean return for index `i` is set to `false` and won't be overwritten. - returnBool[i] = false - pkPoints = append(pkPoints, getIdentityPoint()) - flatSigs = append(flatSigs, g1Serialization...) - } else { - returnBool[i] = true // default to true - pkPoints = append(pkPoints, pkBLS.point) - flatSigs = append(flatSigs, sigs[i]...) - } - } - - // hash the input to 128 bytes - h := kmac.ComputeHash(message) - verifInt := make([]byte, len(sigs)) - // internal non-determministic entropy source required by bls_batch_verify - // specific length of the seed is required by bls_batch_verify. - seed := make([]byte, (securityBits/8)*len(verifInt)) - _, err := rand.Read(seed) - if err != nil { - return falseSlice, fmt.Errorf("generating randoms failed: %w", err) - } - - C.bls_batch_verify( - (C.int)(len(verifInt)), - (*C.uchar)(&verifInt[0]), - (*C.E2)(&pkPoints[0]), - (*C.uchar)(&flatSigs[0]), - (*C.uchar)(&h[0]), - (C.int)(len(h)), - (*C.uchar)(&seed[0]), - ) - - for i, v := range verifInt { - if (C.int)(v) != valid && (C.int)(v) != invalid { - return falseSlice, fmt.Errorf("batch verification failed") - } - if returnBool[i] { // only overwrite if not previously set to false - returnBool[i] = ((C.int)(v) == valid) - } - } - return returnBool, nil -} - -// blsAggregateEmptyListError is returned when a list of BLS objects (e.g. signatures or keys) -// is empty or nil and thereby represents an invalid input. -var blsAggregateEmptyListError = errors.New("list cannot be empty") - -// IsBLSAggregateEmptyListError checks if err is an `blsAggregateEmptyListError`. -// blsAggregateEmptyListError is returned when a BLS aggregation function is called with -// an empty list which is not allowed in some aggregation cases to avoid signature equivocation -// issues. -func IsBLSAggregateEmptyListError(err error) bool { - return errors.Is(err, blsAggregateEmptyListError) -} - -// notBLSKeyError is returned when a private or public key -// used is not a BLS on BLS12 381 key. -var notBLSKeyError = errors.New("input key has to be a BLS on BLS12-381 key") - -// IsNotBLSKeyError checks if err is an `notBLSKeyError`. -// notBLSKeyError is returned when a private or public key -// used is not a BLS on BLS12 381 key. -func IsNotBLSKeyError(err error) bool { - return errors.Is(err, notBLSKeyError) -} - -// invalidSignatureError is returned when a signature input does not serialize to a -// valid element on E1 of the BLS12-381 curve (but without checking the element is on subgroup G1). -var invalidSignatureError = errors.New("input signature does not deserialize to an E1 element") - -// IsInvalidSignatureError checks if err is an `invalidSignatureError` -// invalidSignatureError is returned when a signature input does not serialize to a -// valid element on E1 of the BLS12-381 curve (but without checking the element is on subgroup G1). -func IsInvalidSignatureError(err error) bool { - return errors.Is(err, invalidSignatureError) -} diff --git a/crypto/bls_test.go b/crypto/bls_test.go deleted file mode 100644 index 4fa02958496..00000000000 --- a/crypto/bls_test.go +++ /dev/null @@ -1,1229 +0,0 @@ -package crypto - -import ( - crand "crypto/rand" - "encoding/hex" - "fmt" - mrand "math/rand" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/onflow/flow-go/crypto/hash" -) - -// TestBLSMainMethods is a sanity check of main signature scheme methods (keyGen, sign, verify) -func TestBLSMainMethods(t *testing.T) { - // test the key generation seed lengths - testKeyGenSeed(t, BLSBLS12381, KeyGenSeedMinLen, KeyGenSeedMaxLen) - // test the consistency with different inputs - hasher := NewExpandMsgXOFKMAC128("test tag") - testGenSignVerify(t, BLSBLS12381, hasher) - - // specific signature test for BLS: - // Test a signature with a point encoded with a coordinate x not reduced mod p - // The same signature point with the x coordinate reduced passes verification. - // This test checks that: - // - signature decoding handles input x-coordinates larger than p (doesn't result in an exception) - // - signature decoding only accepts reduced x-coordinates to avoid signature malleability - - t.Run("invalid x coordinate larger than p", func(t *testing.T) { - if !isG1Compressed() || !isG2Compressed() { - t.Skip() - } - msg, err := hex.DecodeString("7f26ba692dc2da7ff828ef4675ff1cd6ab855fca0637b6dab295f1df8e51bc8bb1b8f0c6610aabd486cf1f098f2ddbc6691d94e10f928816f890a3d366ce46249836a595c7ea1828af52e899ba2ab627ab667113bb563918c5d5a787c414399487b4e3a7") - require.NoError(t, err) - validSig, err := hex.DecodeString("80b0cac2a0f4f8881913edf2b29065675dfed6f6f4e17e9b5d860a845d4e7d476b277d06a493b81482e63d8131f9f2fa") - require.NoError(t, err) - invalidSig, err := hex.DecodeString("9AB1DCACDA74DF22642F95A8F5DC123EC276227BE866915AC4B6DD2553FF736B89D37D0555E7B8143CE53D8131F99DA5") - require.NoError(t, err) - pkBytes, err := hex.DecodeString("a7ac85ac8ffd9d2611f73721a93ec92115f29d769dfa425fec2e2c26ab3e4e8089a961ab430639104262723e829b75e9190a05d8fc8d22a7ac78a18473cc3df146b5c4c9c8e46d5f208039384fe2fc018321f14c01641c3afff7558a2eb06463") - require.NoError(t, err) - pk, err := DecodePublicKey(BLSBLS12381, pkBytes) - require.NoError(t, err) - // sanity check of valid signature (P_x < p) - valid, err := pk.Verify(validSig, msg, hasher) - require.NoError(t, err) - require.True(t, valid) - // invalid signature (P'_x = P_x + p ) - valid, err = pk.Verify(invalidSig, msg, hasher) - require.NoError(t, err) - assert.False(t, valid) - }) - - t.Run("private key equal to 1 and -1", func(t *testing.T) { - sk1Bytes := make([]byte, PrKeyLenBLSBLS12381) - sk1Bytes[PrKeyLenBLSBLS12381-1] = 1 - sk1, err := DecodePrivateKey(BLSBLS12381, sk1Bytes) - require.NoError(t, err) - - skMinus1Bytes := make([]byte, PrKeyLenBLSBLS12381) - copy(skMinus1Bytes, BLS12381Order) - skMinus1Bytes[PrKeyLenBLSBLS12381-1] -= 1 - skMinus1, err := DecodePrivateKey(BLSBLS12381, skMinus1Bytes) - require.NoError(t, err) - - for _, sk := range []PrivateKey{sk1, skMinus1} { - input := make([]byte, 100) - _, err = crand.Read(input) - require.NoError(t, err) - s, err := sk.Sign(input, hasher) - require.NoError(t, err) - pk := sk.PublicKey() - - // test a valid signature - result, err := pk.Verify(s, input, hasher) - assert.NoError(t, err) - assert.True(t, result) - } - }) -} - -// Signing bench -func BenchmarkBLSBLS12381Sign(b *testing.B) { - halg := NewExpandMsgXOFKMAC128("bench tag") - benchSign(b, BLSBLS12381, halg) -} - -// Verifying bench -func BenchmarkBLSBLS12381Verify(b *testing.B) { - halg := NewExpandMsgXOFKMAC128("bench tag") - benchVerify(b, BLSBLS12381, halg) -} - -// utility function to generate a random BLS private key -func randomSK(t *testing.T, rand *mrand.Rand) PrivateKey { - seed := make([]byte, KeyGenSeedMinLen) - n, err := rand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(t, err) - return sk -} - -// utility function to generate a non BLS private key -func invalidSK(t *testing.T) PrivateKey { - seed := make([]byte, KeyGenSeedMinLen) - n, err := crand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - sk, err := GeneratePrivateKey(ECDSAP256, seed) - require.NoError(t, err) - return sk -} - -// Utility function that flips a point sign bit to negate the point -// this is shortcut which works only for zcash BLS12-381 compressed serialization -// Applicable to both signatures and public keys -func negatePoint(pointbytes []byte) { - pointbytes[0] ^= 0x20 -} - -// BLS tests -func TestBLSBLS12381Hasher(t *testing.T) { - rand := getPRG(t) - // generate a key pair - sk := randomSK(t, rand) - sig := make([]byte, SignatureLenBLSBLS12381) - msg := []byte("message") - - // empty hasher - t.Run("Empty hasher", func(t *testing.T) { - _, err := sk.Sign(msg, nil) - assert.Error(t, err) - assert.True(t, IsNilHasherError(err)) - _, err = sk.PublicKey().Verify(sig, msg, nil) - assert.Error(t, err) - assert.True(t, IsNilHasherError(err)) - }) - - // short size hasher - t.Run("short size hasher", func(t *testing.T) { - s, err := sk.Sign(msg, hash.NewSHA2_256()) - assert.Error(t, err) - assert.True(t, IsInvalidHasherSizeError(err)) - assert.Nil(t, s) - - valid, err := sk.PublicKey().Verify(sig, msg, hash.NewSHA2_256()) - assert.Error(t, err) - assert.True(t, IsInvalidHasherSizeError(err)) - assert.False(t, valid) - }) - - t.Run("NewExpandMsgXOFKMAC128 sanity check", func(t *testing.T) { - // test the parameter lengths of NewExpandMsgXOFKMAC128 are in the correct range - // h would be nil if the kmac inputs are invalid - h := internalExpandMsgXOFKMAC128(blsSigCipherSuite) - assert.NotNil(t, h) - }) - - t.Run("constants sanity check", func(t *testing.T) { - // test that the ciphersuites exceed 16 bytes as per draft-irtf-cfrg-hash-to-curve - // The tags used by internalExpandMsgXOFKMAC128 are at least len(ciphersuite) long - assert.GreaterOrEqual(t, len(blsSigCipherSuite), 16) - assert.GreaterOrEqual(t, len(blsPOPCipherSuite), 16) - }) - - t.Run("orthogonal PoP and signature hashing", func(t *testing.T) { - data := []byte("random_data") - // empty tag hasher - sigKmac := NewExpandMsgXOFKMAC128("") - h1 := sigKmac.ComputeHash(data) - - // PoP hasher - h2 := popKMAC.ComputeHash(data) - assert.NotEqual(t, h1, h2) - }) - -} - -// TestBLSEncodeDecode tests encoding and decoding of BLS keys -func TestBLSEncodeDecode(t *testing.T) { - // generic tests - testEncodeDecode(t, BLSBLS12381) - - // specific tests for BLS - - // zero private key - t.Run("zero private key", func(t *testing.T) { - skBytes := make([]byte, PrKeyLenBLSBLS12381) - sk, err := DecodePrivateKey(BLSBLS12381, skBytes) - require.Error(t, err, "decoding identity private key should fail") - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, sk) - }) - - // identity public key - t.Run("infinity public key", func(t *testing.T) { - // decode an identity public key - pkBytes := make([]byte, PubKeyLenBLSBLS12381) - pkBytes[0] = g2SerHeader - pk, err := DecodePublicKey(BLSBLS12381, pkBytes) - require.NoError(t, err, "decoding identity public key should succeed") - assert.True(t, pk.Equals(IdentityBLSPublicKey())) - // encode an identity public key - assert.Equal(t, pk.Encode(), pkBytes) - }) - - // invalid point - t.Run("invalid public key", func(t *testing.T) { - pkBytes := make([]byte, PubKeyLenBLSBLS12381) - pkBytes[0] = invalidBLSSignatureHeader - pk, err := DecodePublicKey(BLSBLS12381, pkBytes) - require.Error(t, err, "the key decoding should fail - key value is invalid") - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, pk) - }) - - // Test a public key serialization with a point encoded with a coordinate x with - // x[0] or x[1] not reduced mod p. - // The same public key point with x[0] and x[1] reduced passes decoding. - // This test checks that: - // - public key decoding handles input x-coordinates with x[0] and x[1] larger than p (doesn't result in an exception) - // - public key decoding only accepts reduced x[0] and x[1] to insure key serialization uniqueness. - // Although uniqueness of public key respresentation isn't a security property, some implementations - // may implicitely rely on the property. - - t.Run("public key with non-reduced coordinates", func(t *testing.T) { - if !isG2Compressed() { - t.Skip() - } - // valid pk with x[0] < p and x[1] < p - validPk, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b8038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") - require.NoError(t, err) - _, err = DecodePublicKey(BLSBLS12381, validPk) - assert.NoError(t, err) - // invalidpk1 with x[0]+p and same x[1] - invalidPk1, err := hex.DecodeString("9B8E840277BE772540D913E47A94F94C00003BBE60C4CEEB0C0ABCC9E876034089000EC7AF5AB6D81AF62EC9363D5E63038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") - require.NoError(t, err) - _, err = DecodePublicKey(BLSBLS12381, invalidPk1) - assert.Error(t, err) - // invalidpk1 with same x[0] and x[1]+p - invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D") - require.NoError(t, err) - _, err = DecodePublicKey(BLSBLS12381, invalidPk2) - assert.Error(t, err) - }) -} - -// TestBLSEquals tests equal for BLS keys -func TestBLSEquals(t *testing.T) { - testEquals(t, BLSBLS12381, ECDSAP256) -} - -// TestBLSUtils tests some utility functions -func TestBLSUtils(t *testing.T) { - rand := getPRG(t) - // generate a key pair - sk := randomSK(t, rand) - // test Algorithm() - testKeysAlgorithm(t, sk, BLSBLS12381) - // test Size() - testKeySize(t, sk, PrKeyLenBLSBLS12381, PubKeyLenBLSBLS12381) -} - -// BLS Proof of Possession test -func TestBLSPOP(t *testing.T) { - rand := getPRG(t) - seed := make([]byte, KeyGenSeedMinLen) - input := make([]byte, 100) - - t.Run("PoP tests", func(t *testing.T) { - loops := 10 - for j := 0; j < loops; j++ { - n, err := rand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(t, err) - _, err = rand.Read(input) - require.NoError(t, err) - s, err := BLSGeneratePOP(sk) - require.NoError(t, err) - pk := sk.PublicKey() - - // test a valid PoP - result, err := BLSVerifyPOP(pk, s) - require.NoError(t, err) - assert.True(t, result) - - // test with a valid but different key - seed[0] ^= 1 - wrongSk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(t, err) - result, err = BLSVerifyPOP(wrongSk.PublicKey(), s) - require.NoError(t, err) - assert.False(t, result) - } - }) - - t.Run("invalid inputs", func(t *testing.T) { - // ecdsa key - sk := invalidSK(t) - s, err := BLSGeneratePOP(sk) - assert.True(t, IsNotBLSKeyError(err)) - assert.Nil(t, s) - - s = make([]byte, SignatureLenBLSBLS12381) - result, err := BLSVerifyPOP(sk.PublicKey(), s) - assert.True(t, IsNotBLSKeyError(err)) - assert.False(t, result) - }) -} - -// BLS multi-signature -// signature aggregation with the same message sanity check -// -// Aggregate n signatures of the same message under different keys, and compare -// it against the signature of the message under an aggregated private key. -// Verify the aggregated signature using the multi-signature verification with -// one message. -func TestBLSAggregateSignatures(t *testing.T) { - rand := getPRG(t) - // random message - input := make([]byte, 100) - _, err := rand.Read(input) - require.NoError(t, err) - // hasher - kmac := NewExpandMsgXOFKMAC128("test tag") - // number of signatures to aggregate - sigsNum := rand.Intn(100) + 1 - sigs := make([]Signature, 0, sigsNum) - sks := make([]PrivateKey, 0, sigsNum) - pks := make([]PublicKey, 0, sigsNum) - var aggSig, expectedSig Signature - - // create the signatures - for i := 0; i < sigsNum; i++ { - sk := randomSK(t, rand) - s, err := sk.Sign(input, kmac) - require.NoError(t, err) - sigs = append(sigs, s) - sks = append(sks, sk) - pks = append(pks, sk.PublicKey()) - } - - // all signatures are valid - t.Run("all valid signatures", func(t *testing.T) { - // aggregate private keys - aggSk, err := AggregateBLSPrivateKeys(sks) - require.NoError(t, err) - expectedSig, err := aggSk.Sign(input, kmac) - require.NoError(t, err) - // aggregate signatures - aggSig, err := AggregateBLSSignatures(sigs) - require.NoError(t, err) - // First check: check the signatures are equal - assert.Equal(t, aggSig, expectedSig) - // Second check: Verify the aggregated signature - valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) - require.NoError(t, err) - assert.True(t, valid) - }) - - // check if one signature is not correct - t.Run("one invalid signature", func(t *testing.T) { - input[0] ^= 1 - randomIndex := rand.Intn(sigsNum) - sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // sign a different message - input[0] ^= 1 - aggSig, err = AggregateBLSSignatures(sigs) - require.NoError(t, err) - // First check: check the signatures are not equal - assert.NotEqual(t, aggSig, expectedSig) - // Second check: multi-verification should fail - valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) - require.NoError(t, err) - assert.False(t, valid) - sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // rebuild the correct signature - require.NoError(t, err) - }) - - // check if one the public keys is not correct - t.Run("one invalid public key", func(t *testing.T) { - randomIndex := rand.Intn(sigsNum) - newSk := randomSK(t, rand) - sks[randomIndex] = newSk - pks[randomIndex] = newSk.PublicKey() - aggSk, err := AggregateBLSPrivateKeys(sks) - require.NoError(t, err) - expectedSig, err = aggSk.Sign(input, kmac) - require.NoError(t, err) - assert.NotEqual(t, aggSig, expectedSig) - valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) - require.NoError(t, err) - assert.False(t, valid) - }) - - t.Run("invalid inputs", func(t *testing.T) { - // test aggregating an empty signature list - aggSig, err = AggregateBLSSignatures(sigs[:0]) - assert.Error(t, err) - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.Nil(t, aggSig) - - // test verification with an empty key list - result, err := VerifyBLSSignatureOneMessage(pks[:0], aggSig, input, kmac) - assert.Error(t, err) - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.False(t, result) - - // test with a signature of a wrong length - shortSig := sigs[0][:SignatureLenBLSBLS12381-1] - aggSig, err = AggregateBLSSignatures([]Signature{shortSig}) - assert.Error(t, err) - assert.True(t, IsInvalidSignatureError(err)) - assert.Nil(t, aggSig) - - // test with an invalid signature of a correct length - invalidSig := BLSInvalidSignature() - aggSig, err = AggregateBLSSignatures([]Signature{invalidSig}) - assert.Error(t, err) - assert.True(t, IsInvalidSignatureError(err)) - assert.Nil(t, aggSig) - - // test the empty key list - aggSk, err := AggregateBLSPrivateKeys(sks[:0]) - assert.Error(t, err) - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.Nil(t, aggSk) - - // test with an invalid key type - sk := invalidSK(t) - aggSk, err = AggregateBLSPrivateKeys([]PrivateKey{sk}) - assert.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.Nil(t, aggSk) - }) -} - -// BLS multi-signature -// public keys aggregation sanity check -// -// Aggregate n public keys and their respective private keys and compare -// the public key of the aggregated private key is equal to the aggregated -// public key -func TestBLSAggregatePublicKeys(t *testing.T) { - rand := getPRG(t) - // number of keys to aggregate - pkNum := rand.Intn(100) + 1 - pks := make([]PublicKey, 0, pkNum) - sks := make([]PrivateKey, 0, pkNum) - - // create the signatures - for i := 0; i < pkNum; i++ { - sk := randomSK(t, rand) - sks = append(sks, sk) - pks = append(pks, sk.PublicKey()) - } - - // consistent private and public key aggregation - t.Run("correctness check", func(t *testing.T) { - // aggregate private keys - aggSk, err := AggregateBLSPrivateKeys(sks) - require.NoError(t, err) - expectedPk := aggSk.PublicKey() - // aggregate public keys - aggPk, err := AggregateBLSPublicKeys(pks) - assert.NoError(t, err) - assert.True(t, expectedPk.Equals(aggPk), - "incorrect public key %s, should be %s, public keys are %s", - aggPk, expectedPk, pks) - }) - - // aggregate an empty list - t.Run("empty list", func(t *testing.T) { - // private keys - aggSk, err := AggregateBLSPrivateKeys(sks[:0]) - assert.Error(t, err) - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.Nil(t, aggSk) - // public keys - aggPk, err := AggregateBLSPublicKeys(pks[:0]) - assert.Error(t, err) - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.Nil(t, aggPk) - }) - - // aggregate a list that includes the identity key, - // to check that identity key is indeed the identity element with regards to aggregation. - t.Run("aggregate a list that includes the identity key", func(t *testing.T) { - // aggregate the identity key with a non identity key - keys := []PublicKey{pks[0], IdentityBLSPublicKey()} - aggPkWithIdentity, err := AggregateBLSPublicKeys(keys) - assert.NoError(t, err) - assert.True(t, aggPkWithIdentity.Equals(pks[0])) - }) - - t.Run("invalid inputs", func(t *testing.T) { - // empty list - aggPK, err := AggregateBLSPublicKeys(pks[:0]) - assert.Error(t, err) - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.Nil(t, aggPK) - - // test with an invalid key type - pk := invalidSK(t).PublicKey() - aggPK, err = AggregateBLSPublicKeys([]PublicKey{pk}) - assert.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.Nil(t, aggPK) - }) - - // check that the public key corresponding to the zero private key is indeed identity - // The package doesn't allow to generate a zero private key. One way to obtain a zero - // private key is via aggregating opposite private keys - t.Run("Identity public key from identity private key", func(t *testing.T) { - // sk1 is group order of bls12-381 minus one - groupOrderMinus1 := []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x39, - 0xD8, 0x08, 0x09, 0xA1, 0xD8, 0x05, 0x53, 0xBD, 0xA4, 0x02, 0xFF, 0xFE, - 0x5B, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00} - sk1, err := DecodePrivateKey(BLSBLS12381, groupOrderMinus1) - require.NoError(t, err) - // sk2 is 1 - one := make([]byte, PrKeyLenBLSBLS12381) - one[PrKeyLenBLSBLS12381-1] = 1 - sk2, err := DecodePrivateKey(BLSBLS12381, one) - require.NoError(t, err) - // public key of aggregated private keys - aggSK, err := AggregateBLSPrivateKeys([]PrivateKey{sk1, sk2}) - require.NoError(t, err) - assert.True(t, aggSK.PublicKey().Equals(IdentityBLSPublicKey())) - // aggregated public keys - aggPK, err := AggregateBLSPublicKeys([]PublicKey{sk1.PublicKey(), sk2.PublicKey()}) - require.NoError(t, err) - assert.True(t, aggPK.Equals(IdentityBLSPublicKey())) - // check of internal identity flag - blsKey, ok := aggPK.(*pubKeyBLSBLS12381) - require.True(t, ok) - assert.True(t, blsKey.isIdentity) - // check of encoding header - pkBytes := aggPK.Encode() - assert.Equal(t, g2SerHeader, pkBytes[0]) - }) - - t.Run("Identity public key from opposite points", func(t *testing.T) { - if !isG2Compressed() { - t.Skip() - } - pkBytes := pks[0].Encode() - negateCompressedPoint(pkBytes) - minusPk, err := DecodePublicKey(BLSBLS12381, pkBytes) - require.NoError(t, err) - // aggregated public keys - aggPK, err := AggregateBLSPublicKeys([]PublicKey{pks[0], minusPk}) - require.NoError(t, err) - assert.True(t, aggPK.Equals(IdentityBLSPublicKey())) - // check of internal identity flag - blsKey, ok := aggPK.(*pubKeyBLSBLS12381) - require.True(t, ok) - assert.True(t, blsKey.isIdentity) - // check of encoding header - pkBytes = aggPK.Encode() - assert.Equal(t, g2SerHeader, pkBytes[0]) - }) -} - -// BLS multi-signature -// public keys removal sanity check -func TestBLSRemovePubKeys(t *testing.T) { - rand := getPRG(t) - // number of keys to aggregate - pkNum := rand.Intn(100) + 1 - pks := make([]PublicKey, 0, pkNum) - - // generate public keys - for i := 0; i < pkNum; i++ { - sk := randomSK(t, rand) - pks = append(pks, sk.PublicKey()) - } - // aggregate public keys - aggPk, err := AggregateBLSPublicKeys(pks) - require.NoError(t, err) - - // random number of keys to remove (at least one key is left) - pkToRemoveNum := rand.Intn(pkNum) - expectedPatrialPk, err := AggregateBLSPublicKeys(pks[pkToRemoveNum:]) - require.NoError(t, err) - - // check correctness - t.Run("equality check", func(t *testing.T) { - partialPk, err := RemoveBLSPublicKeys(aggPk, pks[:pkToRemoveNum]) - require.NoError(t, err) - - BLSkey, ok := expectedPatrialPk.(*pubKeyBLSBLS12381) - require.True(t, ok) - - assert.True(t, BLSkey.Equals(partialPk)) - }) - - // remove an extra key and check inequality - t.Run("inequality check", func(t *testing.T) { - extraPk := randomSK(t, rand).PublicKey() - partialPk, err := RemoveBLSPublicKeys(aggPk, []PublicKey{extraPk}) - assert.NoError(t, err) - - BLSkey, ok := expectedPatrialPk.(*pubKeyBLSBLS12381) - require.True(t, ok) - assert.False(t, BLSkey.Equals(partialPk)) - }) - - // specific test to remove all keys - t.Run("remove all keys", func(t *testing.T) { - identityPk, err := RemoveBLSPublicKeys(aggPk, pks) - require.NoError(t, err) - // identity public key is expected - randomPk := randomSK(t, rand).PublicKey() - randomPkPlusIdentityPk, err := AggregateBLSPublicKeys([]PublicKey{randomPk, identityPk}) - require.NoError(t, err) - - BLSRandomPk, ok := randomPk.(*pubKeyBLSBLS12381) - require.True(t, ok) - - assert.True(t, BLSRandomPk.Equals(randomPkPlusIdentityPk)) - }) - - // specific test with an empty slice of keys to remove - t.Run("remove empty list", func(t *testing.T) { - partialPk, err := RemoveBLSPublicKeys(aggPk, []PublicKey{}) - require.NoError(t, err) - - aggBLSkey, ok := aggPk.(*pubKeyBLSBLS12381) - require.True(t, ok) - - assert.True(t, aggBLSkey.Equals(partialPk)) - }) - - t.Run("invalid inputs", func(t *testing.T) { - pk := invalidSK(t).PublicKey() - partialPk, err := RemoveBLSPublicKeys(pk, pks) - assert.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.Nil(t, partialPk) - - partialPk, err = RemoveBLSPublicKeys(aggPk, []PublicKey{pk}) - assert.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.Nil(t, partialPk) - }) -} - -// BLS multi-signature -// batch verification -// -// Verify n signatures of the same message under different keys using the fast -// batch verification technique and compares the result to verifying each signature -// separately. -func TestBLSBatchVerify(t *testing.T) { - rand := getPRG(t) - // random message - input := make([]byte, 100) - _, err := rand.Read(input) - require.NoError(t, err) - // hasher - kmac := NewExpandMsgXOFKMAC128("test tag") - // number of signatures to aggregate - sigsNum := rand.Intn(100) + 2 - sigs := make([]Signature, 0, sigsNum) - pks := make([]PublicKey, 0, sigsNum) - expectedValid := make([]bool, 0, sigsNum) - - // create the signatures - for i := 0; i < sigsNum; i++ { - sk := randomSK(t, rand) - s, err := sk.Sign(input, kmac) - require.NoError(t, err) - sigs = append(sigs, s) - pks = append(pks, sk.PublicKey()) - expectedValid = append(expectedValid, true) - } - - // all signatures are valid - t.Run("all signatures are valid", func(t *testing.T) { - valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) - require.NoError(t, err) - assert.Equal(t, valid, expectedValid) - }) - - // valid signatures but indices aren't correct: sig[i] is correct under pks[j] - // and sig[j] is correct under pks[j]. - // implementations simply aggregating all signatures and keys would fail this test. - t.Run("valid signatures with incorrect indices", func(t *testing.T) { - i := rand.Intn(sigsNum-1) + 1 - j := rand.Intn(i) - // swap correct keys - pks[i], pks[j] = pks[j], pks[i] - - valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) - require.NoError(t, err) - expectedValid[i], expectedValid[j] = false, false - assert.Equal(t, valid, expectedValid) - - // restore keys - pks[i], pks[j] = pks[j], pks[i] - expectedValid[i], expectedValid[j] = true, true - }) - - // valid signatures but indices aren't correct: sig[i] is correct under pks[j] - // and sig[j] is correct under pks[j]. - // implementations simply aggregating all signatures and keys would fail this test. - t.Run("valid signatures with incorrect indices", func(t *testing.T) { - i := mrand.Intn(sigsNum-1) + 1 - j := mrand.Intn(i) - // swap correct keys - pks[i], pks[j] = pks[j], pks[i] - - valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) - require.NoError(t, err) - expectedValid[i], expectedValid[j] = false, false - assert.Equal(t, valid, expectedValid) - - // restore keys - pks[i], pks[j] = pks[j], pks[i] - expectedValid[i], expectedValid[j] = true, true - }) - - // one valid signature - t.Run("one valid signature", func(t *testing.T) { - valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:1], sigs[:1], input, kmac) - require.NoError(t, err) - assert.Equal(t, expectedValid[:1], valid) - }) - - // pick a random number of invalid signatures - invalidSigsNum := rand.Intn(sigsNum-1) + 1 - // generate a random permutation of indices to pick the - // invalid signatures. - indices := make([]int, 0, sigsNum) - for i := 0; i < sigsNum; i++ { - indices = append(indices, i) - } - rand.Shuffle(sigsNum, func(i, j int) { - indices[i], indices[j] = indices[j], indices[i] - }) - - // some signatures are invalid - t.Run("some signatures are invalid", func(t *testing.T) { - for i := 0; i < invalidSigsNum; i++ { // alter invalidSigsNum random signatures - alterSignature(sigs[indices[i]]) - expectedValid[indices[i]] = false - } - - valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) - require.NoError(t, err) - assert.Equal(t, expectedValid, valid) - }) - - // all signatures are invalid - t.Run("all signatures are invalid", func(t *testing.T) { - for i := invalidSigsNum; i < sigsNum; i++ { // alter the remaining random signatures - alterSignature(sigs[indices[i]]) - expectedValid[indices[i]] = false - if i%5 == 0 { - sigs[indices[i]] = sigs[indices[i]][:3] // test the short signatures - } - } - - valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) - require.NoError(t, err) - assert.Equal(t, valid, expectedValid) - }) - - // test the empty list case - t.Run("empty list", func(t *testing.T) { - valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:0], sigs[:0], input, kmac) - require.Error(t, err) - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.Equal(t, valid, expectedValid[:0]) - }) - - // test incorrect inputs - t.Run("inconsistent inputs", func(t *testing.T) { - for i := 0; i < sigsNum; i++ { - expectedValid[i] = false - } - valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:len(pks)-1], sigs, input, kmac) - require.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Equal(t, valid, expectedValid) - }) - - // test wrong hasher - t.Run("invalid hasher", func(t *testing.T) { - for i := 0; i < sigsNum; i++ { - expectedValid[i] = false - } - valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, nil) - require.Error(t, err) - assert.True(t, IsNilHasherError(err)) - - assert.Equal(t, valid, expectedValid) - }) - - // test wrong key - t.Run("wrong key", func(t *testing.T) { - for i := 0; i < sigsNum; i++ { - expectedValid[i] = false - } - pks[0] = invalidSK(t).PublicKey() - valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) - require.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - - assert.Equal(t, valid, expectedValid) - }) -} - -// Utility function that flips a point sign bit to negate the point -// this is shortcut which works only for zcash BLS12-381 compressed serialization. -// Applicable to both signatures and public keys. -func negateCompressedPoint(pointbytes []byte) { - pointbytes[0] ^= 0x20 -} - -// alter or fix a signature -func alterSignature(s Signature) { - // this causes the signature to remain in G1 and be invalid - // OR to be a non-point in G1 (either on curve or not) - // which tests multiple error cases. - s[10] ^= 1 -} - -// Batch verify bench in the happy (all signatures are valid) -// and unhappy path (only one signature is invalid) -func BenchmarkBatchVerify(b *testing.B) { - // random message - input := make([]byte, 100) - _, err := crand.Read(input) - require.NoError(b, err) - // hasher - kmac := NewExpandMsgXOFKMAC128("bench tag") - sigsNum := 100 - sigs := make([]Signature, 0, sigsNum) - pks := make([]PublicKey, 0, sigsNum) - seed := make([]byte, KeyGenSeedMinLen) - - // create the signatures - for i := 0; i < sigsNum; i++ { - _, err := crand.Read(seed) - require.NoError(b, err) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(b, err) - s, err := sk.Sign(input, kmac) - require.NoError(b, err) - sigs = append(sigs, s) - pks = append(pks, sk.PublicKey()) - } - - // Batch verify bench when all signatures are valid - // (2) pairing compared to (2*n) pairings for the batch verification. - b.Run("happy path", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - // all signatures are valid - _, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) - require.NoError(b, err) - } - b.StopTimer() - }) - - // Batch verify bench when some signatures are invalid - // - if only one signaure is invalid (a valid point in G1): - // less than (2*2*log(n)) pairings compared to (2*n) pairings for the simple verification. - // - if all signatures are invalid (valid points in G1): - // (2*2*(n-1)) pairings compared to (2*n) pairings for the simple verification. - b.Run("unhappy path", func(b *testing.B) { - // only one invalid signature - alterSignature(sigs[sigsNum/2]) - b.ResetTimer() - for i := 0; i < b.N; i++ { - // all signatures are valid - _, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) - require.NoError(b, err) - } - b.StopTimer() - }) -} - -// BLS multi-signature -// signature aggregation sanity check -// -// Aggregate n signatures of distinct messages under different keys, -// and verify the aggregated signature using the multi-signature verification with -// many messages. -func TestBLSAggregateSignaturesManyMessages(t *testing.T) { - rand := getPRG(t) - // number of signatures to aggregate - sigsNum := rand.Intn(40) + 1 - sigs := make([]Signature, 0, sigsNum) - - // number of keys (less than the number of signatures) - keysNum := rand.Intn(sigsNum) + 1 - sks := make([]PrivateKey, 0, keysNum) - // generate the keys - for i := 0; i < keysNum; i++ { - sk := randomSK(t, rand) - sks = append(sks, sk) - } - - // number of messages (could be larger or smaller than the number of keys) - msgsNum := rand.Intn(sigsNum) + 1 - messages := make([][20]byte, msgsNum) - for i := 0; i < msgsNum; i++ { - _, err := rand.Read(messages[i][:]) - require.NoError(t, err) - } - - inputMsgs := make([][]byte, 0, sigsNum) - inputPks := make([]PublicKey, 0, sigsNum) - inputKmacs := make([]hash.Hasher, 0, sigsNum) - - // create the signatures - for i := 0; i < sigsNum; i++ { - kmac := NewExpandMsgXOFKMAC128("test tag") - // pick a key randomly from the list - skRand := rand.Intn(keysNum) - sk := sks[skRand] - // pick a message randomly from the list - msgRand := rand.Intn(msgsNum) - msg := messages[msgRand][:] - // generate a signature - s, err := sk.Sign(msg, kmac) - require.NoError(t, err) - // update signatures and api inputs - sigs = append(sigs, s) - inputPks = append(inputPks, sk.PublicKey()) - inputMsgs = append(inputMsgs, msg) - inputKmacs = append(inputKmacs, kmac) - } - var aggSig Signature - - t.Run("correctness check", func(t *testing.T) { - // aggregate signatures - var err error - aggSig, err = AggregateBLSSignatures(sigs) - require.NoError(t, err) - // Verify the aggregated signature - valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) - require.NoError(t, err) - assert.True(t, valid) - }) - - // check if one of the signatures is not correct - t.Run("one signature is invalid", func(t *testing.T) { - randomIndex := rand.Intn(sigsNum) // pick a random signature - messages[0][0] ^= 1 // make sure the signature is different - var err error - sigs[randomIndex], err = sks[0].Sign(messages[0][:], inputKmacs[0]) - require.NoError(t, err) - messages[0][0] ^= 1 - aggSig, err = AggregateBLSSignatures(sigs) - require.NoError(t, err) - valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) - require.NoError(t, err) - assert.False(t, valid) - }) - - // test the empty keys case - t.Run("empty list", func(t *testing.T) { - valid, err := VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs) - assert.Error(t, err) - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.False(t, valid) - }) - - // test inconsistent input arrays - t.Run("inconsistent inputs", func(t *testing.T) { - // inconsistent lengths - valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs[:sigsNum-1], inputKmacs) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.False(t, valid) - - // empty key list - valid, err = VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs) - assert.Error(t, err) - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.False(t, valid) - - // nil hasher - tmp := inputKmacs[0] - inputKmacs[0] = nil - valid, err = VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) - assert.Error(t, err) - assert.True(t, IsNilHasherError(err)) - assert.False(t, valid) - inputKmacs[0] = tmp - - // wrong key - tmpPK := inputPks[0] - inputPks[0] = invalidSK(t).PublicKey() - valid, err = VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) - assert.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.False(t, valid) - inputPks[0] = tmpPK - }) - - t.Run("variable number of distinct keys and messages", func(t *testing.T) { - // use a specific PRG for easier reproduction - prg := getPRG(t) - // number of signatures to aggregate - N := 100 - sigs := make([]Signature, 0, N) - msgs := make([][]byte, 0, N) - pks := make([]PublicKey, 0, N) - kmacs := make([]hash.Hasher, 0, N) - kmac := NewExpandMsgXOFKMAC128("test tag") - for i := 0; i < N; i++ { - // distinct message - msg := make([]byte, 20) - msgs = append(msgs, msg) - _, err := prg.Read(msg) - require.NoError(t, err) - // distinct key - sk := randomSK(t, prg) - pks = append(pks, sk.PublicKey()) - // generate a signature - s, err := sk.Sign(msg, kmac) - require.NoError(t, err) - sigs = append(sigs, s) - kmacs = append(kmacs, kmac) - } - - // go through all numbers of couples (msg, key) - for i := 1; i < N; i++ { - // aggregate signatures - var err error - aggSig, err = AggregateBLSSignatures(sigs[:i]) - require.NoError(t, err) - // Verify the aggregated signature - valid, err := VerifyBLSSignatureManyMessages(pks[:i], aggSig, msgs[:i], kmacs[:i]) - require.NoError(t, err, "verification errored with %d couples (msg,key)", i) - assert.True(t, valid, "verification failed with %d couples (msg,key)", i) - } - }) -} - -// TestBLSErrorTypes verifies working of error-type-detecting functions -// such as `IsInvalidInputsError`. -func TestBLSErrorTypes(t *testing.T) { - t.Run("aggregateEmptyListError sanity", func(t *testing.T) { - err := blsAggregateEmptyListError - invInpError := invalidInputsErrorf("") - otherError := fmt.Errorf("some error") - assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.False(t, IsInvalidInputsError(err)) - assert.False(t, IsBLSAggregateEmptyListError(invInpError)) - assert.False(t, IsBLSAggregateEmptyListError(otherError)) - assert.False(t, IsBLSAggregateEmptyListError(nil)) - }) - - t.Run("notBLSKeyError sanity", func(t *testing.T) { - err := notBLSKeyError - invInpError := invalidInputsErrorf("") - otherError := fmt.Errorf("some error") - assert.True(t, IsNotBLSKeyError(err)) - assert.False(t, IsInvalidInputsError(err)) - assert.False(t, IsNotBLSKeyError(invInpError)) - assert.False(t, IsNotBLSKeyError(otherError)) - assert.False(t, IsNotBLSKeyError(nil)) - }) -} - -// VerifyBLSSignatureManyMessages bench -// Bench the slowest case where all messages and public keys are distinct. -// (2*n) pairings without aggregation Vs (n+1) pairings with aggregation. -// The function is faster whenever there are redundant messages or public keys. -func BenchmarkVerifySignatureManyMessages(b *testing.B) { - // inputs - sigsNum := 100 - inputKmacs := make([]hash.Hasher, 0, sigsNum) - sigs := make([]Signature, 0, sigsNum) - pks := make([]PublicKey, 0, sigsNum) - inputMsgs := make([][]byte, 0, sigsNum) - kmac := NewExpandMsgXOFKMAC128("bench tag") - seed := make([]byte, KeyGenSeedMinLen) - - // create the signatures - for i := 0; i < sigsNum; i++ { - input := make([]byte, 100) - _, err := crand.Read(input) - require.NoError(b, err) - - _, err = crand.Read(seed) - require.NoError(b, err) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(b, err) - s, err := sk.Sign(input, kmac) - require.NoError(b, err) - sigs = append(sigs, s) - pks = append(pks, sk.PublicKey()) - inputKmacs = append(inputKmacs, kmac) - inputMsgs = append(inputMsgs, input) - } - aggSig, err := AggregateBLSSignatures(sigs) - require.NoError(b, err) - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := VerifyBLSSignatureManyMessages(pks, aggSig, inputMsgs, inputKmacs) - require.NoError(b, err) - } - b.StopTimer() -} - -// Bench of all aggregation functions -func BenchmarkAggregate(b *testing.B) { - seed := make([]byte, KeyGenSeedMinLen) - // random message - input := make([]byte, 100) - _, _ = crand.Read(input) - // hasher - kmac := NewExpandMsgXOFKMAC128("bench tag") - sigsNum := 1000 - sigs := make([]Signature, 0, sigsNum) - sks := make([]PrivateKey, 0, sigsNum) - pks := make([]PublicKey, 0, sigsNum) - - // create the signatures - for i := 0; i < sigsNum; i++ { - _, err := crand.Read(seed) - require.NoError(b, err) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(b, err) - s, err := sk.Sign(input, kmac) - if err != nil { - b.Fatal() - } - sigs = append(sigs, s) - sks = append(sks, sk) - pks = append(pks, sk.PublicKey()) - } - - // private keys - b.Run("PrivateKeys", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := AggregateBLSPrivateKeys(sks) - require.NoError(b, err) - } - b.StopTimer() - }) - - // public keys - b.Run("PublicKeys", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := AggregateBLSPublicKeys(pks) - require.NoError(b, err) - } - b.StopTimer() - }) - - // signatures - b.Run("Signatures", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := AggregateBLSSignatures(sigs) - require.NoError(b, err) - } - b.StopTimer() - }) -} - -func TestBLSIdentity(t *testing.T) { - rand := getPRG(t) - - var identitySig []byte - msg := []byte("random_message") - hasher := NewExpandMsgXOFKMAC128("") - - t.Run("identity signature comparison", func(t *testing.T) { - if !isG1Compressed() { - t.Skip() - } - // verify that constructed identity signatures are recognized as such by IsBLSSignatureIdentity. - // construct identity signature by summing (aggregating) a random signature and its inverse. - - // sanity check to start - assert.True(t, IsBLSSignatureIdentity(g1Serialization)) - - // sum up a random signature and its inverse to get identity - sk := randomSK(t, rand) - sig, err := sk.Sign(msg, hasher) - require.NoError(t, err) - oppositeSig := make([]byte, SignatureLenBLSBLS12381) - copy(oppositeSig, sig) - negateCompressedPoint(oppositeSig) - aggSig, err := AggregateBLSSignatures([]Signature{sig, oppositeSig}) - require.NoError(t, err) - assert.True(t, IsBLSSignatureIdentity(aggSig)) - }) - - t.Run("verification with identity key", func(t *testing.T) { - // all verification methods should return (false, nil) when verified against - // an identity public key. - idPk := IdentityBLSPublicKey() - valid, err := idPk.Verify(identitySig, msg, hasher) - assert.NoError(t, err) - assert.False(t, valid) - - valid, err = VerifyBLSSignatureOneMessage([]PublicKey{idPk}, identitySig, msg, hasher) - assert.NoError(t, err) - assert.False(t, valid) - - valid, err = VerifyBLSSignatureManyMessages([]PublicKey{idPk}, identitySig, [][]byte{msg}, []hash.Hasher{hasher}) - assert.NoError(t, err) - assert.False(t, valid) - - validSlice, err := BatchVerifyBLSSignaturesOneMessage([]PublicKey{idPk}, []Signature{identitySig}, msg, hasher) - assert.NoError(t, err) - assert.False(t, validSlice[0]) - - valid, err = BLSVerifyPOP(idPk, identitySig) - assert.NoError(t, err) - assert.False(t, valid) - }) -} diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go deleted file mode 100644 index 412f06f962a..00000000000 --- a/crypto/bls_thresholdsign.go +++ /dev/null @@ -1,598 +0,0 @@ -package crypto - -// #include "bls_thresholdsign_include.h" -import "C" - -import ( - "fmt" - "sync" - - "github.com/onflow/flow-go/crypto/hash" -) - -// BLS-based threshold signature on BLS 12-381 curve -// The BLS settings are the same as in the signature -// scheme defined in the package. - -// A threshold signature scheme allows any subset of (t+1) -// valid signature shares to reconstruct the threshold signature. -// Up to (t) shares do not reveal any information about the threshold -// signature. -// Although the API allows using arbitrary values of (t), -// the threshold signature scheme is secure in the presence of up to (t) -// malicious participants when (t < n/2). -// In order to optimize equally for unforgeability and robustness, -// the input threshold value (t) should be set to t = floor((n-1)/2). - -// The package offers two api for BLS threshold signature: -// - stateful api where a structure holds all information -// of the threshold signature protocols and is recommended -// to be used for safety and to reduce protocol inconsistencies. -// - stateless api with signature reconstruction. Verifying and storing -// the signature shares has to be managed outside of the library. - -// blsThresholdSignatureParticipant implements ThresholdSignatureParticipant -// based on the BLS signature scheme -type blsThresholdSignatureParticipant struct { - // embed the follower - *blsThresholdSignatureInspector - // the index of the current participant - myIndex int - // the current participant private key (a threshold KG output) - myPrivateKey PrivateKey -} - -var _ ThresholdSignatureParticipant = (*blsThresholdSignatureParticipant)(nil) - -// blsThresholdSignatureInspector implements ThresholdSignatureInspector -// based on the BLS signature scheme -type blsThresholdSignatureInspector struct { - // size of the group - size int - // the threshold t of the scheme where (t+1) shares are - // required to reconstruct a signature - threshold int - // the group public key (a threshold KG output) - groupPublicKey PublicKey - // the group public key shares (a threshold KG output) - publicKeyShares []PublicKey - // the hasher to be used for all signatures - hasher hash.Hasher - // the message to be signed. Signature shares and the threshold signature - // are verified against this message - message []byte - // the valid signature shares received from other participants - shares map[index]Signature - // the threshold signature. It is equal to nil if less than (t+1) shares are - // received - thresholdSignature Signature - // lock for atomic operations - lock sync.RWMutex -} - -var _ ThresholdSignatureInspector = (*blsThresholdSignatureInspector)(nil) - -// NewBLSThresholdSignatureParticipant creates a new instance of Threshold signature Participant using BLS. -// A participant is able to participate in a threshold signing protocol as well as following the -// protocol. -// -// A new instance is needed for each set of public keys and message. -// If the key set or message change, a new structure needs to be instantiated. -// Participants are defined by their public key share, and are indexed from 0 to n-1. The current -// participant is indexed by `myIndex` and holds the input private key -// where n is the length of the public key shares slice. -// -// The function returns: -// - (nil, invalidInputsError) if: -// - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] -// - threshold value is not in interval [1, n-1] -// - input private key and public key at my index do not match -// - (nil, notBLSKeyError) if the private or at least one public key is not of type BLS BLS12-381. -// - (pointer, nil) otherwise -func NewBLSThresholdSignatureParticipant( - groupPublicKey PublicKey, - sharePublicKeys []PublicKey, - threshold int, - myIndex int, - myPrivateKey PrivateKey, - message []byte, - dsTag string, -) (*blsThresholdSignatureParticipant, error) { - - size := len(sharePublicKeys) - if myIndex >= size || myIndex < 0 { - return nil, invalidInputsErrorf( - "the current index must be between 0 and %d, got %d", - size-1, myIndex) - } - - // check private key is BLS key - if _, ok := myPrivateKey.(*prKeyBLSBLS12381); !ok { - return nil, fmt.Errorf("private key of participant %d is not valid: %w", myIndex, notBLSKeyError) - } - - // create the follower - follower, err := NewBLSThresholdSignatureInspector(groupPublicKey, sharePublicKeys, threshold, message, dsTag) - if err != nil { - return nil, fmt.Errorf("create a threshold signature follower failed: %w", err) - } - - // check the private key, index and corresponding public key are consistent - currentPublicKey := sharePublicKeys[myIndex] - if !myPrivateKey.PublicKey().Equals(currentPublicKey) { - return nil, invalidInputsErrorf("private key is not matching public key at index %d", myIndex) - } - - return &blsThresholdSignatureParticipant{ - blsThresholdSignatureInspector: follower, - myIndex: myIndex, // current participant index - myPrivateKey: myPrivateKey, // myPrivateKey is the current participant's own private key share - }, nil -} - -// NewBLSThresholdSignatureInspector creates a new instance of Threshold signature follower using BLS. -// It only allows following the threshold signing protocol . -// -// A new instance is needed for each set of public keys and message. -// If the key set or message change, a new structure needs to be instantiated. -// Participants are defined by their public key share, and are indexed from 0 to n-1 -// where n is the length of the public key shares slice. -// -// The function returns: -// - (nil, invalidInputsError) if: -// - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] -// - threshold value is not in interval [1, n-1] -// - (nil, notBLSKeyError) at least one public key is not of type pubKeyBLSBLS12381 -// - (pointer, nil) otherwise -func NewBLSThresholdSignatureInspector( - groupPublicKey PublicKey, - sharePublicKeys []PublicKey, - threshold int, - message []byte, - dsTag string, -) (*blsThresholdSignatureInspector, error) { - - size := len(sharePublicKeys) - if size < ThresholdSignMinSize || size > ThresholdSignMaxSize { - return nil, invalidInputsErrorf( - "size should be between %d and %d, got %d", - ThresholdSignMinSize, ThresholdSignMaxSize, size) - } - if threshold >= size || threshold < MinimumThreshold { - return nil, invalidInputsErrorf( - "the threshold must be between %d and %d, got %d", - MinimumThreshold, size-1, threshold) - } - - // check keys are BLS keys - for i, pk := range sharePublicKeys { - if _, ok := pk.(*pubKeyBLSBLS12381); !ok { - return nil, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) - } - } - if _, ok := groupPublicKey.(*pubKeyBLSBLS12381); !ok { - return nil, fmt.Errorf("group key is invalid: %w", notBLSKeyError) - } - - return &blsThresholdSignatureInspector{ - size: size, - threshold: threshold, - message: message, - hasher: NewExpandMsgXOFKMAC128(dsTag), - shares: make(map[index]Signature), - thresholdSignature: nil, - groupPublicKey: groupPublicKey, // groupPublicKey is the group public key corresponding to the group secret key - publicKeyShares: sharePublicKeys, // sharePublicKeys are the public key shares corresponding to the private key shares - }, nil -} - -// SignShare generates a signature share using the current private key share. -// -// The function does not add the share to the internal pool of shares and do -// not update the internal state. -// This function is thread safe and non-blocking -// -// The function returns -// - (nil, error) if an unexpected error occurs -// - (signature, nil) otherwise -func (s *blsThresholdSignatureParticipant) SignShare() (Signature, error) { - share, err := s.myPrivateKey.Sign(s.message, s.hasher) - if err != nil { - return nil, fmt.Errorf("share signing failed: %w", err) - } - return share, nil -} - -// validIndex returns invalidInputsError error if given index is valid and nil otherwise. -// This function is thread safe. -func (s *blsThresholdSignatureInspector) validIndex(orig int) error { - if orig >= s.size || orig < 0 { - return invalidInputsErrorf( - "origin input is invalid, should be positive less than %d, got %d", - s.size, orig) - } - return nil -} - -// VerifyShare verifies the input signature against the stored message and stored -// key at the input index. -// -// This function does not update the internal state and is thread-safe. -// Returns: -// - (true, nil) if the signature is valid -// - (false, nil) if `orig` is valid but the signature share does not verify against -// the public key share and message. -// - (false, invalidInputsError) if `orig` is an invalid index value -// - (false, error) for all other unexpected errors -func (s *blsThresholdSignatureInspector) VerifyShare(orig int, share Signature) (bool, error) { - // validate index - if err := s.validIndex(orig); err != nil { - return false, err - } - return s.publicKeyShares[orig].Verify(share, s.message, s.hasher) -} - -// VerifyThresholdSignature verifies the input signature against the stored -// message and stored group public key. -// -// This function does not update the internal state and is thread-safe. -// Returns: -// - (true, nil) if the signature is valid -// - (false, nil) if signature is invalid -// - (false, error) for all other unexpected errors -func (s *blsThresholdSignatureInspector) VerifyThresholdSignature(thresholdSignature Signature) (bool, error) { - return s.groupPublicKey.Verify(thresholdSignature, s.message, s.hasher) -} - -// EnoughShares indicates whether enough shares have been accumulated in order to reconstruct -// a group signature. -// -// This function is thread safe. -// Returns: -// - true if and only if at least (threshold+1) shares were added -func (s *blsThresholdSignatureInspector) EnoughShares() bool { - s.lock.RLock() - defer s.lock.RUnlock() - - return s.enoughShares() -} - -// non thread safe version of EnoughShares -func (s *blsThresholdSignatureInspector) enoughShares() bool { - // len(s.signers) is always <= s.threshold + 1 - return len(s.shares) == (s.threshold + 1) -} - -// HasShare checks whether the internal map contains the share of the given index. -// This function is thread safe and locks the internal state. -// The function returns: -// - (false, invalidInputsError) if the index is invalid -// - (false, nil) if index is valid and share is not in the map -// - (true, nil) if index is valid and share is in the map -func (s *blsThresholdSignatureInspector) HasShare(orig int) (bool, error) { - // validate index - if err := s.validIndex(orig); err != nil { - return false, err - } - - s.lock.RLock() - defer s.lock.RUnlock() - - return s.hasShare(index(orig)), nil -} - -// non thread safe version of HasShare, and assumes input is valid -func (s *blsThresholdSignatureInspector) hasShare(orig index) bool { - _, ok := s.shares[orig] - return ok -} - -// TrustedAdd adds a signature share to the internal pool of shares -// without verifying the signature against the message and the participant's -// public key. This function is thread safe and locks the internal state. -// -// The share is only added if the signer index is valid and has not been -// added yet. Moreover, the share is added only if not enough shares were collected. -// The function returns: -// - (true, nil) if enough signature shares were already collected and no error occurred -// - (false, nil) if not enough shares were collected and no error occurred -// - (false, invalidInputsError) if index is invalid -// - (false, duplicatedSignerError) if a signature for the index was previously added -func (s *blsThresholdSignatureInspector) TrustedAdd(orig int, share Signature) (bool, error) { - // validate index - if err := s.validIndex(orig); err != nil { - return false, err - } - - s.lock.Lock() - defer s.lock.Unlock() - - if s.hasShare(index(orig)) { - return false, duplicatedSignerErrorf("share for %d was already added", orig) - } - - if s.enoughShares() { - return true, nil - } - s.shares[index(orig)] = share - return s.enoughShares(), nil -} - -// VerifyAndAdd verifies a signature share (same as `VerifyShare`), -// and may or may not add the share to the local pool of shares. -// This function is thread safe and locks the internal state. -// -// The share is only added if the signature is valid, the signer index is valid and has not been -// added yet. Moreover, the share is added only if not enough shares were collected. -// Boolean returns: -// - First boolean output is true if the share is valid and no error is returned, and false otherwise. -// - Second boolean output is true if enough shares were collected and no error is returned, and false otherwise. -// -// Error returns: -// - invalidInputsError if input index is invalid. A signature that doesn't verify against the signer's -// public key is not considered an invalid input. -// - duplicatedSignerError if signer was already added. -// - other errors if an unexpected exception occurred. -func (s *blsThresholdSignatureInspector) VerifyAndAdd(orig int, share Signature) (bool, bool, error) { - // validate index - if err := s.validIndex(orig); err != nil { - return false, false, err - } - - s.lock.Lock() - defer s.lock.Unlock() - - // check share is new - if s.hasShare(index(orig)) { - return false, false, duplicatedSignerErrorf("share for %d was already added", orig) - } - - // verify the share - verif, err := s.publicKeyShares[index(orig)].Verify(share, s.message, s.hasher) - if err != nil { - return false, false, fmt.Errorf("verification of share failed: %w", err) - } - - enough := s.enoughShares() - if verif && !enough { - s.shares[index(orig)] = share - } - return verif, s.enoughShares(), nil -} - -// ThresholdSignature returns the threshold signature if the threshold was reached. -// The threshold signature is reconstructed only once is cached for subsequent calls. -// -// The function is thread-safe. -// Returns: -// - (signature, nil) if no error occurred -// - (nil, notEnoughSharesError) if not enough shares were collected -// - (nil, invalidSignatureError) if at least one collected share does not serialize to a valid BLS signature. -// - (nil, invalidInputsError) if the constructed signature failed to verify against the group public key and stored -// message. This post-verification is required for safety, as `TrustedAdd` allows adding invalid signatures. -// - (nil, error) for any other unexpected error. -func (s *blsThresholdSignatureInspector) ThresholdSignature() (Signature, error) { - s.lock.Lock() - defer s.lock.Unlock() - - // check cached thresholdSignature - if s.thresholdSignature != nil { - return s.thresholdSignature, nil - } - - // reconstruct the threshold signature - thresholdSignature, err := s.reconstructThresholdSignature() - if err != nil { - return nil, err - } - s.thresholdSignature = thresholdSignature - return thresholdSignature, nil -} - -// reconstructThresholdSignature reconstructs the threshold signature from at least (t+1) shares. -// Returns: -// - (signature, nil) if no error occurred -// - (nil, notEnoughSharesError) if not enough shares were collected -// - (nil, invalidSignatureError) if at least one collected share does not serialize to a valid BLS signature. -// - (nil, invalidInputsError) if the constructed signature failed to verify against the group public key and stored message. -// - (nil, error) for any other unexpected error. -func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signature, error) { - - if !s.enoughShares() { - return nil, notEnoughSharesErrorf("number of signature shares %d is not enough, %d are required", - len(s.shares), s.threshold+1) - } - thresholdSignature := make([]byte, SignatureLenBLSBLS12381) - - // prepare the C layer inputs - shares := make([]byte, 0, len(s.shares)*SignatureLenBLSBLS12381) - signers := make([]index, 0, len(s.shares)) - for index, share := range s.shares { - shares = append(shares, share...) - signers = append(signers, index+1) - } - - // Lagrange Interpolate at point 0 - result := C.E1_lagrange_interpolate_at_zero_write( - (*C.uchar)(&thresholdSignature[0]), - (*C.uchar)(&shares[0]), - (*C.uint8_t)(&signers[0]), (C.int)(s.threshold)) - - if result != valid { - return nil, invalidSignatureError - } - - // Verify the computed signature - verif, err := s.VerifyThresholdSignature(thresholdSignature) - if err != nil { - return nil, fmt.Errorf("internal error while verifying the threshold signature: %w", err) - } - if !verif { - return nil, invalidInputsErrorf( - "constructed threshold signature does not verify against the group public key, check shares and public key") - } - - return thresholdSignature, nil -} - -// BLSReconstructThresholdSignature is a stateless BLS api that takes a list of -// BLS signatures and their signers' indices and returns the threshold signature. -// -// size is the number of participants, it must be in the range [ThresholdSignMinSize..ThresholdSignMaxSize]. -// threshold is the threshold value, it must be in the range [MinimumThreshold..size-1]. -// The function does not accept any input public key. Therefore, it does not check the validity of the -// shares against individual public keys, and does not check the validity of the resulting signature -// against the group public key. -// BLSReconstructThresholdSignature returns: -// - (nil, invalidInputsError) if : -// -- numbers of shares does not match the number of signers -// -- the inputs are not in the correct range. -// - (nil, notEnoughSharesError) if the threshold is not reached. -// - (nil, duplicatedSignerError) if input signers are not distinct. -// - (nil, invalidSignatureError) if at least one of the first (threshold+1) signatures. -// does not serialize to a valid E1 point. -// - (threshold_sig, nil) otherwise. -// -// If the number of shares reaches the required threshold, only the first threshold+1 shares -// are considered to reconstruct the signature. -func BLSReconstructThresholdSignature(size int, threshold int, - shares []Signature, signers []int) (Signature, error) { - - if size < ThresholdSignMinSize || size > ThresholdSignMaxSize { - return nil, invalidInputsErrorf( - "size should be between %d and %d", - ThresholdSignMinSize, - ThresholdSignMaxSize) - } - if threshold >= size || threshold < MinimumThreshold { - return nil, invalidInputsErrorf( - "the threshold must be between %d and %d, got %d", - MinimumThreshold, size-1, - threshold) - } - - if len(shares) != len(signers) { - return nil, invalidInputsErrorf( - "the number of signature shares is not matching the number of signers") - } - - if len(shares) < threshold+1 { - return nil, notEnoughSharesErrorf( - "the number of signatures %d is less than the minimum %d", len(shares), threshold+1) - } - - // map to check signers are distinct - m := make(map[index]bool) - - // flatten the shares (required by the C layer) - flatShares := make([]byte, 0, SignatureLenBLSBLS12381*(threshold+1)) - indexSigners := make([]index, 0, threshold+1) - for i, share := range shares { - flatShares = append(flatShares, share...) - // check the index is valid - if signers[i] >= size || signers[i] < 0 { - return nil, invalidInputsErrorf( - "signer index #%d is invalid", i) - } - // check the index is new - if _, isSeen := m[index(signers[i])]; isSeen { - return nil, duplicatedSignerErrorf( - "%d is a duplicate signer", index(signers[i])) - } - m[index(signers[i])] = true - indexSigners = append(indexSigners, index(signers[i])+1) - } - - thresholdSignature := make([]byte, SignatureLenBLSBLS12381) - // Lagrange Interpolate at point 0 - if C.E1_lagrange_interpolate_at_zero_write( - (*C.uchar)(&thresholdSignature[0]), - (*C.uchar)(&flatShares[0]), - (*C.uint8_t)(&indexSigners[0]), (C.int)(threshold), - ) != valid { - return nil, invalidSignatureError - } - return thresholdSignature, nil -} - -// EnoughShares is a stateless function that takes the value of the threshold -// and a shares number and returns true if the shares number is enough -// to reconstruct a threshold signature. -// -// The function returns: -// - (false, invalidInputsErrorf) if input threshold is less than 1 -// - (false, nil) if threshold is valid but shares are not enough. -// - (true, nil) if the threshold is valid but shares are enough. -func EnoughShares(threshold int, sharesNumber int) (bool, error) { - if threshold < MinimumThreshold { - return false, invalidInputsErrorf( - "the threshold can't be smaller than %d, got %d", - MinimumThreshold, threshold) - } - return sharesNumber > threshold, nil -} - -// BLSThresholdKeyGen is a key generation for a BLS-based -// threshold signature scheme with a trusted dealer. -// -// The function returns: -// - (nil, nil, nil, invalidInputsErrorf) if: -// - seed is too short -// - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] -// - threshold value is not in interval [1, n-1] -// - (groupPrivKey, []pubKeyShares, groupPubKey, nil) otherwise -func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, - []PublicKey, PublicKey, error) { - - if size < ThresholdSignMinSize || size > ThresholdSignMaxSize { - return nil, nil, nil, invalidInputsErrorf( - "size should be between %d and %d, got %d", - ThresholdSignMinSize, - ThresholdSignMaxSize, - size) - } - if threshold >= size || threshold < MinimumThreshold { - return nil, nil, nil, invalidInputsErrorf( - "the threshold must be between %d and %d, got %d", - MinimumThreshold, - size-1, - threshold) - } - - // the scalars x and G2 points y - x := make([]scalar, size) - y := make([]pointE2, size) - var X0 pointE2 - - // Generate a polynomial P in Fr[X] of degree t - a, err := generateFrPolynomial(seed, threshold) - if err != nil { - return nil, nil, nil, fmt.Errorf("failed to generate random polynomial: %w", err) - } - - // compute the shares - for i := index(1); int(i) <= size; i++ { - C.Fr_polynomial_image( - (*C.Fr)(&x[i-1]), - (*C.E2)(&y[i-1]), - (*C.Fr)(&a[0]), (C.int)(len(a)-1), - (C.uint8_t)(i), - ) - } - // group public key - generatorScalarMultG2(&X0, &a[0]) - // export the keys - skShares := make([]PrivateKey, size) - pkShares := make([]PublicKey, size) - var pkGroup PublicKey - for i := 0; i < size; i++ { - skShares[i] = newPrKeyBLSBLS12381(&x[i]) - pkShares[i] = newPubKeyBLSBLS12381(&y[i]) - } - pkGroup = newPubKeyBLSBLS12381(&X0) - - // public key shares and group public key - // are sampled uniformly at random. The probability of - // generating an identity key is therefore negligible. - return skShares, pkShares, pkGroup, nil -} diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c deleted file mode 100644 index 7c1d809d228..00000000000 --- a/crypto/bls_thresholdsign_core.c +++ /dev/null @@ -1,118 +0,0 @@ -#include "bls_thresholdsign_include.h" - -// the highest index of a threshold participant -#define MAX_IND 255 -#define MAX_IND_BITS 8 // equal to ceiling(log_2(MAX_IND)) - -// Computes the Lagrange coefficient L_i(0) in Fr with regards to the range -// [indices(0)..indices(t)] and stores it in `res`, where t is the degree of the -// polynomial P. -// `degree` is equal to the polynomial degree `t`. -static void Fr_lagrange_coeff_at_zero(Fr *res, const int i, - const byte indices[], const int degree) { - - // coefficient is computed as N * D^(-1) - Fr numerator; // eventually would represent N*R^k - Fr denominator; // eventually would represent D*R^k - - // Initialize N and D to Montgomery constant R - Fr_copy(&numerator, &BLS12_381_rR); - Fr_copy(&denominator, &BLS12_381_rR); - - // sign of D: 0 for positive and 1 for negative - int sign = 0; - - // the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < 2^64 (approximately - // 64/MAX_IND_BITS) this means we can multiply up to (k) indices in a limb (64 - // bits) without overflowing. - const int loops = 64 / MAX_IND_BITS; - int k, j = 0; - Fr tmp; - while (j < degree + 1) { - limb_t limb_numerator = 1; - limb_t limb_denominator = 1; - // batch up to `loops` elements in one limb - for (k = j; j < MIN(degree + 1, k + loops); j++) { - if (j == i) - continue; - if (indices[j] < indices[i]) { - sign ^= 1; - limb_denominator *= indices[i] - indices[j]; - } else { - limb_denominator *= indices[j] - indices[i]; - } - limb_numerator *= indices[j]; - } - // numerator and denominator are both computed in Montgomery form. - // update numerator - Fr_set_limb(&tmp, limb_numerator); // L_N - Fr_to_montg(&tmp, &tmp); // L_N*R - Fr_mul_montg(&numerator, &numerator, &tmp); // N*R - // update denominator - Fr_set_limb(&tmp, limb_denominator); // L_D - Fr_to_montg(&tmp, &tmp); // L_D*R - Fr_mul_montg(&denominator, &denominator, &tmp); // D*R - } - if (sign) { - Fr_neg(&denominator, &denominator); - } - - // at this point, denominator = D*R , numertaor = N*R - // inversion inv(x) = x^(-1)R - Fr_inv_montg_eucl(&denominator, &denominator); // (DR)^(-1)*R = D^(-1) - Fr_mul_montg(res, &numerator, &denominator); // N*D^(-1) -} - -// Computes the Langrange interpolation at zero P(0) = LI(0) with regards to the -// indices [indices(0)..indices(t)] and their G1 images [shares(0)..shares(t)], -// and stores the resulting G1 point in `dest`. -// `degree` is equal to the polynomial degree `t`. -static void E1_lagrange_interpolate_at_zero(E1 *out, const E1 shares[], - const byte indices[], - const int degree) { - // Purpose is to compute Q(0) where Q(x) = A_0 + A_1*x + ... + A_t*x^t in G1 - // where A_i = g1 ^ a_i - - // Q(0) = share_i0 ^ L_i0(0) + share_i1 ^ L_i1(0) + .. + share_it ^ L_it(0) - // where L is the Lagrange coefficient - - E1_set_infty(out); - Fr fr_lagr_coef; - E1 mult; - for (int i = 0; i < degree + 1; i++) { - Fr_lagrange_coeff_at_zero(&fr_lagr_coef, i, indices, degree); - E1_mult(&mult, &shares[i], &fr_lagr_coef); - E1_add(out, out, &mult); - } -} - -// Computes the Lagrange interpolation at zero LI(0) with regards to the -// indices [indices(0)..indices(t)] and writes their E1 concatenated -// serializations [shares(1)..shares(t+1)] in `dest`. -// `degree` is equal to the polynomial degree `t`. -int E1_lagrange_interpolate_at_zero_write(byte *dest, const byte *shares, - const byte indices[], - const int degree) { - int read_ret; - E1 *E1_shares = malloc(sizeof(E1) * (degree + 1)); - for (int i = 0; i < degree + 1; i++) { - read_ret = - E1_read_bytes(&E1_shares[i], &shares[G1_SER_BYTES * i], G1_SER_BYTES); - if (read_ret != VALID) { - goto out; - } - } - - // G1 interpolation at 0 - // computes Q(x) = A_0 + A_1*x + ... + A_t*x^t in G1, - // where A_i = g1 ^ a_i - E1 res; - E1_lagrange_interpolate_at_zero(&res, E1_shares, indices, degree); - // export the result - E1_write_bytes(dest, &res); - read_ret = VALID; -out: - // free the temp memory - free(E1_shares); - return read_ret; -} diff --git a/crypto/bls_thresholdsign_include.h b/crypto/bls_thresholdsign_include.h deleted file mode 100644 index d41779dab25..00000000000 --- a/crypto/bls_thresholdsign_include.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _THRESHOLD_INCLUDE_H -#define _THRESHOLD_INCLUDE_H - -#include "bls_include.h" - -int E1_lagrange_interpolate_at_zero_write(byte *, const byte *, const byte[], - const int); -extern void Fr_polynomial_image(Fr *out, E2 *y, const Fr *a, const int a_size, - const byte x); - -#endif diff --git a/crypto/bls_thresholdsign_test.go b/crypto/bls_thresholdsign_test.go deleted file mode 100644 index 9f3f83cb387..00000000000 --- a/crypto/bls_thresholdsign_test.go +++ /dev/null @@ -1,649 +0,0 @@ -package crypto - -import ( - crand "crypto/rand" - "fmt" - "sync" - "testing" - "time" - - log "github.com/sirupsen/logrus" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestBLSThresholdSignature(t *testing.T) { - // stateless API - t.Run("centralized_stateless_keygen", testCentralizedStatelessAPI) - // stateful API - t.Run("centralized_stateful_keygen", testCentralizedStatefulAPI) - t.Run("distributed_stateful_feldmanVSS_keygen", testDistributedStatefulAPI_FeldmanVSS) - t.Run("distributed_stateful_jointFeldman_keygen", testDistributedStatefulAPI_JointFeldman) // Flow Random beacon case -} - -const thresholdSignatureTag = "random tag" - -var thresholdSignatureMessage = []byte("random message") - -// centralized test of the stateful threshold signature using the threshold key generation. -func testCentralizedStatefulAPI(t *testing.T) { - rand := getPRG(t) - seed := make([]byte, KeyGenSeedMinLen) - _, err := rand.Read(seed) - n := 10 - for threshold := MinimumThreshold; threshold < n; threshold++ { - // generate threshold keys - require.NoError(t, err) - skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed) - require.NoError(t, err) - // generate signature shares - signers := make([]int, 0, n) - // hasher - kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag) - // fill the signers list and shuffle it - for i := 0; i < n; i++ { - signers = append(signers, i) - } - rand.Shuffle(n, func(i, j int) { - signers[i], signers[j] = signers[j], signers[i] - }) - - t.Run("happy path", func(t *testing.T) { - // create the stateful threshold signer - ts, err := NewBLSThresholdSignatureInspector(pkGroup, pkShares, threshold, thresholdSignatureMessage, thresholdSignatureTag) - require.NoError(t, err) - - // check EnoughShares - enough := ts.EnoughShares() - assert.False(t, enough) - var wg sync.WaitGroup - // create (t) signatures of the first randomly chosen signers - // ( 1 signature short of the threshold) - for j := 0; j < threshold; j++ { - wg.Add(1) - // test thread safety - go func(j int) { - defer wg.Done() - i := signers[j] - share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) - require.NoError(t, err) - // VerifyShare - verif, err := ts.VerifyShare(i, share) - assert.NoError(t, err) - assert.True(t, verif, "signature should be valid") - // check HasSignature is false - ok, err := ts.HasShare(i) - assert.NoError(t, err) - assert.False(t, ok) - // TrustedAdd - enough, err := ts.TrustedAdd(i, share) - assert.NoError(t, err) - assert.False(t, enough) - // check HasShare is true - ok, err = ts.HasShare(i) - assert.NoError(t, err) - assert.True(t, ok) - // check EnoughSignature - assert.False(t, ts.EnoughShares(), "threshold shouldn't be reached") - // check ThresholdSignature - sig, err := ts.ThresholdSignature() - assert.Error(t, err) - assert.True(t, IsNotEnoughSharesError(err)) - assert.Nil(t, sig) - }(j) - } - wg.Wait() - // add the last required signature to get (t+1) shares - i := signers[threshold] - share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) - require.NoError(t, err) - verif, enough, err := ts.VerifyAndAdd(i, share) - assert.NoError(t, err) - assert.True(t, verif) - assert.True(t, enough) - // check EnoughSignature - assert.True(t, ts.EnoughShares()) - - // add a share when threshold is reached - if threshold+1 < n { - i := signers[threshold+1] - share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) - require.NoError(t, err) - // Trusted Add - enough, err := ts.TrustedAdd(i, share) - assert.NoError(t, err) - assert.True(t, enough) - // VerifyAndAdd - verif, enough, err := ts.VerifyAndAdd(i, share) - assert.NoError(t, err) - assert.True(t, verif) - assert.True(t, enough) - } - // reconstruct the threshold signature - thresholdsignature, err := ts.ThresholdSignature() - require.NoError(t, err) - // VerifyThresholdSignature - verif, err = ts.VerifyThresholdSignature(thresholdsignature) - require.NoError(t, err) - assert.True(t, verif) - }) - - t.Run("duplicate signer", func(t *testing.T) { - // create the stateful threshold signer - ts, err := NewBLSThresholdSignatureInspector(pkGroup, pkShares, threshold, thresholdSignatureMessage, thresholdSignatureTag) - require.NoError(t, err) - - // Create a share and add it - i := rand.Intn(n) - share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) - require.NoError(t, err) - enough, err := ts.TrustedAdd(i, share) - assert.NoError(t, err) - assert.False(t, enough) - - // Add an existing share - - // VerifyAndAdd - verif, enough, err := ts.VerifyAndAdd(i, share) - assert.Error(t, err) - assert.True(t, IsDuplicatedSignerError(err)) - assert.False(t, verif) - assert.False(t, enough) - // TrustedAdd - enough, err = ts.TrustedAdd(i, share) - assert.Error(t, err) - assert.True(t, IsDuplicatedSignerError(err)) - assert.False(t, enough) - }) - - t.Run("Invalid index", func(t *testing.T) { - // create the stateful threshold signer - ts, err := NewBLSThresholdSignatureInspector(pkGroup, pkShares, threshold, thresholdSignatureMessage, thresholdSignatureTag) - require.NoError(t, err) - - share, err := skShares[0].Sign(thresholdSignatureMessage, kmac) - require.NoError(t, err) - // invalid index - invalidIndex := len(pkShares) + 1 - // VerifyShare - verif, err := ts.VerifyShare(invalidIndex, share) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.False(t, verif) - // TrustedAdd - enough, err := ts.TrustedAdd(invalidIndex, share) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.False(t, enough) - // VerifyAndAdd - verif, enough, err = ts.VerifyAndAdd(invalidIndex, share) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.False(t, verif) - assert.False(t, enough) - // HasShare - verif, err = ts.HasShare(invalidIndex) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.False(t, verif) - }) - - t.Run("invalid signature", func(t *testing.T) { - index := signers[0] - ts, err := NewBLSThresholdSignatureInspector(pkGroup, pkShares, threshold, thresholdSignatureMessage, thresholdSignatureTag) - require.NoError(t, err) - share, err := skShares[index].Sign(thresholdSignatureMessage, kmac) - require.NoError(t, err) - - // alter signature - invalid serialization - tmp := share[0] - share[0] = invalidBLSSignatureHeader - // VerifyShare - verif, err := ts.VerifyShare(index, share) - assert.NoError(t, err) - assert.False(t, verif) - // VerifyAndAdd - verif, enough, err := ts.VerifyAndAdd(index, share) - assert.NoError(t, err) - assert.False(t, verif) - assert.False(t, enough) - // check share was not added - verif, err = ts.HasShare(index) - assert.NoError(t, err) - assert.False(t, verif) - // restore share - share[0] = tmp - - // valid curve point but invalid signature - otherIndex := (index + 1) % n // otherIndex is different than index - // VerifyShare - verif, err = ts.VerifyShare(otherIndex, share) - assert.NoError(t, err) - assert.False(t, verif) - // VerifyAndAdd - verif, enough, err = ts.VerifyAndAdd(otherIndex, share) - assert.NoError(t, err) - assert.False(t, verif) - assert.False(t, enough) - // check share was not added - verif, err = ts.HasShare(otherIndex) - assert.NoError(t, err) - assert.False(t, verif) - - // trust add one invalid signature and check ThresholdSignature - tmp = share[0] - share[0] = invalidBLSSignatureHeader // alter the share - enough, err = ts.TrustedAdd(index, share) // invalid share - assert.NoError(t, err) - assert.False(t, enough) - for i := 1; i < threshold+1; i++ { // valid shares - index := signers[i] - valid, err := skShares[index].Sign(thresholdSignatureMessage, kmac) - require.NoError(t, err) - enough, err = ts.TrustedAdd(index, valid) - assert.NoError(t, err) - if i < threshold { - assert.False(t, enough) - } else { - assert.True(t, enough) - } - } - sig, err := ts.ThresholdSignature() - assert.Error(t, err) - assert.True(t, IsInvalidSignatureError(err)) - assert.Nil(t, sig) - share[0] = tmp // restore the share - }) - - t.Run("constructor errors", func(t *testing.T) { - // invalid keys size - index := rand.Intn(n) - pkSharesInvalid := make([]PublicKey, ThresholdSignMaxSize+1) - tsFollower, err := NewBLSThresholdSignatureInspector(pkGroup, pkSharesInvalid, threshold, thresholdSignatureMessage, thresholdSignatureTag) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, tsFollower) - // non BLS key share - seed := make([]byte, KeyGenSeedMinLen) - _, err = rand.Read(seed) - require.NoError(t, err) - skEcdsa, err := GeneratePrivateKey(ECDSAP256, seed) - require.NoError(t, err) - tmp := pkShares[0] - pkShares[0] = skEcdsa.PublicKey() - tsFollower, err = NewBLSThresholdSignatureInspector(pkGroup, pkShares, threshold, thresholdSignatureMessage, thresholdSignatureTag) - assert.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.Nil(t, tsFollower) - pkShares[0] = tmp // restore valid keys - // non BLS group key - tsFollower, err = NewBLSThresholdSignatureInspector(skEcdsa.PublicKey(), pkShares, threshold, thresholdSignatureMessage, thresholdSignatureTag) - assert.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.Nil(t, tsFollower) - // non BLS private key - tsParticipant, err := NewBLSThresholdSignatureParticipant(pkGroup, pkShares, threshold, index, skEcdsa, thresholdSignatureMessage, thresholdSignatureTag) - assert.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.Nil(t, tsParticipant) - // invalid current index - tsParticipant, err = NewBLSThresholdSignatureParticipant(pkGroup, pkShares, threshold, len(pkShares)+1, skShares[index], thresholdSignatureMessage, thresholdSignatureTag) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, tsParticipant) - // invalid threshold - tsFollower, err = NewBLSThresholdSignatureInspector(pkGroup, pkShares, len(pkShares)+1, thresholdSignatureMessage, thresholdSignatureTag) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, tsFollower) - // inconsistent private and public key - indexSwap := (index + 1) % n // indexSwap is different than index - pkShares[index], pkShares[indexSwap] = pkShares[indexSwap], pkShares[index] - tsParticipant, err = NewBLSThresholdSignatureParticipant(pkGroup, pkShares, len(pkShares)+1, index, skShares[index], thresholdSignatureMessage, thresholdSignatureTag) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, tsParticipant) - pkShares[index], pkShares[indexSwap] = pkShares[indexSwap], pkShares[index] // restore keys - }) - } -} - -// Distributed Threshold Signature stateful api test -// keys are generated using simple Feldman VSS -func testDistributedStatefulAPI_FeldmanVSS(t *testing.T) { - log.SetLevel(log.ErrorLevel) - log.Info("DKG starts") - gt = t - rand := getPRG(t) - // number of participants to test - n := 5 - lead := rand.Intn(n) // random - var sync sync.WaitGroup - chans := make([]chan *message, n) - processors := make([]testDKGProcessor, 0, n) - - // create n processors for all participants - for current := 0; current < n; current++ { - processors = append(processors, testDKGProcessor{ - current: current, - chans: chans, - protocol: dkgType, - }) - // create DKG in all participants - var err error - processors[current].dkg, err = NewFeldmanVSS(n, optimalThreshold(n), - current, &processors[current], lead) - require.NoError(t, err) - } - - // create the participant (buffered) communication channels - for i := 0; i < n; i++ { - chans[i] = make(chan *message, 2*n) - } - // start DKG in all participants - seed := make([]byte, KeyGenSeedMinLen) - read, err := rand.Read(seed) - require.Equal(t, read, KeyGenSeedMinLen) - require.NoError(t, err) - sync.Add(n) - for current := 0; current < n; current++ { - err := processors[current].dkg.Start(seed) - require.NoError(t, err) - go tsDkgRunChan(&processors[current], &sync, t, 2) - } - - // synchronize the main thread to end DKG - sync.Wait() - for i := 1; i < n; i++ { - assert.True(t, processors[i].pk.Equals(processors[0].pk), "2 group public keys are mismatching") - } - - // Start TS - log.Info("TS starts") - sync.Add(n) - for i := 0; i < n; i++ { - go tsRunChan(&processors[i], &sync, t) - } - // synchronize the main thread to end TS - sync.Wait() -} - -// Distributed Threshold Signature stateful api test -// keys are generated using Joint-Feldman -func testDistributedStatefulAPI_JointFeldman(t *testing.T) { - log.SetLevel(log.ErrorLevel) - log.Info("DKG starts") - gt = t - rand := getPRG(t) - // number of participants to test - n := 5 - for threshold := MinimumThreshold; threshold < n; threshold++ { - var sync sync.WaitGroup - chans := make([]chan *message, n) - processors := make([]testDKGProcessor, 0, n) - - // create n processors for all participants - for current := 0; current < n; current++ { - processors = append(processors, testDKGProcessor{ - current: current, - chans: chans, - protocol: dkgType, - }) - // create DKG in all participants - var err error - processors[current].dkg, err = NewJointFeldman(n, - optimalThreshold(n), current, &processors[current]) - require.NoError(t, err) - } - - // create the participant (buffered) communication channels - for i := 0; i < n; i++ { - chans[i] = make(chan *message, 2*n) - } - // start DKG in all participants but the - seed := make([]byte, KeyGenSeedMinLen) - read, err := rand.Read(seed) - require.Equal(t, read, KeyGenSeedMinLen) - require.NoError(t, err) - sync.Add(n) - for current := 0; current < n; current++ { - err := processors[current].dkg.Start(seed) - require.NoError(t, err) - go tsDkgRunChan(&processors[current], &sync, t, 0) - } - - // sync the 2 timeouts at all participants and start the next phase - for phase := 1; phase <= 2; phase++ { - sync.Wait() - sync.Add(n) - for current := 0; current < n; current++ { - go tsDkgRunChan(&processors[current], &sync, t, phase) - } - } - - // synchronize the main thread to end DKG - sync.Wait() - for i := 1; i < n; i++ { - assert.True(t, processors[i].pk.Equals(processors[0].pk), - "2 group public keys are mismatching") - } - - // Start TS - log.Info("TS starts") - sync.Add(n) - for current := 0; current < n; current++ { - go tsRunChan(&processors[current], &sync, t) - } - // synchronize the main thread to end TS - sync.Wait() - } -} - -// This is a testing function -// It simulates processing incoming messages by a participant during DKG -// It assumes proc.dkg is already running -func tsDkgRunChan(proc *testDKGProcessor, - sync *sync.WaitGroup, t *testing.T, phase int) { - for { - select { - case newMsg := <-proc.chans[proc.current]: - log.Debugf("%d Receiving DKG from %d:", proc.current, newMsg.orig) - if newMsg.channel == private { - err := proc.dkg.HandlePrivateMsg(newMsg.orig, newMsg.data) - require.Nil(t, err) - } else { - err := proc.dkg.HandleBroadcastMsg(newMsg.orig, newMsg.data) - require.Nil(t, err) - } - - // if timeout, finalize DKG and create the threshold signer - case <-time.After(200 * time.Millisecond): - switch phase { - case 0: - log.Infof("%d shares phase ended \n", proc.current) - err := proc.dkg.NextTimeout() - require.NoError(t, err) - case 1: - log.Infof("%d complaints phase ended \n", proc.current) - err := proc.dkg.NextTimeout() - require.NoError(t, err) - case 2: - log.Infof("%d dkg ended \n", proc.current) - sk, groupPK, nodesPK, err := proc.dkg.End() - require.NotNil(t, sk) - require.NotNil(t, groupPK) - require.NotNil(t, nodesPK) - require.Nil(t, err, "End dkg failed: %v\n", err) - proc.pk = groupPK - n := proc.dkg.Size() - proc.ts, err = NewBLSThresholdSignatureParticipant(groupPK, nodesPK, optimalThreshold(n), proc.current, sk, thresholdSignatureMessage, thresholdSignatureTag) - require.NoError(t, err) - // needed to test the statless api - proc.keys = &statelessKeys{sk, groupPK, nodesPK} - } - sync.Done() - return - } - } -} - -// This is a testing function using the stateful api -// It simulates processing incoming messages by a participant during TS -func tsRunChan(proc *testDKGProcessor, sync *sync.WaitGroup, t *testing.T) { - // Sign a share and broadcast it - sigShare, err := proc.ts.SignShare() - proc.protocol = tsType - if err != nil { // not using require.Nil for now - panic(fmt.Sprintf("%d couldn't sign", proc.current)) - } - proc.Broadcast(sigShare) - for { - select { - case newMsg := <-proc.chans[proc.current]: - log.Debugf("%d Receiving TS from %d:", proc.current, newMsg.orig) - verif, enough, err := proc.ts.VerifyAndAdd( - newMsg.orig, newMsg.data) - require.NoError(t, err) - assert.True(t, verif, - "the signature share sent from %d to %d is not correct", newMsg.orig, - proc.current) - log.Info(enough) - if enough { - assert.Equal(t, enough, proc.ts.EnoughShares()) - thresholdSignature, err := proc.ts.ThresholdSignature() - require.NoError(t, err) - verif, err = proc.ts.VerifyThresholdSignature(thresholdSignature) - require.NoError(t, err) - assert.True(t, verif, "the threshold signature is not correct") - if verif { - log.Infof("%d reconstructed a valid signature: %d\n", proc.current, - thresholdSignature) - } - } - - // if timeout, finalize TS - case <-time.After(time.Second): - sync.Done() - return - } - } -} - -// This stucture holds the keys and is needed for the stateless test -type statelessKeys struct { - // the current participant private key (a DKG output) - myPrivateKey PrivateKey - // the group public key (a DKG output) - groupPublicKey PublicKey - // the group public key shares (a DKG output) - publicKeyShares []PublicKey -} - -// Centralized test of threshold signature protocol using the threshold key generation. -func testCentralizedStatelessAPI(t *testing.T) { - - seed := make([]byte, KeyGenSeedMinLen) - n := 10 - for threshold := MinimumThreshold; threshold < n; threshold++ { - // generate threshold keys - rand := getPRG(t) - _, err := rand.Read(seed) - require.NoError(t, err) - skShares, pkShares, pkGroup, err := BLSThresholdKeyGen(n, threshold, seed) - require.NoError(t, err) - // signature hasher - kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag) - // generate signature shares - signShares := make([]Signature, 0, n) - signers := make([]int, 0, n) - // fill the signers list and shuffle it - for i := 0; i < n; i++ { - signers = append(signers, i) - } - rand.Shuffle(n, func(i, j int) { - signers[i], signers[j] = signers[j], signers[i] - }) - // create (t+1) signatures of the first randomly chosen signers - for j := 0; j < threshold+1; j++ { - i := signers[j] - share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) - require.NoError(t, err) - verif, err := pkShares[i].Verify(share, thresholdSignatureMessage, kmac) - require.NoError(t, err) - assert.True(t, verif, "signature share is not valid") - if verif { - signShares = append(signShares, share) - } - } - // reconstruct and test the threshold signature - thresholdSignature, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) - require.NoError(t, err) - verif, err := pkGroup.Verify(thresholdSignature, thresholdSignatureMessage, kmac) - require.NoError(t, err) - assert.True(t, verif, "signature share is not valid") - - // check failure with a random redundant signer - if threshold > 1 { - randomDuplicate := rand.Intn(int(threshold)) + 1 // 1 <= duplicate <= threshold - tmp := signers[randomDuplicate] - signers[randomDuplicate] = signers[0] - thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) - assert.Error(t, err) - assert.True(t, IsDuplicatedSignerError(err)) - assert.Nil(t, thresholdSignature) - signers[randomDuplicate] = tmp - } - - // check with not enough signatures - thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares[:threshold], signers[:threshold]) - assert.Error(t, err) - assert.True(t, IsNotEnoughSharesError(err)) - assert.Nil(t, thresholdSignature) - - // check with an invalid signature (invalid serialization) - signShares[0] = BLSInvalidSignature() - thresholdSignature, err = BLSReconstructThresholdSignature(n, threshold, signShares, signers[:threshold+1]) - assert.Error(t, err) - assert.True(t, IsInvalidSignatureError(err)) - assert.Nil(t, thresholdSignature) - } -} - -func BenchmarkSimpleKeyGen(b *testing.B) { - n := 60 - seed := make([]byte, KeyGenSeedMinLen) - _, err := crand.Read(seed) - require.NoError(b, err) - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, _, _, _ = BLSThresholdKeyGen(n, optimalThreshold(n), seed) - } -} - -func BenchmarkSignatureReconstruction(b *testing.B) { - n := 60 - seed := make([]byte, KeyGenSeedMinLen) - _, _ = crand.Read(seed) - threshold := 40 - // generate threshold keys - skShares, _, _, err := BLSThresholdKeyGen(n, threshold, seed) - require.NoError(b, err) - // signature hasher - kmac := NewExpandMsgXOFKMAC128(thresholdSignatureTag) - // generate signature shares - signShares := make([]Signature, 0, threshold+1) - signers := make([]int, 0, threshold+1) - // create (t+1) signatures of the first randomly chosen signers - for i := 0; i < threshold+1; i++ { - signers = append(signers, i) - share, err := skShares[i].Sign(thresholdSignatureMessage, kmac) - require.NoError(b, err) - signShares = append(signShares, share) - } - // reconstruct - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := BLSReconstructThresholdSignature(n, threshold, signShares, signers) - require.NoError(b, err) - } -} diff --git a/crypto/blst_assembly.S b/crypto/blst_assembly.S deleted file mode 100644 index fb99b3d985e..00000000000 --- a/crypto/blst_assembly.S +++ /dev/null @@ -1 +0,0 @@ -# include "assembly.S" diff --git a/crypto/blst_include.h b/crypto/blst_include.h deleted file mode 100644 index d5eb5079cfd..00000000000 --- a/crypto/blst_include.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef __BLST_INCLUDE_H__ -#define __BLST_INCLUDE_H__ - -// BLST src headers -#include "consts.h" -#include "fields.h" -#include "point.h" - -// types used by the Flow crypto library that are imported from BLST. -// these type definitions are used as an abstraction from BLST internal types. - -// field elements F_r -// where `r` is the order of G1/G2. -// F_r elements are represented as big numbers reduced modulo `r`. Big numbers -// are represented as a little endian vector of limbs. -// `Fr` is equivalent to type `vec256` (used internally by BLST for F_r -// elements). `Fr` is defined as a struct so that it can be exportable through -// cgo to the Go layer. -#define R_BITS 255 // equal to Fr_bits in bls12381_utils.h -typedef struct { - limb_t limbs[(R_BITS + 63) / 64]; -} Fr; - -// field elements F_p -// F_p elements are represented as big numbers reduced modulo `p`. Big numbers -// are represented as a little endian vector of limbs. -// `Fp` is equivalent to type `vec384` (used internally by BLST for F_p -// elements). `Fp` does not need to be exported to cgo. -typedef vec384 Fp; - -// curve E_1 (over F_p) -// E_1 points are represented in Jacobian coordinates (x,y,z), -// where x, y, z are elements of F_p (type `Fp`). -// `E1` is equivalent to type `POINTonE1` (used internally by BLST for Jacobian -// E1 elements) `E1` is defined as a struct to be exportable through cgo to the -// Go layer. `E1` is also used to represent all subgroup G_1 elements. -typedef struct { - Fp x, y, z; -} E1; - -// field elements F_p^2 -// F_p^2 elements are represented as a vector of two F_p elements. -// `Fp2` is equivalent to type `vec384x` (used internally by BLST for F_p^2 -// elements). `Fp2` does not need to be exported to cgo. -typedef vec384x Fp2; -// helpers to get "real" and "imaginary" Fp elements from Fp2 pointers -#define real(p) ((*(p))[0]) -#define imag(p) ((*(p))[1]) - -// curve E_2 (over F_p^2) -// E_2 points are represented in Jacobian coordinates (x,y,z), -// where x, y, z are elements of F_p (type `Fp`). -// `E2` is equivelent to type `POINTonE2` (used internally by BLST for Jacobian -// E2 elements) `E2` is defined as a struct to be exportable through cgo to the -// Go layer. `E2` is also used to represent all subgroup G_2 elements. -typedef struct { - Fp2 x, y, z; -} E2; - -// Fp12 is the codomain of the pairing function `e`, specifically the subgroup -// G_T of Fp12. -// Fp12 represents G_T elements and is equivalent to `vec384fp12` (used -// internally by BLST) -typedef vec384fp12 Fp12; -#endif diff --git a/crypto/blst_src/README.md b/crypto/blst_src/README.md deleted file mode 100644 index c2e89a1de71..00000000000 --- a/crypto/blst_src/README.md +++ /dev/null @@ -1,31 +0,0 @@ -All files in this folder contain source files copied from the BLST repo https://github.com/supranational/blst, -specifically from the tagged version `v0.3.11`. - - Copyright Supranational LLC - Licensed under the Apache License, Version 2.0, see LICENSE for details. - SPDX-License-Identifier: Apache-2.0 - -While BLST exports multiple functions and tools, the implementation in Flow crypto requires access to low level functions. Some of these tools are not exported by BLST, others would need to be used without paying for the cgo cost, and therefore without using the Go bindings in BLST. - -The folder contains: -- BLST LICENSE file -- all `/src/*.c` and `/src/*.h` files (C source files) but `server.c`. -- `server.c` is replaced by `./blst_src.c` (which lists only the files needed by Flow crypto). -- all `/build` (assembly generated files). -- this `README` file. - -To upgrade the BLST version: -- [ ] audit all BLST updates, with focus on `/src`: https://github.com/supranational/blst/compare/v0.3.11... -- [ ] delete all files in this folder `./blst_src/` but `blst_src.c` and `README.md`. -- [ ] delete all files in `./internal/blst/`. -- [ ] open BLST repository on the new version. -- [ ] copy all `.c` and `.h` files from `/src/` into `./blst_src/`. -- [ ] delete newly copied `./blst_src/server.c`. -- [ ] copy the folder `/build/` into this folder `./blst_src`. -- [ ] copy `/bindings/blst.h`, `/bindings/blst_aux.h`, and `/bindings/go/blst.go` into `./internal/blst/.`. -- [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `/bindings/go/blst.go`. -- [ ] update `./blst_src/blst_src.c` if needed. -- [ ] solve all breaking changes that may occur. -- [ ] update the commit version on this `./blst_src/README`. - -Note that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should be done along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. diff --git a/crypto/blst_src/aggregate.c b/crypto/blst_src/aggregate.c deleted file mode 100644 index ca78876acad..00000000000 --- a/crypto/blst_src/aggregate.c +++ /dev/null @@ -1,673 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -/* - * Usage pattern on single-processor system is - * - * blst_pairing_init(ctx, hash_or_encode, DST); - * blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, msg[0]); - * blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, msg[1]); - * ... - * blst_pairing_commit(ctx); - * blst_pairing_finalverify(ctx, NULL); - * - *********************************************************************** - * Usage pattern on multi-processor system is - * - * blst_pairing_init(pk[0], hash_or_encode, DST); - * blst_pairing_init(pk[1], hash_or_encode, DST); - * ... - * start threads each processing an N/nthreads slice of PKs and messages: - * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+0], NULL, msg[i*n+0]); - * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+1], NULL, msg[i*n+1]); - * ... - * blst_pairing_commit(pkx); - * ... - * meanwhile in main thread - * blst_fp12 gtsig; - * blst_aggregated_in_g2(>sig, aggregated_signature); - * join threads and merge their contexts: - * blst_pairing_merge(pk[0], pk[1]); - * blst_pairing_merge(pk[0], pk[2]); - * ... - * blst_pairing_finalverify(pk[0], gtsig); - */ - -#ifndef N_MAX -# define N_MAX 8 -#endif - -typedef union { POINTonE1 e1; POINTonE2 e2; } AggregatedSignature; -typedef struct { - unsigned int ctrl; - unsigned int nelems; - const void *DST; - size_t DST_len; - vec384fp12 GT; - AggregatedSignature AggrSign; - POINTonE2_affine Q[N_MAX]; - POINTonE1_affine P[N_MAX]; -} PAIRING; - -enum { AGGR_UNDEFINED = 0, - AGGR_MIN_SIG = 1, - AGGR_MIN_PK = 2, - AGGR_SIGN_SET = 0x10, - AGGR_GT_SET = 0x20, - AGGR_HASH_OR_ENCODE = 0x40 }; -#define MIN_SIG_OR_PK (AGGR_MIN_SIG | AGGR_MIN_PK) - -static const size_t sizeof_pairing = (sizeof(PAIRING) + 7) & ~(size_t)7; - -size_t blst_pairing_sizeof(void) -{ return sizeof_pairing; } - -void blst_pairing_init(PAIRING *ctx, int hash_or_encode, - const void *DST, size_t DST_len) -{ - ctx->ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); - ctx->nelems = 0; - ctx->DST = (uptr_t)DST==(uptr_t)((byte *)ctx+sizeof_pairing) ? (void *)42 - : DST; - ctx->DST_len = DST_len; -} - -static const void *pairing_get_dst(const PAIRING *ctx) -{ return (uptr_t)ctx->DST==(uptr_t)42 ? (const byte *)ctx+sizeof_pairing - : ctx->DST; -} - -const void *blst_pairing_get_dst(const PAIRING *ctx) -{ return pairing_get_dst(ctx); } - -#define FROM_AFFINE(out,in) do { \ - vec_copy((out)->X, in->X, 2*sizeof(in->X)), \ - vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \ - vec_is_zero(in->X, 2*sizeof(in->X))); } while(0) - -/* - * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated - * signature verification as discussed at - * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407. - * Usage pattern is not finalized yet, because (sig != NULL) is better and - * will be handled separately... - */ -static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx, - const POINTonE2_affine *PK, - size_t pk_groupcheck, - const POINTonE1_affine *sig, - size_t sig_groupcheck, - const byte *scalar, size_t nbits, - const void *msg, size_t msg_len, - const void *aug, size_t aug_len) -{ - if (ctx->ctrl & AGGR_MIN_PK) - return BLST_AGGR_TYPE_MISMATCH; - - ctx->ctrl |= AGGR_MIN_SIG; - - /* - * Since we don't know if the signature is individual or aggregated, - * the only sensible thing to do is to skip over infinite one and - * count on the corresponding infinite public key to be rejected, - * in case the signature is non-aggregated that is. - */ - if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { - POINTonE1 *S = &ctx->AggrSign.e1; - POINTonE1 P[1]; - - FROM_AFFINE(P, sig); - - if (sig_groupcheck && !POINTonE1_in_G1(P)) - return BLST_POINT_NOT_IN_GROUP; - - if (ctx->ctrl & AGGR_SIGN_SET) { - if (nbits != 0 && scalar != NULL) { - POINTonE1_mult_w5(P, P, scalar, nbits); - POINTonE1_dadd(S, S, P, NULL); - } else { - POINTonE1_dadd_affine(S, S, sig); - } - } else { - ctx->ctrl |= AGGR_SIGN_SET; - if (nbits != 0 && scalar != NULL) - POINTonE1_mult_w5(S, P, scalar, nbits); - else - vec_copy(S, P, sizeof(P)); - } - } - - if (PK != NULL) { - unsigned int n; - POINTonE1 H[1]; - const void *DST = pairing_get_dst(ctx); - - /* - * Reject infinite public keys. - */ - if (vec_is_zero(PK, sizeof(*PK))) - return BLST_PK_IS_INFINITY; - - if (pk_groupcheck) { - POINTonE2 P[1]; - - FROM_AFFINE(P, PK); - if (!POINTonE2_in_G2(P)) - return BLST_POINT_NOT_IN_GROUP; - } - - if (ctx->ctrl & AGGR_HASH_OR_ENCODE) - Hash_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); - else - Encode_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); - - if (nbits != 0 && scalar != NULL) - POINTonE1_mult_w5(H, H, scalar, nbits); - - POINTonE1_from_Jacobian(H, H); - - n = ctx->nelems; - vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine)); - vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine)); - if (++n == N_MAX) { - if (ctx->ctrl & AGGR_GT_SET) { - vec384fp12 GT; - miller_loop_n(GT, ctx->Q, ctx->P, n); - mul_fp12(ctx->GT, ctx->GT, GT); - } else { - miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); - ctx->ctrl |= AGGR_GT_SET; - } - n = 0; - } - ctx->nelems = n; - } - - return BLST_SUCCESS; -} - -BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx, - const POINTonE2_affine *PK, - const POINTonE1_affine *signature, - const void *msg, size_t msg_len, - const void *aug, size_t aug_len) -{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, signature, 1, NULL, 0, - msg, msg_len, aug, aug_len); -} - -BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(PAIRING *ctx, - const POINTonE2_affine *PK, - const POINTonE1_affine *sig, - const byte *scalar, - size_t nbits, - const void *msg, - size_t msg_len, - const void *aug, - size_t aug_len) -{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, sig, 1, scalar, nbits, - msg, msg_len, aug, aug_len); -} - -BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(PAIRING *ctx, - const POINTonE2_affine *PK, - size_t pk_grpchk, - const POINTonE1_affine *signature, - size_t sig_grpchk, - const void *msg, size_t msg_len, - const void *aug, size_t aug_len) -{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, signature, sig_grpchk, - NULL, 0, msg, msg_len, aug, aug_len); -} - -BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(PAIRING *ctx, - const POINTonE2_affine *PK, - size_t pk_grpchk, - const POINTonE1_affine *sig, - size_t sig_grpchk, - const byte *scalar, - size_t nbits, - const void *msg, - size_t msg_len, - const void *aug, - size_t aug_len) -{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, sig, sig_grpchk, - scalar, nbits, - msg, msg_len, aug, aug_len); -} - -static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx, - const POINTonE1_affine *PK, - size_t pk_groupcheck, - const POINTonE2_affine *sig, - size_t sig_groupcheck, - const byte *scalar, size_t nbits, - const void *msg, size_t msg_len, - const void *aug, size_t aug_len) -{ - if (ctx->ctrl & AGGR_MIN_SIG) - return BLST_AGGR_TYPE_MISMATCH; - - ctx->ctrl |= AGGR_MIN_PK; - - /* - * Since we don't know if the signature is individual or aggregated, - * the only sensible thing to do is to skip over infinite one and - * count on the corresponding infinite public key to be rejected, - * in case the signature is non-aggregated that is. - */ - if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { - POINTonE2 *S = &ctx->AggrSign.e2; - POINTonE2 P[1]; - - FROM_AFFINE(P, sig); - - if (sig_groupcheck && !POINTonE2_in_G2(P)) - return BLST_POINT_NOT_IN_GROUP; - - if (ctx->ctrl & AGGR_SIGN_SET) { - if (nbits != 0 && scalar != NULL) { - - POINTonE2_mult_w5(P, P, scalar, nbits); - POINTonE2_dadd(S, S, P, NULL); - } else { - POINTonE2_dadd_affine(S, S, sig); - } - } else { - ctx->ctrl |= AGGR_SIGN_SET; - if (nbits != 0 && scalar != NULL) - POINTonE2_mult_w5(S, P, scalar, nbits); - else - vec_copy(S, P, sizeof(P)); - } - } - - if (PK != NULL) { - unsigned int n; - POINTonE2 H[1]; - POINTonE1 pk[1]; - const void *DST = pairing_get_dst(ctx); - - /* - * Reject infinite public keys. - */ - if (vec_is_zero(PK, sizeof(*PK))) - return BLST_PK_IS_INFINITY; - - if (pk_groupcheck) { - POINTonE1 P[1]; - - FROM_AFFINE(P, PK); - if (!POINTonE1_in_G1(P)) - return BLST_POINT_NOT_IN_GROUP; - } - - if (ctx->ctrl & AGGR_HASH_OR_ENCODE) - Hash_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); - else - Encode_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); - - POINTonE2_from_Jacobian(H, H); - - if (nbits != 0 && scalar != NULL) { - FROM_AFFINE(pk, PK); - POINTonE1_mult_w5(pk, pk, scalar, nbits); - POINTonE1_from_Jacobian(pk, pk); - PK = (const POINTonE1_affine *)pk; - } - - n = ctx->nelems; - vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine)); - vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine)); - if (++n == N_MAX) { - if (ctx->ctrl & AGGR_GT_SET) { - vec384fp12 GT; - miller_loop_n(GT, ctx->Q, ctx->P, n); - mul_fp12(ctx->GT, ctx->GT, GT); - } else { - miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); - ctx->ctrl |= AGGR_GT_SET; - } - n = 0; - } - ctx->nelems = n; - } - - return BLST_SUCCESS; -} - -BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx, - const POINTonE1_affine *PK, - const POINTonE2_affine *signature, - const void *msg, size_t msg_len, - const void *aug, size_t aug_len) -{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, signature, 1, NULL, 0, - msg, msg_len, aug, aug_len); -} - -BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(PAIRING *ctx, - const POINTonE1_affine *PK, - const POINTonE2_affine *sig, - const byte *scalar, - size_t nbits, - const void *msg, - size_t msg_len, - const void *aug, - size_t aug_len) -{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, sig, 1, scalar, nbits, - msg, msg_len, aug, aug_len); -} - -BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(PAIRING *ctx, - const POINTonE1_affine *PK, - size_t pk_grpchk, - const POINTonE2_affine *signature, - size_t sig_grpchk, - const void *msg, size_t msg_len, - const void *aug, size_t aug_len) -{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, signature, sig_grpchk, - NULL, 0, msg, msg_len, aug, aug_len); -} - -BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(PAIRING *ctx, - const POINTonE1_affine *PK, - size_t pk_grpchk, - const POINTonE2_affine *sig, - size_t sig_grpchk, - const byte *scalar, - size_t nbits, - const void *msg, - size_t msg_len, - const void *aug, - size_t aug_len) -{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, sig, sig_grpchk, - scalar, nbits, - msg, msg_len, aug, aug_len); -} - -static void PAIRING_Commit(PAIRING *ctx) -{ - unsigned int n; - - if ((n = ctx->nelems) != 0) { - if (ctx->ctrl & AGGR_GT_SET) { - vec384fp12 GT; - miller_loop_n(GT, ctx->Q, ctx->P, n); - mul_fp12(ctx->GT, ctx->GT, GT); - } else { - miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); - ctx->ctrl |= AGGR_GT_SET; - } - ctx->nelems = 0; - } -} - -void blst_pairing_commit(PAIRING *ctx) -{ PAIRING_Commit(ctx); } - -BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1) -{ - if ((ctx->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED - && (ctx1->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED - && (ctx->ctrl & ctx1->ctrl & MIN_SIG_OR_PK) == 0) - return BLST_AGGR_TYPE_MISMATCH; - - /* context producers are expected to have called blst_pairing_commit */ - if (ctx->nelems || ctx1->nelems) - return BLST_AGGR_TYPE_MISMATCH; - - ctx->ctrl |= ctx1->ctrl & MIN_SIG_OR_PK; - - switch (ctx->ctrl & MIN_SIG_OR_PK) { - case AGGR_MIN_SIG: - if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { - POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1, - &ctx1->AggrSign.e1, NULL); - } else if (ctx1->ctrl & AGGR_SIGN_SET) { - ctx->ctrl |= AGGR_SIGN_SET; - vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1, - sizeof(ctx->AggrSign.e1)); - } - break; - case AGGR_MIN_PK: - if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { - POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2, - &ctx1->AggrSign.e2, NULL); - } else if (ctx1->ctrl & AGGR_SIGN_SET) { - ctx->ctrl |= AGGR_SIGN_SET; - vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2, - sizeof(ctx->AggrSign.e2)); - } - break; - case AGGR_UNDEFINED: - break; - default: - return BLST_AGGR_TYPE_MISMATCH; - } - - if (ctx->ctrl & ctx1->ctrl & AGGR_GT_SET) { - mul_fp12(ctx->GT, ctx->GT, ctx1->GT); - } else if (ctx1->ctrl & AGGR_GT_SET) { - ctx->ctrl |= AGGR_GT_SET; - vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT)); - } - - return BLST_SUCCESS; -} - -static bool_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig) -{ - vec384fp12 GT; - - if (!(ctx->ctrl & AGGR_GT_SET)) - return 0; - - if (GTsig != NULL) { - vec_copy(GT, GTsig, sizeof(GT)); - } else if (ctx->ctrl & AGGR_SIGN_SET) { - AggregatedSignature AggrSign; - - switch (ctx->ctrl & MIN_SIG_OR_PK) { - case AGGR_MIN_SIG: - POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1); - miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2, - (const POINTonE1_affine *)&AggrSign.e1, 1); - break; - case AGGR_MIN_PK: - POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2); - miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2, - (const POINTonE1_affine *)&BLS12_381_G1, 1); - break; - default: - return 0; - } - } else { - /* - * The aggregated signature was infinite, relation between the - * hashes and the public keys has to be VERY special... - */ - vec_copy(GT, BLS12_381_Rx.p12, sizeof(GT)); - } - - conjugate_fp12(GT); - mul_fp12(GT, GT, ctx->GT); - final_exp(GT, GT); - - /* return GT==1 */ - return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & - vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])); -} - -int blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig) -{ return (int)PAIRING_FinalVerify(ctx, GTsig); } - -int blst_fp12_finalverify(const vec384fp12 GT1, const vec384fp12 GT2) -{ - vec384fp12 GT; - - vec_copy(GT, GT1, sizeof(GT)); - conjugate_fp12(GT); - mul_fp12(GT, GT, GT2); - final_exp(GT, GT); - - /* return GT==1 */ - return (int)(vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & - vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0]))); -} - -void blst_pairing_raw_aggregate(PAIRING *ctx, const POINTonE2_affine *q, - const POINTonE1_affine *p) -{ - unsigned int n; - - if (vec_is_zero(q, sizeof(*q)) & vec_is_zero(p, sizeof(*p))) - return; - - n = ctx->nelems; - vec_copy(ctx->Q + n, q, sizeof(*q)); - vec_copy(ctx->P + n, p, sizeof(*p)); - if (++n == N_MAX) { - if (ctx->ctrl & AGGR_GT_SET) { - vec384fp12 GT; - miller_loop_n(GT, ctx->Q, ctx->P, n); - mul_fp12(ctx->GT, ctx->GT, GT); - } else { - miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); - ctx->ctrl |= AGGR_GT_SET; - } - n = 0; - } - ctx->nelems = n; -} - -vec384fp12 *blst_pairing_as_fp12(PAIRING *ctx) -{ - PAIRING_Commit(ctx); - return (vec384fp12 *)ctx->GT; -} - -/* - * PAIRING context-free entry points. - * - * To perform FastAggregateVerify, aggregate all public keys and - * signatures with corresponding blst_aggregate_in_g{12}, convert - * result to affine and call suitable blst_core_verify_pk_in_g{12} - * or blst_aggregated_in_g{12}... - */ -BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in, - const unsigned char *zwire) -{ - POINTonE1 P[1]; - BLST_ERROR ret; - - ret = POINTonE1_Deserialize_Z((POINTonE1_affine *)P, zwire); - - if (ret != BLST_SUCCESS) - return ret; - - if (vec_is_zero(P, sizeof(POINTonE1_affine))) { - if (in == NULL) - vec_zero(out, sizeof(*out)); - return BLST_SUCCESS; - } - - vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); - - if (!POINTonE1_in_G1(P)) - return BLST_POINT_NOT_IN_GROUP; - - if (in == NULL) - vec_copy(out, P, sizeof(P)); - else - POINTonE1_dadd_affine(out, in, (POINTonE1_affine *)P); - - return BLST_SUCCESS; -} - -BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in, - const unsigned char *zwire) -{ - POINTonE2 P[1]; - BLST_ERROR ret; - - ret = POINTonE2_Deserialize_Z((POINTonE2_affine *)P, zwire); - - if (ret != BLST_SUCCESS) - return ret; - - if (vec_is_zero(P, sizeof(POINTonE2_affine))) { - if (in == NULL) - vec_zero(out, sizeof(*out)); - return BLST_SUCCESS; - } - - vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); - - if (!POINTonE2_in_G2(P)) - return BLST_POINT_NOT_IN_GROUP; - - if (in == NULL) { - vec_copy(out, P, sizeof(P)); - } else { - POINTonE2_dadd_affine(out, in, (POINTonE2_affine *)P); - } - return BLST_SUCCESS; -} - -void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig) -{ miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1); } - -void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig) -{ miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1); } - -BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk, - const POINTonE2_affine *signature, - int hash_or_encode, - const void *msg, size_t msg_len, - const void *DST, size_t DST_len, - const void *aug, size_t aug_len) -{ - PAIRING ctx; - BLST_ERROR ret; - - ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); - ctx.nelems = 0; - ctx.DST = DST; - ctx.DST_len = DST_len; - - ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, 1, signature, 1, NULL, 0, - msg, msg_len, aug, aug_len); - if (ret != BLST_SUCCESS) - return ret; - - PAIRING_Commit(&ctx); - - return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; -} - -BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk, - const POINTonE1_affine *signature, - int hash_or_encode, - const void *msg, size_t msg_len, - const void *DST, size_t DST_len, - const void *aug, size_t aug_len) -{ - PAIRING ctx; - BLST_ERROR ret; - - ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); - ctx.nelems = 0; - ctx.DST = DST; - ctx.DST_len = DST_len; - - ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, 1, signature, 1, NULL, 0, - msg, msg_len, aug, aug_len); - if (ret != BLST_SUCCESS) - return ret; - - PAIRING_Commit(&ctx); - - return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; -} diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c deleted file mode 100644 index 9e064657e72..00000000000 --- a/crypto/blst_src/blst_src.c +++ /dev/null @@ -1,24 +0,0 @@ -// This file contains all BLST lib C files needed for -// Flow crypto. -// -// The list may need to be updated in a new version of BLST is used. - -#include "keygen.c" -#include "hash_to_field.c" -#include "e1.c" -#include "map_to_g1.c" -#include "e2.c" -#include "map_to_g2.c" -#include "fp12_tower.c" -#include "pairing.c" -#include "exp.c" -#include "sqrt.c" -#include "recip.c" -#include "aggregate.c" -#include "bulk_addition.c" -#include "multi_scalar.c" -#include "consts.c" -#include "vect.c" -#include "exports.c" - - diff --git a/crypto/blst_src/build/assembly.S b/crypto/blst_src/build/assembly.S deleted file mode 100644 index c0c5db30850..00000000000 --- a/crypto/blst_src/build/assembly.S +++ /dev/null @@ -1,116 +0,0 @@ -#if defined(__x86_64) || defined(__x86_64__) -# if defined(__ELF__) -# if defined(__BLST_PORTABLE__) -# include "elf/sha256-portable-x86_64.s" -# define blst_sha256_block_data_order blst_sha256_block_ssse3 -# endif -# include "elf/sha256-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "elf/ctx_inverse_mod_384-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "elf/ctq_inverse_mod_384-x86_64.s" -# endif -# include "elf/add_mod_384-x86_64.s" -# include "elf/add_mod_384x384-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "elf/mulx_mont_384-x86_64.s" -# include "elf/mulx_mont_256-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "elf/mulq_mont_384-x86_64.s" -# include "elf/mulq_mont_256-x86_64.s" -# endif -# include "elf/add_mod_256-x86_64.s" -# include "elf/ct_inverse_mod_256-x86_64.s" -# include "elf/div3w-x86_64.s" -# include "elf/ct_is_square_mod_384-x86_64.s" -# elif defined(_WIN64) || defined(__CYGWIN__) -# include "coff/sha256-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "coff/ctx_inverse_mod_384-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "coff/ctq_inverse_mod_384-x86_64.s" -# endif -# include "coff/add_mod_384-x86_64.s" -# include "coff/add_mod_384x384-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "coff/mulx_mont_384-x86_64.s" -# include "coff/mulx_mont_256-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "coff/mulq_mont_384-x86_64.s" -# include "coff/mulq_mont_256-x86_64.s" -# endif -# include "coff/add_mod_256-x86_64.s" -# include "coff/ct_inverse_mod_256-x86_64.s" -# include "coff/div3w-x86_64.s" -# include "coff/ct_is_square_mod_384-x86_64.s" -# elif defined(__APPLE__) -# include "mach-o/sha256-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "mach-o/ctx_inverse_mod_384-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "mach-o/ctq_inverse_mod_384-x86_64.s" -# endif -# include "mach-o/add_mod_384-x86_64.s" -# include "mach-o/add_mod_384x384-x86_64.s" -# if defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "mach-o/mulx_mont_384-x86_64.s" -# include "mach-o/mulx_mont_256-x86_64.s" -# endif -# if !defined(__ADX__) || defined(__BLST_PORTABLE__) -# include "mach-o/mulq_mont_384-x86_64.s" -# include "mach-o/mulq_mont_256-x86_64.s" -# endif -# include "mach-o/add_mod_256-x86_64.s" -# include "mach-o/ct_inverse_mod_256-x86_64.s" -# include "mach-o/div3w-x86_64.s" -# include "mach-o/ct_is_square_mod_384-x86_64.s" -# endif -#elif defined(__aarch64__) -# if defined(__ELF__) -# include "elf/sha256-armv8.S" -# include "elf/ct_inverse_mod_384-armv8.S" -# include "elf/add_mod_384-armv8.S" -# define __add_mod_384 __add_mont_384 -# define __sub_mod_384 __sub_mont_384 -# include "elf/mul_mont_384-armv8.S" -# include "elf/mul_mont_256-armv8.S" -# include "elf/add_mod_256-armv8.S" -# include "elf/ct_inverse_mod_256-armv8.S" -# include "elf/div3w-armv8.S" -# include "elf/ct_is_square_mod_384-armv8.S" -# elif defined(_WIN64) -# include "coff/sha256-armv8.S" -# include "coff/ct_inverse_mod_384-armv8.S" -# include "coff/add_mod_384-armv8.S" -# define __add_mod_384 __add_mont_384 -# define __sub_mod_384 __sub_mont_384 -# include "coff/mul_mont_384-armv8.S" -# include "coff/mul_mont_256-armv8.S" -# include "coff/add_mod_256-armv8.S" -# include "coff/ct_inverse_mod_256-armv8.S" -# include "coff/div3w-armv8.S" -# include "coff/ct_is_square_mod_384-armv8.S" -# elif defined(__APPLE__) -# include "mach-o/sha256-armv8.S" -# include "mach-o/ct_inverse_mod_384-armv8.S" -# include "mach-o/add_mod_384-armv8.S" -# define __add_mod_384 __add_mont_384 -# define __sub_mod_384 __sub_mont_384 -# include "mach-o/mul_mont_384-armv8.S" -# include "mach-o/mul_mont_256-armv8.S" -# include "mach-o/add_mod_256-armv8.S" -# include "mach-o/ct_inverse_mod_256-armv8.S" -# include "mach-o/div3w-armv8.S" -# include "mach-o/ct_is_square_mod_384-armv8.S" -# endif -#elif defined(__BLST_NO_ASM__) || \ - (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4) -/* inaccurate way to detect a 32-bit processor, but it's close enough */ -#else -# error "unsupported platform" -#endif diff --git a/crypto/blst_src/build/bindings_trim.pl b/crypto/blst_src/build/bindings_trim.pl deleted file mode 100755 index 0880352d79e..00000000000 --- a/crypto/blst_src/build/bindings_trim.pl +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env perl - -# read whole file -while(<>) { push @file, $_; } - -# traverse and remove auto-generated PartialEq for chosen types -for (my $i = 0; $i <= $#file; $i++) { - if (@file[$i] =~ m/pub\s+(?:struct|enum)\s+(\w+)/) { - push @structs, $1; - } - - if (@file[$i] =~ m/struct\s+blst_p[12]/) { - @file[$i-1] =~ s/,\s*PartialEq//; - } elsif (@file[$i] =~ m/struct\s+blst_fp12/) { - @file[$i-1] =~ s/,\s*(?:Default|PartialEq)//g; - } elsif (@file[$i] =~ m/struct\s+(blst_pairing|blst_uniq)/) { - @file[$i-1] =~ s/,\s*(?:Copy|Clone|Eq|PartialEq)//g; - } elsif (@file[$i] =~ m/struct\s+blst_scalar/) { - @file[$i-1] =~ s/,\s*Copy//; - @file[$i-1] =~ s/\)/, Zeroize\)/; - splice @file, $i, 0, "#[zeroize(drop)]\n"; $i++; - } else { - @file[$i] =~ s/::std::/::core::/g; - } -} - -print @file; - -print << '___'; -#[test] -fn bindgen_test_normal_types() { - // from "Rust for Rustaceans" by Jon Gjengset - fn is_normal() {} -___ -for (@structs) { - print " is_normal::<$_>();\n"; -} -print "}\n"; - -close STDOUT; diff --git a/crypto/blst_src/build/coff/add_mod_256-armv8.S b/crypto/blst_src/build/coff/add_mod_256-armv8.S deleted file mode 100644 index 27b64ef4ca4..00000000000 --- a/crypto/blst_src/build/coff/add_mod_256-armv8.S +++ /dev/null @@ -1,397 +0,0 @@ -.text - -.globl add_mod_256 - -.def add_mod_256; -.type 32; -.endef -.p2align 5 -add_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - - ldp x10,x11,[x1,#16] - adds x8,x8,x12 - ldp x14,x15,[x2,#16] - adcs x9,x9,x13 - ldp x4,x5,[x3] - adcs x10,x10,x14 - ldp x6,x7,[x3,#16] - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - stp x8,x9,[x0] - csel x11,x11,x2,lo - stp x10,x11,[x0,#16] - - ret - - -.globl mul_by_3_mod_256 - -.def mul_by_3_mod_256; -.type 32; -.endef -.p2align 5 -mul_by_3_mod_256: - ldp x12,x13,[x1] - ldp x14,x15,[x1,#16] - - adds x8,x12,x12 - ldp x4,x5,[x2] - adcs x9,x13,x13 - ldp x6,x7,[x2,#16] - adcs x10,x14,x14 - adcs x11,x15,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - csel x11,x11,x2,lo - - adds x8,x8,x12 - adcs x9,x9,x13 - adcs x10,x10,x14 - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - stp x8,x9,[x0] - csel x11,x11,x2,lo - stp x10,x11,[x0,#16] - - ret - - -.globl lshift_mod_256 - -.def lshift_mod_256; -.type 32; -.endef -.p2align 5 -lshift_mod_256: - ldp x8,x9,[x1] - ldp x10,x11,[x1,#16] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - -.Loop_lshift_mod_256: - adds x8,x8,x8 - sub x2,x2,#1 - adcs x9,x9,x9 - adcs x10,x10,x10 - adcs x11,x11,x11 - adc x3,xzr,xzr - - subs x12,x8,x4 - sbcs x13,x9,x5 - sbcs x14,x10,x6 - sbcs x15,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x12,lo - csel x9,x9,x13,lo - csel x10,x10,x14,lo - csel x11,x11,x15,lo - - cbnz x2,.Loop_lshift_mod_256 - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - ret - - -.globl rshift_mod_256 - -.def rshift_mod_256; -.type 32; -.endef -.p2align 5 -rshift_mod_256: - ldp x8,x9,[x1] - ldp x10,x11,[x1,#16] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - -.Loop_rshift: - adds x12,x8,x4 - sub x2,x2,#1 - adcs x13,x9,x5 - adcs x14,x10,x6 - adcs x15,x11,x7 - adc x3,xzr,xzr - tst x8,#1 - - csel x12,x12,x8,ne - csel x13,x13,x9,ne - csel x14,x14,x10,ne - csel x15,x15,x11,ne - csel x3,x3,xzr,ne - - extr x8,x13,x12,#1 - extr x9,x14,x13,#1 - extr x10,x15,x14,#1 - extr x11,x3,x15,#1 - - cbnz x2,.Loop_rshift - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - ret - - -.globl cneg_mod_256 - -.def cneg_mod_256; -.type 32; -.endef -.p2align 5 -cneg_mod_256: - ldp x8,x9,[x1] - ldp x4,x5,[x3] - - ldp x10,x11,[x1,#16] - subs x12,x4,x8 - ldp x6,x7,[x3,#16] - orr x4,x8,x9 - sbcs x13,x5,x9 - orr x5,x10,x11 - sbcs x14,x6,x10 - orr x3,x4,x5 - sbc x15,x7,x11 - - cmp x3,#0 - csetm x3,ne - ands x2,x2,x3 - - csel x8,x8,x12,eq - csel x9,x9,x13,eq - csel x10,x10,x14,eq - stp x8,x9,[x0] - csel x11,x11,x15,eq - stp x10,x11,[x0,#16] - - ret - - -.globl sub_mod_256 - -.def sub_mod_256; -.type 32; -.endef -.p2align 5 -sub_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - - ldp x10,x11,[x1,#16] - subs x8,x8,x12 - ldp x14,x15,[x2,#16] - sbcs x9,x9,x13 - ldp x4,x5,[x3] - sbcs x10,x10,x14 - ldp x6,x7,[x3,#16] - sbcs x11,x11,x15 - sbc x3,xzr,xzr - - and x4,x4,x3 - and x5,x5,x3 - adds x8,x8,x4 - and x6,x6,x3 - adcs x9,x9,x5 - and x7,x7,x3 - adcs x10,x10,x6 - stp x8,x9,[x0] - adc x11,x11,x7 - stp x10,x11,[x0,#16] - - ret - - -.globl check_mod_256 - -.def check_mod_256; -.type 32; -.endef -.p2align 5 -check_mod_256: - ldp x8,x9,[x0] - ldp x10,x11,[x0,#16] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - subs xzr,x8,x4 - sbcs xzr,x9,x5 - orr x8,x8,x9 - sbcs xzr,x10,x6 - orr x8,x8,x10 - sbcs xzr,x11,x7 - orr x8,x8,x11 - sbc x1,xzr,xzr - - cmp x8,#0 - mov x0,#1 - csel x0,x0,xzr,ne - and x0,x0,x1 - - ret - - -.globl add_n_check_mod_256 - -.def add_n_check_mod_256; -.type 32; -.endef -.p2align 5 -add_n_check_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - ldp x10,x11,[x1,#16] - ldp x14,x15,[x2,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 - rev x10,x10 - rev x14,x14 - rev x11,x11 - rev x15,x15 -#endif - - adds x8,x8,x12 - ldp x4,x5,[x3] - adcs x9,x9,x13 - ldp x6,x7,[x3,#16] - adcs x10,x10,x14 - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - csel x11,x11,x2,lo - - orr x16, x8, x9 - orr x17, x10, x11 - orr x16, x16, x17 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - mov x17, #1 - cmp x16, #0 - csel x0, x17, xzr, ne - - ret - - -.globl sub_n_check_mod_256 - -.def sub_n_check_mod_256; -.type 32; -.endef -.p2align 5 -sub_n_check_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - ldp x10,x11,[x1,#16] - ldp x14,x15,[x2,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 - rev x10,x10 - rev x14,x14 - rev x11,x11 - rev x15,x15 -#endif - - subs x8,x8,x12 - sbcs x9,x9,x13 - ldp x4,x5,[x3] - sbcs x10,x10,x14 - ldp x6,x7,[x3,#16] - sbcs x11,x11,x15 - sbc x3,xzr,xzr - - and x4,x4,x3 - and x5,x5,x3 - adds x8,x8,x4 - and x6,x6,x3 - adcs x9,x9,x5 - and x7,x7,x3 - adcs x10,x10,x6 - adc x11,x11,x7 - - orr x16, x8, x9 - orr x17, x10, x11 - orr x16, x16, x17 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - mov x17, #1 - cmp x16, #0 - csel x0, x17, xzr, ne - - ret - diff --git a/crypto/blst_src/build/coff/add_mod_256-x86_64.s b/crypto/blst_src/build/coff/add_mod_256-x86_64.s deleted file mode 100644 index c2c83502a18..00000000000 --- a/crypto/blst_src/build/coff/add_mod_256-x86_64.s +++ /dev/null @@ -1,924 +0,0 @@ -.text - -.globl add_mod_256 - -.def add_mod_256; .scl 2; .type 32; .endef -.p2align 5 -add_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_add_mod_256: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - subq $8,%rsp - -.LSEH_body_add_mod_256: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - -.Loaded_a_add_mod_256: - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - movq %r8,%rax - adcq 16(%rdx),%r10 - movq %r9,%rsi - adcq 24(%rdx),%r11 - sbbq %rdx,%rdx - - movq %r10,%rbx - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - sbbq 16(%rcx),%r10 - movq %r11,%rbp - sbbq 24(%rcx),%r11 - sbbq $0,%rdx - - cmovcq %rax,%r8 - cmovcq %rsi,%r9 - movq %r8,0(%rdi) - cmovcq %rbx,%r10 - movq %r9,8(%rdi) - cmovcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_add_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_add_mod_256: - - -.globl mul_by_3_mod_256 - -.def mul_by_3_mod_256; .scl 2; .type 32; .endef -.p2align 5 -mul_by_3_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_by_3_mod_256: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - pushq %rbx - - pushq %r12 - -.LSEH_body_mul_by_3_mod_256: - - - movq %rdx,%rcx - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq %rsi,%rdx - movq 24(%rsi),%r11 - - call __lshift_mod_256 - movq 0(%rsp),%r12 - - jmp .Loaded_a_add_mod_256 - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_mul_by_3_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_by_3_mod_256: - -.def __lshift_mod_256; .scl 3; .type 32; .endef -.p2align 5 -__lshift_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - - addq %r8,%r8 - adcq %r9,%r9 - movq %r8,%rax - adcq %r10,%r10 - movq %r9,%rsi - adcq %r11,%r11 - sbbq %r12,%r12 - - movq %r10,%rbx - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - sbbq 16(%rcx),%r10 - movq %r11,%rbp - sbbq 24(%rcx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r8 - cmovcq %rsi,%r9 - cmovcq %rbx,%r10 - cmovcq %rbp,%r11 - - .byte 0xf3,0xc3 - - - -.globl lshift_mod_256 - -.def lshift_mod_256; .scl 2; .type 32; .endef -.p2align 5 -lshift_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_lshift_mod_256: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - -.LSEH_body_lshift_mod_256: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - -.Loop_lshift_mod_256: - call __lshift_mod_256 - decl %edx - jnz .Loop_lshift_mod_256 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 0(%rsp),%r12 - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_lshift_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_lshift_mod_256: - - -.globl rshift_mod_256 - -.def rshift_mod_256; .scl 2; .type 32; .endef -.p2align 5 -rshift_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_rshift_mod_256: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - subq $8,%rsp - -.LSEH_body_rshift_mod_256: - - - movq 0(%rsi),%rbp - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - -.Loop_rshift_mod_256: - movq %rbp,%r8 - andq $1,%rbp - movq 0(%rcx),%rax - negq %rbp - movq 8(%rcx),%rsi - movq 16(%rcx),%rbx - - andq %rbp,%rax - andq %rbp,%rsi - andq %rbp,%rbx - andq 24(%rcx),%rbp - - addq %rax,%r8 - adcq %rsi,%r9 - adcq %rbx,%r10 - adcq %rbp,%r11 - sbbq %rax,%rax - - shrq $1,%r8 - movq %r9,%rbp - shrq $1,%r9 - movq %r10,%rbx - shrq $1,%r10 - movq %r11,%rsi - shrq $1,%r11 - - shlq $63,%rbp - shlq $63,%rbx - orq %r8,%rbp - shlq $63,%rsi - orq %rbx,%r9 - shlq $63,%rax - orq %rsi,%r10 - orq %rax,%r11 - - decl %edx - jnz .Loop_rshift_mod_256 - - movq %rbp,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_rshift_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_rshift_mod_256: - - -.globl cneg_mod_256 - -.def cneg_mod_256; .scl 2; .type 32; .endef -.p2align 5 -cneg_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_cneg_mod_256: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - -.LSEH_body_cneg_mod_256: - - - movq 0(%rsi),%r12 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq %r12,%r8 - movq 24(%rsi),%r11 - orq %r9,%r12 - orq %r10,%r12 - orq %r11,%r12 - movq $-1,%rbp - - movq 0(%rcx),%rax - cmovnzq %rbp,%r12 - movq 8(%rcx),%rsi - movq 16(%rcx),%rbx - andq %r12,%rax - movq 24(%rcx),%rbp - andq %r12,%rsi - andq %r12,%rbx - andq %r12,%rbp - - subq %r8,%rax - sbbq %r9,%rsi - sbbq %r10,%rbx - sbbq %r11,%rbp - - orq %rdx,%rdx - - cmovzq %r8,%rax - cmovzq %r9,%rsi - movq %rax,0(%rdi) - cmovzq %r10,%rbx - movq %rsi,8(%rdi) - cmovzq %r11,%rbp - movq %rbx,16(%rdi) - movq %rbp,24(%rdi) - - movq 0(%rsp),%r12 - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_cneg_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_cneg_mod_256: - - -.globl sub_mod_256 - -.def sub_mod_256; .scl 2; .type 32; .endef -.p2align 5 -sub_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sub_mod_256: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - subq $8,%rsp - -.LSEH_body_sub_mod_256: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - subq 0(%rdx),%r8 - movq 0(%rcx),%rax - sbbq 8(%rdx),%r9 - movq 8(%rcx),%rsi - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rbx - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbp - sbbq %rdx,%rdx - - andq %rdx,%rax - andq %rdx,%rsi - andq %rdx,%rbx - andq %rdx,%rbp - - addq %rax,%r8 - adcq %rsi,%r9 - movq %r8,0(%rdi) - adcq %rbx,%r10 - movq %r9,8(%rdi) - adcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_sub_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sub_mod_256: - - -.globl check_mod_256 - -.def check_mod_256; .scl 2; .type 32; .endef -.p2align 5 -check_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_check_mod_256: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq 0(%rdi),%rax - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - - movq %rax,%r8 - orq %r9,%rax - orq %r10,%rax - orq %r11,%rax - - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq %rsi,%rsi - - movq $1,%rdx - cmpq $0,%rax - cmovneq %rdx,%rax - andq %rsi,%rax -.LSEH_epilogue_check_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_check_mod_256: - - -.globl add_n_check_mod_256 - -.def add_n_check_mod_256; .scl 2; .type 32; .endef -.p2align 5 -add_n_check_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_add_n_check_mod_256: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - subq $8,%rsp - -.LSEH_body_add_n_check_mod_256: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - movq %r8,%rax - adcq 16(%rdx),%r10 - movq %r9,%rsi - adcq 24(%rdx),%r11 - sbbq %rdx,%rdx - - movq %r10,%rbx - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - sbbq 16(%rcx),%r10 - movq %r11,%rbp - sbbq 24(%rcx),%r11 - sbbq $0,%rdx - - cmovcq %rax,%r8 - cmovcq %rsi,%r9 - movq %r8,0(%rdi) - cmovcq %rbx,%r10 - movq %r9,8(%rdi) - cmovcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - orq %r9,%r8 - orq %r11,%r10 - orq %r10,%r8 - movq $1,%rax - cmovzq %r8,%rax - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_add_n_check_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_add_n_check_mod_256: - - -.globl sub_n_check_mod_256 - -.def sub_n_check_mod_256; .scl 2; .type 32; .endef -.p2align 5 -sub_n_check_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sub_n_check_mod_256: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - subq $8,%rsp - -.LSEH_body_sub_n_check_mod_256: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - subq 0(%rdx),%r8 - movq 0(%rcx),%rax - sbbq 8(%rdx),%r9 - movq 8(%rcx),%rsi - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rbx - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbp - sbbq %rdx,%rdx - - andq %rdx,%rax - andq %rdx,%rsi - andq %rdx,%rbx - andq %rdx,%rbp - - addq %rax,%r8 - adcq %rsi,%r9 - movq %r8,0(%rdi) - adcq %rbx,%r10 - movq %r9,8(%rdi) - adcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - orq %r9,%r8 - orq %r11,%r10 - orq %r10,%r8 - movq $1,%rax - cmovzq %r8,%rax - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_sub_n_check_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sub_n_check_mod_256: -.section .pdata -.p2align 2 -.rva .LSEH_begin_add_mod_256 -.rva .LSEH_body_add_mod_256 -.rva .LSEH_info_add_mod_256_prologue - -.rva .LSEH_body_add_mod_256 -.rva .LSEH_epilogue_add_mod_256 -.rva .LSEH_info_add_mod_256_body - -.rva .LSEH_epilogue_add_mod_256 -.rva .LSEH_end_add_mod_256 -.rva .LSEH_info_add_mod_256_epilogue - -.rva .LSEH_begin_mul_by_3_mod_256 -.rva .LSEH_body_mul_by_3_mod_256 -.rva .LSEH_info_mul_by_3_mod_256_prologue - -.rva .LSEH_body_mul_by_3_mod_256 -.rva .LSEH_epilogue_mul_by_3_mod_256 -.rva .LSEH_info_mul_by_3_mod_256_body - -.rva .LSEH_epilogue_mul_by_3_mod_256 -.rva .LSEH_end_mul_by_3_mod_256 -.rva .LSEH_info_mul_by_3_mod_256_epilogue - -.rva .LSEH_begin_lshift_mod_256 -.rva .LSEH_body_lshift_mod_256 -.rva .LSEH_info_lshift_mod_256_prologue - -.rva .LSEH_body_lshift_mod_256 -.rva .LSEH_epilogue_lshift_mod_256 -.rva .LSEH_info_lshift_mod_256_body - -.rva .LSEH_epilogue_lshift_mod_256 -.rva .LSEH_end_lshift_mod_256 -.rva .LSEH_info_lshift_mod_256_epilogue - -.rva .LSEH_begin_rshift_mod_256 -.rva .LSEH_body_rshift_mod_256 -.rva .LSEH_info_rshift_mod_256_prologue - -.rva .LSEH_body_rshift_mod_256 -.rva .LSEH_epilogue_rshift_mod_256 -.rva .LSEH_info_rshift_mod_256_body - -.rva .LSEH_epilogue_rshift_mod_256 -.rva .LSEH_end_rshift_mod_256 -.rva .LSEH_info_rshift_mod_256_epilogue - -.rva .LSEH_begin_cneg_mod_256 -.rva .LSEH_body_cneg_mod_256 -.rva .LSEH_info_cneg_mod_256_prologue - -.rva .LSEH_body_cneg_mod_256 -.rva .LSEH_epilogue_cneg_mod_256 -.rva .LSEH_info_cneg_mod_256_body - -.rva .LSEH_epilogue_cneg_mod_256 -.rva .LSEH_end_cneg_mod_256 -.rva .LSEH_info_cneg_mod_256_epilogue - -.rva .LSEH_begin_sub_mod_256 -.rva .LSEH_body_sub_mod_256 -.rva .LSEH_info_sub_mod_256_prologue - -.rva .LSEH_body_sub_mod_256 -.rva .LSEH_epilogue_sub_mod_256 -.rva .LSEH_info_sub_mod_256_body - -.rva .LSEH_epilogue_sub_mod_256 -.rva .LSEH_end_sub_mod_256 -.rva .LSEH_info_sub_mod_256_epilogue - -.rva .LSEH_epilogue_check_mod_256 -.rva .LSEH_end_check_mod_256 -.rva .LSEH_info_check_mod_256_epilogue - -.rva .LSEH_begin_add_n_check_mod_256 -.rva .LSEH_body_add_n_check_mod_256 -.rva .LSEH_info_add_n_check_mod_256_prologue - -.rva .LSEH_body_add_n_check_mod_256 -.rva .LSEH_epilogue_add_n_check_mod_256 -.rva .LSEH_info_add_n_check_mod_256_body - -.rva .LSEH_epilogue_add_n_check_mod_256 -.rva .LSEH_end_add_n_check_mod_256 -.rva .LSEH_info_add_n_check_mod_256_epilogue - -.rva .LSEH_begin_sub_n_check_mod_256 -.rva .LSEH_body_sub_n_check_mod_256 -.rva .LSEH_info_sub_n_check_mod_256_prologue - -.rva .LSEH_body_sub_n_check_mod_256 -.rva .LSEH_epilogue_sub_n_check_mod_256 -.rva .LSEH_info_sub_n_check_mod_256_body - -.rva .LSEH_epilogue_sub_n_check_mod_256 -.rva .LSEH_end_sub_n_check_mod_256 -.rva .LSEH_info_sub_n_check_mod_256_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_add_mod_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_add_mod_256_body: -.byte 1,0,9,0 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_add_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mul_by_3_mod_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_by_3_mod_256_body: -.byte 1,0,11,0 -.byte 0x00,0xc4,0x00,0x00 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.LSEH_info_mul_by_3_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_lshift_mod_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_lshift_mod_256_body: -.byte 1,0,11,0 -.byte 0x00,0xc4,0x00,0x00 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.LSEH_info_lshift_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_rshift_mod_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_rshift_mod_256_body: -.byte 1,0,9,0 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_rshift_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_cneg_mod_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_cneg_mod_256_body: -.byte 1,0,11,0 -.byte 0x00,0xc4,0x00,0x00 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.LSEH_info_cneg_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sub_mod_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sub_mod_256_body: -.byte 1,0,9,0 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sub_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_check_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_add_n_check_mod_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_add_n_check_mod_256_body: -.byte 1,0,9,0 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_add_n_check_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sub_n_check_mod_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sub_n_check_mod_256_body: -.byte 1,0,9,0 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sub_n_check_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/add_mod_384-armv8.S b/crypto/blst_src/build/coff/add_mod_384-armv8.S deleted file mode 100644 index 2eff0677f54..00000000000 --- a/crypto/blst_src/build/coff/add_mod_384-armv8.S +++ /dev/null @@ -1,1056 +0,0 @@ -.text - -.globl add_mod_384 - -.def add_mod_384; -.type 32; -.endef -.p2align 5 -add_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.def __add_mod_384; -.type 32; -.endef -.p2align 5 -__add_mod_384: - ldp x10,x11,[x1] - ldp x16,x17,[x2] - ldp x12,x13,[x1,#16] - ldp x19,x20,[x2,#16] - ldp x14,x15,[x1,#32] - ldp x21,x22,[x2,#32] - -__add_mod_384_ab_are_loaded: - adds x10,x10,x16 - adcs x11,x11,x17 - adcs x12,x12,x19 - adcs x13,x13,x20 - adcs x14,x14,x21 - adcs x15,x15,x22 - adc x3,xzr,xzr - - subs x16,x10,x4 - sbcs x17,x11,x5 - sbcs x19,x12,x6 - sbcs x20,x13,x7 - sbcs x21,x14,x8 - sbcs x22,x15,x9 - sbcs xzr,x3,xzr - - csel x10,x10,x16,lo - csel x11,x11,x17,lo - csel x12,x12,x19,lo - csel x13,x13,x20,lo - csel x14,x14,x21,lo - csel x15,x15,x22,lo - - ret - - -.globl add_mod_384x - -.def add_mod_384x; -.type 32; -.endef -.p2align 5 -add_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __add_mod_384 - - stp x10,x11,[x0] - add x1,x1,#48 - stp x12,x13,[x0,#16] - add x2,x2,#48 - stp x14,x15,[x0,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl rshift_mod_384 - -.def rshift_mod_384; -.type 32; -.endef -.p2align 5 -rshift_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - -.Loop_rshift_mod_384: - sub x2,x2,#1 - bl __rshift_mod_384 - cbnz x2,.Loop_rshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.def __rshift_mod_384; -.type 32; -.endef -.p2align 5 -__rshift_mod_384: - sbfx x22,x10,#0,#1 - and x16,x22,x4 - and x17,x22,x5 - adds x10,x10,x16 - and x19,x22,x6 - adcs x11,x11,x17 - and x20,x22,x7 - adcs x12,x12,x19 - and x21,x22,x8 - adcs x13,x13,x20 - and x22,x22,x9 - adcs x14,x14,x21 - extr x10,x11,x10,#1 // a[0:5] >>= 1 - adcs x15,x15,x22 - extr x11,x12,x11,#1 - adc x22,xzr,xzr - extr x12,x13,x12,#1 - extr x13,x14,x13,#1 - extr x14,x15,x14,#1 - extr x15,x22,x15,#1 - ret - - -.globl div_by_2_mod_384 - -.def div_by_2_mod_384; -.type 32; -.endef -.p2align 5 -div_by_2_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __rshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl lshift_mod_384 - -.def lshift_mod_384; -.type 32; -.endef -.p2align 5 -lshift_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - -.Loop_lshift_mod_384: - sub x2,x2,#1 - bl __lshift_mod_384 - cbnz x2,.Loop_lshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.def __lshift_mod_384; -.type 32; -.endef -.p2align 5 -__lshift_mod_384: - adds x10,x10,x10 - adcs x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x3,xzr,xzr - - subs x16,x10,x4 - sbcs x17,x11,x5 - sbcs x19,x12,x6 - sbcs x20,x13,x7 - sbcs x21,x14,x8 - sbcs x22,x15,x9 - sbcs xzr,x3,xzr - - csel x10,x10,x16,lo - csel x11,x11,x17,lo - csel x12,x12,x19,lo - csel x13,x13,x20,lo - csel x14,x14,x21,lo - csel x15,x15,x22,lo - - ret - - -.globl mul_by_3_mod_384 - -.def mul_by_3_mod_384; -.type 32; -.endef -.p2align 5 -mul_by_3_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl mul_by_8_mod_384 - -.def mul_by_8_mod_384; -.type 32; -.endef -.p2align 5 -mul_by_8_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl mul_by_3_mod_384x - -.def mul_by_3_mod_384x; -.type 32; -.endef -.p2align 5 -mul_by_3_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - - bl __add_mod_384_ab_are_loaded - - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __lshift_mod_384 - - ldp x16,x17,[x1,#48] - ldp x19,x20,[x1,#64] - ldp x21,x22,[x1,#80] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl mul_by_8_mod_384x - -.def mul_by_8_mod_384x; -.type 32; -.endef -.p2align 5 -mul_by_8_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl cneg_mod_384 - -.def cneg_mod_384; -.type 32; -.endef -.p2align 5 -cneg_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x4,x5,[x3] - ldp x12,x13,[x1,#16] - ldp x6,x7,[x3,#16] - - subs x16,x4,x10 - ldp x14,x15,[x1,#32] - ldp x8,x9,[x3,#32] - orr x3,x10,x11 - sbcs x17,x5,x11 - orr x3,x3,x12 - sbcs x19,x6,x12 - orr x3,x3,x13 - sbcs x20,x7,x13 - orr x3,x3,x14 - sbcs x21,x8,x14 - orr x3,x3,x15 - sbc x22,x9,x15 - - cmp x3,#0 - csetm x3,ne - ands x2,x2,x3 - - csel x10,x10,x16,eq - csel x11,x11,x17,eq - csel x12,x12,x19,eq - csel x13,x13,x20,eq - stp x10,x11,[x0] - csel x14,x14,x21,eq - stp x12,x13,[x0,#16] - csel x15,x15,x22,eq - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl sub_mod_384 - -.def sub_mod_384; -.type 32; -.endef -.p2align 5 -sub_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.def __sub_mod_384; -.type 32; -.endef -.p2align 5 -__sub_mod_384: - ldp x10,x11,[x1] - ldp x16,x17,[x2] - ldp x12,x13,[x1,#16] - ldp x19,x20,[x2,#16] - ldp x14,x15,[x1,#32] - ldp x21,x22,[x2,#32] - - subs x10,x10,x16 - sbcs x11,x11,x17 - sbcs x12,x12,x19 - sbcs x13,x13,x20 - sbcs x14,x14,x21 - sbcs x15,x15,x22 - sbc x3,xzr,xzr - - and x16,x4,x3 - and x17,x5,x3 - adds x10,x10,x16 - and x19,x6,x3 - adcs x11,x11,x17 - and x20,x7,x3 - adcs x12,x12,x19 - and x21,x8,x3 - adcs x13,x13,x20 - and x22,x9,x3 - adcs x14,x14,x21 - adc x15,x15,x22 - - ret - - -.globl sub_mod_384x - -.def sub_mod_384x; -.type 32; -.endef -.p2align 5 -sub_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __sub_mod_384 - - stp x10,x11,[x0] - add x1,x1,#48 - stp x12,x13,[x0,#16] - add x2,x2,#48 - stp x14,x15,[x0,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl mul_by_1_plus_i_mod_384x - -.def mul_by_1_plus_i_mod_384x; -.type 32; -.endef -.p2align 5 -mul_by_1_plus_i_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - add x2,x1,#48 - - bl __sub_mod_384 // a->re - a->im - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __add_mod_384_ab_are_loaded // a->re + a->im - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl sgn0_pty_mod_384 - -.def sgn0_pty_mod_384; -.type 32; -.endef -.p2align 5 -sgn0_pty_mod_384: - ldp x10,x11,[x0] - ldp x12,x13,[x0,#16] - ldp x14,x15,[x0,#32] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldp x8,x9,[x1,#32] - - and x0,x10,#1 - adds x10,x10,x10 - adcs x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x3,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x3,x3,xzr - - mvn x3,x3 - and x3,x3,#2 - orr x0,x0,x3 - - ret - - -.globl sgn0_pty_mod_384x - -.def sgn0_pty_mod_384x; -.type 32; -.endef -.p2align 5 -sgn0_pty_mod_384x: - ldp x10,x11,[x0] - ldp x12,x13,[x0,#16] - ldp x14,x15,[x0,#32] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldp x8,x9,[x1,#32] - - and x2,x10,#1 - orr x3,x10,x11 - adds x10,x10,x10 - orr x3,x3,x12 - adcs x11,x11,x11 - orr x3,x3,x13 - adcs x12,x12,x12 - orr x3,x3,x14 - adcs x13,x13,x13 - orr x3,x3,x15 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x16,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x16,x16,xzr - - ldp x10,x11,[x0,#48] - ldp x12,x13,[x0,#64] - ldp x14,x15,[x0,#80] - - mvn x16,x16 - and x16,x16,#2 - orr x2,x2,x16 - - and x0,x10,#1 - orr x1,x10,x11 - adds x10,x10,x10 - orr x1,x1,x12 - adcs x11,x11,x11 - orr x1,x1,x13 - adcs x12,x12,x12 - orr x1,x1,x14 - adcs x13,x13,x13 - orr x1,x1,x15 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x16,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x16,x16,xzr - - mvn x16,x16 - and x16,x16,#2 - orr x0,x0,x16 - - cmp x3,#0 - csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) - - cmp x1,#0 - csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) - - and x3,x3,#1 - and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity - - ret - -.globl vec_select_32 - -.def vec_select_32; -.type 32; -.endef -.p2align 5 -vec_select_32: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret - -.globl vec_select_48 - -.def vec_select_48; -.type 32; -.endef -.p2align 5 -vec_select_48: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret - -.globl vec_select_96 - -.def vec_select_96; -.type 32; -.endef -.p2align 5 -vec_select_96: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret - -.globl vec_select_192 - -.def vec_select_192; -.type 32; -.endef -.p2align 5 -vec_select_192: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret - -.globl vec_select_144 - -.def vec_select_144; -.type 32; -.endef -.p2align 5 -vec_select_144: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret - -.globl vec_select_288 - -.def vec_select_288; -.type 32; -.endef -.p2align 5 -vec_select_288: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret - -.globl vec_prefetch - -.def vec_prefetch; -.type 32; -.endef -.p2align 5 -vec_prefetch: - add x1, x1, x0 - sub x1, x1, #1 - mov x2, #64 - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - prfm pldl1keep, [x0] - ret - -.globl vec_is_zero_16x - -.def vec_is_zero_16x; -.type 32; -.endef -.p2align 5 -vec_is_zero_16x: - ld1 {v0.2d}, [x0], #16 - lsr x1, x1, #4 - sub x1, x1, #1 - cbz x1, .Loop_is_zero_done - -.Loop_is_zero: - ld1 {v1.2d}, [x0], #16 - orr v0.16b, v0.16b, v1.16b - sub x1, x1, #1 - cbnz x1, .Loop_is_zero - -.Loop_is_zero_done: - dup v1.2d, v0.d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] - mov x0, #1 - cmp x1, #0 - csel x0, x0, xzr, eq - ret - -.globl vec_is_equal_16x - -.def vec_is_equal_16x; -.type 32; -.endef -.p2align 5 -vec_is_equal_16x: - ld1 {v0.2d}, [x0], #16 - ld1 {v1.2d}, [x1], #16 - lsr x2, x2, #4 - eor v0.16b, v0.16b, v1.16b - -.Loop_is_equal: - sub x2, x2, #1 - cbz x2, .Loop_is_equal_done - ld1 {v1.2d}, [x0], #16 - ld1 {v2.2d}, [x1], #16 - eor v1.16b, v1.16b, v2.16b - orr v0.16b, v0.16b, v1.16b - b .Loop_is_equal - nop - -.Loop_is_equal_done: - dup v1.2d, v0.d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] - mov x0, #1 - cmp x1, #0 - csel x0, x0, xzr, eq - ret - diff --git a/crypto/blst_src/build/coff/add_mod_384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384-x86_64.s deleted file mode 100644 index 3ef562a3bf2..00000000000 --- a/crypto/blst_src/build/coff/add_mod_384-x86_64.s +++ /dev/null @@ -1,2510 +0,0 @@ -.text - -.globl add_mod_384 - -.def add_mod_384; .scl 2; .type 32; .endef -.p2align 5 -add_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_add_mod_384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_add_mod_384: - - - call __add_mod_384 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_add_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_add_mod_384: - -.def __add_mod_384; .scl 3; .type 32; .endef -.p2align 5 -__add_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -__add_mod_384_a_is_loaded: - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - movq %r8,%r14 - adcq 24(%rdx),%r11 - movq %r9,%r15 - adcq 32(%rdx),%r12 - movq %r10,%rax - adcq 40(%rdx),%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,0(%rdi) - cmovcq %rbx,%r11 - movq %r9,8(%rdi) - cmovcq %rbp,%r12 - movq %r10,16(%rdi) - cmovcq %rsi,%r13 - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 - - -.globl add_mod_384x - -.def add_mod_384x; .scl 2; .type 32; .endef -.p2align 5 -add_mod_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_add_mod_384x: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $24,%rsp - -.LSEH_body_add_mod_384x: - - - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - leaq 48(%rsi),%rsi - leaq 48(%rdx),%rdx - leaq 48(%rdi),%rdi - call __add_mod_384 - - movq 0(%rsp),%rsi - movq 8(%rsp),%rdx - leaq -48(%rdi),%rdi - call __add_mod_384 - - movq 24+0(%rsp),%r15 - - movq 24+8(%rsp),%r14 - - movq 24+16(%rsp),%r13 - - movq 24+24(%rsp),%r12 - - movq 24+32(%rsp),%rbx - - movq 24+40(%rsp),%rbp - - leaq 24+48(%rsp),%rsp - -.LSEH_epilogue_add_mod_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_add_mod_384x: - - -.globl rshift_mod_384 - -.def rshift_mod_384; .scl 2; .type 32; .endef -.p2align 5 -rshift_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_rshift_mod_384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rdi - -.LSEH_body_rshift_mod_384: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -.Loop_rshift_mod_384: - call __rshift_mod_384 - decl %edx - jnz .Loop_rshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_rshift_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_rshift_mod_384: - -.def __rshift_mod_384; .scl 3; .type 32; .endef -.p2align 5 -__rshift_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq $1,%rsi - movq 0(%rcx),%r14 - andq %r8,%rsi - movq 8(%rcx),%r15 - negq %rsi - movq 16(%rcx),%rax - andq %rsi,%r14 - movq 24(%rcx),%rbx - andq %rsi,%r15 - movq 32(%rcx),%rbp - andq %rsi,%rax - andq %rsi,%rbx - andq %rsi,%rbp - andq 40(%rcx),%rsi - - addq %r8,%r14 - adcq %r9,%r15 - adcq %r10,%rax - adcq %r11,%rbx - adcq %r12,%rbp - adcq %r13,%rsi - sbbq %r13,%r13 - - shrq $1,%r14 - movq %r15,%r8 - shrq $1,%r15 - movq %rax,%r9 - shrq $1,%rax - movq %rbx,%r10 - shrq $1,%rbx - movq %rbp,%r11 - shrq $1,%rbp - movq %rsi,%r12 - shrq $1,%rsi - shlq $63,%r8 - shlq $63,%r9 - orq %r14,%r8 - shlq $63,%r10 - orq %r15,%r9 - shlq $63,%r11 - orq %rax,%r10 - shlq $63,%r12 - orq %rbx,%r11 - shlq $63,%r13 - orq %rbp,%r12 - orq %rsi,%r13 - - .byte 0xf3,0xc3 - - -.globl div_by_2_mod_384 - -.def div_by_2_mod_384; .scl 2; .type 32; .endef -.p2align 5 -div_by_2_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_div_by_2_mod_384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rdi - -.LSEH_body_div_by_2_mod_384: - - - movq 0(%rsi),%r8 - movq %rdx,%rcx - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - call __rshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_div_by_2_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_div_by_2_mod_384: - - -.globl lshift_mod_384 - -.def lshift_mod_384; .scl 2; .type 32; .endef -.p2align 5 -lshift_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_lshift_mod_384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rdi - -.LSEH_body_lshift_mod_384: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -.Loop_lshift_mod_384: - addq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - movq %r8,%r14 - adcq %r11,%r11 - movq %r9,%r15 - adcq %r12,%r12 - movq %r10,%rax - adcq %r13,%r13 - movq %r11,%rbx - sbbq %rdi,%rdi - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdi - - movq (%rsp),%rdi - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - cmovcq %rbx,%r11 - cmovcq %rbp,%r12 - cmovcq %rsi,%r13 - - decl %edx - jnz .Loop_lshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_lshift_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_lshift_mod_384: - -.def __lshift_mod_384; .scl 3; .type 32; .endef -.p2align 5 -__lshift_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - - addq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - movq %r8,%r14 - adcq %r11,%r11 - movq %r9,%r15 - adcq %r12,%r12 - movq %r10,%rax - adcq %r13,%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - cmovcq %rbx,%r11 - cmovcq %rbp,%r12 - cmovcq %rsi,%r13 - - .byte 0xf3,0xc3 - - - -.globl mul_by_3_mod_384 - -.def mul_by_3_mod_384; .scl 2; .type 32; .endef -.p2align 5 -mul_by_3_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_by_3_mod_384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rsi - -.LSEH_body_mul_by_3_mod_384: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - - movq (%rsp),%rdx - call __add_mod_384_a_is_loaded - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_mul_by_3_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_by_3_mod_384: - -.globl mul_by_8_mod_384 - -.def mul_by_8_mod_384; .scl 2; .type 32; .endef -.p2align 5 -mul_by_8_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_by_8_mod_384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_mul_by_8_mod_384: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_mul_by_8_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_by_8_mod_384: - - -.globl mul_by_3_mod_384x - -.def mul_by_3_mod_384x; .scl 2; .type 32; .endef -.p2align 5 -mul_by_3_mod_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_by_3_mod_384x: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rsi - -.LSEH_body_mul_by_3_mod_384x: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - - movq (%rsp),%rdx - call __add_mod_384_a_is_loaded - - movq (%rsp),%rsi - leaq 48(%rdi),%rdi - - movq 48(%rsi),%r8 - movq 56(%rsi),%r9 - movq 64(%rsi),%r10 - movq 72(%rsi),%r11 - movq 80(%rsi),%r12 - movq 88(%rsi),%r13 - - call __lshift_mod_384 - - movq $48,%rdx - addq (%rsp),%rdx - call __add_mod_384_a_is_loaded - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_mul_by_3_mod_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_by_3_mod_384x: - -.globl mul_by_8_mod_384x - -.def mul_by_8_mod_384x; .scl 2; .type 32; .endef -.p2align 5 -mul_by_8_mod_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_by_8_mod_384x: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rsi - -.LSEH_body_mul_by_8_mod_384x: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - movq (%rsp),%rsi - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 48+0(%rsi),%r8 - movq 48+8(%rsi),%r9 - movq 48+16(%rsi),%r10 - movq 48+24(%rsi),%r11 - movq 48+32(%rsi),%r12 - movq 48+40(%rsi),%r13 - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - movq %r8,48+0(%rdi) - movq %r9,48+8(%rdi) - movq %r10,48+16(%rdi) - movq %r11,48+24(%rdi) - movq %r12,48+32(%rdi) - movq %r13,48+40(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_mul_by_8_mod_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_by_8_mod_384x: - - -.globl cneg_mod_384 - -.def cneg_mod_384; .scl 2; .type 32; .endef -.p2align 5 -cneg_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_cneg_mod_384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rdx - -.LSEH_body_cneg_mod_384: - - - movq 0(%rsi),%rdx - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq %rdx,%r8 - movq 24(%rsi),%r11 - orq %r9,%rdx - movq 32(%rsi),%r12 - orq %r10,%rdx - movq 40(%rsi),%r13 - orq %r11,%rdx - movq $-1,%rsi - orq %r12,%rdx - orq %r13,%rdx - - movq 0(%rcx),%r14 - cmovnzq %rsi,%rdx - movq 8(%rcx),%r15 - movq 16(%rcx),%rax - andq %rdx,%r14 - movq 24(%rcx),%rbx - andq %rdx,%r15 - movq 32(%rcx),%rbp - andq %rdx,%rax - movq 40(%rcx),%rsi - andq %rdx,%rbx - movq 0(%rsp),%rcx - andq %rdx,%rbp - andq %rdx,%rsi - - subq %r8,%r14 - sbbq %r9,%r15 - sbbq %r10,%rax - sbbq %r11,%rbx - sbbq %r12,%rbp - sbbq %r13,%rsi - - orq %rcx,%rcx - - cmovzq %r8,%r14 - cmovzq %r9,%r15 - cmovzq %r10,%rax - movq %r14,0(%rdi) - cmovzq %r11,%rbx - movq %r15,8(%rdi) - cmovzq %r12,%rbp - movq %rax,16(%rdi) - cmovzq %r13,%rsi - movq %rbx,24(%rdi) - movq %rbp,32(%rdi) - movq %rsi,40(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_cneg_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_cneg_mod_384: - - -.globl sub_mod_384 - -.def sub_mod_384; .scl 2; .type 32; .endef -.p2align 5 -sub_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sub_mod_384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_sub_mod_384: - - - call __sub_mod_384 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sub_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sub_mod_384: - -.def __sub_mod_384; .scl 3; .type 32; .endef -.p2align 5 -__sub_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - subq 0(%rdx),%r8 - movq 0(%rcx),%r14 - sbbq 8(%rdx),%r9 - movq 8(%rcx),%r15 - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rax - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbx - sbbq 32(%rdx),%r12 - movq 32(%rcx),%rbp - sbbq 40(%rdx),%r13 - movq 40(%rcx),%rsi - sbbq %rdx,%rdx - - andq %rdx,%r14 - andq %rdx,%r15 - andq %rdx,%rax - andq %rdx,%rbx - andq %rdx,%rbp - andq %rdx,%rsi - - addq %r14,%r8 - adcq %r15,%r9 - movq %r8,0(%rdi) - adcq %rax,%r10 - movq %r9,8(%rdi) - adcq %rbx,%r11 - movq %r10,16(%rdi) - adcq %rbp,%r12 - movq %r11,24(%rdi) - adcq %rsi,%r13 - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 - - -.globl sub_mod_384x - -.def sub_mod_384x; .scl 2; .type 32; .endef -.p2align 5 -sub_mod_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sub_mod_384x: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $24,%rsp - -.LSEH_body_sub_mod_384x: - - - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - leaq 48(%rsi),%rsi - leaq 48(%rdx),%rdx - leaq 48(%rdi),%rdi - call __sub_mod_384 - - movq 0(%rsp),%rsi - movq 8(%rsp),%rdx - leaq -48(%rdi),%rdi - call __sub_mod_384 - - movq 24+0(%rsp),%r15 - - movq 24+8(%rsp),%r14 - - movq 24+16(%rsp),%r13 - - movq 24+24(%rsp),%r12 - - movq 24+32(%rsp),%rbx - - movq 24+40(%rsp),%rbp - - leaq 24+48(%rsp),%rsp - -.LSEH_epilogue_sub_mod_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sub_mod_384x: -.globl mul_by_1_plus_i_mod_384x - -.def mul_by_1_plus_i_mod_384x; .scl 2; .type 32; .endef -.p2align 5 -mul_by_1_plus_i_mod_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_by_1_plus_i_mod_384x: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $56,%rsp - -.LSEH_body_mul_by_1_plus_i_mod_384x: - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %r8,%r14 - addq 48(%rsi),%r8 - movq %r9,%r15 - adcq 56(%rsi),%r9 - movq %r10,%rax - adcq 64(%rsi),%r10 - movq %r11,%rbx - adcq 72(%rsi),%r11 - movq %r12,%rcx - adcq 80(%rsi),%r12 - movq %r13,%rbp - adcq 88(%rsi),%r13 - movq %rdi,48(%rsp) - sbbq %rdi,%rdi - - subq 48(%rsi),%r14 - sbbq 56(%rsi),%r15 - sbbq 64(%rsi),%rax - sbbq 72(%rsi),%rbx - sbbq 80(%rsi),%rcx - sbbq 88(%rsi),%rbp - sbbq %rsi,%rsi - - movq %r8,0(%rsp) - movq 0(%rdx),%r8 - movq %r9,8(%rsp) - movq 8(%rdx),%r9 - movq %r10,16(%rsp) - movq 16(%rdx),%r10 - movq %r11,24(%rsp) - movq 24(%rdx),%r11 - movq %r12,32(%rsp) - andq %rsi,%r8 - movq 32(%rdx),%r12 - movq %r13,40(%rsp) - andq %rsi,%r9 - movq 40(%rdx),%r13 - andq %rsi,%r10 - andq %rsi,%r11 - andq %rsi,%r12 - andq %rsi,%r13 - movq 48(%rsp),%rsi - - addq %r8,%r14 - movq 0(%rsp),%r8 - adcq %r9,%r15 - movq 8(%rsp),%r9 - adcq %r10,%rax - movq 16(%rsp),%r10 - adcq %r11,%rbx - movq 24(%rsp),%r11 - adcq %r12,%rcx - movq 32(%rsp),%r12 - adcq %r13,%rbp - movq 40(%rsp),%r13 - - movq %r14,0(%rsi) - movq %r8,%r14 - movq %r15,8(%rsi) - movq %rax,16(%rsi) - movq %r9,%r15 - movq %rbx,24(%rsi) - movq %rcx,32(%rsi) - movq %r10,%rax - movq %rbp,40(%rsi) - - subq 0(%rdx),%r8 - movq %r11,%rbx - sbbq 8(%rdx),%r9 - sbbq 16(%rdx),%r10 - movq %r12,%rcx - sbbq 24(%rdx),%r11 - sbbq 32(%rdx),%r12 - movq %r13,%rbp - sbbq 40(%rdx),%r13 - sbbq $0,%rdi - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,48(%rsi) - cmovcq %rbx,%r11 - movq %r9,56(%rsi) - cmovcq %rcx,%r12 - movq %r10,64(%rsi) - cmovcq %rbp,%r13 - movq %r11,72(%rsi) - movq %r12,80(%rsi) - movq %r13,88(%rsi) - - movq 56+0(%rsp),%r15 - - movq 56+8(%rsp),%r14 - - movq 56+16(%rsp),%r13 - - movq 56+24(%rsp),%r12 - - movq 56+32(%rsp),%rbx - - movq 56+40(%rsp),%rbp - - leaq 56+48(%rsp),%rsp - -.LSEH_epilogue_mul_by_1_plus_i_mod_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_by_1_plus_i_mod_384x: -.globl sgn0_pty_mod_384 - -.def sgn0_pty_mod_384; .scl 2; .type 32; .endef -.p2align 5 -sgn0_pty_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sgn0_pty_mod_384: - - - movq %rcx,%rdi - movq %rdx,%rsi -.LSEH_body_sgn0_pty_mod_384: - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%rcx - movq 40(%rdi),%rdx - - xorq %rax,%rax - movq %r8,%rdi - addq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %rcx,%rcx - adcq %rdx,%rdx - adcq $0,%rax - - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - sbbq 40(%rsi),%rdx - sbbq $0,%rax - - notq %rax - andq $1,%rdi - andq $2,%rax - orq %rdi,%rax - -.LSEH_epilogue_sgn0_pty_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sgn0_pty_mod_384: - -.globl sgn0_pty_mod_384x - -.def sgn0_pty_mod_384x; .scl 2; .type 32; .endef -.p2align 5 -sgn0_pty_mod_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sgn0_pty_mod_384x: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - pushq %rbx - - subq $8,%rsp - -.LSEH_body_sgn0_pty_mod_384x: - - - movq 48(%rdi),%r8 - movq 56(%rdi),%r9 - movq 64(%rdi),%r10 - movq 72(%rdi),%r11 - movq 80(%rdi),%rcx - movq 88(%rdi),%rdx - - movq %r8,%rbx - orq %r9,%r8 - orq %r10,%r8 - orq %r11,%r8 - orq %rcx,%r8 - orq %rdx,%r8 - - leaq 0(%rdi),%rax - xorq %rdi,%rdi - movq %rbx,%rbp - addq %rbx,%rbx - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %rcx,%rcx - adcq %rdx,%rdx - adcq $0,%rdi - - subq 0(%rsi),%rbx - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - sbbq 40(%rsi),%rdx - sbbq $0,%rdi - - movq %r8,0(%rsp) - notq %rdi - andq $1,%rbp - andq $2,%rdi - orq %rbp,%rdi - - movq 0(%rax),%r8 - movq 8(%rax),%r9 - movq 16(%rax),%r10 - movq 24(%rax),%r11 - movq 32(%rax),%rcx - movq 40(%rax),%rdx - - movq %r8,%rbx - orq %r9,%r8 - orq %r10,%r8 - orq %r11,%r8 - orq %rcx,%r8 - orq %rdx,%r8 - - xorq %rax,%rax - movq %rbx,%rbp - addq %rbx,%rbx - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %rcx,%rcx - adcq %rdx,%rdx - adcq $0,%rax - - subq 0(%rsi),%rbx - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - sbbq 40(%rsi),%rdx - sbbq $0,%rax - - movq 0(%rsp),%rbx - - notq %rax - - testq %r8,%r8 - cmovzq %rdi,%rbp - - testq %rbx,%rbx - cmovnzq %rdi,%rax - - andq $1,%rbp - andq $2,%rax - orq %rbp,%rax - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_sgn0_pty_mod_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sgn0_pty_mod_384x: -.globl vec_select_32 - -.def vec_select_32; .scl 2; .type 32; .endef -.p2align 5 -vec_select_32: - .byte 0xf3,0x0f,0x1e,0xfa - - movd %r9d,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rdx),%xmm0 - leaq 16(%rdx),%rdx - pcmpeqd %xmm4,%xmm5 - movdqu (%r8),%xmm1 - leaq 16(%r8),%r8 - pcmpeqd %xmm5,%xmm4 - leaq 16(%rcx),%rcx - pand %xmm4,%xmm0 - movdqu 0+16-16(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-16(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-16(%rcx) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,16-16(%rcx) - .byte 0xf3,0xc3 - -.globl vec_select_48 - -.def vec_select_48; .scl 2; .type 32; .endef -.p2align 5 -vec_select_48: - .byte 0xf3,0x0f,0x1e,0xfa - - movd %r9d,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rdx),%xmm0 - leaq 24(%rdx),%rdx - pcmpeqd %xmm4,%xmm5 - movdqu (%r8),%xmm1 - leaq 24(%r8),%r8 - pcmpeqd %xmm5,%xmm4 - leaq 24(%rcx),%rcx - pand %xmm4,%xmm0 - movdqu 0+16-24(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-24(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-24(%rcx) - pand %xmm4,%xmm2 - movdqu 16+16-24(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-24(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-24(%rcx) - pand %xmm4,%xmm0 - pand %xmm5,%xmm1 - por %xmm1,%xmm0 - movdqu %xmm0,32-24(%rcx) - .byte 0xf3,0xc3 - -.globl vec_select_96 - -.def vec_select_96; .scl 2; .type 32; .endef -.p2align 5 -vec_select_96: - .byte 0xf3,0x0f,0x1e,0xfa - - movd %r9d,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rdx),%xmm0 - leaq 48(%rdx),%rdx - pcmpeqd %xmm4,%xmm5 - movdqu (%r8),%xmm1 - leaq 48(%r8),%r8 - pcmpeqd %xmm5,%xmm4 - leaq 48(%rcx),%rcx - pand %xmm4,%xmm0 - movdqu 0+16-48(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-48(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-48(%rcx) - pand %xmm4,%xmm2 - movdqu 16+16-48(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-48(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-48(%rcx) - pand %xmm4,%xmm0 - movdqu 32+16-48(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-48(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-48(%rcx) - pand %xmm4,%xmm2 - movdqu 48+16-48(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-48(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-48(%rcx) - pand %xmm4,%xmm0 - movdqu 64+16-48(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-48(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-48(%rcx) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,80-48(%rcx) - .byte 0xf3,0xc3 - -.globl vec_select_192 - -.def vec_select_192; .scl 2; .type 32; .endef -.p2align 5 -vec_select_192: - .byte 0xf3,0x0f,0x1e,0xfa - - movd %r9d,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rdx),%xmm0 - leaq 96(%rdx),%rdx - pcmpeqd %xmm4,%xmm5 - movdqu (%r8),%xmm1 - leaq 96(%r8),%r8 - pcmpeqd %xmm5,%xmm4 - leaq 96(%rcx),%rcx - pand %xmm4,%xmm0 - movdqu 0+16-96(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-96(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-96(%rcx) - pand %xmm4,%xmm2 - movdqu 16+16-96(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-96(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-96(%rcx) - pand %xmm4,%xmm0 - movdqu 32+16-96(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-96(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-96(%rcx) - pand %xmm4,%xmm2 - movdqu 48+16-96(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-96(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-96(%rcx) - pand %xmm4,%xmm0 - movdqu 64+16-96(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-96(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-96(%rcx) - pand %xmm4,%xmm2 - movdqu 80+16-96(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 80+16-96(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,80-96(%rcx) - pand %xmm4,%xmm0 - movdqu 96+16-96(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 96+16-96(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,96-96(%rcx) - pand %xmm4,%xmm2 - movdqu 112+16-96(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 112+16-96(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,112-96(%rcx) - pand %xmm4,%xmm0 - movdqu 128+16-96(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 128+16-96(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,128-96(%rcx) - pand %xmm4,%xmm2 - movdqu 144+16-96(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 144+16-96(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,144-96(%rcx) - pand %xmm4,%xmm0 - movdqu 160+16-96(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 160+16-96(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,160-96(%rcx) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,176-96(%rcx) - .byte 0xf3,0xc3 - -.globl vec_select_144 - -.def vec_select_144; .scl 2; .type 32; .endef -.p2align 5 -vec_select_144: - .byte 0xf3,0x0f,0x1e,0xfa - - movd %r9d,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rdx),%xmm0 - leaq 72(%rdx),%rdx - pcmpeqd %xmm4,%xmm5 - movdqu (%r8),%xmm1 - leaq 72(%r8),%r8 - pcmpeqd %xmm5,%xmm4 - leaq 72(%rcx),%rcx - pand %xmm4,%xmm0 - movdqu 0+16-72(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-72(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-72(%rcx) - pand %xmm4,%xmm2 - movdqu 16+16-72(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-72(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-72(%rcx) - pand %xmm4,%xmm0 - movdqu 32+16-72(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-72(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-72(%rcx) - pand %xmm4,%xmm2 - movdqu 48+16-72(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-72(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-72(%rcx) - pand %xmm4,%xmm0 - movdqu 64+16-72(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-72(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-72(%rcx) - pand %xmm4,%xmm2 - movdqu 80+16-72(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 80+16-72(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,80-72(%rcx) - pand %xmm4,%xmm0 - movdqu 96+16-72(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 96+16-72(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,96-72(%rcx) - pand %xmm4,%xmm2 - movdqu 112+16-72(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 112+16-72(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,112-72(%rcx) - pand %xmm4,%xmm0 - pand %xmm5,%xmm1 - por %xmm1,%xmm0 - movdqu %xmm0,128-72(%rcx) - .byte 0xf3,0xc3 - -.globl vec_select_288 - -.def vec_select_288; .scl 2; .type 32; .endef -.p2align 5 -vec_select_288: - .byte 0xf3,0x0f,0x1e,0xfa - - movd %r9d,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rdx),%xmm0 - leaq 144(%rdx),%rdx - pcmpeqd %xmm4,%xmm5 - movdqu (%r8),%xmm1 - leaq 144(%r8),%r8 - pcmpeqd %xmm5,%xmm4 - leaq 144(%rcx),%rcx - pand %xmm4,%xmm0 - movdqu 0+16-144(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-144(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-144(%rcx) - pand %xmm4,%xmm2 - movdqu 16+16-144(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-144(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-144(%rcx) - pand %xmm4,%xmm0 - movdqu 32+16-144(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-144(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-144(%rcx) - pand %xmm4,%xmm2 - movdqu 48+16-144(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-144(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-144(%rcx) - pand %xmm4,%xmm0 - movdqu 64+16-144(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-144(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-144(%rcx) - pand %xmm4,%xmm2 - movdqu 80+16-144(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 80+16-144(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,80-144(%rcx) - pand %xmm4,%xmm0 - movdqu 96+16-144(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 96+16-144(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,96-144(%rcx) - pand %xmm4,%xmm2 - movdqu 112+16-144(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 112+16-144(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,112-144(%rcx) - pand %xmm4,%xmm0 - movdqu 128+16-144(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 128+16-144(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,128-144(%rcx) - pand %xmm4,%xmm2 - movdqu 144+16-144(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 144+16-144(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,144-144(%rcx) - pand %xmm4,%xmm0 - movdqu 160+16-144(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 160+16-144(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,160-144(%rcx) - pand %xmm4,%xmm2 - movdqu 176+16-144(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 176+16-144(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,176-144(%rcx) - pand %xmm4,%xmm0 - movdqu 192+16-144(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 192+16-144(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,192-144(%rcx) - pand %xmm4,%xmm2 - movdqu 208+16-144(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 208+16-144(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,208-144(%rcx) - pand %xmm4,%xmm0 - movdqu 224+16-144(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 224+16-144(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,224-144(%rcx) - pand %xmm4,%xmm2 - movdqu 240+16-144(%rdx),%xmm0 - pand %xmm5,%xmm3 - movdqu 240+16-144(%r8),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,240-144(%rcx) - pand %xmm4,%xmm0 - movdqu 256+16-144(%rdx),%xmm2 - pand %xmm5,%xmm1 - movdqu 256+16-144(%r8),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,256-144(%rcx) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,272-144(%rcx) - .byte 0xf3,0xc3 - -.globl vec_prefetch - -.def vec_prefetch; .scl 2; .type 32; .endef -.p2align 5 -vec_prefetch: - .byte 0xf3,0x0f,0x1e,0xfa - - leaq -1(%rcx,%rdx,1),%rdx - movq $64,%rax - xorq %r8,%r8 - prefetchnta (%rcx) - leaq (%rcx,%rax,1),%rcx - cmpq %rdx,%rcx - cmovaq %rdx,%rcx - cmovaq %r8,%rax - prefetchnta (%rcx) - leaq (%rcx,%rax,1),%rcx - cmpq %rdx,%rcx - cmovaq %rdx,%rcx - cmovaq %r8,%rax - prefetchnta (%rcx) - leaq (%rcx,%rax,1),%rcx - cmpq %rdx,%rcx - cmovaq %rdx,%rcx - cmovaq %r8,%rax - prefetchnta (%rcx) - leaq (%rcx,%rax,1),%rcx - cmpq %rdx,%rcx - cmovaq %rdx,%rcx - cmovaq %r8,%rax - prefetchnta (%rcx) - leaq (%rcx,%rax,1),%rcx - cmpq %rdx,%rcx - cmovaq %rdx,%rcx - cmovaq %r8,%rax - prefetchnta (%rcx) - leaq (%rcx,%rax,1),%rcx - cmpq %rdx,%rcx - cmovaq %rdx,%rcx - prefetchnta (%rcx) - .byte 0xf3,0xc3 - -.globl vec_is_zero_16x - -.def vec_is_zero_16x; .scl 2; .type 32; .endef -.p2align 5 -vec_is_zero_16x: - .byte 0xf3,0x0f,0x1e,0xfa - - shrl $4,%edx - movdqu (%rcx),%xmm0 - leaq 16(%rcx),%rcx - -.Loop_is_zero: - decl %edx - jz .Loop_is_zero_done - movdqu (%rcx),%xmm1 - leaq 16(%rcx),%rcx - por %xmm1,%xmm0 - jmp .Loop_is_zero - -.Loop_is_zero_done: - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 -.byte 102,72,15,126,192 - incl %edx - testq %rax,%rax - cmovnzl %edx,%eax - xorl $1,%eax - .byte 0xf3,0xc3 - -.globl vec_is_equal_16x - -.def vec_is_equal_16x; .scl 2; .type 32; .endef -.p2align 5 -vec_is_equal_16x: - .byte 0xf3,0x0f,0x1e,0xfa - - shrl $4,%r8d - movdqu (%rcx),%xmm0 - movdqu (%rdx),%xmm1 - subq %rcx,%rdx - leaq 16(%rcx),%rcx - pxor %xmm1,%xmm0 - -.Loop_is_equal: - decl %r8d - jz .Loop_is_equal_done - movdqu (%rcx),%xmm1 - movdqu (%rcx,%rdx,1),%xmm2 - leaq 16(%rcx),%rcx - pxor %xmm2,%xmm1 - por %xmm1,%xmm0 - jmp .Loop_is_equal - -.Loop_is_equal_done: - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 -.byte 102,72,15,126,192 - incl %r8d - testq %rax,%rax - cmovnzl %r8d,%eax - xorl $1,%eax - .byte 0xf3,0xc3 - -.section .pdata -.p2align 2 -.rva .LSEH_begin_add_mod_384 -.rva .LSEH_body_add_mod_384 -.rva .LSEH_info_add_mod_384_prologue - -.rva .LSEH_body_add_mod_384 -.rva .LSEH_epilogue_add_mod_384 -.rva .LSEH_info_add_mod_384_body - -.rva .LSEH_epilogue_add_mod_384 -.rva .LSEH_end_add_mod_384 -.rva .LSEH_info_add_mod_384_epilogue - -.rva .LSEH_begin_add_mod_384x -.rva .LSEH_body_add_mod_384x -.rva .LSEH_info_add_mod_384x_prologue - -.rva .LSEH_body_add_mod_384x -.rva .LSEH_epilogue_add_mod_384x -.rva .LSEH_info_add_mod_384x_body - -.rva .LSEH_epilogue_add_mod_384x -.rva .LSEH_end_add_mod_384x -.rva .LSEH_info_add_mod_384x_epilogue - -.rva .LSEH_begin_rshift_mod_384 -.rva .LSEH_body_rshift_mod_384 -.rva .LSEH_info_rshift_mod_384_prologue - -.rva .LSEH_body_rshift_mod_384 -.rva .LSEH_epilogue_rshift_mod_384 -.rva .LSEH_info_rshift_mod_384_body - -.rva .LSEH_epilogue_rshift_mod_384 -.rva .LSEH_end_rshift_mod_384 -.rva .LSEH_info_rshift_mod_384_epilogue - -.rva .LSEH_begin_div_by_2_mod_384 -.rva .LSEH_body_div_by_2_mod_384 -.rva .LSEH_info_div_by_2_mod_384_prologue - -.rva .LSEH_body_div_by_2_mod_384 -.rva .LSEH_epilogue_div_by_2_mod_384 -.rva .LSEH_info_div_by_2_mod_384_body - -.rva .LSEH_epilogue_div_by_2_mod_384 -.rva .LSEH_end_div_by_2_mod_384 -.rva .LSEH_info_div_by_2_mod_384_epilogue - -.rva .LSEH_begin_lshift_mod_384 -.rva .LSEH_body_lshift_mod_384 -.rva .LSEH_info_lshift_mod_384_prologue - -.rva .LSEH_body_lshift_mod_384 -.rva .LSEH_epilogue_lshift_mod_384 -.rva .LSEH_info_lshift_mod_384_body - -.rva .LSEH_epilogue_lshift_mod_384 -.rva .LSEH_end_lshift_mod_384 -.rva .LSEH_info_lshift_mod_384_epilogue - -.rva .LSEH_begin_mul_by_3_mod_384 -.rva .LSEH_body_mul_by_3_mod_384 -.rva .LSEH_info_mul_by_3_mod_384_prologue - -.rva .LSEH_body_mul_by_3_mod_384 -.rva .LSEH_epilogue_mul_by_3_mod_384 -.rva .LSEH_info_mul_by_3_mod_384_body - -.rva .LSEH_epilogue_mul_by_3_mod_384 -.rva .LSEH_end_mul_by_3_mod_384 -.rva .LSEH_info_mul_by_3_mod_384_epilogue - -.rva .LSEH_begin_mul_by_8_mod_384 -.rva .LSEH_body_mul_by_8_mod_384 -.rva .LSEH_info_mul_by_8_mod_384_prologue - -.rva .LSEH_body_mul_by_8_mod_384 -.rva .LSEH_epilogue_mul_by_8_mod_384 -.rva .LSEH_info_mul_by_8_mod_384_body - -.rva .LSEH_epilogue_mul_by_8_mod_384 -.rva .LSEH_end_mul_by_8_mod_384 -.rva .LSEH_info_mul_by_8_mod_384_epilogue - -.rva .LSEH_begin_mul_by_3_mod_384x -.rva .LSEH_body_mul_by_3_mod_384x -.rva .LSEH_info_mul_by_3_mod_384x_prologue - -.rva .LSEH_body_mul_by_3_mod_384x -.rva .LSEH_epilogue_mul_by_3_mod_384x -.rva .LSEH_info_mul_by_3_mod_384x_body - -.rva .LSEH_epilogue_mul_by_3_mod_384x -.rva .LSEH_end_mul_by_3_mod_384x -.rva .LSEH_info_mul_by_3_mod_384x_epilogue - -.rva .LSEH_begin_mul_by_8_mod_384x -.rva .LSEH_body_mul_by_8_mod_384x -.rva .LSEH_info_mul_by_8_mod_384x_prologue - -.rva .LSEH_body_mul_by_8_mod_384x -.rva .LSEH_epilogue_mul_by_8_mod_384x -.rva .LSEH_info_mul_by_8_mod_384x_body - -.rva .LSEH_epilogue_mul_by_8_mod_384x -.rva .LSEH_end_mul_by_8_mod_384x -.rva .LSEH_info_mul_by_8_mod_384x_epilogue - -.rva .LSEH_begin_cneg_mod_384 -.rva .LSEH_body_cneg_mod_384 -.rva .LSEH_info_cneg_mod_384_prologue - -.rva .LSEH_body_cneg_mod_384 -.rva .LSEH_epilogue_cneg_mod_384 -.rva .LSEH_info_cneg_mod_384_body - -.rva .LSEH_epilogue_cneg_mod_384 -.rva .LSEH_end_cneg_mod_384 -.rva .LSEH_info_cneg_mod_384_epilogue - -.rva .LSEH_begin_sub_mod_384 -.rva .LSEH_body_sub_mod_384 -.rva .LSEH_info_sub_mod_384_prologue - -.rva .LSEH_body_sub_mod_384 -.rva .LSEH_epilogue_sub_mod_384 -.rva .LSEH_info_sub_mod_384_body - -.rva .LSEH_epilogue_sub_mod_384 -.rva .LSEH_end_sub_mod_384 -.rva .LSEH_info_sub_mod_384_epilogue - -.rva .LSEH_begin_sub_mod_384x -.rva .LSEH_body_sub_mod_384x -.rva .LSEH_info_sub_mod_384x_prologue - -.rva .LSEH_body_sub_mod_384x -.rva .LSEH_epilogue_sub_mod_384x -.rva .LSEH_info_sub_mod_384x_body - -.rva .LSEH_epilogue_sub_mod_384x -.rva .LSEH_end_sub_mod_384x -.rva .LSEH_info_sub_mod_384x_epilogue - -.rva .LSEH_begin_mul_by_1_plus_i_mod_384x -.rva .LSEH_body_mul_by_1_plus_i_mod_384x -.rva .LSEH_info_mul_by_1_plus_i_mod_384x_prologue - -.rva .LSEH_body_mul_by_1_plus_i_mod_384x -.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x -.rva .LSEH_info_mul_by_1_plus_i_mod_384x_body - -.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x -.rva .LSEH_end_mul_by_1_plus_i_mod_384x -.rva .LSEH_info_mul_by_1_plus_i_mod_384x_epilogue - -.rva .LSEH_begin_sgn0_pty_mod_384 -.rva .LSEH_body_sgn0_pty_mod_384 -.rva .LSEH_info_sgn0_pty_mod_384_prologue - -.rva .LSEH_body_sgn0_pty_mod_384 -.rva .LSEH_epilogue_sgn0_pty_mod_384 -.rva .LSEH_info_sgn0_pty_mod_384_body - -.rva .LSEH_epilogue_sgn0_pty_mod_384 -.rva .LSEH_end_sgn0_pty_mod_384 -.rva .LSEH_info_sgn0_pty_mod_384_epilogue - -.rva .LSEH_begin_sgn0_pty_mod_384x -.rva .LSEH_body_sgn0_pty_mod_384x -.rva .LSEH_info_sgn0_pty_mod_384x_prologue - -.rva .LSEH_body_sgn0_pty_mod_384x -.rva .LSEH_epilogue_sgn0_pty_mod_384x -.rva .LSEH_info_sgn0_pty_mod_384x_body - -.rva .LSEH_epilogue_sgn0_pty_mod_384x -.rva .LSEH_end_sgn0_pty_mod_384x -.rva .LSEH_info_sgn0_pty_mod_384x_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_add_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_add_mod_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_add_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_add_mod_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_add_mod_384x_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x03,0x00 -.byte 0x00,0xe4,0x04,0x00 -.byte 0x00,0xd4,0x05,0x00 -.byte 0x00,0xc4,0x06,0x00 -.byte 0x00,0x34,0x07,0x00 -.byte 0x00,0x54,0x08,0x00 -.byte 0x00,0x74,0x0a,0x00 -.byte 0x00,0x64,0x0b,0x00 -.byte 0x00,0x82 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_add_mod_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_rshift_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_rshift_mod_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_rshift_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_div_by_2_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_div_by_2_mod_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_div_by_2_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_lshift_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_lshift_mod_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_lshift_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mul_by_3_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_by_3_mod_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mul_by_3_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mul_by_8_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_by_8_mod_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mul_by_8_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mul_by_3_mod_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_by_3_mod_384x_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mul_by_3_mod_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mul_by_8_mod_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_by_8_mod_384x_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mul_by_8_mod_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_cneg_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_cneg_mod_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_cneg_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sub_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sub_mod_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sub_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sub_mod_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sub_mod_384x_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x03,0x00 -.byte 0x00,0xe4,0x04,0x00 -.byte 0x00,0xd4,0x05,0x00 -.byte 0x00,0xc4,0x06,0x00 -.byte 0x00,0x34,0x07,0x00 -.byte 0x00,0x54,0x08,0x00 -.byte 0x00,0x74,0x0a,0x00 -.byte 0x00,0x64,0x0b,0x00 -.byte 0x00,0x82 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sub_mod_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mul_by_1_plus_i_mod_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_by_1_plus_i_mod_384x_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x07,0x00 -.byte 0x00,0xe4,0x08,0x00 -.byte 0x00,0xd4,0x09,0x00 -.byte 0x00,0xc4,0x0a,0x00 -.byte 0x00,0x34,0x0b,0x00 -.byte 0x00,0x54,0x0c,0x00 -.byte 0x00,0x74,0x0e,0x00 -.byte 0x00,0x64,0x0f,0x00 -.byte 0x00,0xc2 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mul_by_1_plus_i_mod_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sgn0_pty_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sgn0_pty_mod_384_body: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sgn0_pty_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sgn0_pty_mod_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sgn0_pty_mod_384x_body: -.byte 1,0,9,0 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sgn0_pty_mod_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s deleted file mode 100644 index 53662b4a56a..00000000000 --- a/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s +++ /dev/null @@ -1,330 +0,0 @@ -.text - -.def __add_mod_384x384; .scl 3; .type 32; .endef -.p2align 5 -__add_mod_384x384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - addq 0(%rdx),%r8 - movq 56(%rsi),%r15 - adcq 8(%rdx),%r9 - movq 64(%rsi),%rax - adcq 16(%rdx),%r10 - movq 72(%rsi),%rbx - adcq 24(%rdx),%r11 - movq 80(%rsi),%rbp - adcq 32(%rdx),%r12 - movq 88(%rsi),%rsi - adcq 40(%rdx),%r13 - movq %r8,0(%rdi) - adcq 48(%rdx),%r14 - movq %r9,8(%rdi) - adcq 56(%rdx),%r15 - movq %r10,16(%rdi) - adcq 64(%rdx),%rax - movq %r12,32(%rdi) - movq %r14,%r8 - adcq 72(%rdx),%rbx - movq %r11,24(%rdi) - movq %r15,%r9 - adcq 80(%rdx),%rbp - movq %r13,40(%rdi) - movq %rax,%r10 - adcq 88(%rdx),%rsi - movq %rbx,%r11 - sbbq %rdx,%rdx - - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - movq %rbp,%r12 - sbbq 16(%rcx),%rax - sbbq 24(%rcx),%rbx - sbbq 32(%rcx),%rbp - movq %rsi,%r13 - sbbq 40(%rcx),%rsi - sbbq $0,%rdx - - cmovcq %r8,%r14 - cmovcq %r9,%r15 - cmovcq %r10,%rax - movq %r14,48(%rdi) - cmovcq %r11,%rbx - movq %r15,56(%rdi) - cmovcq %r12,%rbp - movq %rax,64(%rdi) - cmovcq %r13,%rsi - movq %rbx,72(%rdi) - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 - - -.def __sub_mod_384x384; .scl 3; .type 32; .endef -.p2align 5 -__sub_mod_384x384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - subq 0(%rdx),%r8 - movq 56(%rsi),%r15 - sbbq 8(%rdx),%r9 - movq 64(%rsi),%rax - sbbq 16(%rdx),%r10 - movq 72(%rsi),%rbx - sbbq 24(%rdx),%r11 - movq 80(%rsi),%rbp - sbbq 32(%rdx),%r12 - movq 88(%rsi),%rsi - sbbq 40(%rdx),%r13 - movq %r8,0(%rdi) - sbbq 48(%rdx),%r14 - movq 0(%rcx),%r8 - movq %r9,8(%rdi) - sbbq 56(%rdx),%r15 - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - sbbq 64(%rdx),%rax - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - sbbq 72(%rdx),%rbx - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - sbbq 80(%rdx),%rbp - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - sbbq 88(%rdx),%rsi - movq 40(%rcx),%r13 - sbbq %rdx,%rdx - - andq %rdx,%r8 - andq %rdx,%r9 - andq %rdx,%r10 - andq %rdx,%r11 - andq %rdx,%r12 - andq %rdx,%r13 - - addq %r8,%r14 - adcq %r9,%r15 - movq %r14,48(%rdi) - adcq %r10,%rax - movq %r15,56(%rdi) - adcq %r11,%rbx - movq %rax,64(%rdi) - adcq %r12,%rbp - movq %rbx,72(%rdi) - adcq %r13,%rsi - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 - - -.globl add_mod_384x384 - -.def add_mod_384x384; .scl 2; .type 32; .endef -.p2align 5 -add_mod_384x384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_add_mod_384x384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_add_mod_384x384: - - - call __add_mod_384x384 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_add_mod_384x384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_add_mod_384x384: - -.globl sub_mod_384x384 - -.def sub_mod_384x384; .scl 2; .type 32; .endef -.p2align 5 -sub_mod_384x384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sub_mod_384x384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_sub_mod_384x384: - - - call __sub_mod_384x384 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sub_mod_384x384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sub_mod_384x384: -.section .pdata -.p2align 2 -.rva .LSEH_begin_add_mod_384x384 -.rva .LSEH_body_add_mod_384x384 -.rva .LSEH_info_add_mod_384x384_prologue - -.rva .LSEH_body_add_mod_384x384 -.rva .LSEH_epilogue_add_mod_384x384 -.rva .LSEH_info_add_mod_384x384_body - -.rva .LSEH_epilogue_add_mod_384x384 -.rva .LSEH_end_add_mod_384x384 -.rva .LSEH_info_add_mod_384x384_epilogue - -.rva .LSEH_begin_sub_mod_384x384 -.rva .LSEH_body_sub_mod_384x384 -.rva .LSEH_info_sub_mod_384x384_prologue - -.rva .LSEH_body_sub_mod_384x384 -.rva .LSEH_epilogue_sub_mod_384x384 -.rva .LSEH_info_sub_mod_384x384_body - -.rva .LSEH_epilogue_sub_mod_384x384 -.rva .LSEH_end_sub_mod_384x384 -.rva .LSEH_info_sub_mod_384x384_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_add_mod_384x384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_add_mod_384x384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_add_mod_384x384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sub_mod_384x384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sub_mod_384x384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sub_mod_384x384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S deleted file mode 100644 index d2fd83182b4..00000000000 --- a/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S +++ /dev/null @@ -1,799 +0,0 @@ -.text - -.globl ct_inverse_mod_256 - -.def ct_inverse_mod_256; -.type 32; -.endef -.p2align 5 -ct_inverse_mod_256: -.long 3573752639 - stp x29, x30, [sp,#-80]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - sub sp, sp, #1040 - - ldp x4, x5, [x1,#8*0] - ldp x6, x7, [x1,#8*2] - - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... - str x0, [sp] - - ldp x8, x9, [x2,#8*0] - ldp x10, x11, [x2,#8*2] - - stp x4, x5, [x1,#8*0] // copy input to |a| - stp x6, x7, [x1,#8*2] - stp x8, x9, [x1,#8*4] // copy modulus to |b| - stp x10, x11, [x1,#8*6] - - ////////////////////////////////////////// first iteration - bl .Lab_approximation_31_256_loaded - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - str x12,[x0,#8*8] // initialize |u| with |f0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to dst |b| - bl __smul_256_n_shift_by_31 - str x12, [x0,#8*9] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - ldr x8, [x1,#8*8] // |u| - ldr x9, [x1,#8*13] // |v| - madd x4, x16, x8, xzr // |u|*|f0| - madd x4, x17, x9, x4 // |v|*|g0| - str x4, [x0,#8*4] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*5] - stp x5, x5, [x0,#8*7] - - madd x4, x12, x8, xzr // |u|*|f1| - madd x4, x13, x9, x4 // |v|*|g1| - str x4, [x0,#8*9] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*10] - stp x5, x5, [x0,#8*12] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - ////////////////////////////////////////// two[!] last iterations - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #47 // 31 + 512 % 31 - //bl __ab_approximation_62_256 // |a| and |b| are exact, - ldr x7, [x1,#8*0] // just load - ldr x11, [x1,#8*4] - bl __inner_loop_62_256 - - mov x16, x14 - mov x17, x15 - ldr x0, [sp] // original out_ptr - bl __smul_256x63 - bl __smul_512x63_tail - ldr x30, [x29,#8] - - smulh x20, x7, x17 // figure out top-most limb - ldp x8, x9, [x3,#8*0] - adc x23, x23, x25 - ldp x10, x11, [x3,#8*2] - - add x20, x20, x23 // x20 is 1, 0 or -1 - asr x19, x20, #63 // sign as mask - - and x23, x8, x19 // add mod<<256 conditionally - and x24, x9, x19 - adds x4, x4, x23 - and x25, x10, x19 - adcs x5, x5, x24 - and x26, x11, x19 - adcs x6, x6, x25 - adcs x7, x22, x26 - adc x20, x20, xzr // x20 is 1, 0 or -1 - - neg x19, x20 - orr x20, x20, x19 // excess bit or sign as mask - asr x19, x19, #63 // excess bit as mask - - and x8, x8, x20 // mask |mod| - and x9, x9, x20 - and x10, x10, x20 - and x11, x11, x20 - - eor x8, x8, x19 // conditionally negate |mod| - eor x9, x9, x19 - adds x8, x8, x19, lsr#63 - eor x10, x10, x19 - adcs x9, x9, xzr - eor x11, x11, x19 - adcs x10, x10, xzr - adc x11, x11, xzr - - adds x4, x4, x8 // final adjustment for |mod|<<256 - adcs x5, x5, x9 - adcs x6, x6, x10 - stp x4, x5, [x0,#8*4] - adc x7, x7, x11 - stp x6, x7, [x0,#8*6] - - add sp, sp, #1040 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldr x29, [sp],#80 -.long 3573752767 - ret - - -//////////////////////////////////////////////////////////////////////// -.def __smul_256x63; -.type 32; -.endef -.p2align 5 -__smul_256x63: - ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) - asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x6, x7, [x1,#8*2+64] - eor x16, x16, x14 // conditionally negate |f_| (or |g_|) - ldr x22, [x1,#8*4+64] - - eor x4, x4, x14 // conditionally negate |u| (or |v|) - sub x16, x16, x14 - eor x5, x5, x14 - adds x4, x4, x14, lsr#63 - eor x6, x6, x14 - adcs x5, x5, xzr - eor x7, x7, x14 - adcs x6, x6, xzr - eor x22, x22, x14 - umulh x19, x4, x16 - adcs x7, x7, xzr - umulh x20, x5, x16 - adcs x22, x22, xzr - umulh x21, x6, x16 - mul x4, x4, x16 - cmp x16, #0 - mul x5, x5, x16 - csel x22, x22, xzr, ne - mul x6, x6, x16 - adds x5, x5, x19 - mul x24, x7, x16 - adcs x6, x6, x20 - adcs x24, x24, x21 - adc x26, xzr, xzr - ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) - asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x10, x11, [x1,#8*2+104] - eor x17, x17, x14 // conditionally negate |f_| (or |g_|) - ldr x23, [x1,#8*4+104] - - eor x8, x8, x14 // conditionally negate |u| (or |v|) - sub x17, x17, x14 - eor x9, x9, x14 - adds x8, x8, x14, lsr#63 - eor x10, x10, x14 - adcs x9, x9, xzr - eor x11, x11, x14 - adcs x10, x10, xzr - eor x23, x23, x14 - umulh x19, x8, x17 - adcs x11, x11, xzr - umulh x20, x9, x17 - adcs x23, x23, xzr - umulh x21, x10, x17 - adc x15, xzr, xzr // used in __smul_512x63_tail - mul x8, x8, x17 - cmp x17, #0 - mul x9, x9, x17 - csel x23, x23, xzr, ne - mul x10, x10, x17 - adds x9, x9, x19 - mul x25, x11, x17 - adcs x10, x10, x20 - adcs x25, x25, x21 - adc x26, x26, xzr - - adds x4, x4, x8 - adcs x5, x5, x9 - adcs x6, x6, x10 - stp x4, x5, [x0,#8*0] - adcs x24, x24, x25 - stp x6, x24, [x0,#8*2] - - ret - - -.def __smul_512x63_tail; -.type 32; -.endef -.p2align 5 -__smul_512x63_tail: - umulh x24, x7, x16 - ldp x5, x6, [x1,#8*18] // load rest of |v| - adc x26, x26, xzr - ldr x7, [x1,#8*20] - and x22, x22, x16 - - umulh x11, x11, x17 // resume |v|*|g1| chain - - sub x24, x24, x22 // tie up |u|*|f1| chain - asr x25, x24, #63 - - eor x5, x5, x14 // conditionally negate rest of |v| - eor x6, x6, x14 - adds x5, x5, x15 - eor x7, x7, x14 - adcs x6, x6, xzr - umulh x19, x23, x17 - adc x7, x7, xzr - umulh x20, x5, x17 - add x11, x11, x26 - umulh x21, x6, x17 - - mul x4, x23, x17 - mul x5, x5, x17 - adds x4, x4, x11 - mul x6, x6, x17 - adcs x5, x5, x19 - mul x22, x7, x17 - adcs x6, x6, x20 - adcs x22, x22, x21 - adc x23, xzr, xzr // used in the final step - - adds x4, x4, x24 - adcs x5, x5, x25 - adcs x6, x6, x25 - stp x4, x5, [x0,#8*4] - adcs x22, x22, x25 // carry is used in the final step - stp x6, x22, [x0,#8*6] - - ret - - -.def __smul_256_n_shift_by_31; -.type 32; -.endef -.p2align 5 -__smul_256_n_shift_by_31: - ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) - asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x6, x7, [x1,#8*2+0] - eor x25, x12, x24 // conditionally negate |f0| (or |g0|) - - eor x4, x4, x24 // conditionally negate |a| (or |b|) - sub x25, x25, x24 - eor x5, x5, x24 - adds x4, x4, x24, lsr#63 - eor x6, x6, x24 - adcs x5, x5, xzr - eor x7, x7, x24 - umulh x19, x4, x25 - adcs x6, x6, xzr - umulh x20, x5, x25 - adc x7, x7, xzr - umulh x21, x6, x25 - and x24, x24, x25 - umulh x22, x7, x25 - neg x24, x24 - - mul x4, x4, x25 - mul x5, x5, x25 - mul x6, x6, x25 - adds x5, x5, x19 - mul x7, x7, x25 - adcs x6, x6, x20 - adcs x7, x7, x21 - adc x22, x22, x24 - ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) - asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x10, x11, [x1,#8*2+32] - eor x25, x13, x24 // conditionally negate |f0| (or |g0|) - - eor x8, x8, x24 // conditionally negate |a| (or |b|) - sub x25, x25, x24 - eor x9, x9, x24 - adds x8, x8, x24, lsr#63 - eor x10, x10, x24 - adcs x9, x9, xzr - eor x11, x11, x24 - umulh x19, x8, x25 - adcs x10, x10, xzr - umulh x20, x9, x25 - adc x11, x11, xzr - umulh x21, x10, x25 - and x24, x24, x25 - umulh x23, x11, x25 - neg x24, x24 - - mul x8, x8, x25 - mul x9, x9, x25 - mul x10, x10, x25 - adds x9, x9, x19 - mul x11, x11, x25 - adcs x10, x10, x20 - adcs x11, x11, x21 - adc x23, x23, x24 - adds x4, x4, x8 - adcs x5, x5, x9 - adcs x6, x6, x10 - adcs x7, x7, x11 - adc x8, x22, x23 - - extr x4, x5, x4, #31 - extr x5, x6, x5, #31 - extr x6, x7, x6, #31 - asr x23, x8, #63 // result's sign as mask - extr x7, x8, x7, #31 - - eor x4, x4, x23 // ensure the result is positive - eor x5, x5, x23 - adds x4, x4, x23, lsr#63 - eor x6, x6, x23 - adcs x5, x5, xzr - eor x7, x7, x23 - adcs x6, x6, xzr - stp x4, x5, [x0,#8*0] - adc x7, x7, xzr - stp x6, x7, [x0,#8*2] - - eor x12, x12, x23 // adjust |f/g| accordingly - eor x13, x13, x23 - sub x12, x12, x23 - sub x13, x13, x23 - - ret - -.def __ab_approximation_31_256; -.type 32; -.endef -.p2align 4 -__ab_approximation_31_256: - ldp x6, x7, [x1,#8*2] - ldp x10, x11, [x1,#8*6] - ldp x4, x5, [x1,#8*0] - ldp x8, x9, [x1,#8*4] - -.Lab_approximation_31_256_loaded: - orr x19, x7, x11 // check top-most limbs, ... - cmp x19, #0 - csel x7, x7, x6, ne - csel x11, x11, x10, ne - csel x6, x6, x5, ne - orr x19, x7, x11 // and ones before top-most, ... - csel x10, x10, x9, ne - - cmp x19, #0 - csel x7, x7, x6, ne - csel x11, x11, x10, ne - csel x6, x6, x4, ne - orr x19, x7, x11 // and one more, ... - csel x10, x10, x8, ne - - clz x19, x19 - cmp x19, #64 - csel x19, x19, xzr, ne - csel x7, x7, x6, ne - csel x11, x11, x10, ne - neg x20, x19 - - lslv x7, x7, x19 // align high limbs to the left - lslv x11, x11, x19 - lsrv x6, x6, x20 - lsrv x10, x10, x20 - and x6, x6, x20, asr#6 - and x10, x10, x20, asr#6 - orr x7, x7, x6 - orr x11, x11, x10 - - bfxil x7, x4, #0, #31 - bfxil x11, x8, #0, #31 - - b __inner_loop_31_256 - ret - - -.def __inner_loop_31_256; -.type 32; -.endef -.p2align 4 -__inner_loop_31_256: - mov x2, #31 - mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov x23,#0x7FFFFFFF7FFFFFFF - -.Loop_31_256: - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x19, x15 - csel x11, x11, x7, hs // |b_| = |a_| - csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x15, x15, x13, hs // exchange |fg0| and |fg1| - csel x13, x13, x19, hs - lsr x7, x7, #1 - and x19, x15, x22 - and x20, x23, x22 - sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x15, x15, x15 // |f1|<<=1 - add x13, x13, x20 - sub x15, x15, x23 - cbnz x2, .Loop_31_256 - - mov x23, #0x7FFFFFFF - ubfx x12, x13, #0, #32 - ubfx x13, x13, #32, #32 - ubfx x14, x15, #0, #32 - ubfx x15, x15, #32, #32 - sub x12, x12, x23 // remove bias - sub x13, x13, x23 - sub x14, x14, x23 - sub x15, x15, x23 - - ret - - -.def __inner_loop_62_256; -.type 32; -.endef -.p2align 4 -__inner_loop_62_256: - mov x12, #1 // |f0|=1 - mov x13, #0 // |g0|=0 - mov x14, #0 // |f1|=0 - mov x15, #1 // |g1|=1 - -.Loop_62_256: - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x19, x12 - csel x11, x11, x7, hs // |b_| = |a_| - csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - mov x20, x13 - csel x12, x12, x14, hs // exchange |f0| and |f1| - csel x14, x14, x19, hs - csel x13, x13, x15, hs // exchange |g0| and |g1| - csel x15, x15, x20, hs - lsr x7, x7, #1 - and x19, x14, x22 - and x20, x15, x22 - add x14, x14, x14 // |f1|<<=1 - add x15, x15, x15 // |g1|<<=1 - sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) - cbnz x2, .Loop_62_256 - - ret - diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s deleted file mode 100644 index d1aa7597bc0..00000000000 --- a/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s +++ /dev/null @@ -1,1213 +0,0 @@ -.text - -.globl ct_inverse_mod_256 - -.def ct_inverse_mod_256; .scl 2; .type 32; .endef -.p2align 5 -ct_inverse_mod_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_ct_inverse_mod_256: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $1072,%rsp - -.LSEH_body_ct_inverse_mod_256: - - - leaq 48+511(%rsp),%rax - andq $-512,%rax - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - movq 0(%rdx),%r12 - movq 8(%rdx),%r13 - movq 16(%rdx),%r14 - movq 24(%rdx),%r15 - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - - movq %r12,32(%rax) - movq %r13,40(%rax) - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rax,%rsi - - - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - - - movq %rdx,64(%rdi) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - - - movq %rdx,72(%rdi) - - - xorq $256,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - - - - movq 64(%rsi),%r8 - movq 104(%rsi),%r12 - movq %r8,%r9 - imulq 0(%rsp),%r8 - movq %r12,%r13 - imulq 8(%rsp),%r12 - addq %r12,%r8 - movq %r8,32(%rdi) - sarq $63,%r8 - movq %r8,40(%rdi) - movq %r8,48(%rdi) - movq %r8,56(%rdi) - movq %r8,64(%rdi) - leaq 64(%rsi),%rsi - - imulq %rdx,%r9 - imulq %rcx,%r13 - addq %r13,%r9 - movq %r9,72(%rdi) - sarq $63,%r9 - movq %r9,80(%rdi) - movq %r9,88(%rdi) - movq %r9,96(%rdi) - movq %r9,104(%rdi) - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - sarq $63,%rbp - movq %rbp,40(%rdi) - movq %rbp,48(%rdi) - movq %rbp,56(%rdi) - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - - xorq $256+64,%rsi - movl $47,%edx - - movq 0(%rsi),%r8 - - movq 32(%rsi),%r10 - - call __inner_loop_62_256 - - - - - - - - leaq 64(%rsi),%rsi - - - - - - movq %r12,%rdx - movq %r13,%rcx - movq 32(%rsp),%rdi - call __smulq_512x63 - adcq %rbp,%rdx - - movq 40(%rsp),%rsi - movq %rdx,%rax - sarq $63,%rdx - - movq %rdx,%r8 - movq %rdx,%r9 - andq 0(%rsi),%r8 - movq %rdx,%r10 - andq 8(%rsi),%r9 - andq 16(%rsi),%r10 - andq 24(%rsi),%rdx - - addq %r8,%r12 - adcq %r9,%r13 - adcq %r10,%r14 - adcq %rdx,%r15 - adcq $0,%rax - - movq %rax,%rdx - negq %rax - orq %rax,%rdx - sarq $63,%rax - - movq %rdx,%r8 - movq %rdx,%r9 - andq 0(%rsi),%r8 - movq %rdx,%r10 - andq 8(%rsi),%r9 - andq 16(%rsi),%r10 - andq 24(%rsi),%rdx - - xorq %rax,%r8 - xorq %rcx,%rcx - xorq %rax,%r9 - subq %rax,%rcx - xorq %rax,%r10 - xorq %rax,%rdx - addq %rcx,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%rdx - - addq %r8,%r12 - adcq %r9,%r13 - adcq %r10,%r14 - adcq %rdx,%r15 - - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 1072(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_ct_inverse_mod_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_ct_inverse_mod_256: -.def __smulq_512x63; .scl 3; .type 32; .endef -.p2align 5 -__smulq_512x63: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%rbp - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%rbp - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%rbp - - mulq %rbx - movq %rax,0(%rdi) - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %r9,8(%rdi) - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %r10,16(%rdi) - movq %rdx,%r11 - andq %rbx,%rbp - negq %rbp - mulq %rbx - addq %rax,%r11 - adcq %rdx,%rbp - movq %r11,24(%rdi) - - movq 40(%rsi),%r8 - movq 48(%rsi),%r9 - movq 56(%rsi),%r10 - movq 64(%rsi),%r11 - movq 72(%rsi),%r12 - movq 80(%rsi),%r13 - movq 88(%rsi),%r14 - movq 96(%rsi),%r15 - - movq %rcx,%rdx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rcx - addq %rax,%rcx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - xorq %rdx,%r14 - xorq %rdx,%r15 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - - mulq %rcx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rcx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rcx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rcx - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rcx - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - mulq %rcx - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rcx - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - imulq %rcx - addq %rax,%r15 - adcq $0,%rdx - - movq %rbp,%rbx - sarq $63,%rbp - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq %rbx,%r12 - adcq %rbp,%r13 - adcq %rbp,%r14 - adcq %rbp,%r15 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - .byte 0xf3,0xc3 - - -.def __smulq_256x63; .scl 3; .type 32; .endef -.p2align 5 -__smulq_256x63: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - movq 0+32(%rsi),%rbp - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%rbp - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%rbp - - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - andq %rbx,%rbp - negq %rbp - mulq %rbx - addq %rax,%r11 - adcq %rdx,%rbp - movq %rcx,%rdx - movq 40+0(%rsi),%r12 - movq 40+8(%rsi),%r13 - movq 40+16(%rsi),%r14 - movq 40+24(%rsi),%r15 - movq 40+32(%rsi),%rcx - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r12 - xorq %rdx,%r13 - xorq %rdx,%r14 - xorq %rdx,%r15 - xorq %rdx,%rcx - addq %r12,%rax - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rcx - - mulq %rbx - movq %rax,%r12 - movq %r13,%rax - movq %rdx,%r13 - mulq %rbx - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rbx - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - andq %rbx,%rcx - negq %rcx - mulq %rbx - addq %rax,%r15 - adcq %rdx,%rcx - addq %r12,%r8 - adcq %r13,%r9 - adcq %r14,%r10 - adcq %r15,%r11 - adcq %rcx,%rbp - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %rbp,32(%rdi) - - .byte 0xf3,0xc3 - -.def __smulq_256_n_shift_by_31; .scl 3; .type 32; .endef -.p2align 5 -__smulq_256_n_shift_by_31: - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,0(%rdi) - movq %rcx,8(%rdi) - movq %rdx,%rbp - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - - movq %rbp,%rbx - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%rbx - addq %rax,%rbx - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - andq %rbx,%rbp - negq %rbp - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbx - addq %rax,%r11 - adcq %rdx,%rbp - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r14 - movq 32+24(%rsi),%r15 - - movq %rcx,%rbx - sarq $63,%rcx - xorq %rax,%rax - subq %rcx,%rax - - xorq %rcx,%rbx - addq %rax,%rbx - - xorq %rcx,%r12 - xorq %rcx,%r13 - xorq %rcx,%r14 - xorq %rcx,%r15 - addq %r12,%rax - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - - mulq %rbx - movq %rax,%r12 - movq %r13,%rax - andq %rbx,%rcx - negq %rcx - movq %rdx,%r13 - mulq %rbx - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rbx - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - mulq %rbx - addq %rax,%r15 - adcq %rdx,%rcx - addq %r12,%r8 - adcq %r13,%r9 - adcq %r14,%r10 - adcq %r15,%r11 - adcq %rcx,%rbp - - movq 0(%rdi),%rdx - movq 8(%rdi),%rcx - - shrdq $31,%r9,%r8 - shrdq $31,%r10,%r9 - shrdq $31,%r11,%r10 - shrdq $31,%rbp,%r11 - - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - xorq %rbp,%rdx - xorq %rbp,%rcx - addq %rax,%rdx - addq %rax,%rcx - - .byte 0xf3,0xc3 - -.def __ab_approximation_31_256; .scl 3; .type 32; .endef -.p2align 5 -__ab_approximation_31_256: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 24(%rsi),%r9 - movq 56(%rsi),%r11 - movq 16(%rsi),%rbx - movq 48(%rsi),%rbp - movq 8(%rsi),%r8 - movq 40(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 0(%rsi),%r8 - cmovzq %r10,%rbp - movq 32(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - - movq %r9,%rax - orq %r11,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %r8,%r9 - cmovzq %r10,%r11 - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%rbx,%r9 - shldq %cl,%rbp,%r11 - - movl $0x7FFFFFFF,%eax - andq %rax,%r8 - andq %rax,%r10 - notq %rax - andq %rax,%r9 - andq %rax,%r11 - orq %r9,%r8 - orq %r11,%r10 - - jmp __inner_loop_31_256 - - .byte 0xf3,0xc3 - -.def __inner_loop_31_256; .scl 3; .type 32; .endef -.p2align 5 -__inner_loop_31_256: - .byte 0xf3,0x0f,0x1e,0xfa - - movq $0x7FFFFFFF80000000,%rcx - movq $0x800000007FFFFFFF,%r13 - movq $0x7FFFFFFF7FFFFFFF,%r15 - -.Loop_31_256: - cmpq %r10,%r8 - movq %r8,%rax - movq %r10,%rbx - movq %rcx,%rbp - movq %r13,%r14 - cmovbq %r10,%r8 - cmovbq %rax,%r10 - cmovbq %r13,%rcx - cmovbq %rbp,%r13 - - subq %r10,%r8 - subq %r13,%rcx - addq %r15,%rcx - - testq $1,%rax - cmovzq %rax,%r8 - cmovzq %rbx,%r10 - cmovzq %rbp,%rcx - cmovzq %r14,%r13 - - shrq $1,%r8 - addq %r13,%r13 - subq %r15,%r13 - subl $1,%edx - jnz .Loop_31_256 - - shrq $32,%r15 - movl %ecx,%edx - movl %r13d,%r12d - shrq $32,%rcx - shrq $32,%r13 - subq %r15,%rdx - subq %r15,%rcx - subq %r15,%r12 - subq %r15,%r13 - - .byte 0xf3,0xc3 - - -.def __inner_loop_62_256; .scl 3; .type 32; .endef -.p2align 5 -__inner_loop_62_256: - .byte 0xf3,0x0f,0x1e,0xfa - - movl %edx,%r15d - movq $1,%rdx - xorq %rcx,%rcx - xorq %r12,%r12 - movq %rdx,%r13 - movq %rdx,%r14 - -.Loop_62_256: - xorq %rax,%rax - testq %r14,%r8 - movq %r10,%rbx - cmovnzq %r10,%rax - subq %r8,%rbx - movq %r8,%rbp - subq %rax,%r8 - cmovcq %rbx,%r8 - cmovcq %rbp,%r10 - movq %rdx,%rax - cmovcq %r12,%rdx - cmovcq %rax,%r12 - movq %rcx,%rbx - cmovcq %r13,%rcx - cmovcq %rbx,%r13 - xorq %rax,%rax - xorq %rbx,%rbx - shrq $1,%r8 - testq %r14,%rbp - cmovnzq %r12,%rax - cmovnzq %r13,%rbx - addq %r12,%r12 - addq %r13,%r13 - subq %rax,%rdx - subq %rbx,%rcx - subl $1,%r15d - jnz .Loop_62_256 - - .byte 0xf3,0xc3 - -.section .pdata -.p2align 2 -.rva .LSEH_begin_ct_inverse_mod_256 -.rva .LSEH_body_ct_inverse_mod_256 -.rva .LSEH_info_ct_inverse_mod_256_prologue - -.rva .LSEH_body_ct_inverse_mod_256 -.rva .LSEH_epilogue_ct_inverse_mod_256 -.rva .LSEH_info_ct_inverse_mod_256_body - -.rva .LSEH_epilogue_ct_inverse_mod_256 -.rva .LSEH_end_ct_inverse_mod_256 -.rva .LSEH_info_ct_inverse_mod_256_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_ct_inverse_mod_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_ct_inverse_mod_256_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x86,0x00 -.byte 0x00,0xe4,0x87,0x00 -.byte 0x00,0xd4,0x88,0x00 -.byte 0x00,0xc4,0x89,0x00 -.byte 0x00,0x34,0x8a,0x00 -.byte 0x00,0x54,0x8b,0x00 -.byte 0x00,0x74,0x8d,0x00 -.byte 0x00,0x64,0x8e,0x00 -.byte 0x00,0x01,0x8c,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_ct_inverse_mod_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S deleted file mode 100644 index 86fdc405828..00000000000 --- a/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S +++ /dev/null @@ -1,730 +0,0 @@ -.text - -.globl ct_inverse_mod_383 - -.def ct_inverse_mod_383; -.type 32; -.endef -.p2align 5 -ct_inverse_mod_383: -.long 3573752639 - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #1040 - - ldp x22, x4, [x1,#8*0] - ldp x5, x6, [x1,#8*2] - ldp x7, x8, [x1,#8*4] - - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... - stp x0, x3, [sp] - - ldp x9, x10, [x2,#8*0] - ldp x11, x12, [x2,#8*2] - ldp x13, x14, [x2,#8*4] - - stp x22, x4, [x1,#8*0] // copy input to |a| - stp x5, x6, [x1,#8*2] - stp x7, x8, [x1,#8*4] - stp x9, x10, [x1,#8*6] // copy modulus to |b| - stp x11, x12, [x1,#8*8] - stp x13, x14, [x1,#8*10] - - ////////////////////////////////////////// first iteration - mov x2, #62 - bl .Lab_approximation_62_loaded - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - str x15,[x0,#8*12] // initialize |u| with |f0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to dst |b| - bl __smul_383_n_shift_by_62 - str x15, [x0,#8*12] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - ldr x7, [x1,#8*12] // |u| - ldr x8, [x1,#8*18] // |v| - mul x3, x20, x7 // |u|*|f0| - smulh x4, x20, x7 - mul x5, x21, x8 // |v|*|g0| - smulh x6, x21, x8 - adds x3, x3, x5 - adc x4, x4, x6 - stp x3, x4, [x0,#8*6] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*8] - stp x5, x5, [x0,#8*10] - - mul x3, x15, x7 // |u|*|f1| - smulh x4, x15, x7 - mul x5, x16, x8 // |v|*|g1| - smulh x6, x16, x8 - adds x3, x3, x5 - adc x4, x4, x6 - stp x3, x4, [x0,#8*12] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*14] - stp x5, x5, [x0,#8*16] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - asr x27, x27, #63 // sign extension - stp x27, x27, [x0,#8*6] - stp x27, x27, [x0,#8*8] - stp x27, x27, [x0,#8*10] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - ////////////////////////////////////////// iteration before last - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldp x3, x8, [x1,#8*0] // just load - ldp x9, x14, [x1,#8*6] - bl __inner_loop_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - str x3, [x0,#8*0] - str x9, [x0,#8*6] - - mov x20, x15 // exact |f0| - mov x21, x16 // exact |g0| - mov x15, x17 - mov x16, x19 - add x0, x0, #8*12 // pointer to dst |u| - bl __smul_383x63 - - mov x20, x15 // exact |f1| - mov x21, x16 // exact |g1| - add x0, x0, #8*6 // pointer to dst |v| - bl __smul_383x63 - bl __smul_767x63_tail - - ////////////////////////////////////////// last iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #22 // 766 % 62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldr x3, [x1,#8*0] // just load - eor x8, x8, x8 - ldr x9, [x1,#8*6] - eor x14, x14, x14 - bl __inner_loop_62 - - mov x20, x17 - mov x21, x19 - ldp x0, x15, [sp] // original out_ptr and n_ptr - bl __smul_383x63 - bl __smul_767x63_tail - ldr x30, [x29,#8] - - asr x22, x8, #63 // sign as mask - ldp x9, x10, [x15,#8*0] - ldp x11, x12, [x15,#8*2] - ldp x13, x14, [x15,#8*4] - - and x9, x9, x22 // add mod<<384 conditionally - and x10, x10, x22 - adds x3, x3, x9 - and x11, x11, x22 - adcs x4, x4, x10 - and x12, x12, x22 - adcs x5, x5, x11 - and x13, x13, x22 - adcs x6, x6, x12 - and x14, x14, x22 - stp x3, x4, [x0,#8*6] - adcs x7, x7, x13 - stp x5, x6, [x0,#8*8] - adc x8, x8, x14 - stp x7, x8, [x0,#8*10] - - add sp, sp, #1040 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 -.long 3573752767 - ret - - -//////////////////////////////////////////////////////////////////////// -// see corresponding commentary in ctx_inverse_mod_384-x86_64... -.def __smul_383x63; -.type 32; -.endef -.p2align 5 -__smul_383x63: - ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) - asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x5, x6, [x1,#8*2+96] - eor x20, x20, x17 // conditionally negate |f_| (or |g_|) - ldp x7, x8, [x1,#8*4+96] - - eor x3, x3, x17 // conditionally negate |u| (or |v|) - sub x20, x20, x17 - eor x4, x4, x17 - adds x3, x3, x17, lsr#63 - eor x5, x5, x17 - adcs x4, x4, xzr - eor x6, x6, x17 - adcs x5, x5, xzr - eor x7, x7, x17 - adcs x6, x6, xzr - umulh x22, x3, x20 - eor x8, x8, x17 - umulh x23, x4, x20 - adcs x7, x7, xzr - umulh x24, x5, x20 - adcs x8, x8, xzr - umulh x25, x6, x20 - umulh x26, x7, x20 - mul x3, x3, x20 - mul x4, x4, x20 - mul x5, x5, x20 - adds x4, x4, x22 - mul x6, x6, x20 - adcs x5, x5, x23 - mul x7, x7, x20 - adcs x6, x6, x24 - mul x27,x8, x20 - adcs x7, x7, x25 - adcs x27,x27,x26 - adc x2, xzr, xzr - ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) - asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x11, x12, [x1,#8*2+144] - eor x21, x21, x17 // conditionally negate |f_| (or |g_|) - ldp x13, x14, [x1,#8*4+144] - - eor x9, x9, x17 // conditionally negate |u| (or |v|) - sub x21, x21, x17 - eor x10, x10, x17 - adds x9, x9, x17, lsr#63 - eor x11, x11, x17 - adcs x10, x10, xzr - eor x12, x12, x17 - adcs x11, x11, xzr - eor x13, x13, x17 - adcs x12, x12, xzr - umulh x22, x9, x21 - eor x14, x14, x17 - umulh x23, x10, x21 - adcs x13, x13, xzr - umulh x24, x11, x21 - adcs x14, x14, xzr - umulh x25, x12, x21 - adc x19, xzr, xzr // used in __smul_767x63_tail - umulh x26, x13, x21 - mul x9, x9, x21 - mul x10, x10, x21 - mul x11, x11, x21 - adds x10, x10, x22 - mul x12, x12, x21 - adcs x11, x11, x23 - mul x13, x13, x21 - adcs x12, x12, x24 - mul x28,x14, x21 - adcs x13, x13, x25 - adcs x28,x28,x26 - adc x2, x2, xzr - - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - stp x3, x4, [x0,#8*0] - adcs x7, x7, x13 - stp x5, x6, [x0,#8*2] - adcs x27, x27, x28 - stp x7, x27, [x0,#8*4] - adc x28, x2, xzr // used in __smul_767x63_tail - - ret - - -.def __smul_767x63_tail; -.type 32; -.endef -.p2align 5 -__smul_767x63_tail: - smulh x27, x8, x20 - ldp x3, x4, [x1,#8*24] // load rest of |v| - umulh x14,x14, x21 - ldp x5, x6, [x1,#8*26] - ldp x7, x8, [x1,#8*28] - - eor x3, x3, x17 // conditionally negate rest of |v| - eor x4, x4, x17 - eor x5, x5, x17 - adds x3, x3, x19 - eor x6, x6, x17 - adcs x4, x4, xzr - eor x7, x7, x17 - adcs x5, x5, xzr - eor x8, x8, x17 - adcs x6, x6, xzr - umulh x22, x3, x21 - adcs x7, x7, xzr - umulh x23, x4, x21 - adc x8, x8, xzr - - umulh x24, x5, x21 - add x14, x14, x28 - umulh x25, x6, x21 - asr x28, x27, #63 - umulh x26, x7, x21 - mul x3, x3, x21 - mul x4, x4, x21 - mul x5, x5, x21 - adds x3, x3, x14 - mul x6, x6, x21 - adcs x4, x4, x22 - mul x7, x7, x21 - adcs x5, x5, x23 - mul x8, x8, x21 - adcs x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, x26 - - adds x3, x3, x27 - adcs x4, x4, x28 - adcs x5, x5, x28 - adcs x6, x6, x28 - stp x3, x4, [x0,#8*6] - adcs x7, x7, x28 - stp x5, x6, [x0,#8*8] - adc x8, x8, x28 - stp x7, x8, [x0,#8*10] - - ret - - -.def __smul_383_n_shift_by_62; -.type 32; -.endef -.p2align 5 -__smul_383_n_shift_by_62: - ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) - asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x5, x6, [x1,#8*2+0] - eor x2, x15, x28 // conditionally negate |f0| (or |g0|) - ldp x7, x8, [x1,#8*4+0] - - eor x3, x3, x28 // conditionally negate |a| (or |b|) - sub x2, x2, x28 - eor x4, x4, x28 - adds x3, x3, x28, lsr#63 - eor x5, x5, x28 - adcs x4, x4, xzr - eor x6, x6, x28 - adcs x5, x5, xzr - eor x7, x7, x28 - umulh x22, x3, x2 - adcs x6, x6, xzr - umulh x23, x4, x2 - eor x8, x8, x28 - umulh x24, x5, x2 - adcs x7, x7, xzr - umulh x25, x6, x2 - adc x8, x8, xzr - - umulh x26, x7, x2 - smulh x27, x8, x2 - mul x3, x3, x2 - mul x4, x4, x2 - mul x5, x5, x2 - adds x4, x4, x22 - mul x6, x6, x2 - adcs x5, x5, x23 - mul x7, x7, x2 - adcs x6, x6, x24 - mul x8, x8, x2 - adcs x7, x7, x25 - adcs x8, x8 ,x26 - adc x27, x27, xzr - ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) - asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x11, x12, [x1,#8*2+48] - eor x2, x16, x28 // conditionally negate |f0| (or |g0|) - ldp x13, x14, [x1,#8*4+48] - - eor x9, x9, x28 // conditionally negate |a| (or |b|) - sub x2, x2, x28 - eor x10, x10, x28 - adds x9, x9, x28, lsr#63 - eor x11, x11, x28 - adcs x10, x10, xzr - eor x12, x12, x28 - adcs x11, x11, xzr - eor x13, x13, x28 - umulh x22, x9, x2 - adcs x12, x12, xzr - umulh x23, x10, x2 - eor x14, x14, x28 - umulh x24, x11, x2 - adcs x13, x13, xzr - umulh x25, x12, x2 - adc x14, x14, xzr - - umulh x26, x13, x2 - smulh x28, x14, x2 - mul x9, x9, x2 - mul x10, x10, x2 - mul x11, x11, x2 - adds x10, x10, x22 - mul x12, x12, x2 - adcs x11, x11, x23 - mul x13, x13, x2 - adcs x12, x12, x24 - mul x14, x14, x2 - adcs x13, x13, x25 - adcs x14, x14 ,x26 - adc x28, x28, xzr - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - adcs x7, x7, x13 - adcs x8, x8, x14 - adc x9, x27, x28 - - extr x3, x4, x3, #62 - extr x4, x5, x4, #62 - extr x5, x6, x5, #62 - asr x28, x9, #63 - extr x6, x7, x6, #62 - extr x7, x8, x7, #62 - extr x8, x9, x8, #62 - - eor x3, x3, x28 - eor x4, x4, x28 - adds x3, x3, x28, lsr#63 - eor x5, x5, x28 - adcs x4, x4, xzr - eor x6, x6, x28 - adcs x5, x5, xzr - eor x7, x7, x28 - adcs x6, x6, xzr - eor x8, x8, x28 - stp x3, x4, [x0,#8*0] - adcs x7, x7, xzr - stp x5, x6, [x0,#8*2] - adc x8, x8, xzr - stp x7, x8, [x0,#8*4] - - eor x15, x15, x28 - eor x16, x16, x28 - sub x15, x15, x28 - sub x16, x16, x28 - - ret - -.def __ab_approximation_62; -.type 32; -.endef -.p2align 4 -__ab_approximation_62: - ldp x7, x8, [x1,#8*4] - ldp x13, x14, [x1,#8*10] - ldp x5, x6, [x1,#8*2] - ldp x11, x12, [x1,#8*8] - -.Lab_approximation_62_loaded: - orr x22, x8, x14 // check top-most limbs, ... - cmp x22, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x6, ne - orr x22, x8, x14 // ... ones before top-most, ... - csel x13, x13, x12, ne - - ldp x3, x4, [x1,#8*0] - ldp x9, x10, [x1,#8*6] - - cmp x22, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x5, ne - orr x22, x8, x14 // ... and ones before that ... - csel x13, x13, x11, ne - - cmp x22, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x4, ne - orr x22, x8, x14 - csel x13, x13, x10, ne - - clz x22, x22 - cmp x22, #64 - csel x22, x22, xzr, ne - csel x8, x8, x7, ne - csel x14, x14, x13, ne - neg x23, x22 - - lslv x8, x8, x22 // align high limbs to the left - lslv x14, x14, x22 - lsrv x7, x7, x23 - lsrv x13, x13, x23 - and x7, x7, x23, asr#6 - and x13, x13, x23, asr#6 - orr x8, x8, x7 - orr x14, x14, x13 - - b __inner_loop_62 - ret - -.def __inner_loop_62; -.type 32; -.endef -.p2align 4 -__inner_loop_62: - mov x15, #1 // |f0|=1 - mov x16, #0 // |g0|=0 - mov x17, #0 // |f1|=0 - mov x19, #1 // |g1|=1 - -.Loop_62: - sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - subs x24, x9, x3 // |b_|-|a_| - and x22, x9, x28 - sbc x25, x14, x8 - and x23, x14, x28 - subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x22, x15 - sbcs x27, x8, x23 - mov x23, x16 - csel x9, x9, x3, hs // |b_| = |a_| - csel x14, x14, x8, hs - csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x8, x27, x25, hs - csel x15, x15, x17, hs // exchange |f0| and |f1| - csel x17, x17, x22, hs - csel x16, x16, x19, hs // exchange |g0| and |g1| - csel x19, x19, x23, hs - extr x3, x8, x3, #1 - lsr x8, x8, #1 - and x22, x17, x28 - and x23, x19, x28 - add x17, x17, x17 // |f1|<<=1 - add x19, x19, x19 // |g1|<<=1 - sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) - cbnz x2, .Loop_62 - - ret - diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S deleted file mode 100644 index efe90a82144..00000000000 --- a/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S +++ /dev/null @@ -1,335 +0,0 @@ -.text - -.globl ct_is_square_mod_384 - -.def ct_is_square_mod_384; -.type 32; -.endef -.p2align 5 -ct_is_square_mod_384: -.long 3573752639 - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #512 - - ldp x3, x4, [x0,#8*0] // load input - ldp x5, x6, [x0,#8*2] - ldp x7, x8, [x0,#8*4] - - add x0, sp, #255 // find closest 256-byte-aligned spot - and x0, x0, #-256 // in the frame... - - ldp x9, x10, [x1,#8*0] // load modulus - ldp x11, x12, [x1,#8*2] - ldp x13, x14, [x1,#8*4] - - stp x3, x4, [x0,#8*6] // copy input to |a| - stp x5, x6, [x0,#8*8] - stp x7, x8, [x0,#8*10] - stp x9, x10, [x0,#8*0] // copy modulus to |b| - stp x11, x12, [x0,#8*2] - stp x13, x14, [x0,#8*4] - - eor x2, x2, x2 // init the .Legendre symbol - mov x15, #24 // 24 is 768/30-1 - b .Loop_is_square - -.p2align 4 -.Loop_is_square: - bl __ab_approximation_30 - sub x15, x15, #1 - - eor x1, x0, #128 // pointer to dst |b| - bl __smul_384_n_shift_by_30 - - mov x19, x16 // |f0| - mov x20, x17 // |g0| - add x1, x1, #8*6 // pointer to dst |a| - bl __smul_384_n_shift_by_30 - - ldp x9, x10, [x1,#-8*6] - eor x0, x0, #128 // flip-flop src |a|b| - and x27, x27, x9 // if |a| was negative, - add x2, x2, x27, lsr#1 // adjust |L| - - cbnz x15, .Loop_is_square - - ////////////////////////////////////////// last iteration - //bl __ab_approximation_30 // |a| and |b| are exact, - //ldr x8, [x0,#8*6] // and loaded - //ldr x14, [x0,#8*0] - mov x15, #48 // 48 is 768%30 + 30 - bl __inner_loop_48 - ldr x30, [x29,#8] - - and x0, x2, #1 - eor x0, x0, #1 - - add sp, sp, #512 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 -.long 3573752767 - ret - - -.def __smul_384_n_shift_by_30; -.type 32; -.endef -.p2align 5 -__smul_384_n_shift_by_30: - ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) - asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) - ldp x5, x6, [x0,#8*2+0] - eor x20, x20, x27 // conditionally negate |g1| (or |f1|) - ldp x7, x8, [x0,#8*4+0] - - eor x3, x3, x27 // conditionally negate |b| (or |a|) - sub x20, x20, x27 - eor x4, x4, x27 - adds x3, x3, x27, lsr#63 - eor x5, x5, x27 - adcs x4, x4, xzr - eor x6, x6, x27 - adcs x5, x5, xzr - eor x7, x7, x27 - umulh x21, x3, x20 - adcs x6, x6, xzr - umulh x22, x4, x20 - eor x8, x8, x27 - umulh x23, x5, x20 - adcs x7, x7, xzr - umulh x24, x6, x20 - adc x8, x8, xzr - - umulh x25, x7, x20 - and x28, x20, x27 - umulh x26, x8, x20 - neg x28, x28 - mul x3, x3, x20 - mul x4, x4, x20 - mul x5, x5, x20 - adds x4, x4, x21 - mul x6, x6, x20 - adcs x5, x5, x22 - mul x7, x7, x20 - adcs x6, x6, x23 - mul x8, x8, x20 - adcs x7, x7, x24 - adcs x8, x8 ,x25 - adc x26, x26, x28 - ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) - asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) - ldp x11, x12, [x0,#8*2+48] - eor x19, x19, x27 // conditionally negate |g1| (or |f1|) - ldp x13, x14, [x0,#8*4+48] - - eor x9, x9, x27 // conditionally negate |b| (or |a|) - sub x19, x19, x27 - eor x10, x10, x27 - adds x9, x9, x27, lsr#63 - eor x11, x11, x27 - adcs x10, x10, xzr - eor x12, x12, x27 - adcs x11, x11, xzr - eor x13, x13, x27 - umulh x21, x9, x19 - adcs x12, x12, xzr - umulh x22, x10, x19 - eor x14, x14, x27 - umulh x23, x11, x19 - adcs x13, x13, xzr - umulh x24, x12, x19 - adc x14, x14, xzr - - umulh x25, x13, x19 - and x28, x19, x27 - umulh x27, x14, x19 - neg x28, x28 - mul x9, x9, x19 - mul x10, x10, x19 - mul x11, x11, x19 - adds x10, x10, x21 - mul x12, x12, x19 - adcs x11, x11, x22 - mul x13, x13, x19 - adcs x12, x12, x23 - mul x14, x14, x19 - adcs x13, x13, x24 - adcs x14, x14 ,x25 - adc x27, x27, x28 - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - adcs x7, x7, x13 - adcs x8, x8, x14 - adc x9, x26, x27 - - extr x3, x4, x3, #30 - extr x4, x5, x4, #30 - extr x5, x6, x5, #30 - asr x27, x9, #63 - extr x6, x7, x6, #30 - extr x7, x8, x7, #30 - extr x8, x9, x8, #30 - - eor x3, x3, x27 - eor x4, x4, x27 - adds x3, x3, x27, lsr#63 - eor x5, x5, x27 - adcs x4, x4, xzr - eor x6, x6, x27 - adcs x5, x5, xzr - eor x7, x7, x27 - adcs x6, x6, xzr - eor x8, x8, x27 - stp x3, x4, [x1,#8*0] - adcs x7, x7, xzr - stp x5, x6, [x1,#8*2] - adc x8, x8, xzr - stp x7, x8, [x1,#8*4] - - ret - -.def __ab_approximation_30; -.type 32; -.endef -.p2align 4 -__ab_approximation_30: - ldp x13, x14, [x0,#8*4] // |a| is still in registers - ldp x11, x12, [x0,#8*2] - - orr x21, x8, x14 // check top-most limbs, ... - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x6, ne - orr x21, x8, x14 // ... ones before top-most, ... - csel x13, x13, x12, ne - - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x5, ne - orr x21, x8, x14 // ... and ones before that ... - csel x13, x13, x11, ne - - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x4, ne - orr x21, x8, x14 // and one more, ... - csel x13, x13, x10, ne - - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x3, ne - orr x21, x8, x14 - csel x13, x13, x9, ne - - clz x21, x21 - cmp x21, #64 - csel x21, x21, xzr, ne - csel x8, x8, x7, ne - csel x14, x14, x13, ne - neg x22, x21 - - lslv x8, x8, x21 // align high limbs to the left - lslv x14, x14, x21 - lsrv x7, x7, x22 - lsrv x13, x13, x22 - and x7, x7, x22, asr#6 - and x13, x13, x22, asr#6 - orr x8, x8, x7 - orr x14, x14, x13 - - bfxil x8, x3, #0, #32 - bfxil x14, x9, #0, #32 - - b __inner_loop_30 - ret - - -.def __inner_loop_30; -.type 32; -.endef -.p2align 4 -__inner_loop_30: - mov x28, #30 - mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov x27,#0x7FFFFFFF7FFFFFFF - -.Loop_30: - sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting - and x25, x8, x14 - sub x28, x28, #1 - and x21, x14, x24 - - sub x22, x14, x8 // |b_|-|a_| - subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) - add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 - mov x21, x20 - csel x14, x14, x8, hs // |b_| = |a_| - csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x20, x20, x17, hs // exchange |fg0| and |fg1| - csel x17, x17, x21, hs - csel x2, x2, x25, hs - lsr x8, x8, #1 - and x21, x20, x24 - and x22, x27, x24 - add x23, x14, #2 - sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x20, x20, x20 // |f1|<<=1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - add x17, x17, x22 - sub x20, x20, x27 - - cbnz x28, .Loop_30 - - mov x27, #0x7FFFFFFF - ubfx x16, x17, #0, #32 - ubfx x17, x17, #32, #32 - ubfx x19, x20, #0, #32 - ubfx x20, x20, #32, #32 - sub x16, x16, x27 // remove the bias - sub x17, x17, x27 - sub x19, x19, x27 - sub x20, x20, x27 - - ret - -.def __inner_loop_48; -.type 32; -.endef -.p2align 4 -__inner_loop_48: -.Loop_48: - sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting - and x25, x3, x9 - sub x15, x15, #1 - and x21, x9, x24 - sub x22, x9, x3 // |b_|-|a_| - subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) - add x25, x2, x25, lsr#1 - csel x9, x9, x3, hs // |b_| = |a_| - csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x2, x2, x25, hs - add x23, x9, #2 - lsr x3, x3, #1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - - cbnz x15, .Loop_48 - - ret - diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s deleted file mode 100644 index 9ac32f50852..00000000000 --- a/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s +++ /dev/null @@ -1,509 +0,0 @@ -.text - -.globl ct_is_square_mod_384 - -.def ct_is_square_mod_384; .scl 2; .type 32; .endef -.p2align 5 -ct_is_square_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_ct_is_square_mod_384: - - - pushq %rbp - - movq %rcx,%rdi - movq %rdx,%rsi - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $536,%rsp - -.LSEH_body_ct_is_square_mod_384: - - - leaq 24+255(%rsp),%rax - andq $-256,%rax - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rbx - movq 24(%rsi),%rcx - movq 32(%rsi),%rdx - movq 40(%rsi),%rdi - movq %rax,%rsi - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - movq %r12,32(%rax) - movq %r13,40(%rax) - - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rbx,64(%rax) - movq %rcx,72(%rax) - movq %rdx,80(%rax) - movq %rdi,88(%rax) - - xorq %rbp,%rbp - movl $24,%ecx - jmp .Loop_is_square - -.p2align 5 -.Loop_is_square: - movl %ecx,16(%rsp) - - call __ab_approximation_30 - movq %rax,0(%rsp) - movq %rbx,8(%rsp) - - movq $128+48,%rdi - xorq %rsi,%rdi - call __smulq_384_n_shift_by_30 - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq -48(%rdi),%rdi - call __smulq_384_n_shift_by_30 - - movl 16(%rsp),%ecx - xorq $128,%rsi - - andq 48(%rdi),%r14 - shrq $1,%r14 - addq %r14,%rbp - - subl $1,%ecx - jnz .Loop_is_square - - - - - movq 48(%rsi),%r9 - call __inner_loop_48 - - movq $1,%rax - andq %rbp,%rax - xorq $1,%rax - - leaq 536(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_ct_is_square_mod_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_ct_is_square_mod_384: - -.def __smulq_384_n_shift_by_30; .scl 3; .type 32; .endef -.p2align 5 -__smulq_384_n_shift_by_30: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %rdx,%r14 - andq %rbx,%r14 - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbx - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbx - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - negq %r14 - mulq %rbx - addq %rax,%r13 - adcq %rdx,%r14 - leaq 48(%rsi),%rsi - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %rdx,%r15 - andq %rbx,%r15 - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbx - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbx - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - negq %r15 - mulq %rbx - addq %rax,%r13 - adcq %rdx,%r15 - leaq -48(%rsi),%rsi - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq %r15,%r14 - - shrdq $30,%r9,%r8 - shrdq $30,%r10,%r9 - shrdq $30,%r11,%r10 - shrdq $30,%r12,%r11 - shrdq $30,%r13,%r12 - shrdq $30,%r14,%r13 - - sarq $63,%r14 - xorq %rbx,%rbx - subq %r14,%rbx - - xorq %r14,%r8 - xorq %r14,%r9 - xorq %r14,%r10 - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%r13 - addq %rbx,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 - -.def __ab_approximation_30; .scl 3; .type 32; .endef -.p2align 5 -__ab_approximation_30: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 88(%rsi),%rbx - movq 80(%rsi),%r15 - movq 72(%rsi),%r14 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r11,%r12 - movq 64(%rsi),%r11 - cmovzq %r14,%r15 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r10,%r12 - movq 56(%rsi),%r10 - cmovzq %r11,%r15 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r9,%r12 - movq 48(%rsi),%r9 - cmovzq %r10,%r15 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r8,%r12 - cmovzq %r9,%r15 - - movq %r13,%rax - orq %rbx,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %r8,%r13 - cmovzq %r9,%rbx - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%r12,%r13 - shldq %cl,%r15,%rbx - - movq $0xFFFFFFFF00000000,%rax - movl %r8d,%r8d - movl %r9d,%r9d - andq %rax,%r13 - andq %rax,%rbx - orq %r13,%r8 - orq %rbx,%r9 - - jmp __inner_loop_30 - - .byte 0xf3,0xc3 - -.def __inner_loop_30; .scl 3; .type 32; .endef -.p2align 5 -__inner_loop_30: - .byte 0xf3,0x0f,0x1e,0xfa - - movq $0x7FFFFFFF80000000,%rbx - movq $0x800000007FFFFFFF,%rcx - leaq -1(%rbx),%r15 - movl $30,%edi - -.Loop_30: - movq %r8,%rax - andq %r9,%rax - shrq $1,%rax - - cmpq %r9,%r8 - movq %r8,%r10 - movq %r9,%r11 - leaq (%rax,%rbp,1),%rax - movq %rbx,%r12 - movq %rcx,%r13 - movq %rbp,%r14 - cmovbq %r9,%r8 - cmovbq %r10,%r9 - cmovbq %rcx,%rbx - cmovbq %r12,%rcx - cmovbq %rax,%rbp - - subq %r9,%r8 - subq %rcx,%rbx - addq %r15,%rbx - - testq $1,%r10 - cmovzq %r10,%r8 - cmovzq %r11,%r9 - cmovzq %r12,%rbx - cmovzq %r13,%rcx - cmovzq %r14,%rbp - - leaq 2(%r9),%rax - shrq $1,%r8 - shrq $2,%rax - addq %rcx,%rcx - leaq (%rax,%rbp,1),%rbp - subq %r15,%rcx - - subl $1,%edi - jnz .Loop_30 - - shrq $32,%r15 - movl %ebx,%eax - shrq $32,%rbx - movl %ecx,%edx - shrq $32,%rcx - subq %r15,%rax - subq %r15,%rbx - subq %r15,%rdx - subq %r15,%rcx - - .byte 0xf3,0xc3 - - -.def __inner_loop_48; .scl 3; .type 32; .endef -.p2align 5 -__inner_loop_48: - .byte 0xf3,0x0f,0x1e,0xfa - - movl $48,%edi - -.Loop_48: - movq %r8,%rax - andq %r9,%rax - shrq $1,%rax - - cmpq %r9,%r8 - movq %r8,%r10 - movq %r9,%r11 - leaq (%rax,%rbp,1),%rax - movq %rbp,%r12 - cmovbq %r9,%r8 - cmovbq %r10,%r9 - cmovbq %rax,%rbp - - subq %r9,%r8 - - testq $1,%r10 - cmovzq %r10,%r8 - cmovzq %r11,%r9 - cmovzq %r12,%rbp - - leaq 2(%r9),%rax - shrq $1,%r8 - shrq $2,%rax - addq %rax,%rbp - - subl $1,%edi - jnz .Loop_48 - - .byte 0xf3,0xc3 - -.section .pdata -.p2align 2 -.rva .LSEH_begin_ct_is_square_mod_384 -.rva .LSEH_body_ct_is_square_mod_384 -.rva .LSEH_info_ct_is_square_mod_384_prologue - -.rva .LSEH_body_ct_is_square_mod_384 -.rva .LSEH_epilogue_ct_is_square_mod_384 -.rva .LSEH_info_ct_is_square_mod_384_body - -.rva .LSEH_epilogue_ct_is_square_mod_384 -.rva .LSEH_end_ct_is_square_mod_384 -.rva .LSEH_info_ct_is_square_mod_384_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_ct_is_square_mod_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_ct_is_square_mod_384_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x43,0x00 -.byte 0x00,0xe4,0x44,0x00 -.byte 0x00,0xd4,0x45,0x00 -.byte 0x00,0xc4,0x46,0x00 -.byte 0x00,0x34,0x47,0x00 -.byte 0x00,0x54,0x48,0x00 -.byte 0x00,0x74,0x4a,0x00 -.byte 0x00,0x64,0x4b,0x00 -.byte 0x00,0x01,0x49,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_ct_is_square_mod_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s deleted file mode 100644 index d027a6dc5c0..00000000000 --- a/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s +++ /dev/null @@ -1,1230 +0,0 @@ -.comm __blst_platform_cap,4 -.text - -.globl ct_inverse_mod_383 - -.def ct_inverse_mod_383; .scl 2; .type 32; .endef -.p2align 5 -ct_inverse_mod_383: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_ct_inverse_mod_383: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz ct_inverse_mod_383$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $1112,%rsp - -.LSEH_body_ct_inverse_mod_383: - - - leaq 88+511(%rsp),%rax - andq $-512,%rax - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq 0(%rdx),%r14 - movq 8(%rdx),%r15 - movq 16(%rdx),%rbx - movq 24(%rdx),%rbp - movq 32(%rdx),%rsi - movq 40(%rdx),%rdi - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - movq %r12,32(%rax) - movq %r13,40(%rax) - - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rbx,64(%rax) - movq %rbp,72(%rax) - movq %rsi,80(%rax) - movq %rax,%rsi - movq %rdi,88(%rax) - - - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - - - movq %rdx,96(%rdi) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - - - movq %rdx,96(%rdi) - - - xorq $256,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - - - - movq 96(%rsi),%rax - movq 144(%rsi),%r11 - movq %rdx,%rbx - movq %rax,%r10 - imulq 56(%rsp) - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq 64(%rsp) - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,48(%rdi) - movq %r9,56(%rdi) - sarq $63,%r9 - movq %r9,64(%rdi) - movq %r9,72(%rdi) - movq %r9,80(%rdi) - movq %r9,88(%rdi) - leaq 96(%rsi),%rsi - - movq %r10,%rax - imulq %rbx - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq %rcx - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,96(%rdi) - movq %r9,104(%rdi) - sarq $63,%r9 - movq %r9,112(%rdi) - movq %r9,120(%rdi) - movq %r9,128(%rdi) - movq %r9,136(%rdi) - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - sarq $63,%r13 - movq %r13,48(%rdi) - movq %r13,56(%rdi) - movq %r13,64(%rdi) - movq %r13,72(%rdi) - movq %r13,80(%rdi) - movq %r13,88(%rdi) - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - - xorq $256+96,%rsi - movl $62,%edi - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 48(%rsi),%r10 - movq 56(%rsi),%r11 - call __inner_loop_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - movq %r8,0(%rdi) - movq %r10,48(%rdi) - - - - leaq 96(%rsi),%rsi - leaq 96(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - - - xorq $256+96,%rsi - movl $22,%edi - - movq 0(%rsi),%r8 - xorq %r9,%r9 - movq 48(%rsi),%r10 - xorq %r11,%r11 - call __inner_loop_62 - - - - - - - - leaq 96(%rsi),%rsi - - - - - - movq %r12,%rdx - movq %r13,%rcx - movq 32(%rsp),%rdi - call __smulq_767x63 - - movq 40(%rsp),%rsi - movq %rax,%rdx - sarq $63,%rax - - movq %rax,%r8 - movq %rax,%r9 - movq %rax,%r10 - andq 0(%rsi),%r8 - andq 8(%rsi),%r9 - movq %rax,%r11 - andq 16(%rsi),%r10 - andq 24(%rsi),%r11 - movq %rax,%r12 - andq 32(%rsi),%r12 - andq 40(%rsi),%rax - - addq %r8,%r14 - adcq %r9,%r15 - adcq %r10,%rbx - adcq %r11,%rbp - adcq %r12,%rcx - adcq %rax,%rdx - - movq %r14,48(%rdi) - movq %r15,56(%rdi) - movq %rbx,64(%rdi) - movq %rbp,72(%rdi) - movq %rcx,80(%rdi) - movq %rdx,88(%rdi) - - leaq 1112(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_ct_inverse_mod_383: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_ct_inverse_mod_383: -.def __smulq_767x63; .scl 3; .type 32; .endef -.p2align 5 -__smulq_767x63: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - leaq 48(%rsi),%rsi - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,0(%rdi) - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - movq %r9,8(%rdi) - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - movq %r10,16(%rdi) - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - movq %r11,24(%rdi) - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - movq %r12,32(%rdi) - imulq %rbp - addq %rax,%r13 - adcq $0,%rdx - - movq %r13,40(%rdi) - movq %rdx,48(%rdi) - sarq $63,%rdx - movq %rdx,56(%rdi) - movq %rcx,%rdx - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - movq 56(%rsi),%r15 - movq 64(%rsi),%rbx - movq 72(%rsi),%rbp - movq 80(%rsi),%rcx - movq 88(%rsi),%rdi - - movq %rdx,%rsi - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rsi - addq %rax,%rsi - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - xorq %rdx,%r14 - xorq %rdx,%r15 - xorq %rdx,%rbx - xorq %rdx,%rbp - xorq %rdx,%rcx - xorq %rdx,%rdi - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rbx - adcq $0,%rbp - adcq $0,%rcx - adcq $0,%rdi - - mulq %rsi - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rsi - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rsi - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rsi - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rsi - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - mulq %rsi - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rsi - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - mulq %rsi - addq %rax,%r15 - movq %rbx,%rax - adcq $0,%rdx - movq %rdx,%rbx - mulq %rsi - addq %rax,%rbx - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rbp - mulq %rsi - addq %rax,%rbp - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rcx - mulq %rsi - addq %rax,%rcx - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%rdi - movq 8(%rsp),%rdx - imulq %rsi,%rax - movq 16(%rsp),%rsi - addq %rdi,%rax - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - adcq 24(%rdx),%r11 - adcq 32(%rdx),%r12 - adcq 40(%rdx),%r13 - adcq 48(%rdx),%r14 - movq 56(%rdx),%rdi - adcq %rdi,%r15 - adcq %rdi,%rbx - adcq %rdi,%rbp - adcq %rdi,%rcx - adcq %rdi,%rax - - movq %rdx,%rdi - - movq %r8,0(%rdx) - movq %r9,8(%rdx) - movq %r10,16(%rdx) - movq %r11,24(%rdx) - movq %r12,32(%rdx) - movq %r13,40(%rdx) - movq %r14,48(%rdx) - movq %r15,56(%rdx) - movq %rbx,64(%rdx) - movq %rbp,72(%rdx) - movq %rcx,80(%rdx) - movq %rax,88(%rdx) - - .byte 0xf3,0xc3 - -.def __smulq_383x63; .scl 3; .type 32; .endef -.p2align 5 -__smulq_383x63: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp,%rax - addq %rax,%r13 - - leaq 48(%rsi),%rsi - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp,%rax - addq %rax,%r13 - - leaq -48(%rsi),%rsi - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 - -.def __smulq_383_n_shift_by_62; .scl 3; .type 32; .endef -.p2align 5 -__smulq_383_n_shift_by_62: - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,%rbx - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp - addq %rax,%r13 - adcq $0,%rdx - - leaq 48(%rsi),%rsi - movq %rdx,%r14 - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp - addq %rax,%r13 - adcq $0,%rdx - - leaq -48(%rsi),%rsi - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq %rdx,%r14 - movq %rbx,%rdx - - shrdq $62,%r9,%r8 - shrdq $62,%r10,%r9 - shrdq $62,%r11,%r10 - shrdq $62,%r12,%r11 - shrdq $62,%r13,%r12 - shrdq $62,%r14,%r13 - - sarq $63,%r14 - xorq %rbp,%rbp - subq %r14,%rbp - - xorq %r14,%r8 - xorq %r14,%r9 - xorq %r14,%r10 - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%r13 - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - xorq %r14,%rdx - xorq %r14,%rcx - addq %rbp,%rdx - addq %rbp,%rcx - - .byte 0xf3,0xc3 - -.def __ab_approximation_62; .scl 3; .type 32; .endef -.p2align 5 -__ab_approximation_62: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 40(%rsi),%r9 - movq 88(%rsi),%r11 - movq 32(%rsi),%rbx - movq 80(%rsi),%rbp - movq 24(%rsi),%r8 - movq 72(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - movq 16(%rsi),%r8 - movq 64(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - movq 8(%rsi),%r8 - movq 56(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - movq 0(%rsi),%r8 - movq 48(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%rbx,%r9 - shldq %cl,%rbp,%r11 - - jmp __inner_loop_62 - - .byte 0xf3,0xc3 - -.def __inner_loop_62; .scl 3; .type 32; .endef -.p2align 3 -.long 0 -__inner_loop_62: - .byte 0xf3,0x0f,0x1e,0xfa - - movq $1,%rdx - xorq %rcx,%rcx - xorq %r12,%r12 - movq $1,%r13 - movq %rsi,8(%rsp) - -.Loop_62: - xorq %rax,%rax - xorq %rbx,%rbx - testq $1,%r8 - movq %r10,%rbp - movq %r11,%r14 - cmovnzq %r10,%rax - cmovnzq %r11,%rbx - subq %r8,%rbp - sbbq %r9,%r14 - movq %r8,%r15 - movq %r9,%rsi - subq %rax,%r8 - sbbq %rbx,%r9 - cmovcq %rbp,%r8 - cmovcq %r14,%r9 - cmovcq %r15,%r10 - cmovcq %rsi,%r11 - movq %rdx,%rax - cmovcq %r12,%rdx - cmovcq %rax,%r12 - movq %rcx,%rbx - cmovcq %r13,%rcx - cmovcq %rbx,%r13 - xorq %rax,%rax - xorq %rbx,%rbx - shrdq $1,%r9,%r8 - shrq $1,%r9 - testq $1,%r15 - cmovnzq %r12,%rax - cmovnzq %r13,%rbx - addq %r12,%r12 - addq %r13,%r13 - subq %rax,%rdx - subq %rbx,%rcx - subl $1,%edi - jnz .Loop_62 - - movq 8(%rsp),%rsi - .byte 0xf3,0xc3 - -.section .pdata -.p2align 2 -.rva .LSEH_begin_ct_inverse_mod_383 -.rva .LSEH_body_ct_inverse_mod_383 -.rva .LSEH_info_ct_inverse_mod_383_prologue - -.rva .LSEH_body_ct_inverse_mod_383 -.rva .LSEH_epilogue_ct_inverse_mod_383 -.rva .LSEH_info_ct_inverse_mod_383_body - -.rva .LSEH_epilogue_ct_inverse_mod_383 -.rva .LSEH_end_ct_inverse_mod_383 -.rva .LSEH_info_ct_inverse_mod_383_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_ct_inverse_mod_383_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_ct_inverse_mod_383_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x8b,0x00 -.byte 0x00,0xe4,0x8c,0x00 -.byte 0x00,0xd4,0x8d,0x00 -.byte 0x00,0xc4,0x8e,0x00 -.byte 0x00,0x34,0x8f,0x00 -.byte 0x00,0x54,0x90,0x00 -.byte 0x00,0x74,0x92,0x00 -.byte 0x00,0x64,0x93,0x00 -.byte 0x00,0x01,0x91,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_ct_inverse_mod_383_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s deleted file mode 100644 index 4f7dd6d1552..00000000000 --- a/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s +++ /dev/null @@ -1,1601 +0,0 @@ -.text - -.globl ctx_inverse_mod_383 - -.def ctx_inverse_mod_383; .scl 2; .type 32; .endef -.p2align 5 -ctx_inverse_mod_383: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_ctx_inverse_mod_383: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -ct_inverse_mod_383$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $1112,%rsp - -.LSEH_body_ctx_inverse_mod_383: - - - leaq 88+511(%rsp),%rax - andq $-512,%rax - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq 0(%rdx),%r14 - movq 8(%rdx),%r15 - movq 16(%rdx),%rbx - movq 24(%rdx),%rbp - movq 32(%rdx),%rsi - movq 40(%rdx),%rdi - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - movq %r12,32(%rax) - movq %r13,40(%rax) - - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rbx,64(%rax) - movq %rbp,72(%rax) - movq %rsi,80(%rax) - movq %rax,%rsi - movq %rdi,88(%rax) - - - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - - - movq %rdx,96(%rdi) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - - - movq %rdx,96(%rdi) - - - xorq $256,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - - - - movq 96(%rsi),%rax - movq 144(%rsi),%r11 - movq %rdx,%rbx - movq %rax,%r10 - imulq 56(%rsp) - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq 64(%rsp) - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,48(%rdi) - movq %r9,56(%rdi) - sarq $63,%r9 - movq %r9,64(%rdi) - movq %r9,72(%rdi) - movq %r9,80(%rdi) - movq %r9,88(%rdi) - leaq 96(%rsi),%rsi - - movq %r10,%rax - imulq %rbx - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq %rcx - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,96(%rdi) - movq %r9,104(%rdi) - sarq $63,%r9 - movq %r9,112(%rdi) - movq %r9,120(%rdi) - movq %r9,128(%rdi) - movq %r9,136(%rdi) - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - sarq $63,%r13 - movq %r13,48(%rdi) - movq %r13,56(%rdi) - movq %r13,64(%rdi) - movq %r13,72(%rdi) - movq %r13,80(%rdi) - movq %r13,88(%rdi) - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - - xorq $256+96,%rsi - movl $53,%edi - - movq 0(%rsi),%r8 - - movq 48(%rsi),%r10 - - call __tail_loop_53 - - - - - - - - leaq 96(%rsi),%rsi - - - - - - movq %r12,%rdx - movq %r13,%rcx - movq 32(%rsp),%rdi - call __smulx_767x63 - - movq 40(%rsp),%rsi - movq %rax,%rdx - sarq $63,%rax - - movq %rax,%r8 - movq %rax,%r9 - movq %rax,%r10 - andq 0(%rsi),%r8 - andq 8(%rsi),%r9 - movq %rax,%r11 - andq 16(%rsi),%r10 - andq 24(%rsi),%r11 - movq %rax,%r12 - andq 32(%rsi),%r12 - andq 40(%rsi),%rax - - addq %r8,%r14 - adcq %r9,%r15 - adcq %r10,%rbx - adcq %r11,%rbp - adcq %r12,%rcx - adcq %rax,%rdx - - movq %r14,48(%rdi) - movq %r15,56(%rdi) - movq %rbx,64(%rdi) - movq %rbp,72(%rdi) - movq %rcx,80(%rdi) - movq %rdx,88(%rdi) - - leaq 1112(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_ctx_inverse_mod_383: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_ctx_inverse_mod_383: -.def __smulx_767x63; .scl 3; .type 32; .endef -.p2align 5 -__smulx_767x63: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - leaq 48(%rsi),%rsi - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r13 - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %r13,%r10 - mulxq %r11,%r11,%r13 - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %r13,%r12 - adcq $0,%rbp - imulq %rdx - addq %rbp,%rax - adcq $0,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %rax,40(%rdi) - movq %rdx,48(%rdi) - sarq $63,%rdx - movq %rdx,56(%rdi) - movq %rcx,%rdx - movq %rcx,%rax - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - movq 56(%rsi),%r15 - movq 64(%rsi),%rbx - movq 72(%rsi),%rbp - movq 80(%rsi),%rcx - movq 88(%rsi),%rdi - - sarq $63,%rax - xorq %rsi,%rsi - subq %rax,%rsi - - xorq %rax,%rdx - addq %rsi,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %rax,%r13 - xorq %rax,%r14 - xorq %rax,%r15 - xorq %rax,%rbx - xorq %rax,%rbp - xorq %rax,%rcx - xorq %rax,%rdi - addq %rsi,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rbx - adcq $0,%rbp - adcq $0,%rcx - adcq $0,%rdi - - mulxq %r8,%r8,%rax - mulxq %r9,%r9,%rsi - addq %rax,%r9 - mulxq %r10,%r10,%rax - adcq %rsi,%r10 - mulxq %r11,%r11,%rsi - adcq %rax,%r11 - mulxq %r12,%r12,%rax - adcq %rsi,%r12 - mulxq %r13,%r13,%rsi - adcq %rax,%r13 - mulxq %r14,%r14,%rax - adcq %rsi,%r14 - mulxq %r15,%r15,%rsi - adcq %rax,%r15 - mulxq %rbx,%rbx,%rax - adcq %rsi,%rbx - mulxq %rbp,%rbp,%rsi - adcq %rax,%rbp - mulxq %rcx,%rcx,%rax - adcq %rsi,%rcx - mulxq %rdi,%rdi,%rsi - movq 8(%rsp),%rdx - movq 16(%rsp),%rsi - adcq %rdi,%rax - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - adcq 24(%rdx),%r11 - adcq 32(%rdx),%r12 - adcq 40(%rdx),%r13 - adcq 48(%rdx),%r14 - movq 56(%rdx),%rdi - adcq %rdi,%r15 - adcq %rdi,%rbx - adcq %rdi,%rbp - adcq %rdi,%rcx - adcq %rdi,%rax - - movq %rdx,%rdi - - movq %r8,0(%rdx) - movq %r9,8(%rdx) - movq %r10,16(%rdx) - movq %r11,24(%rdx) - movq %r12,32(%rdx) - movq %r13,40(%rdx) - movq %r14,48(%rdx) - movq %r15,56(%rdx) - movq %rbx,64(%rdx) - movq %rbp,72(%rdx) - movq %rcx,80(%rdx) - movq %rax,88(%rdx) - - .byte 0xf3,0xc3 - -.def __smulx_383x63; .scl 3; .type 32; .endef -.p2align 5 -__smulx_383x63: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - movq 0+32(%rsi),%r12 - movq 0+40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%rdx - addq %rax,%rdx - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - xorq %rbp,%r12 - xorq %rbp,%r13 - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%rax - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %rax,%r10 - mulxq %r11,%r11,%rax - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %rax,%r12 - mulxq %r13,%r13,%rax - movq %rcx,%rdx - adcq %rbp,%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 48+0(%rsi),%r8 - movq 48+8(%rsi),%r9 - movq 48+16(%rsi),%r10 - movq 48+24(%rsi),%r11 - movq 48+32(%rsi),%r12 - movq 48+40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%rdx - addq %rax,%rdx - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - xorq %rbp,%r12 - xorq %rbp,%r13 - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%rax - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %rax,%r10 - mulxq %r11,%r11,%rax - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %rax,%r12 - mulxq %r13,%r13,%rax - adcq %rbp,%r13 - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 - -.def __smulx_383_n_shift_by_31; .scl 3; .type 32; .endef -.p2align 5 -__smulx_383_n_shift_by_31: - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,%rbx - xorq %r14,%r14 - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - movq 0+32(%rsi),%r12 - movq 0+40(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r13 - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %r13,%r10 - mulxq %r11,%r11,%r13 - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %r13,%r12 - adcq $0,%rbp - imulq %rdx - addq %rbp,%rax - adcq %rdx,%r14 - - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %rax,40(%rdi) - movq 48+0(%rsi),%r8 - movq 48+8(%rsi),%r9 - movq 48+16(%rsi),%r10 - movq 48+24(%rsi),%r11 - movq 48+32(%rsi),%r12 - movq 48+40(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r13 - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %r13,%r10 - mulxq %r11,%r11,%r13 - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %r13,%r12 - adcq $0,%rbp - imulq %rdx - addq %rbp,%rax - adcq $0,%rdx - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%rax - adcq %rdx,%r14 - movq %rbx,%rdx - - shrdq $31,%r9,%r8 - shrdq $31,%r10,%r9 - shrdq $31,%r11,%r10 - shrdq $31,%r12,%r11 - shrdq $31,%rax,%r12 - shrdq $31,%r14,%rax - - sarq $63,%r14 - xorq %rbp,%rbp - subq %r14,%rbp - - xorq %r14,%r8 - xorq %r14,%r9 - xorq %r14,%r10 - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %rax,40(%rdi) - - xorq %r14,%rdx - xorq %r14,%rcx - addq %rbp,%rdx - addq %rbp,%rcx - - .byte 0xf3,0xc3 - -.def __smulx_191_n_shift_by_31; .scl 3; .type 32; .endef -.p2align 5 -__smulx_191_n_shift_by_31: - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,%rbx - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %r10,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r10 - addq %rbp,%r9 - adcq $0,%r10 - imulq %rdx - addq %rax,%r10 - adcq $0,%rdx - movq %rdx,%r14 - movq %rcx,%rdx - movq 48+0(%rsi),%r11 - movq 48+8(%rsi),%r12 - movq 48+16(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r11,%r11,%rbp - mulxq %r12,%r12,%r13 - addq %rbp,%r12 - adcq $0,%r13 - imulq %rdx - addq %rax,%r13 - adcq $0,%rdx - addq %r8,%r11 - adcq %r9,%r12 - adcq %r10,%r13 - adcq %rdx,%r14 - movq %rbx,%rdx - - shrdq $31,%r12,%r11 - shrdq $31,%r13,%r12 - shrdq $31,%r14,%r13 - - sarq $63,%r14 - xorq %rbp,%rbp - subq %r14,%rbp - - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%r13 - addq %rbp,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %r11,0(%rdi) - movq %r12,8(%rdi) - movq %r13,16(%rdi) - - xorq %r14,%rdx - xorq %r14,%rcx - addq %rbp,%rdx - addq %rbp,%rcx - - .byte 0xf3,0xc3 - -.def __ab_approximation_31; .scl 3; .type 32; .endef -.p2align 5 -__ab_approximation_31: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 40(%rsi),%r9 - movq 88(%rsi),%r11 - movq 32(%rsi),%rbx - movq 80(%rsi),%rbp - movq 24(%rsi),%r8 - movq 72(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 16(%rsi),%r8 - cmovzq %r10,%rbp - movq 64(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 8(%rsi),%r8 - cmovzq %r10,%rbp - movq 56(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 0(%rsi),%r8 - cmovzq %r10,%rbp - movq 48(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - - movq %r9,%rax - orq %r11,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %r8,%r9 - cmovzq %r10,%r11 - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%rbx,%r9 - shldq %cl,%rbp,%r11 - - movl $0x7FFFFFFF,%eax - andq %rax,%r8 - andq %rax,%r10 - andnq %r9,%rax,%r9 - andnq %r11,%rax,%r11 - orq %r9,%r8 - orq %r11,%r10 - - jmp __inner_loop_31 - - .byte 0xf3,0xc3 - -.def __inner_loop_31; .scl 3; .type 32; .endef -.p2align 5 -__inner_loop_31: - .byte 0xf3,0x0f,0x1e,0xfa - - movq $0x7FFFFFFF80000000,%rcx - movq $0x800000007FFFFFFF,%r13 - movq $0x7FFFFFFF7FFFFFFF,%r15 - -.Loop_31: - cmpq %r10,%r8 - movq %r8,%rax - movq %r10,%rbx - movq %rcx,%rbp - movq %r13,%r14 - cmovbq %r10,%r8 - cmovbq %rax,%r10 - cmovbq %r13,%rcx - cmovbq %rbp,%r13 - - subq %r10,%r8 - subq %r13,%rcx - addq %r15,%rcx - - testq $1,%rax - cmovzq %rax,%r8 - cmovzq %rbx,%r10 - cmovzq %rbp,%rcx - cmovzq %r14,%r13 - - shrq $1,%r8 - addq %r13,%r13 - subq %r15,%r13 - subl $1,%edi - jnz .Loop_31 - - shrq $32,%r15 - movl %ecx,%edx - movl %r13d,%r12d - shrq $32,%rcx - shrq $32,%r13 - subq %r15,%rdx - subq %r15,%rcx - subq %r15,%r12 - subq %r15,%r13 - - .byte 0xf3,0xc3 - - -.def __tail_loop_53; .scl 3; .type 32; .endef -.p2align 5 -__tail_loop_53: - .byte 0xf3,0x0f,0x1e,0xfa - - movq $1,%rdx - xorq %rcx,%rcx - xorq %r12,%r12 - movq $1,%r13 - -.Loop_53: - xorq %rax,%rax - testq $1,%r8 - movq %r10,%rbx - cmovnzq %r10,%rax - subq %r8,%rbx - movq %r8,%rbp - subq %rax,%r8 - cmovcq %rbx,%r8 - cmovcq %rbp,%r10 - movq %rdx,%rax - cmovcq %r12,%rdx - cmovcq %rax,%r12 - movq %rcx,%rbx - cmovcq %r13,%rcx - cmovcq %rbx,%r13 - xorq %rax,%rax - xorq %rbx,%rbx - shrq $1,%r8 - testq $1,%rbp - cmovnzq %r12,%rax - cmovnzq %r13,%rbx - addq %r12,%r12 - addq %r13,%r13 - subq %rax,%rdx - subq %rbx,%rcx - subl $1,%edi - jnz .Loop_53 - - .byte 0xf3,0xc3 - -.section .pdata -.p2align 2 -.rva .LSEH_begin_ctx_inverse_mod_383 -.rva .LSEH_body_ctx_inverse_mod_383 -.rva .LSEH_info_ctx_inverse_mod_383_prologue - -.rva .LSEH_body_ctx_inverse_mod_383 -.rva .LSEH_epilogue_ctx_inverse_mod_383 -.rva .LSEH_info_ctx_inverse_mod_383_body - -.rva .LSEH_epilogue_ctx_inverse_mod_383 -.rva .LSEH_end_ctx_inverse_mod_383 -.rva .LSEH_info_ctx_inverse_mod_383_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_ctx_inverse_mod_383_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_ctx_inverse_mod_383_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x8b,0x00 -.byte 0x00,0xe4,0x8c,0x00 -.byte 0x00,0xd4,0x8d,0x00 -.byte 0x00,0xc4,0x8e,0x00 -.byte 0x00,0x34,0x8f,0x00 -.byte 0x00,0x54,0x90,0x00 -.byte 0x00,0x74,0x92,0x00 -.byte 0x00,0x64,0x93,0x00 -.byte 0x00,0x01,0x91,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_ctx_inverse_mod_383_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/div3w-armv8.S b/crypto/blst_src/build/coff/div3w-armv8.S deleted file mode 100644 index 2e5d7045d6a..00000000000 --- a/crypto/blst_src/build/coff/div3w-armv8.S +++ /dev/null @@ -1,94 +0,0 @@ -.text - -.globl div_3_limbs -.def div_3_limbs; -.type 32; -.endef -.p2align 5 -div_3_limbs: - ldp x4,x5,[x0] // load R - eor x0,x0,x0 // Q = 0 - mov x3,#64 // loop counter - nop - -.Loop: - subs x6,x4,x1 // R - D - add x0,x0,x0 // Q <<= 1 - sbcs x7,x5,x2 - add x0,x0,#1 // Q + speculative bit - csel x4,x4,x6,lo // select between R and R - D - extr x1,x2,x1,#1 // D >>= 1 - csel x5,x5,x7,lo - lsr x2,x2,#1 - sbc x0,x0,xzr // subtract speculative bit - sub x3,x3,#1 - cbnz x3,.Loop - - asr x3,x0,#63 // top bit -> mask - add x0,x0,x0 // Q <<= 1 - subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + speculative bit - sbcs x7,x5,x2 - sbc x0,x0,xzr // subtract speculative bit - - orr x0,x0,x3 // all ones if overflow - - ret - -.globl quot_rem_128 -.def quot_rem_128; -.type 32; -.endef -.p2align 5 -quot_rem_128: - ldp x3,x4,[x1] - - mul x5,x3,x2 // divisor[0:1} * quotient - umulh x6,x3,x2 - mul x11, x4,x2 - umulh x7,x4,x2 - - ldp x8,x9,[x0] // load 3 limbs of the dividend - ldr x10,[x0,#16] - - adds x6,x6,x11 - adc x7,x7,xzr - - subs x8,x8,x5 // dividend - divisor * quotient - sbcs x9,x9,x6 - sbcs x10,x10,x7 - sbc x5,xzr,xzr // borrow -> mask - - add x2,x2,x5 // if borrowed, adjust the quotient ... - and x3,x3,x5 - and x4,x4,x5 - adds x8,x8,x3 // ... and add divisor - adc x9,x9,x4 - - stp x8,x9,[x0] // save 2 limbs of the remainder - str x2,[x0,#16] // and one limb of the quotient - - mov x0,x2 // return adjusted quotient - - ret - - -.globl quot_rem_64 -.def quot_rem_64; -.type 32; -.endef -.p2align 5 -quot_rem_64: - ldr x3,[x1] - ldr x8,[x0] // load 1 limb of the dividend - - mul x5,x3,x2 // divisor * quotient - - sub x8,x8,x5 // dividend - divisor * quotient - - stp x8,x2,[x0] // save remainder and quotient - - mov x0,x2 // return quotient - - ret - diff --git a/crypto/blst_src/build/coff/div3w-x86_64.s b/crypto/blst_src/build/coff/div3w-x86_64.s deleted file mode 100644 index 033d1eb3055..00000000000 --- a/crypto/blst_src/build/coff/div3w-x86_64.s +++ /dev/null @@ -1,248 +0,0 @@ -.text - -.globl div_3_limbs - -.def div_3_limbs; .scl 2; .type 32; .endef -.p2align 5 -div_3_limbs: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_div_3_limbs: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -.LSEH_body_div_3_limbs: - - movq (%rdi),%r8 - movq 8(%rdi),%r9 - xorq %rax,%rax - movl $64,%ecx - -.Loop: - movq %r8,%r10 - subq %rsi,%r8 - movq %r9,%r11 - sbbq %rdx,%r9 - leaq 1(%rax,%rax,1),%rax - movq %rdx,%rdi - cmovcq %r10,%r8 - cmovcq %r11,%r9 - sbbq $0,%rax - shlq $63,%rdi - shrq $1,%rsi - shrq $1,%rdx - orq %rdi,%rsi - subl $1,%ecx - jnz .Loop - - leaq 1(%rax,%rax,1),%rcx - sarq $63,%rax - - subq %rsi,%r8 - sbbq %rdx,%r9 - sbbq $0,%rcx - - orq %rcx,%rax - -.LSEH_epilogue_div_3_limbs: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_div_3_limbs: -.globl quot_rem_128 - -.def quot_rem_128; .scl 2; .type 32; .endef -.p2align 5 -quot_rem_128: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_quot_rem_128: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -.LSEH_body_quot_rem_128: - - movq %rdx,%rax - movq %rdx,%rcx - - mulq 0(%rsi) - movq %rax,%r8 - movq %rcx,%rax - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r9 - adcq $0,%rdx - - movq 0(%rdi),%r10 - movq 8(%rdi),%r11 - movq 16(%rdi),%rax - - subq %r8,%r10 - sbbq %r9,%r11 - sbbq %rdx,%rax - sbbq %r8,%r8 - - addq %r8,%rcx - movq %r8,%r9 - andq 0(%rsi),%r8 - andq 8(%rsi),%r9 - addq %r8,%r10 - adcq %r9,%r11 - - movq %r10,0(%rdi) - movq %r11,8(%rdi) - movq %rcx,16(%rdi) - - movq %rcx,%rax - -.LSEH_epilogue_quot_rem_128: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_quot_rem_128: - - - - - -.globl quot_rem_64 - -.def quot_rem_64; .scl 2; .type 32; .endef -.p2align 5 -quot_rem_64: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_quot_rem_64: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -.LSEH_body_quot_rem_64: - - movq %rdx,%rax - imulq 0(%rsi),%rdx - - movq 0(%rdi),%r10 - - subq %rdx,%r10 - - movq %r10,0(%rdi) - movq %rax,8(%rdi) - -.LSEH_epilogue_quot_rem_64: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_quot_rem_64: -.section .pdata -.p2align 2 -.rva .LSEH_begin_div_3_limbs -.rva .LSEH_body_div_3_limbs -.rva .LSEH_info_div_3_limbs_prologue - -.rva .LSEH_body_div_3_limbs -.rva .LSEH_epilogue_div_3_limbs -.rva .LSEH_info_div_3_limbs_body - -.rva .LSEH_epilogue_div_3_limbs -.rva .LSEH_end_div_3_limbs -.rva .LSEH_info_div_3_limbs_epilogue - -.rva .LSEH_begin_quot_rem_128 -.rva .LSEH_body_quot_rem_128 -.rva .LSEH_info_quot_rem_128_prologue - -.rva .LSEH_body_quot_rem_128 -.rva .LSEH_epilogue_quot_rem_128 -.rva .LSEH_info_quot_rem_128_body - -.rva .LSEH_epilogue_quot_rem_128 -.rva .LSEH_end_quot_rem_128 -.rva .LSEH_info_quot_rem_128_epilogue - -.rva .LSEH_begin_quot_rem_64 -.rva .LSEH_body_quot_rem_64 -.rva .LSEH_info_quot_rem_64_prologue - -.rva .LSEH_body_quot_rem_64 -.rva .LSEH_epilogue_quot_rem_64 -.rva .LSEH_info_quot_rem_64_body - -.rva .LSEH_epilogue_quot_rem_64 -.rva .LSEH_end_quot_rem_64 -.rva .LSEH_info_quot_rem_64_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_div_3_limbs_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_div_3_limbs_body: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_div_3_limbs_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_quot_rem_128_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_quot_rem_128_body: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_quot_rem_128_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_quot_rem_64_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_quot_rem_64_body: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_quot_rem_64_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/mul_mont_256-armv8.S b/crypto/blst_src/build/coff/mul_mont_256-armv8.S deleted file mode 100644 index 8cadbb89344..00000000000 --- a/crypto/blst_src/build/coff/mul_mont_256-armv8.S +++ /dev/null @@ -1,474 +0,0 @@ -.text - -.globl mul_mont_sparse_256 - -.def mul_mont_sparse_256; -.type 32; -.endef -.p2align 5 -mul_mont_sparse_256: - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x10,x11,[x1] - ldr x9, [x2] - ldp x12,x13,[x1,#16] - - mul x19,x10,x9 - ldp x5,x6,[x3] - mul x20,x11,x9 - ldp x7,x8,[x3,#16] - mul x21,x12,x9 - mul x22,x13,x9 - - umulh x14,x10,x9 - umulh x15,x11,x9 - mul x3,x4,x19 - umulh x16,x12,x9 - umulh x17,x13,x9 - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,xzr, x17 - mul x17,x8,x3 - ldr x9,[x2,8*1] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - ldr x9,[x2,8*2] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - ldr x9,[x2,8*3] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - adcs x20,x21,x15 - adcs x21,x22,x16 - adcs x22,x23,x17 - adc x23,xzr,xzr - - subs x14,x19,x5 - sbcs x15,x20,x6 - sbcs x16,x21,x7 - sbcs x17,x22,x8 - sbcs xzr, x23,xzr - - csel x19,x19,x14,lo - csel x20,x20,x15,lo - csel x21,x21,x16,lo - csel x22,x22,x17,lo - - stp x19,x20,[x0] - stp x21,x22,[x0,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - ret - -.globl sqr_mont_sparse_256 - -.def sqr_mont_sparse_256; -.type 32; -.endef -.p2align 5 -sqr_mont_sparse_256: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - mov x4,x3 - - //////////////////////////////////////////////////////////////// - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul x11,x6,x5 // a[1]*a[0] - umulh x15,x6,x5 - mul x12,x7,x5 // a[2]*a[0] - umulh x16,x7,x5 - mul x13,x8,x5 // a[3]*a[0] - umulh x19,x8,x5 - - adds x12,x12,x15 // accumulate high parts of multiplication - mul x14,x7,x6 // a[2]*a[1] - umulh x15,x7,x6 - adcs x13,x13,x16 - mul x16,x8,x6 // a[3]*a[1] - umulh x17,x8,x6 - adc x19,x19,xzr // can't overflow - - mul x20,x8,x7 // a[3]*a[2] - umulh x21,x8,x7 - - adds x15,x15,x16 // accumulate high parts of multiplication - mul x10,x5,x5 // a[0]*a[0] - adc x16,x17,xzr // can't overflow - - adds x13,x13,x14 // accumulate low parts of multiplication - umulh x5,x5,x5 - adcs x19,x19,x15 - mul x15,x6,x6 // a[1]*a[1] - adcs x20,x20,x16 - umulh x6,x6,x6 - adc x21,x21,xzr // can't overflow - - adds x11,x11,x11 // acc[1-6]*=2 - mul x16,x7,x7 // a[2]*a[2] - adcs x12,x12,x12 - umulh x7,x7,x7 - adcs x13,x13,x13 - mul x17,x8,x8 // a[3]*a[3] - adcs x19,x19,x19 - umulh x8,x8,x8 - adcs x20,x20,x20 - adcs x21,x21,x21 - adc x22,xzr,xzr - - adds x11,x11,x5 // +a[i]*a[i] - adcs x12,x12,x15 - adcs x13,x13,x6 - adcs x19,x19,x16 - adcs x20,x20,x7 - adcs x21,x21,x17 - adc x22,x22,x8 - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - adds x10,x10,x19 // accumulate upper half - adcs x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - adc x19,xzr,xzr - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - sbcs xzr, x19,xzr - - csel x10,x10,x14,lo - csel x11,x11,x15,lo - csel x12,x12,x16,lo - csel x13,x13,x17,lo - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - -.globl from_mont_256 - -.def from_mont_256; -.type 32; -.endef -.p2align 5 -from_mont_256: -.long 3573752639 - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x4,x3 - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - - csel x10,x10,x14,lo - csel x11,x11,x15,lo - csel x12,x12,x16,lo - csel x13,x13,x17,lo - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldr x29,[sp],#16 -.long 3573752767 - ret - - -.globl redc_mont_256 - -.def redc_mont_256; -.type 32; -.endef -.p2align 5 -redc_mont_256: -.long 3573752639 - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x4,x3 - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - ldp x14,x15,[x1,#32] - ldp x16,x17,[x1,#48] - - adds x10,x10,x14 - adcs x11,x11,x15 - adcs x12,x12,x16 - adcs x13,x13,x17 - adc x9,xzr,xzr - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - sbcs xzr, x9,xzr - - csel x10,x10,x14,lo - csel x11,x11,x15,lo - csel x12,x12,x16,lo - csel x13,x13,x17,lo - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldr x29,[sp],#16 -.long 3573752767 - ret - - -.def __mul_by_1_mont_256; -.type 32; -.endef -.p2align 5 -__mul_by_1_mont_256: - mul x3,x4,x10 - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - adc x13,x9,x17 - - ret - diff --git a/crypto/blst_src/build/coff/mul_mont_384-armv8.S b/crypto/blst_src/build/coff/mul_mont_384-armv8.S deleted file mode 100644 index 074f38c495c..00000000000 --- a/crypto/blst_src/build/coff/mul_mont_384-armv8.S +++ /dev/null @@ -1,2424 +0,0 @@ -.text - -.globl add_mod_384x384 -.def add_mod_384x384; -.type 32; -.endef -.p2align 5 -add_mod_384x384: -.long 3573752639 - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __add_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 -.long 3573752767 - ret - - -.def __add_mod_384x384; -.type 32; -.endef -.p2align 5 -__add_mod_384x384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - adds x11,x11,x19 - ldp x21,x22,[x2,#16] - adcs x12,x12,x20 - ldp x15, x16, [x1,#32] - adcs x13,x13,x21 - ldp x23,x24,[x2,#32] - adcs x14,x14,x22 - stp x11, x12, [x0] - adcs x15,x15,x23 - ldp x11, x12, [x1,#48] - adcs x16,x16,x24 - - ldp x19,x20,[x2,#48] - stp x13, x14, [x0,#16] - ldp x13, x14, [x1,#64] - ldp x21,x22,[x2,#64] - - adcs x11,x11,x19 - stp x15, x16, [x0,#32] - adcs x12,x12,x20 - ldp x15, x16, [x1,#80] - adcs x13,x13,x21 - ldp x23,x24,[x2,#80] - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x17,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x17,xzr - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - stp x11,x12,[x0,#48] - csel x15,x15,x23,lo - stp x13,x14,[x0,#64] - csel x16,x16,x24,lo - stp x15,x16,[x0,#80] - - ret - - -.globl sub_mod_384x384 -.def sub_mod_384x384; -.type 32; -.endef -.p2align 5 -sub_mod_384x384: -.long 3573752639 - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 -.long 3573752767 - ret - - -.def __sub_mod_384x384; -.type 32; -.endef -.p2align 5 -__sub_mod_384x384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - subs x11,x11,x19 - ldp x21,x22,[x2,#16] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#32] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#32] - sbcs x14,x14,x22 - stp x11, x12, [x0] - sbcs x15,x15,x23 - ldp x11, x12, [x1,#48] - sbcs x16,x16,x24 - - ldp x19,x20,[x2,#48] - stp x13, x14, [x0,#16] - ldp x13, x14, [x1,#64] - ldp x21,x22,[x2,#64] - - sbcs x11,x11,x19 - stp x15, x16, [x0,#32] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#80] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#80] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x17,xzr,xzr - - and x19,x5,x17 - and x20,x6,x17 - adds x11,x11,x19 - and x21,x7,x17 - adcs x12,x12,x20 - and x22,x8,x17 - adcs x13,x13,x21 - and x23,x9,x17 - adcs x14,x14,x22 - and x24,x10,x17 - adcs x15,x15,x23 - stp x11,x12,[x0,#48] - adc x16,x16,x24 - stp x13,x14,[x0,#64] - stp x15,x16,[x0,#80] - - ret - - -.def __add_mod_384; -.type 32; -.endef -.p2align 5 -__add_mod_384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - adds x11,x11,x19 - ldp x21,x22,[x2,#16] - adcs x12,x12,x20 - ldp x15, x16, [x1,#32] - adcs x13,x13,x21 - ldp x23,x24,[x2,#32] - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x17,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x17,xzr - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - csel x15,x15,x23,lo - stp x11,x12,[x0] - csel x16,x16,x24,lo - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret - - -.def __sub_mod_384; -.type 32; -.endef -.p2align 5 -__sub_mod_384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - subs x11,x11,x19 - ldp x21,x22,[x2,#16] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#32] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#32] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x17,xzr,xzr - - and x19,x5,x17 - and x20,x6,x17 - adds x11,x11,x19 - and x21,x7,x17 - adcs x12,x12,x20 - and x22,x8,x17 - adcs x13,x13,x21 - and x23,x9,x17 - adcs x14,x14,x22 - and x24,x10,x17 - adcs x15,x15,x23 - stp x11,x12,[x0] - adc x16,x16,x24 - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret - - -.globl mul_mont_384x - -.def mul_mont_384x; -.type 32; -.endef -.p2align 5 -mul_mont_384x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#288 // space for 3 768-bit vectors - - mov x26,x0 // save r_ptr - mov x27,x1 // save b_ptr - mov x28,x2 // save b_ptr - - sub x0,sp,#0 // mul_384(t0, a->re, b->re) - bl __mul_384 - - add x1,x1,#48 // mul_384(t1, a->im, b->im) - add x2,x2,#48 - add x0,sp,#96 - bl __mul_384 - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - sub x2,x1,#48 - add x0,sp,#240 - bl __add_mod_384 - - add x1,x28,#0 - add x2,x28,#48 - add x0,sp,#192 // t2 - bl __add_mod_384 - - add x1,x0,#0 - add x2,x0,#48 - bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - mov x1,x0 - add x2,sp,#0 - bl __sub_mod_384x384 - - add x2,sp,#96 - bl __sub_mod_384x384 // t2 = t2-t0-t1 - - add x1,sp,#0 - add x2,sp,#96 - add x0,sp,#0 - bl __sub_mod_384x384 // t0 = t0-t1 - - add x1,sp,#0 // ret->re = redc(t0) - add x0,x26,#0 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - - add x1,sp,#192 // ret->im = redc(t2) - add x0,x0,#48 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#288 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl sqr_mont_384x - -.def sqr_mont_384x; -.type 32; -.endef -.p2align 5 -sqr_mont_384x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 2 384-bit vectors - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - add x2,x1,#48 - add x0,sp,#0 - bl __add_mod_384 // t0 = a->re + a->im - - add x0,sp,#48 - bl __sub_mod_384 // t1 = a->re - a->im - - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) - - adds x11,x11,x11 // add with itself - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x25,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x25,xzr - - csel x19,x11,x19,lo - csel x20,x12,x20,lo - csel x21,x13,x21,lo - ldp x11,x12,[sp] - csel x22,x14,x22,lo - ldr x17, [sp,#48] - csel x23,x15,x23,lo - ldp x13,x14,[sp,#16] - csel x24,x16,x24,lo - ldp x15,x16,[sp,#32] - - stp x19,x20,[x2,#48] - stp x21,x22,[x2,#64] - stp x23,x24,[x2,#80] - - add x2,sp,#48 - bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl mul_mont_384 - -.def mul_mont_384; -.type 32; -.endef -.p2align 5 -mul_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there - - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.def __mul_mont_384; -.type 32; -.endef -.p2align 5 -__mul_mont_384: - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - mul x4,x4,x19 - - umulh x26,x11,x17 - umulh x27,x12,x17 - umulh x28,x13,x17 - umulh x0,x14,x17 - umulh x1,x15,x17 - umulh x3,x16,x17 - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,xzr, x3 - mul x3,x10,x4 - mov x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*1] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*2] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*3] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*4] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*5] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr - adc x17,x17,xzr - - adds x19,x20,x26 - adcs x20,x21,x27 - adcs x21,x22,x28 - adcs x22,x23,x0 - adcs x23,x24,x1 - adcs x24,x25,x3 - adc x25,x17,xzr - - subs x26,x19,x5 - sbcs x27,x20,x6 - sbcs x28,x21,x7 - sbcs x0,x22,x8 - sbcs x1,x23,x9 - sbcs x3,x24,x10 - sbcs xzr, x25,xzr - - csel x11,x19,x26,lo - csel x12,x20,x27,lo - csel x13,x21,x28,lo - csel x14,x22,x0,lo - csel x15,x23,x1,lo - csel x16,x24,x3,lo - ret - - -.globl sqr_mont_384 - -.def sqr_mont_384; -.type 32; -.endef -.p2align 5 -sqr_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for 768-bit vector - mov x4,x3 // adjust for missing b_ptr - - mov x3,x0 // save r_ptr - mov x0,sp - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __sqr_384 - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - mov x1,sp - mov x0,x3 // restore r_ptr - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl sqr_n_mul_mont_383 - -.def sqr_n_mul_mont_383; -.type 32; -.endef -.p2align 5 -sqr_n_mul_mont_383: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 768-bit vector - mov x17,x5 // save b_ptr - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - mov x0,sp -.Loop_sqr_383: - bl __sqr_384 - sub x2,x2,#1 // counter - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - mov x1,sp - bl __mul_by_1_mont_384 - - ldp x19,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x11,x11,x19 // just accumulate upper half - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adc x16,x16,x24 - - cbnz x2,.Loop_sqr_383 - - mov x2,x17 - ldr x17,[x17] - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - -.def __sqr_384; -.type 32; -.endef -.p2align 5 -__sqr_384: - mul x19,x12,x11 - mul x20,x13,x11 - mul x21,x14,x11 - mul x22,x15,x11 - mul x23,x16,x11 - - umulh x6,x12,x11 - umulh x7,x13,x11 - umulh x8,x14,x11 - umulh x9,x15,x11 - adds x20,x20,x6 - umulh x10,x16,x11 - adcs x21,x21,x7 - mul x7,x13,x12 - adcs x22,x22,x8 - mul x8,x14,x12 - adcs x23,x23,x9 - mul x9,x15,x12 - adc x24,xzr, x10 - mul x10,x16,x12 - - adds x21,x21,x7 - umulh x7,x13,x12 - adcs x22,x22,x8 - umulh x8,x14,x12 - adcs x23,x23,x9 - umulh x9,x15,x12 - adcs x24,x24,x10 - umulh x10,x16,x12 - adc x25,xzr,xzr - - mul x5,x11,x11 - adds x22,x22,x7 - umulh x11, x11,x11 - adcs x23,x23,x8 - mul x8,x14,x13 - adcs x24,x24,x9 - mul x9,x15,x13 - adc x25,x25,x10 - mul x10,x16,x13 - - adds x23,x23,x8 - umulh x8,x14,x13 - adcs x24,x24,x9 - umulh x9,x15,x13 - adcs x25,x25,x10 - umulh x10,x16,x13 - adc x26,xzr,xzr - - mul x6,x12,x12 - adds x24,x24,x8 - umulh x12, x12,x12 - adcs x25,x25,x9 - mul x9,x15,x14 - adc x26,x26,x10 - mul x10,x16,x14 - - adds x25,x25,x9 - umulh x9,x15,x14 - adcs x26,x26,x10 - umulh x10,x16,x14 - adc x27,xzr,xzr - mul x7,x13,x13 - adds x26,x26,x9 - umulh x13, x13,x13 - adc x27,x27,x10 - mul x8,x14,x14 - - mul x10,x16,x15 - umulh x14, x14,x14 - adds x27,x27,x10 - umulh x10,x16,x15 - mul x9,x15,x15 - adc x28,x10,xzr - - adds x19,x19,x19 - adcs x20,x20,x20 - adcs x21,x21,x21 - adcs x22,x22,x22 - adcs x23,x23,x23 - adcs x24,x24,x24 - adcs x25,x25,x25 - adcs x26,x26,x26 - umulh x15, x15,x15 - adcs x27,x27,x27 - mul x10,x16,x16 - adcs x28,x28,x28 - umulh x16, x16,x16 - adc x1,xzr,xzr - - adds x19,x19,x11 - adcs x20,x20,x6 - adcs x21,x21,x12 - adcs x22,x22,x7 - adcs x23,x23,x13 - adcs x24,x24,x8 - adcs x25,x25,x14 - stp x5,x19,[x0] - adcs x26,x26,x9 - stp x20,x21,[x0,#16] - adcs x27,x27,x15 - stp x22,x23,[x0,#32] - adcs x28,x28,x10 - stp x24,x25,[x0,#48] - adc x16,x16,x1 - stp x26,x27,[x0,#64] - stp x28,x16,[x0,#80] - - ret - -.globl sqr_384 - -.def sqr_384; -.type 32; -.endef -.p2align 5 -sqr_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __sqr_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl redc_mont_384 - -.def redc_mont_384; -.type 32; -.endef -.p2align 5 -redc_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl from_mont_384 - -.def from_mont_384; -.type 32; -.endef -.p2align 5 -from_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - csel x15,x15,x23,lo - csel x16,x16,x24,lo - - stp x11,x12,[x0] - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.def __mul_by_1_mont_384; -.type 32; -.endef -.p2align 5 -__mul_by_1_mont_384: - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - mul x26,x4,x11 - ldp x15,x16,[x1,#32] - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - ret - - -.def __redc_tail_mont_384; -.type 32; -.endef -.p2align 5 -__redc_tail_mont_384: - ldp x19,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x11,x11,x19 // accumulate upper half - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x25,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x25,xzr - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - csel x15,x15,x23,lo - csel x16,x16,x24,lo - - stp x11,x12,[x0] - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret - - -.globl mul_384 - -.def mul_384; -.type 32; -.endef -.p2align 5 -mul_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - bl __mul_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.def __mul_384; -.type 32; -.endef -.p2align 5 -__mul_384: - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - - umulh x5,x11,x17 - umulh x6,x12,x17 - umulh x7,x13,x17 - umulh x8,x14,x17 - umulh x9,x15,x17 - umulh x10,x16,x17 - ldr x17,[x2,8*1] - - str x19,[x0] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,xzr, x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(1+1)] - adc x25,xzr,xzr - - str x19,[x0,8*1] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(2+1)] - adc x25,xzr,xzr - - str x19,[x0,8*2] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(3+1)] - adc x25,xzr,xzr - - str x19,[x0,8*3] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(4+1)] - adc x25,xzr,xzr - - str x19,[x0,8*4] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - adc x25,xzr,xzr - - str x19,[x0,8*5] - adds x19,x20,x5 - adcs x20,x21,x6 - adcs x21,x22,x7 - adcs x22,x23,x8 - adcs x23,x24,x9 - adc x24,x25,x10 - - stp x19,x20,[x0,#48] - stp x21,x22,[x0,#64] - stp x23,x24,[x0,#80] - - ret - - -.globl mul_382x - -.def mul_382x; -.type 32; -.endef -.p2align 5 -mul_382x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for two 384-bit vectors - - ldp x11,x12,[x1] - mov x26,x0 // save r_ptr - ldp x19,x20,[x1,#48] - mov x27,x1 // save a_ptr - ldp x13,x14,[x1,#16] - mov x28,x2 // save b_ptr - ldp x21,x22,[x1,#64] - ldp x15,x16,[x1,#32] - adds x5,x11,x19 // t0 = a->re + a->im - ldp x23,x24,[x1,#80] - adcs x6,x12,x20 - ldp x11,x12,[x2] - adcs x7,x13,x21 - ldp x19,x20,[x2,#48] - adcs x8,x14,x22 - ldp x13,x14,[x2,#16] - adcs x9,x15,x23 - ldp x21,x22,[x2,#64] - adc x10,x16,x24 - ldp x15,x16,[x2,#32] - - stp x5,x6,[sp] - adds x5,x11,x19 // t1 = b->re + b->im - ldp x23,x24,[x2,#80] - adcs x6,x12,x20 - stp x7,x8,[sp,#16] - adcs x7,x13,x21 - adcs x8,x14,x22 - stp x9,x10,[sp,#32] - adcs x9,x15,x23 - stp x5,x6,[sp,#48] - adc x10,x16,x24 - stp x7,x8,[sp,#64] - stp x9,x10,[sp,#80] - - bl __mul_384 // mul_384(ret->re, a->re, b->re) - - add x1,sp,#0 // mul_384(ret->im, t0, t1) - add x2,sp,#48 - add x0,x26,#96 - bl __mul_384 - - add x1,x27,#48 // mul_384(tx, a->im, b->im) - add x2,x28,#48 - add x0,sp,#0 - bl __mul_384 - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - add x1,x26,#96 // ret->im -= tx - add x2,sp,#0 - add x0,x26,#96 - bl __sub_mod_384x384 - - add x2,x26,#0 // ret->im -= ret->re - bl __sub_mod_384x384 - - add x1,x26,#0 // ret->re -= tx - add x2,sp,#0 - add x0,x26,#0 - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl sqr_382x - -.def sqr_382x; -.type 32; -.endef -.p2align 5 -sqr_382x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp x11,x12,[x1] - ldp x19,x20,[x1,#48] - ldp x13,x14,[x1,#16] - adds x5,x11,x19 // t0 = a->re + a->im - ldp x21,x22,[x1,#64] - adcs x6,x12,x20 - ldp x15,x16,[x1,#32] - adcs x7,x13,x21 - ldp x23,x24,[x1,#80] - adcs x8,x14,x22 - stp x5,x6,[x0] - adcs x9,x15,x23 - ldp x5,x6,[x2] - adc x10,x16,x24 - stp x7,x8,[x0,#16] - - subs x11,x11,x19 // t1 = a->re - a->im - ldp x7,x8,[x2,#16] - sbcs x12,x12,x20 - stp x9,x10,[x0,#32] - sbcs x13,x13,x21 - ldp x9,x10,[x2,#32] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x25,xzr,xzr - - and x19,x5,x25 - and x20,x6,x25 - adds x11,x11,x19 - and x21,x7,x25 - adcs x12,x12,x20 - and x22,x8,x25 - adcs x13,x13,x21 - and x23,x9,x25 - adcs x14,x14,x22 - and x24,x10,x25 - adcs x15,x15,x23 - stp x11,x12,[x0,#48] - adc x16,x16,x24 - stp x13,x14,[x0,#64] - stp x15,x16,[x0,#80] - - mov x4,x1 // save a_ptr - add x1,x0,#0 // mul_384(ret->re, t0, t1) - add x2,x0,#48 - bl __mul_384 - - add x1,x4,#0 // mul_384(ret->im, a->re, a->im) - add x2,x4,#48 - add x0,x0,#96 - bl __mul_384 - ldr x30,[x29,#8] - - ldp x11,x12,[x0] - ldp x13,x14,[x0,#16] - adds x11,x11,x11 // add with itself - ldp x15,x16,[x0,#32] - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adcs x19,x19,x19 - adcs x20,x20,x20 - stp x11,x12,[x0] - adcs x21,x21,x21 - stp x13,x14,[x0,#16] - adcs x22,x22,x22 - stp x15,x16,[x0,#32] - adcs x23,x23,x23 - stp x19,x20,[x0,#48] - adc x24,x24,x24 - stp x21,x22,[x0,#64] - stp x23,x24,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl sqr_mont_382x - -.def sqr_mont_382x; -.type 32; -.endef -.p2align 5 -sqr_mont_382x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#112 // space for two 384-bit vectors + word - mov x4,x3 // adjust for missing b_ptr - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - ldp x17,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x5,x11,x17 // t0 = a->re + a->im - adcs x6,x12,x20 - adcs x7,x13,x21 - adcs x8,x14,x22 - adcs x9,x15,x23 - adc x10,x16,x24 - - subs x19,x11,x17 // t1 = a->re - a->im - sbcs x20,x12,x20 - sbcs x21,x13,x21 - sbcs x22,x14,x22 - sbcs x23,x15,x23 - sbcs x24,x16,x24 - sbc x25,xzr,xzr // borrow flag as mask - - stp x5,x6,[sp] - stp x7,x8,[sp,#16] - stp x9,x10,[sp,#32] - stp x19,x20,[sp,#48] - stp x21,x22,[sp,#64] - stp x23,x24,[sp,#80] - str x25,[sp,#96] - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - add x2,x1,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) - - adds x19,x11,x11 // add with itself - adcs x20,x12,x12 - adcs x21,x13,x13 - adcs x22,x14,x14 - adcs x23,x15,x15 - adc x24,x16,x16 - - stp x19,x20,[x2,#48] - stp x21,x22,[x2,#64] - stp x23,x24,[x2,#80] - - ldp x11,x12,[sp] - ldr x17,[sp,#48] - ldp x13,x14,[sp,#16] - ldp x15,x16,[sp,#32] - - add x2,sp,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) - ldr x30,[x29,#8] - - ldr x25,[sp,#96] // account for sign from a->re - a->im - ldp x19,x20,[sp] - ldp x21,x22,[sp,#16] - ldp x23,x24,[sp,#32] - - and x19,x19,x25 - and x20,x20,x25 - and x21,x21,x25 - and x22,x22,x25 - and x23,x23,x25 - and x24,x24,x25 - - subs x11,x11,x19 - sbcs x12,x12,x20 - sbcs x13,x13,x21 - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x25,xzr,xzr - - and x19,x5,x25 - and x20,x6,x25 - and x21,x7,x25 - and x22,x8,x25 - and x23,x9,x25 - and x24,x10,x25 - - adds x11,x11,x19 - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adc x16,x16,x24 - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#112 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.def __mul_mont_383_nonred; -.type 32; -.endef -.p2align 5 -__mul_mont_383_nonred: - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - mul x4,x4,x19 - - umulh x26,x11,x17 - umulh x27,x12,x17 - umulh x28,x13,x17 - umulh x0,x14,x17 - umulh x1,x15,x17 - umulh x3,x16,x17 - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,xzr, x3 - mul x3,x10,x4 - ldr x17,[x2,8*1] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*2] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*3] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*4] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*5] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr - - adds x11,x20,x26 - adcs x12,x21,x27 - adcs x13,x22,x28 - adcs x14,x23,x0 - adcs x15,x24,x1 - adcs x16,x25,x3 - - ret - - -.globl sgn0_pty_mont_384 - -.def sgn0_pty_mont_384; -.type 32; -.endef -.p2align 5 -sgn0_pty_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov x4,x2 - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - ldp x9,x10,[x1,#32] - mov x1,x0 - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and x0,x11,#1 - adds x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x0,x0,x17 - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl sgn0_pty_mont_384x - -.def sgn0_pty_mont_384x; -.type 32; -.endef -.p2align 5 -sgn0_pty_mont_384x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov x4,x2 - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - ldp x9,x10,[x1,#32] - mov x1,x0 - - bl __mul_by_1_mont_384 - add x1,x1,#48 - - and x2,x11,#1 - orr x3,x11,x12 - adds x11,x11,x11 - orr x3,x3,x13 - adcs x12,x12,x12 - orr x3,x3,x14 - adcs x13,x13,x13 - orr x3,x3,x15 - adcs x14,x14,x14 - orr x3,x3,x16 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x2,x2,x17 - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and x0,x11,#1 - orr x1,x11,x12 - adds x11,x11,x11 - orr x1,x1,x13 - adcs x12,x12,x12 - orr x1,x1,x14 - adcs x13,x13,x13 - orr x1,x1,x15 - adcs x14,x14,x14 - orr x1,x1,x16 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x0,x0,x17 - - cmp x3,#0 - csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) - - cmp x1,#0 - csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) - - and x3,x3,#1 - and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - diff --git a/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s deleted file mode 100644 index 2dd30bc5b5d..00000000000 --- a/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s +++ /dev/null @@ -1,897 +0,0 @@ -.comm __blst_platform_cap,4 -.text - -.globl mul_mont_sparse_256 - -.def mul_mont_sparse_256; .scl 2; .type 32; .endef -.p2align 5 -mul_mont_sparse_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_mont_sparse_256: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_mont_sparse_256$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rdi - -.LSEH_body_mul_mont_sparse_256: - - - movq 0(%rdx),%rax - movq 0(%rsi),%r13 - movq 8(%rsi),%r14 - movq 16(%rsi),%r12 - movq 24(%rsi),%rbp - movq %rdx,%rbx - - movq %rax,%r15 - mulq %r13 - movq %rax,%r9 - movq %r15,%rax - movq %rdx,%r10 - call __mulq_mont_sparse_256 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_mul_mont_sparse_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_mont_sparse_256: - -.globl sqr_mont_sparse_256 - -.def sqr_mont_sparse_256; .scl 2; .type 32; .endef -.p2align 5 -sqr_mont_sparse_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqr_mont_sparse_256: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_mont_sparse_256$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rdi - -.LSEH_body_sqr_mont_sparse_256: - - - movq 0(%rsi),%rax - movq %rcx,%r8 - movq 8(%rsi),%r14 - movq %rdx,%rcx - movq 16(%rsi),%r12 - leaq (%rsi),%rbx - movq 24(%rsi),%rbp - - movq %rax,%r15 - mulq %rax - movq %rax,%r9 - movq %r15,%rax - movq %rdx,%r10 - call __mulq_mont_sparse_256 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sqr_mont_sparse_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqr_mont_sparse_256: -.def __mulq_mont_sparse_256; .scl 3; .type 32; .endef -.p2align 5 -__mulq_mont_sparse_256: - .byte 0xf3,0x0f,0x1e,0xfa - - mulq %r14 - addq %rax,%r10 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %r12 - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq %rbp - addq %rax,%r12 - movq 8(%rbx),%rax - adcq $0,%rdx - xorq %r14,%r14 - movq %rdx,%r13 - - movq %r9,%rdi - imulq %r8,%r9 - - - movq %rax,%r15 - mulq 0(%rsi) - addq %rax,%r10 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rax,%r12 - movq %r15,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rsi) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq %rdx,%r14 - xorq %r15,%r15 - - - mulq 0(%rcx) - addq %rax,%rdi - movq %r9,%rax - adcq %rdx,%rdi - - mulq 8(%rcx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %rdi,%r10 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rax,%r12 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - addq %rdx,%r13 - adcq $0,%r14 - adcq $0,%r15 - movq %r10,%rdi - imulq %r8,%r10 - - - movq %rax,%r9 - mulq 0(%rsi) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rsi) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq %rdx,%r15 - xorq %r9,%r9 - - - mulq 0(%rcx) - addq %rax,%rdi - movq %r10,%rax - adcq %rdx,%rdi - - mulq 8(%rcx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %rdi,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rax,%r13 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - addq %rdx,%r14 - adcq $0,%r15 - adcq $0,%r9 - movq %r11,%rdi - imulq %r8,%r11 - - - movq %rax,%r10 - mulq 0(%rsi) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rsi) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq %rdx,%r9 - xorq %r10,%r10 - - - mulq 0(%rcx) - addq %rax,%rdi - movq %r11,%rax - adcq %rdx,%rdi - - mulq 8(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %rdi,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - addq %rdx,%r15 - adcq $0,%r9 - adcq $0,%r10 - imulq %r8,%rax - movq 8(%rsp),%rsi - - - movq %rax,%r11 - mulq 0(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq %rdx,%r12 - - mulq 8(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r12,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - movq %r14,%rbx - addq %rbp,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %rdx,%r9 - adcq $0,%r10 - - - - - movq %r15,%r12 - subq 0(%rcx),%r13 - sbbq 8(%rcx),%r14 - sbbq 16(%rcx),%r15 - movq %r9,%rbp - sbbq 24(%rcx),%r9 - sbbq $0,%r10 - - cmovcq %rax,%r13 - cmovcq %rbx,%r14 - cmovcq %r12,%r15 - movq %r13,0(%rsi) - cmovcq %rbp,%r9 - movq %r14,8(%rsi) - movq %r15,16(%rsi) - movq %r9,24(%rsi) - - .byte 0xf3,0xc3 - - -.globl from_mont_256 - -.def from_mont_256; .scl 2; .type 32; .endef -.p2align 5 -from_mont_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_from_mont_256: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz from_mont_256$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_from_mont_256: - - - movq %rdx,%rbx - call __mulq_by_1_mont_256 - - - - - - movq %r14,%r10 - movq %r15,%r11 - movq %r9,%r12 - - subq 0(%rbx),%r13 - sbbq 8(%rbx),%r14 - sbbq 16(%rbx),%r15 - sbbq 24(%rbx),%r9 - - cmovncq %r13,%rax - cmovncq %r14,%r10 - cmovncq %r15,%r11 - movq %rax,0(%rdi) - cmovncq %r9,%r12 - movq %r10,8(%rdi) - movq %r11,16(%rdi) - movq %r12,24(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_from_mont_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_from_mont_256: - -.globl redc_mont_256 - -.def redc_mont_256; .scl 2; .type 32; .endef -.p2align 5 -redc_mont_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_redc_mont_256: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz redc_mont_256$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_redc_mont_256: - - - movq %rdx,%rbx - call __mulq_by_1_mont_256 - - addq 32(%rsi),%r13 - adcq 40(%rsi),%r14 - movq %r13,%rax - adcq 48(%rsi),%r15 - movq %r14,%r10 - adcq 56(%rsi),%r9 - sbbq %rsi,%rsi - - - - - movq %r15,%r11 - subq 0(%rbx),%r13 - sbbq 8(%rbx),%r14 - sbbq 16(%rbx),%r15 - movq %r9,%r12 - sbbq 24(%rbx),%r9 - sbbq $0,%rsi - - cmovncq %r13,%rax - cmovncq %r14,%r10 - cmovncq %r15,%r11 - movq %rax,0(%rdi) - cmovncq %r9,%r12 - movq %r10,8(%rdi) - movq %r11,16(%rdi) - movq %r12,24(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_redc_mont_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_redc_mont_256: -.def __mulq_by_1_mont_256; .scl 3; .type 32; .endef -.p2align 5 -__mulq_by_1_mont_256: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - - movq %rax,%r13 - imulq %rcx,%rax - movq %rax,%r9 - - mulq 0(%rbx) - addq %rax,%r13 - movq %r9,%rax - adcq %rdx,%r13 - - mulq 8(%rbx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %r13,%r10 - adcq $0,%rdx - movq %rdx,%r13 - - mulq 16(%rbx) - movq %r10,%r14 - imulq %rcx,%r10 - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %r13,%r11 - adcq $0,%rdx - movq %rdx,%r13 - - mulq 24(%rbx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r13,%r12 - adcq $0,%rdx - movq %rdx,%r13 - - mulq 0(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq %rdx,%r14 - - mulq 8(%rbx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r11 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 16(%rbx) - movq %r11,%r15 - imulq %rcx,%r11 - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r12 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 24(%rbx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r14,%r13 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq %rdx,%r15 - - mulq 8(%rbx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rbx) - movq %r12,%r9 - imulq %rcx,%r12 - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r15,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rbx) - addq %rax,%r9 - movq %r12,%rax - adcq %rdx,%r9 - - mulq 8(%rbx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r9,%r15 - adcq $0,%rdx - movq %rdx,%r9 - .byte 0xf3,0xc3 - -.section .pdata -.p2align 2 -.rva .LSEH_begin_mul_mont_sparse_256 -.rva .LSEH_body_mul_mont_sparse_256 -.rva .LSEH_info_mul_mont_sparse_256_prologue - -.rva .LSEH_body_mul_mont_sparse_256 -.rva .LSEH_epilogue_mul_mont_sparse_256 -.rva .LSEH_info_mul_mont_sparse_256_body - -.rva .LSEH_epilogue_mul_mont_sparse_256 -.rva .LSEH_end_mul_mont_sparse_256 -.rva .LSEH_info_mul_mont_sparse_256_epilogue - -.rva .LSEH_begin_sqr_mont_sparse_256 -.rva .LSEH_body_sqr_mont_sparse_256 -.rva .LSEH_info_sqr_mont_sparse_256_prologue - -.rva .LSEH_body_sqr_mont_sparse_256 -.rva .LSEH_epilogue_sqr_mont_sparse_256 -.rva .LSEH_info_sqr_mont_sparse_256_body - -.rva .LSEH_epilogue_sqr_mont_sparse_256 -.rva .LSEH_end_sqr_mont_sparse_256 -.rva .LSEH_info_sqr_mont_sparse_256_epilogue - -.rva .LSEH_begin_from_mont_256 -.rva .LSEH_body_from_mont_256 -.rva .LSEH_info_from_mont_256_prologue - -.rva .LSEH_body_from_mont_256 -.rva .LSEH_epilogue_from_mont_256 -.rva .LSEH_info_from_mont_256_body - -.rva .LSEH_epilogue_from_mont_256 -.rva .LSEH_end_from_mont_256 -.rva .LSEH_info_from_mont_256_epilogue - -.rva .LSEH_begin_redc_mont_256 -.rva .LSEH_body_redc_mont_256 -.rva .LSEH_info_redc_mont_256_prologue - -.rva .LSEH_body_redc_mont_256 -.rva .LSEH_epilogue_redc_mont_256 -.rva .LSEH_info_redc_mont_256_body - -.rva .LSEH_epilogue_redc_mont_256 -.rva .LSEH_end_redc_mont_256 -.rva .LSEH_info_redc_mont_256_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_mul_mont_sparse_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_mont_sparse_256_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mul_mont_sparse_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqr_mont_sparse_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqr_mont_sparse_256_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqr_mont_sparse_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_from_mont_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_from_mont_256_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_from_mont_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_redc_mont_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_redc_mont_256_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_redc_mont_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s deleted file mode 100644 index ee646f5b137..00000000000 --- a/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s +++ /dev/null @@ -1,4303 +0,0 @@ -.comm __blst_platform_cap,4 -.text - - - - - - - -.def __subq_mod_384x384; .scl 3; .type 32; .endef -.p2align 5 -__subq_mod_384x384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - subq 0(%rdx),%r8 - movq 56(%rsi),%r15 - sbbq 8(%rdx),%r9 - movq 64(%rsi),%rax - sbbq 16(%rdx),%r10 - movq 72(%rsi),%rbx - sbbq 24(%rdx),%r11 - movq 80(%rsi),%rbp - sbbq 32(%rdx),%r12 - movq 88(%rsi),%rsi - sbbq 40(%rdx),%r13 - movq %r8,0(%rdi) - sbbq 48(%rdx),%r14 - movq 0(%rcx),%r8 - movq %r9,8(%rdi) - sbbq 56(%rdx),%r15 - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - sbbq 64(%rdx),%rax - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - sbbq 72(%rdx),%rbx - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - sbbq 80(%rdx),%rbp - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - sbbq 88(%rdx),%rsi - movq 40(%rcx),%r13 - sbbq %rdx,%rdx - - andq %rdx,%r8 - andq %rdx,%r9 - andq %rdx,%r10 - andq %rdx,%r11 - andq %rdx,%r12 - andq %rdx,%r13 - - addq %r8,%r14 - adcq %r9,%r15 - movq %r14,48(%rdi) - adcq %r10,%rax - movq %r15,56(%rdi) - adcq %r11,%rbx - movq %rax,64(%rdi) - adcq %r12,%rbp - movq %rbx,72(%rdi) - adcq %r13,%rsi - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 - - -.def __addq_mod_384; .scl 3; .type 32; .endef -.p2align 5 -__addq_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - movq %r8,%r14 - adcq 24(%rdx),%r11 - movq %r9,%r15 - adcq 32(%rdx),%r12 - movq %r10,%rax - adcq 40(%rdx),%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,0(%rdi) - cmovcq %rbx,%r11 - movq %r9,8(%rdi) - cmovcq %rbp,%r12 - movq %r10,16(%rdi) - cmovcq %rsi,%r13 - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 - - -.def __subq_mod_384; .scl 3; .type 32; .endef -.p2align 5 -__subq_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -__subq_mod_384_a_is_loaded: - subq 0(%rdx),%r8 - movq 0(%rcx),%r14 - sbbq 8(%rdx),%r9 - movq 8(%rcx),%r15 - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rax - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbx - sbbq 32(%rdx),%r12 - movq 32(%rcx),%rbp - sbbq 40(%rdx),%r13 - movq 40(%rcx),%rsi - sbbq %rdx,%rdx - - andq %rdx,%r14 - andq %rdx,%r15 - andq %rdx,%rax - andq %rdx,%rbx - andq %rdx,%rbp - andq %rdx,%rsi - - addq %r14,%r8 - adcq %r15,%r9 - movq %r8,0(%rdi) - adcq %rax,%r10 - movq %r9,8(%rdi) - adcq %rbx,%r11 - movq %r10,16(%rdi) - adcq %rbp,%r12 - movq %r11,24(%rdi) - adcq %rsi,%r13 - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 - -.globl mul_mont_384x - -.def mul_mont_384x; .scl 2; .type 32; .endef -.p2align 5 -mul_mont_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_mont_384x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_mont_384x$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $328,%rsp - -.LSEH_body_mul_mont_384x: - - - movq %rdx,%rbx - movq %rdi,32(%rsp) - movq %rsi,24(%rsp) - movq %rdx,16(%rsp) - movq %rcx,8(%rsp) - movq %r8,0(%rsp) - - - - - leaq 40(%rsp),%rdi - call __mulq_384 - - - leaq 48(%rbx),%rbx - leaq 48(%rsi),%rsi - leaq 40+96(%rsp),%rdi - call __mulq_384 - - - movq 8(%rsp),%rcx - leaq -48(%rsi),%rdx - leaq 40+192+48(%rsp),%rdi - call __addq_mod_384 - - movq 16(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq -48(%rdi),%rdi - call __addq_mod_384 - - leaq (%rdi),%rbx - leaq 48(%rdi),%rsi - call __mulq_384 - - - leaq (%rdi),%rsi - leaq 40(%rsp),%rdx - movq 8(%rsp),%rcx - call __subq_mod_384x384 - - leaq (%rdi),%rsi - leaq -96(%rdi),%rdx - call __subq_mod_384x384 - - - leaq 40(%rsp),%rsi - leaq 40+96(%rsp),%rdx - leaq 40(%rsp),%rdi - call __subq_mod_384x384 - - movq %rcx,%rbx - - - leaq 40(%rsp),%rsi - movq 0(%rsp),%rcx - movq 32(%rsp),%rdi - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - - leaq 40+192(%rsp),%rsi - movq 0(%rsp),%rcx - leaq 48(%rdi),%rdi - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - leaq 328(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_mul_mont_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_mont_384x: -.globl sqr_mont_384x - -.def sqr_mont_384x; .scl 2; .type 32; .endef -.p2align 5 -sqr_mont_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqr_mont_384x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_mont_384x$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $136,%rsp - -.LSEH_body_sqr_mont_384x: - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - - - leaq 48(%rsi),%rdx - leaq 32(%rsp),%rdi - call __addq_mod_384 - - - movq 16(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq 32+48(%rsp),%rdi - call __subq_mod_384 - - - movq 16(%rsp),%rsi - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rax - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - - call __mulq_mont_384 - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - movq %r14,%r12 - adcq %r9,%r9 - movq %r15,%r13 - adcq %r10,%r10 - movq %r8,%rax - adcq %r11,%r11 - movq %r9,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - movq %r10,%rbp - sbbq 16(%rcx),%r8 - sbbq 24(%rcx),%r9 - sbbq 32(%rcx),%r10 - movq %r11,%rsi - sbbq 40(%rcx),%r11 - sbbq $0,%rdx - - cmovcq %r12,%r14 - cmovcq %r13,%r15 - cmovcq %rax,%r8 - movq %r14,48(%rdi) - cmovcq %rbx,%r9 - movq %r15,56(%rdi) - cmovcq %rbp,%r10 - movq %r8,64(%rdi) - cmovcq %rsi,%r11 - movq %r9,72(%rdi) - movq %r10,80(%rdi) - movq %r11,88(%rdi) - - leaq 32(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rax - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%r12 - movq 32+24(%rsp),%r13 - - call __mulq_mont_384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_sqr_mont_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqr_mont_384x: - -.globl mul_382x - -.def mul_382x; .scl 2; .type 32; .endef -.p2align 5 -mul_382x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_382x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_382x$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $136,%rsp - -.LSEH_body_mul_382x: - - - leaq 96(%rdi),%rdi - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - movq %rdi,16(%rsp) - movq %rcx,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 48(%rsi),%r8 - adcq 56(%rsi),%r9 - adcq 64(%rsi),%r10 - adcq 72(%rsi),%r11 - adcq 80(%rsi),%r12 - adcq 88(%rsi),%r13 - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - movq 32(%rdx),%r12 - movq 40(%rdx),%r13 - - addq 48(%rdx),%r8 - adcq 56(%rdx),%r9 - adcq 64(%rdx),%r10 - adcq 72(%rdx),%r11 - adcq 80(%rdx),%r12 - adcq 88(%rdx),%r13 - - movq %r8,32+48(%rsp) - movq %r9,32+56(%rsp) - movq %r10,32+64(%rsp) - movq %r11,32+72(%rsp) - movq %r12,32+80(%rsp) - movq %r13,32+88(%rsp) - - - leaq 32+0(%rsp),%rsi - leaq 32+48(%rsp),%rbx - call __mulq_384 - - - movq 0(%rsp),%rsi - movq 8(%rsp),%rbx - leaq -96(%rdi),%rdi - call __mulq_384 - - - leaq 48(%rsi),%rsi - leaq 48(%rbx),%rbx - leaq 32(%rsp),%rdi - call __mulq_384 - - - movq 16(%rsp),%rsi - leaq 32(%rsp),%rdx - movq 24(%rsp),%rcx - movq %rsi,%rdi - call __subq_mod_384x384 - - - leaq 0(%rdi),%rsi - leaq -96(%rdi),%rdx - call __subq_mod_384x384 - - - leaq -96(%rdi),%rsi - leaq 32(%rsp),%rdx - leaq -96(%rdi),%rdi - call __subq_mod_384x384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_mul_382x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_382x: -.globl sqr_382x - -.def sqr_382x; .scl 2; .type 32; .endef -.p2align 5 -sqr_382x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqr_382x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_382x$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rsi - -.LSEH_body_sqr_382x: - - - movq %rdx,%rcx - - - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%rbx - movq 32(%rsi),%rbp - movq 40(%rsi),%rdx - - movq %r14,%r8 - addq 48(%rsi),%r14 - movq %r15,%r9 - adcq 56(%rsi),%r15 - movq %rax,%r10 - adcq 64(%rsi),%rax - movq %rbx,%r11 - adcq 72(%rsi),%rbx - movq %rbp,%r12 - adcq 80(%rsi),%rbp - movq %rdx,%r13 - adcq 88(%rsi),%rdx - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - movq %rbp,32(%rdi) - movq %rdx,40(%rdi) - - - leaq 48(%rsi),%rdx - leaq 48(%rdi),%rdi - call __subq_mod_384_a_is_loaded - - - leaq (%rdi),%rsi - leaq -48(%rdi),%rbx - leaq -48(%rdi),%rdi - call __mulq_384 - - - movq (%rsp),%rsi - leaq 48(%rsi),%rbx - leaq 96(%rdi),%rdi - call __mulq_384 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq 64(%rdi),%rax - movq 72(%rdi),%rbx - movq 80(%rdi),%rbp - addq %r8,%r8 - movq 88(%rdi),%rdx - adcq %r9,%r9 - movq %r8,0(%rdi) - adcq %r10,%r10 - movq %r9,8(%rdi) - adcq %r11,%r11 - movq %r10,16(%rdi) - adcq %r12,%r12 - movq %r11,24(%rdi) - adcq %r13,%r13 - movq %r12,32(%rdi) - adcq %r14,%r14 - movq %r13,40(%rdi) - adcq %r15,%r15 - movq %r14,48(%rdi) - adcq %rax,%rax - movq %r15,56(%rdi) - adcq %rbx,%rbx - movq %rax,64(%rdi) - adcq %rbp,%rbp - movq %rbx,72(%rdi) - adcq %rdx,%rdx - movq %rbp,80(%rdi) - movq %rdx,88(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sqr_382x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqr_382x: -.globl mul_384 - -.def mul_384; .scl 2; .type 32; .endef -.p2align 5 -mul_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_384$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - -.LSEH_body_mul_384: - - - movq %rdx,%rbx - call __mulq_384 - - movq 0(%rsp),%r12 - - movq 8(%rsp),%rbx - - movq 16(%rsp),%rbp - - leaq 24(%rsp),%rsp - -.LSEH_epilogue_mul_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_384: - -.def __mulq_384; .scl 3; .type 32; .endef -.p2align 5 -__mulq_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rbx),%rax - - movq %rax,%rbp - mulq 0(%rsi) - movq %rax,0(%rdi) - movq %rbp,%rax - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r11 - movq 8(%rbx),%rax - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,8(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,16(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,24(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 32(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,32(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 40(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,40(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq %rax,%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rcx,48(%rdi) - movq %r8,56(%rdi) - movq %r9,64(%rdi) - movq %r10,72(%rdi) - movq %r11,80(%rdi) - movq %r12,88(%rdi) - - .byte 0xf3,0xc3 - -.globl sqr_384 - -.def sqr_384; .scl 2; .type 32; .endef -.p2align 5 -sqr_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqr_384: - - - movq %rcx,%rdi - movq %rdx,%rsi -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_384$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_sqr_384: - - - call __sqrq_384 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sqr_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqr_384: - -.def __sqrq_384; .scl 3; .type 32; .endef -.p2align 5 -__sqrq_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r15 - movq 16(%rsi),%rcx - movq 24(%rsi),%rbx - - - movq %rax,%r14 - mulq %r15 - movq %rax,%r9 - movq %r14,%rax - movq 32(%rsi),%rbp - movq %rdx,%r10 - - mulq %rcx - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - movq 40(%rsi),%rsi - movq %rdx,%r11 - - mulq %rbx - addq %rax,%r11 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq %rbp - addq %rax,%r12 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r13 - - mulq %rsi - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - - mulq %rax - xorq %r8,%r8 - movq %rax,0(%rdi) - movq %r15,%rax - addq %r9,%r9 - adcq $0,%r8 - addq %rdx,%r9 - adcq $0,%r8 - movq %r9,8(%rdi) - - mulq %rcx - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq %rbx - addq %rax,%r12 - movq %r15,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq %rbp - addq %rax,%r13 - movq %r15,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq %rsi - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq %rax - xorq %r9,%r9 - addq %rax,%r8 - movq %rcx,%rax - addq %r10,%r10 - adcq %r11,%r11 - adcq $0,%r9 - addq %r8,%r10 - adcq %rdx,%r11 - adcq $0,%r9 - movq %r10,16(%rdi) - - mulq %rbx - addq %rax,%r13 - movq %rcx,%rax - adcq $0,%rdx - movq %r11,24(%rdi) - movq %rdx,%r8 - - mulq %rbp - addq %rax,%r14 - movq %rcx,%rax - adcq $0,%rdx - addq %r8,%r14 - adcq $0,%rdx - movq %rdx,%r8 - - mulq %rsi - addq %rax,%r15 - movq %rcx,%rax - adcq $0,%rdx - addq %r8,%r15 - adcq $0,%rdx - movq %rdx,%rcx - - mulq %rax - xorq %r11,%r11 - addq %rax,%r9 - movq %rbx,%rax - addq %r12,%r12 - adcq %r13,%r13 - adcq $0,%r11 - addq %r9,%r12 - adcq %rdx,%r13 - adcq $0,%r11 - movq %r12,32(%rdi) - - - mulq %rbp - addq %rax,%r15 - movq %rbx,%rax - adcq $0,%rdx - movq %r13,40(%rdi) - movq %rdx,%r8 - - mulq %rsi - addq %rax,%rcx - movq %rbx,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%rbx - - mulq %rax - xorq %r12,%r12 - addq %rax,%r11 - movq %rbp,%rax - addq %r14,%r14 - adcq %r15,%r15 - adcq $0,%r12 - addq %r11,%r14 - adcq %rdx,%r15 - movq %r14,48(%rdi) - adcq $0,%r12 - movq %r15,56(%rdi) - - - mulq %rsi - addq %rax,%rbx - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq %rax - xorq %r13,%r13 - addq %rax,%r12 - movq %rsi,%rax - addq %rcx,%rcx - adcq %rbx,%rbx - adcq $0,%r13 - addq %r12,%rcx - adcq %rdx,%rbx - movq %rcx,64(%rdi) - adcq $0,%r13 - movq %rbx,72(%rdi) - - - mulq %rax - addq %r13,%rax - addq %rbp,%rbp - adcq $0,%rdx - addq %rbp,%rax - adcq $0,%rdx - movq %rax,80(%rdi) - movq %rdx,88(%rdi) - - .byte 0xf3,0xc3 - - -.globl sqr_mont_384 - -.def sqr_mont_384; .scl 2; .type 32; .endef -.p2align 5 -sqr_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqr_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_mont_384$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $120,%rsp - -.LSEH_body_sqr_mont_384: - - - movq %rcx,96(%rsp) - movq %rdx,104(%rsp) - movq %rdi,112(%rsp) - - movq %rsp,%rdi - call __sqrq_384 - - leaq 0(%rsp),%rsi - movq 96(%rsp),%rcx - movq 104(%rsp),%rbx - movq 112(%rsp),%rdi - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - leaq 120(%rsp),%r8 - movq 120(%rsp),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_sqr_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqr_mont_384: - - - -.globl redc_mont_384 - -.def redc_mont_384; .scl 2; .type 32; .endef -.p2align 5 -redc_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_redc_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz redc_mont_384$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_redc_mont_384: - - - movq %rdx,%rbx - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_redc_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_redc_mont_384: - - - - -.globl from_mont_384 - -.def from_mont_384; .scl 2; .type 32; .endef -.p2align 5 -from_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_from_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz from_mont_384$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_from_mont_384: - - - movq %rdx,%rbx - call __mulq_by_1_mont_384 - - - - - - movq %r15,%rcx - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_from_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_from_mont_384: -.def __mulq_by_1_mont_384; .scl 3; .type 32; .endef -.p2align 5 -__mulq_by_1_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rax,%r14 - imulq %rcx,%rax - movq %rax,%r8 - - mulq 0(%rbx) - addq %rax,%r14 - movq %r8,%rax - adcq %rdx,%r14 - - mulq 8(%rbx) - addq %rax,%r9 - movq %r8,%rax - adcq $0,%rdx - addq %r14,%r9 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 16(%rbx) - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - addq %r14,%r10 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 24(%rbx) - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %r9,%r15 - imulq %rcx,%r9 - addq %r14,%r11 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 32(%rbx) - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %r14,%r12 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 40(%rbx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %r14,%r13 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rbx) - addq %rax,%r15 - movq %r9,%rax - adcq %rdx,%r15 - - mulq 8(%rbx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %r15,%r10 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rbx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %r15,%r11 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rbx) - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - movq %r10,%r8 - imulq %rcx,%r10 - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 32(%rbx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 40(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %r15,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rbx) - addq %rax,%r8 - movq %r10,%rax - adcq %rdx,%r8 - - mulq 8(%rbx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rbx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r8,%r12 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 24(%rbx) - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - movq %r11,%r9 - imulq %rcx,%r11 - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %r8,%r14 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %r8,%r15 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 0(%rbx) - addq %rax,%r9 - movq %r11,%rax - adcq %rdx,%r9 - - mulq 8(%rbx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rbx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rbx) - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - movq %r12,%r10 - imulq %rcx,%r12 - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %r9,%r15 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rbx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 0(%rbx) - addq %rax,%r10 - movq %r12,%rax - adcq %rdx,%r10 - - mulq 8(%rbx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 24(%rbx) - addq %rax,%r15 - movq %r12,%rax - adcq $0,%rdx - movq %r13,%r11 - imulq %rcx,%r13 - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rbx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r8 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rbx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 0(%rbx) - addq %rax,%r11 - movq %r13,%rax - adcq %rdx,%r11 - - mulq 8(%rbx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 24(%rbx) - addq %rax,%r8 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rbx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r9 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rbx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - .byte 0xf3,0xc3 - - -.def __redq_tail_mont_384; .scl 3; .type 32; .endef -.p2align 5 -__redq_tail_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - - addq 48(%rsi),%r14 - movq %r14,%rax - adcq 56(%rsi),%r15 - adcq 64(%rsi),%r8 - adcq 72(%rsi),%r9 - movq %r15,%rcx - adcq 80(%rsi),%r10 - adcq 88(%rsi),%r11 - sbbq %r12,%r12 - - - - - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - .byte 0xf3,0xc3 - - -.globl sgn0_pty_mont_384 - -.def sgn0_pty_mont_384; .scl 2; .type 32; .endef -.p2align 5 -sgn0_pty_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sgn0_pty_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sgn0_pty_mont_384$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_sgn0_pty_mont_384: - - - movq %rsi,%rbx - leaq 0(%rdi),%rsi - movq %rdx,%rcx - call __mulq_by_1_mont_384 - - xorq %rax,%rax - movq %r14,%r13 - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - notq %rax - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sgn0_pty_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sgn0_pty_mont_384: - -.globl sgn0_pty_mont_384x - -.def sgn0_pty_mont_384x; .scl 2; .type 32; .endef -.p2align 5 -sgn0_pty_mont_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sgn0_pty_mont_384x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sgn0_pty_mont_384x$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_sgn0_pty_mont_384x: - - - movq %rsi,%rbx - leaq 48(%rdi),%rsi - movq %rdx,%rcx - call __mulq_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - leaq 0(%rdi),%rsi - xorq %rdi,%rdi - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rdi - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rdi - - movq %r14,0(%rsp) - notq %rdi - andq $1,%r13 - andq $2,%rdi - orq %r13,%rdi - - call __mulq_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - xorq %rax,%rax - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - movq 0(%rsp),%r12 - - notq %rax - - testq %r14,%r14 - cmovzq %rdi,%r13 - - testq %r12,%r12 - cmovnzq %rdi,%rax - - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sgn0_pty_mont_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sgn0_pty_mont_384x: -.globl mul_mont_384 - -.def mul_mont_384; .scl 2; .type 32; .endef -.p2align 5 -mul_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mul_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_mont_384$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $24,%rsp - -.LSEH_body_mul_mont_384: - - - movq 0(%rdx),%rax - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - movq %rdx,%rbx - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - - call __mulq_mont_384 - - movq 24(%rsp),%r15 - - movq 32(%rsp),%r14 - - movq 40(%rsp),%r13 - - movq 48(%rsp),%r12 - - movq 56(%rsp),%rbx - - movq 64(%rsp),%rbp - - leaq 72(%rsp),%rsp - -.LSEH_epilogue_mul_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mul_mont_384: -.def __mulq_mont_384; .scl 3; .type 32; .endef -.p2align 5 -__mulq_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rax,%rdi - mulq %r14 - movq %rax,%r8 - movq %rdi,%rax - movq %rdx,%r9 - - mulq %r15 - addq %rax,%r9 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %r12 - addq %rax,%r10 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r11 - - movq %r8,%rbp - imulq 8(%rsp),%r8 - - mulq %r13 - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r13 - - mulq 40(%rsi) - addq %rax,%r13 - movq %r8,%rax - adcq $0,%rdx - xorq %r15,%r15 - movq %rdx,%r14 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r8,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r9 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r9 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r10 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r13 - movq 8(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq %rdx,%r14 - adcq $0,%r15 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r9 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 8(%rsi) - addq %rax,%r10 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r10 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - movq %r9,%rbp - imulq 8(%rsp),%r9 - - mulq 24(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r12 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rsi) - addq %r8,%r14 - adcq $0,%rdx - xorq %r8,%r8 - addq %rax,%r14 - movq %r9,%rax - adcq %rdx,%r15 - adcq $0,%r8 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r9,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r10 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r14 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq %rdx,%r15 - adcq $0,%r8 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r10 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r11 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - movq %r10,%rbp - imulq 8(%rsp),%r10 - - mulq 24(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rsi) - addq %r9,%r15 - adcq $0,%rdx - xorq %r9,%r9 - addq %rax,%r15 - movq %r10,%rax - adcq %rdx,%r8 - adcq $0,%r9 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r10,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r15 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq %rdx,%r8 - adcq $0,%r9 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 8(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r12 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - movq %r11,%rbp - imulq 8(%rsp),%r11 - - mulq 24(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r15 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rsi) - addq %r10,%r8 - adcq $0,%rdx - xorq %r10,%r10 - addq %rax,%r8 - movq %r11,%rax - adcq %rdx,%r9 - adcq $0,%r10 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r11,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r14 - adcq $0,%rdx - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r8 - movq 32(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r8 - adcq %rdx,%r9 - adcq $0,%r10 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 8(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - movq %r12,%rbp - imulq 8(%rsp),%r12 - - mulq 24(%rsi) - addq %rax,%r15 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rsi) - addq %rax,%r8 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %r11,%r9 - adcq $0,%rdx - xorq %r11,%r11 - addq %rax,%r9 - movq %r12,%rax - adcq %rdx,%r10 - adcq $0,%r11 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r12,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r8 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r9 - movq 40(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r9 - adcq %rdx,%r10 - adcq $0,%r11 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 8(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r14 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 16(%rsi) - addq %rax,%r15 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r15 - adcq $0,%rdx - movq %rdx,%r12 - - movq %r13,%rbp - imulq 8(%rsp),%r13 - - mulq 24(%rsi) - addq %rax,%r8 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r8 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rsi) - addq %rax,%r9 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r9 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 40(%rsi) - addq %r12,%r10 - adcq $0,%rdx - xorq %r12,%r12 - addq %rax,%r10 - movq %r13,%rax - adcq %rdx,%r11 - adcq $0,%r12 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r13,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %rbp,%r9 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %rbp,%r10 - adcq %rdx,%r11 - adcq $0,%r12 - - - - - movq 16(%rsp),%rdi - subq 0(%rcx),%r14 - movq %r15,%rdx - sbbq 8(%rcx),%r15 - movq %r8,%rbx - sbbq 16(%rcx),%r8 - movq %r9,%rsi - sbbq 24(%rcx),%r9 - movq %r10,%rbp - sbbq 32(%rcx),%r10 - movq %r11,%r13 - sbbq 40(%rcx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r14 - cmovcq %rdx,%r15 - cmovcq %rbx,%r8 - movq %r14,0(%rdi) - cmovcq %rsi,%r9 - movq %r15,8(%rdi) - cmovcq %rbp,%r10 - movq %r8,16(%rdi) - cmovcq %r13,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - .byte 0xf3,0xc3 - -.globl sqr_n_mul_mont_384 - -.def sqr_n_mul_mont_384; .scl 2; .type 32; .endef -.p2align 5 -sqr_n_mul_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqr_n_mul_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 - movq 48(%rsp),%r9 -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_n_mul_mont_384$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $136,%rsp - -.LSEH_body_sqr_n_mul_mont_384: - - - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - movq %rcx,16(%rsp) - leaq 32(%rsp),%rdi - movq %r9,24(%rsp) - movq (%r9),%xmm2 - -.Loop_sqr_384: - movd %edx,%xmm1 - - call __sqrq_384 - - leaq 0(%rdi),%rsi - movq 0(%rsp),%rcx - movq 16(%rsp),%rbx - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - movd %xmm1,%edx - leaq 0(%rdi),%rsi - decl %edx - jnz .Loop_sqr_384 - -.byte 102,72,15,126,208 - movq %rbx,%rcx - movq 24(%rsp),%rbx - - - - - - - movq %r8,%r12 - movq %r9,%r13 - - call __mulq_mont_384 - - leaq 136(%rsp),%r8 - movq 136(%rsp),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_sqr_n_mul_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqr_n_mul_mont_384: - -.globl sqr_n_mul_mont_383 - -.def sqr_n_mul_mont_383; .scl 2; .type 32; .endef -.p2align 5 -sqr_n_mul_mont_383: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqr_n_mul_mont_383: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 - movq 48(%rsp),%r9 -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_n_mul_mont_383$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $136,%rsp - -.LSEH_body_sqr_n_mul_mont_383: - - - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - movq %rcx,16(%rsp) - leaq 32(%rsp),%rdi - movq %r9,24(%rsp) - movq (%r9),%xmm2 - -.Loop_sqr_383: - movd %edx,%xmm1 - - call __sqrq_384 - - leaq 0(%rdi),%rsi - movq 0(%rsp),%rcx - movq 16(%rsp),%rbx - call __mulq_by_1_mont_384 - - movd %xmm1,%edx - addq 48(%rsi),%r14 - adcq 56(%rsi),%r15 - adcq 64(%rsi),%r8 - adcq 72(%rsi),%r9 - adcq 80(%rsi),%r10 - adcq 88(%rsi),%r11 - leaq 0(%rdi),%rsi - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - decl %edx - jnz .Loop_sqr_383 - -.byte 102,72,15,126,208 - movq %rbx,%rcx - movq 24(%rsp),%rbx - - - - - - - movq %r8,%r12 - movq %r9,%r13 - - call __mulq_mont_384 - - leaq 136(%rsp),%r8 - movq 136(%rsp),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_sqr_n_mul_mont_383: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqr_n_mul_mont_383: -.def __mulq_mont_383_nonred; .scl 3; .type 32; .endef -.p2align 5 -__mulq_mont_383_nonred: - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rax,%rbp - mulq %r14 - movq %rax,%r8 - movq %rbp,%rax - movq %rdx,%r9 - - mulq %r15 - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %r12 - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - movq %r8,%r15 - imulq 8(%rsp),%r8 - - mulq %r13 - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r13 - - mulq 40(%rsi) - addq %rax,%r13 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rcx) - addq %rax,%r15 - movq %r8,%rax - adcq %rdx,%r15 - - mulq 8(%rcx) - addq %rax,%r9 - movq %r8,%rax - adcq $0,%rdx - addq %r15,%r9 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rcx) - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - addq %r15,%r10 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rcx) - addq %r15,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq 32(%rcx) - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 40(%rcx) - addq %rax,%r13 - movq 8(%rbx),%rax - adcq $0,%rdx - addq %r15,%r13 - adcq %rdx,%r14 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq 8(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r10 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r11 - adcq $0,%rdx - movq %rdx,%r15 - - movq %r9,%r8 - imulq 8(%rsp),%r9 - - mulq 24(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 32(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 40(%rsi) - addq %r15,%r14 - adcq $0,%rdx - addq %rax,%r14 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rcx) - addq %rax,%r8 - movq %r9,%rax - adcq %rdx,%r8 - - mulq 8(%rcx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %r8,%r10 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rcx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 24(%rcx) - addq %r8,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rcx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rcx) - addq %rax,%r14 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %r8,%r14 - adcq %rdx,%r15 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 8(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r12 - adcq $0,%rdx - movq %rdx,%r8 - - movq %r10,%r9 - imulq 8(%rsp),%r10 - - mulq 24(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r14 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rsi) - addq %r8,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 0(%rcx) - addq %rax,%r9 - movq %r10,%rax - adcq %rdx,%r9 - - mulq 8(%rcx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r9,%r11 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rcx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rcx) - addq %r9,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rcx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rcx) - addq %rax,%r15 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %r9,%r15 - adcq %rdx,%r8 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - movq %r11,%r10 - imulq 8(%rsp),%r11 - - mulq 24(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rsi) - addq %rax,%r15 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r15 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rsi) - addq %r9,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 0(%rcx) - addq %rax,%r10 - movq %r11,%rax - adcq %rdx,%r10 - - mulq 8(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r10,%r12 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 24(%rcx) - addq %r10,%r14 - adcq $0,%rdx - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rcx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rcx) - addq %rax,%r8 - movq 32(%rbx),%rax - adcq $0,%rdx - addq %r10,%r8 - adcq %rdx,%r9 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 8(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - movq %r12,%r11 - imulq 8(%rsp),%r12 - - mulq 24(%rsi) - addq %rax,%r15 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r8 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rsi) - addq %r10,%r9 - adcq $0,%rdx - addq %rax,%r9 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 0(%rcx) - addq %rax,%r11 - movq %r12,%rax - adcq %rdx,%r11 - - mulq 8(%rcx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rcx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 24(%rcx) - addq %r11,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rcx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rcx) - addq %rax,%r9 - movq 40(%rbx),%rax - adcq $0,%rdx - addq %r11,%r9 - adcq %rdx,%r10 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 8(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rsi) - addq %rax,%r15 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - movq %r13,%r12 - imulq 8(%rsp),%r13 - - mulq 24(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r9 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %r11,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 0(%rcx) - addq %rax,%r12 - movq %r13,%rax - adcq %rdx,%r12 - - mulq 8(%rcx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %r12,%r14 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 16(%rcx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r12,%r15 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 24(%rcx) - addq %r12,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rcx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %r12,%r9 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 40(%rcx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %r12,%r10 - adcq %rdx,%r11 - .byte 0xf3,0xc3 - -.globl sqr_mont_382x - -.def sqr_mont_382x; .scl 2; .type 32; .endef -.p2align 5 -sqr_mont_382x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqr_mont_382x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_mont_382x$1 -#endif - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $136,%rsp - -.LSEH_body_sqr_mont_382x: - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - movq %rsi,16(%rsp) - movq %rdi,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %r8,%r14 - addq 48(%rsi),%r8 - movq %r9,%r15 - adcq 56(%rsi),%r9 - movq %r10,%rax - adcq 64(%rsi),%r10 - movq %r11,%rdx - adcq 72(%rsi),%r11 - movq %r12,%rbx - adcq 80(%rsi),%r12 - movq %r13,%rbp - adcq 88(%rsi),%r13 - - subq 48(%rsi),%r14 - sbbq 56(%rsi),%r15 - sbbq 64(%rsi),%rax - sbbq 72(%rsi),%rdx - sbbq 80(%rsi),%rbx - sbbq 88(%rsi),%rbp - sbbq %rdi,%rdi - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - movq %r14,32+48(%rsp) - movq %r15,32+56(%rsp) - movq %rax,32+64(%rsp) - movq %rdx,32+72(%rsp) - movq %rbx,32+80(%rsp) - movq %rbp,32+88(%rsp) - movq %rdi,32+96(%rsp) - - - - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rax - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - - movq 24(%rsp),%rdi - call __mulq_mont_383_nonred - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - - movq %r14,48(%rdi) - movq %r15,56(%rdi) - movq %r8,64(%rdi) - movq %r9,72(%rdi) - movq %r10,80(%rdi) - movq %r11,88(%rdi) - - leaq 32(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rax - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%r12 - movq 32+24(%rsp),%r13 - - call __mulq_mont_383_nonred - movq 32+96(%rsp),%rsi - movq 32+0(%rsp),%r12 - movq 32+8(%rsp),%r13 - andq %rsi,%r12 - movq 32+16(%rsp),%rax - andq %rsi,%r13 - movq 32+24(%rsp),%rbx - andq %rsi,%rax - movq 32+32(%rsp),%rbp - andq %rsi,%rbx - andq %rsi,%rbp - andq 32+40(%rsp),%rsi - - subq %r12,%r14 - movq 0(%rcx),%r12 - sbbq %r13,%r15 - movq 8(%rcx),%r13 - sbbq %rax,%r8 - movq 16(%rcx),%rax - sbbq %rbx,%r9 - movq 24(%rcx),%rbx - sbbq %rbp,%r10 - movq 32(%rcx),%rbp - sbbq %rsi,%r11 - sbbq %rsi,%rsi - - andq %rsi,%r12 - andq %rsi,%r13 - andq %rsi,%rax - andq %rsi,%rbx - andq %rsi,%rbp - andq 40(%rcx),%rsi - - addq %r12,%r14 - adcq %r13,%r15 - adcq %rax,%r8 - adcq %rbx,%r9 - adcq %rbp,%r10 - adcq %rsi,%r11 - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_sqr_mont_382x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqr_mont_382x: -.section .pdata -.p2align 2 -.rva .LSEH_begin_mul_mont_384x -.rva .LSEH_body_mul_mont_384x -.rva .LSEH_info_mul_mont_384x_prologue - -.rva .LSEH_body_mul_mont_384x -.rva .LSEH_epilogue_mul_mont_384x -.rva .LSEH_info_mul_mont_384x_body - -.rva .LSEH_epilogue_mul_mont_384x -.rva .LSEH_end_mul_mont_384x -.rva .LSEH_info_mul_mont_384x_epilogue - -.rva .LSEH_begin_sqr_mont_384x -.rva .LSEH_body_sqr_mont_384x -.rva .LSEH_info_sqr_mont_384x_prologue - -.rva .LSEH_body_sqr_mont_384x -.rva .LSEH_epilogue_sqr_mont_384x -.rva .LSEH_info_sqr_mont_384x_body - -.rva .LSEH_epilogue_sqr_mont_384x -.rva .LSEH_end_sqr_mont_384x -.rva .LSEH_info_sqr_mont_384x_epilogue - -.rva .LSEH_begin_mul_382x -.rva .LSEH_body_mul_382x -.rva .LSEH_info_mul_382x_prologue - -.rva .LSEH_body_mul_382x -.rva .LSEH_epilogue_mul_382x -.rva .LSEH_info_mul_382x_body - -.rva .LSEH_epilogue_mul_382x -.rva .LSEH_end_mul_382x -.rva .LSEH_info_mul_382x_epilogue - -.rva .LSEH_begin_sqr_382x -.rva .LSEH_body_sqr_382x -.rva .LSEH_info_sqr_382x_prologue - -.rva .LSEH_body_sqr_382x -.rva .LSEH_epilogue_sqr_382x -.rva .LSEH_info_sqr_382x_body - -.rva .LSEH_epilogue_sqr_382x -.rva .LSEH_end_sqr_382x -.rva .LSEH_info_sqr_382x_epilogue - -.rva .LSEH_begin_mul_384 -.rva .LSEH_body_mul_384 -.rva .LSEH_info_mul_384_prologue - -.rva .LSEH_body_mul_384 -.rva .LSEH_epilogue_mul_384 -.rva .LSEH_info_mul_384_body - -.rva .LSEH_epilogue_mul_384 -.rva .LSEH_end_mul_384 -.rva .LSEH_info_mul_384_epilogue - -.rva .LSEH_begin_sqr_384 -.rva .LSEH_body_sqr_384 -.rva .LSEH_info_sqr_384_prologue - -.rva .LSEH_body_sqr_384 -.rva .LSEH_epilogue_sqr_384 -.rva .LSEH_info_sqr_384_body - -.rva .LSEH_epilogue_sqr_384 -.rva .LSEH_end_sqr_384 -.rva .LSEH_info_sqr_384_epilogue - -.rva .LSEH_begin_sqr_mont_384 -.rva .LSEH_body_sqr_mont_384 -.rva .LSEH_info_sqr_mont_384_prologue - -.rva .LSEH_body_sqr_mont_384 -.rva .LSEH_epilogue_sqr_mont_384 -.rva .LSEH_info_sqr_mont_384_body - -.rva .LSEH_epilogue_sqr_mont_384 -.rva .LSEH_end_sqr_mont_384 -.rva .LSEH_info_sqr_mont_384_epilogue - -.rva .LSEH_begin_redc_mont_384 -.rva .LSEH_body_redc_mont_384 -.rva .LSEH_info_redc_mont_384_prologue - -.rva .LSEH_body_redc_mont_384 -.rva .LSEH_epilogue_redc_mont_384 -.rva .LSEH_info_redc_mont_384_body - -.rva .LSEH_epilogue_redc_mont_384 -.rva .LSEH_end_redc_mont_384 -.rva .LSEH_info_redc_mont_384_epilogue - -.rva .LSEH_begin_from_mont_384 -.rva .LSEH_body_from_mont_384 -.rva .LSEH_info_from_mont_384_prologue - -.rva .LSEH_body_from_mont_384 -.rva .LSEH_epilogue_from_mont_384 -.rva .LSEH_info_from_mont_384_body - -.rva .LSEH_epilogue_from_mont_384 -.rva .LSEH_end_from_mont_384 -.rva .LSEH_info_from_mont_384_epilogue - -.rva .LSEH_begin_sgn0_pty_mont_384 -.rva .LSEH_body_sgn0_pty_mont_384 -.rva .LSEH_info_sgn0_pty_mont_384_prologue - -.rva .LSEH_body_sgn0_pty_mont_384 -.rva .LSEH_epilogue_sgn0_pty_mont_384 -.rva .LSEH_info_sgn0_pty_mont_384_body - -.rva .LSEH_epilogue_sgn0_pty_mont_384 -.rva .LSEH_end_sgn0_pty_mont_384 -.rva .LSEH_info_sgn0_pty_mont_384_epilogue - -.rva .LSEH_begin_sgn0_pty_mont_384x -.rva .LSEH_body_sgn0_pty_mont_384x -.rva .LSEH_info_sgn0_pty_mont_384x_prologue - -.rva .LSEH_body_sgn0_pty_mont_384x -.rva .LSEH_epilogue_sgn0_pty_mont_384x -.rva .LSEH_info_sgn0_pty_mont_384x_body - -.rva .LSEH_epilogue_sgn0_pty_mont_384x -.rva .LSEH_end_sgn0_pty_mont_384x -.rva .LSEH_info_sgn0_pty_mont_384x_epilogue - -.rva .LSEH_begin_mul_mont_384 -.rva .LSEH_body_mul_mont_384 -.rva .LSEH_info_mul_mont_384_prologue - -.rva .LSEH_body_mul_mont_384 -.rva .LSEH_epilogue_mul_mont_384 -.rva .LSEH_info_mul_mont_384_body - -.rva .LSEH_epilogue_mul_mont_384 -.rva .LSEH_end_mul_mont_384 -.rva .LSEH_info_mul_mont_384_epilogue - -.rva .LSEH_begin_sqr_n_mul_mont_384 -.rva .LSEH_body_sqr_n_mul_mont_384 -.rva .LSEH_info_sqr_n_mul_mont_384_prologue - -.rva .LSEH_body_sqr_n_mul_mont_384 -.rva .LSEH_epilogue_sqr_n_mul_mont_384 -.rva .LSEH_info_sqr_n_mul_mont_384_body - -.rva .LSEH_epilogue_sqr_n_mul_mont_384 -.rva .LSEH_end_sqr_n_mul_mont_384 -.rva .LSEH_info_sqr_n_mul_mont_384_epilogue - -.rva .LSEH_begin_sqr_n_mul_mont_383 -.rva .LSEH_body_sqr_n_mul_mont_383 -.rva .LSEH_info_sqr_n_mul_mont_383_prologue - -.rva .LSEH_body_sqr_n_mul_mont_383 -.rva .LSEH_epilogue_sqr_n_mul_mont_383 -.rva .LSEH_info_sqr_n_mul_mont_383_body - -.rva .LSEH_epilogue_sqr_n_mul_mont_383 -.rva .LSEH_end_sqr_n_mul_mont_383 -.rva .LSEH_info_sqr_n_mul_mont_383_epilogue - -.rva .LSEH_begin_sqr_mont_382x -.rva .LSEH_body_sqr_mont_382x -.rva .LSEH_info_sqr_mont_382x_prologue - -.rva .LSEH_body_sqr_mont_382x -.rva .LSEH_epilogue_sqr_mont_382x -.rva .LSEH_info_sqr_mont_382x_body - -.rva .LSEH_epilogue_sqr_mont_382x -.rva .LSEH_end_sqr_mont_382x -.rva .LSEH_info_sqr_mont_382x_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_mul_mont_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_mont_384x_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x29,0x00 -.byte 0x00,0xe4,0x2a,0x00 -.byte 0x00,0xd4,0x2b,0x00 -.byte 0x00,0xc4,0x2c,0x00 -.byte 0x00,0x34,0x2d,0x00 -.byte 0x00,0x54,0x2e,0x00 -.byte 0x00,0x74,0x30,0x00 -.byte 0x00,0x64,0x31,0x00 -.byte 0x00,0x01,0x2f,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mul_mont_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqr_mont_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqr_mont_384x_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x11,0x00 -.byte 0x00,0xe4,0x12,0x00 -.byte 0x00,0xd4,0x13,0x00 -.byte 0x00,0xc4,0x14,0x00 -.byte 0x00,0x34,0x15,0x00 -.byte 0x00,0x54,0x16,0x00 -.byte 0x00,0x74,0x18,0x00 -.byte 0x00,0x64,0x19,0x00 -.byte 0x00,0x01,0x17,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqr_mont_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mul_382x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_382x_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x11,0x00 -.byte 0x00,0xe4,0x12,0x00 -.byte 0x00,0xd4,0x13,0x00 -.byte 0x00,0xc4,0x14,0x00 -.byte 0x00,0x34,0x15,0x00 -.byte 0x00,0x54,0x16,0x00 -.byte 0x00,0x74,0x18,0x00 -.byte 0x00,0x64,0x19,0x00 -.byte 0x00,0x01,0x17,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mul_382x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqr_382x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqr_382x_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqr_382x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mul_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_384_body: -.byte 1,0,11,0 -.byte 0x00,0xc4,0x00,0x00 -.byte 0x00,0x34,0x01,0x00 -.byte 0x00,0x54,0x02,0x00 -.byte 0x00,0x74,0x04,0x00 -.byte 0x00,0x64,0x05,0x00 -.byte 0x00,0x22 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.LSEH_info_mul_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqr_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqr_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqr_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqr_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqr_mont_384_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x0f,0x00 -.byte 0x00,0xe4,0x10,0x00 -.byte 0x00,0xd4,0x11,0x00 -.byte 0x00,0xc4,0x12,0x00 -.byte 0x00,0x34,0x13,0x00 -.byte 0x00,0x54,0x14,0x00 -.byte 0x00,0x74,0x16,0x00 -.byte 0x00,0x64,0x17,0x00 -.byte 0x00,0x01,0x15,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqr_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_redc_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_redc_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_redc_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_from_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_from_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_from_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sgn0_pty_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sgn0_pty_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sgn0_pty_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sgn0_pty_mont_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sgn0_pty_mont_384x_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sgn0_pty_mont_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mul_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mul_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x03,0x00 -.byte 0x00,0xe4,0x04,0x00 -.byte 0x00,0xd4,0x05,0x00 -.byte 0x00,0xc4,0x06,0x00 -.byte 0x00,0x34,0x07,0x00 -.byte 0x00,0x54,0x08,0x00 -.byte 0x00,0x74,0x0a,0x00 -.byte 0x00,0x64,0x0b,0x00 -.byte 0x00,0x82 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mul_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqr_n_mul_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqr_n_mul_mont_384_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x11,0x00 -.byte 0x00,0xe4,0x12,0x00 -.byte 0x00,0xd4,0x13,0x00 -.byte 0x00,0xc4,0x14,0x00 -.byte 0x00,0x34,0x15,0x00 -.byte 0x00,0x54,0x16,0x00 -.byte 0x00,0x74,0x18,0x00 -.byte 0x00,0x64,0x19,0x00 -.byte 0x00,0x01,0x17,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqr_n_mul_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqr_n_mul_mont_383_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqr_n_mul_mont_383_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x11,0x00 -.byte 0x00,0xe4,0x12,0x00 -.byte 0x00,0xd4,0x13,0x00 -.byte 0x00,0xc4,0x14,0x00 -.byte 0x00,0x34,0x15,0x00 -.byte 0x00,0x54,0x16,0x00 -.byte 0x00,0x74,0x18,0x00 -.byte 0x00,0x64,0x19,0x00 -.byte 0x00,0x01,0x17,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqr_n_mul_mont_383_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqr_mont_382x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqr_mont_382x_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x11,0x00 -.byte 0x00,0xe4,0x12,0x00 -.byte 0x00,0xd4,0x13,0x00 -.byte 0x00,0xc4,0x14,0x00 -.byte 0x00,0x34,0x15,0x00 -.byte 0x00,0x54,0x16,0x00 -.byte 0x00,0x74,0x18,0x00 -.byte 0x00,0x64,0x19,0x00 -.byte 0x00,0x01,0x17,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqr_mont_382x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s deleted file mode 100644 index cba65569c52..00000000000 --- a/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s +++ /dev/null @@ -1,796 +0,0 @@ -.text - -.globl mulx_mont_sparse_256 - -.def mulx_mont_sparse_256; .scl 2; .type 32; .endef -.p2align 5 -mulx_mont_sparse_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mulx_mont_sparse_256: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 -mul_mont_sparse_256$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_mulx_mont_sparse_256: - - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rbp - movq 24(%rsi),%r9 - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%rax,%r11 - call __mulx_mont_sparse_256 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_mulx_mont_sparse_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mulx_mont_sparse_256: - -.globl sqrx_mont_sparse_256 - -.def sqrx_mont_sparse_256; .scl 2; .type 32; .endef -.p2align 5 -sqrx_mont_sparse_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqrx_mont_sparse_256: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -sqr_mont_sparse_256$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_sqrx_mont_sparse_256: - - - movq %rsi,%rbx - movq %rcx,%r8 - movq %rdx,%rcx - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rbp - movq 24(%rsi),%r9 - leaq -128(%rbx),%rsi - leaq -128(%rcx),%rcx - - mulxq %rdx,%rax,%r11 - call __mulx_mont_sparse_256 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sqrx_mont_sparse_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqrx_mont_sparse_256: -.def __mulx_mont_sparse_256; .scl 3; .type 32; .endef -.p2align 5 -__mulx_mont_sparse_256: - .byte 0xf3,0x0f,0x1e,0xfa - - mulxq %r15,%r15,%r12 - mulxq %rbp,%rbp,%r13 - addq %r15,%r11 - mulxq %r9,%r9,%r14 - movq 8(%rbx),%rdx - adcq %rbp,%r12 - adcq %r9,%r13 - adcq $0,%r14 - - movq %rax,%r10 - imulq %r8,%rax - - - xorq %r15,%r15 - mulxq 0+128(%rsi),%rbp,%r9 - adoxq %rbp,%r11 - adcxq %r9,%r12 - - mulxq 8+128(%rsi),%rbp,%r9 - adoxq %rbp,%r12 - adcxq %r9,%r13 - - mulxq 16+128(%rsi),%rbp,%r9 - adoxq %rbp,%r13 - adcxq %r9,%r14 - - mulxq 24+128(%rsi),%rbp,%r9 - movq %rax,%rdx - adoxq %rbp,%r14 - adcxq %r15,%r9 - adoxq %r9,%r15 - - - mulxq 0+128(%rcx),%rbp,%rax - adcxq %rbp,%r10 - adoxq %r11,%rax - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%rax - adoxq %r9,%r12 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r12 - adoxq %r9,%r13 - - mulxq 24+128(%rcx),%rbp,%r9 - movq 16(%rbx),%rdx - adcxq %rbp,%r13 - adoxq %r9,%r14 - adcxq %r10,%r14 - adoxq %r10,%r15 - adcxq %r10,%r15 - adoxq %r10,%r10 - adcq $0,%r10 - movq %rax,%r11 - imulq %r8,%rax - - - xorq %rbp,%rbp - mulxq 0+128(%rsi),%rbp,%r9 - adoxq %rbp,%r12 - adcxq %r9,%r13 - - mulxq 8+128(%rsi),%rbp,%r9 - adoxq %rbp,%r13 - adcxq %r9,%r14 - - mulxq 16+128(%rsi),%rbp,%r9 - adoxq %rbp,%r14 - adcxq %r9,%r15 - - mulxq 24+128(%rsi),%rbp,%r9 - movq %rax,%rdx - adoxq %rbp,%r15 - adcxq %r10,%r9 - adoxq %r9,%r10 - - - mulxq 0+128(%rcx),%rbp,%rax - adcxq %rbp,%r11 - adoxq %r12,%rax - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%rax - adoxq %r9,%r13 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r13 - adoxq %r9,%r14 - - mulxq 24+128(%rcx),%rbp,%r9 - movq 24(%rbx),%rdx - adcxq %rbp,%r14 - adoxq %r9,%r15 - adcxq %r11,%r15 - adoxq %r11,%r10 - adcxq %r11,%r10 - adoxq %r11,%r11 - adcq $0,%r11 - movq %rax,%r12 - imulq %r8,%rax - - - xorq %rbp,%rbp - mulxq 0+128(%rsi),%rbp,%r9 - adoxq %rbp,%r13 - adcxq %r9,%r14 - - mulxq 8+128(%rsi),%rbp,%r9 - adoxq %rbp,%r14 - adcxq %r9,%r15 - - mulxq 16+128(%rsi),%rbp,%r9 - adoxq %rbp,%r15 - adcxq %r9,%r10 - - mulxq 24+128(%rsi),%rbp,%r9 - movq %rax,%rdx - adoxq %rbp,%r10 - adcxq %r11,%r9 - adoxq %r9,%r11 - - - mulxq 0+128(%rcx),%rbp,%rax - adcxq %rbp,%r12 - adoxq %r13,%rax - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%rax - adoxq %r9,%r14 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r14 - adoxq %r9,%r15 - - mulxq 24+128(%rcx),%rbp,%r9 - movq %rax,%rdx - adcxq %rbp,%r15 - adoxq %r9,%r10 - adcxq %r12,%r10 - adoxq %r12,%r11 - adcxq %r12,%r11 - adoxq %r12,%r12 - adcq $0,%r12 - imulq %r8,%rdx - - - xorq %rbp,%rbp - mulxq 0+128(%rcx),%r13,%r9 - adcxq %rax,%r13 - adoxq %r9,%r14 - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%r14 - adoxq %r9,%r15 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r15 - adoxq %r9,%r10 - - mulxq 24+128(%rcx),%rbp,%r9 - movq %r14,%rdx - leaq 128(%rcx),%rcx - adcxq %rbp,%r10 - adoxq %r9,%r11 - movq %r15,%rax - adcxq %r13,%r11 - adoxq %r13,%r12 - adcq $0,%r12 - - - - - movq %r10,%rbp - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - sbbq 16(%rcx),%r10 - movq %r11,%r9 - sbbq 24(%rcx),%r11 - sbbq $0,%r12 - - cmovcq %rdx,%r14 - cmovcq %rax,%r15 - cmovcq %rbp,%r10 - movq %r14,0(%rdi) - cmovcq %r9,%r11 - movq %r15,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - .byte 0xf3,0xc3 - -.globl fromx_mont_256 - -.def fromx_mont_256; .scl 2; .type 32; .endef -.p2align 5 -fromx_mont_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_fromx_mont_256: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -from_mont_256$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_fromx_mont_256: - - - movq %rdx,%rbx - call __mulx_by_1_mont_256 - - - - - - movq %r15,%rdx - movq %r10,%r12 - movq %r11,%r13 - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r10 - sbbq 24(%rbx),%r11 - - cmovncq %r14,%rax - cmovncq %r15,%rdx - cmovncq %r10,%r12 - movq %rax,0(%rdi) - cmovncq %r11,%r13 - movq %rdx,8(%rdi) - movq %r12,16(%rdi) - movq %r13,24(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_fromx_mont_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_fromx_mont_256: - -.globl redcx_mont_256 - -.def redcx_mont_256; .scl 2; .type 32; .endef -.p2align 5 -redcx_mont_256: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_redcx_mont_256: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -redc_mont_256$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_redcx_mont_256: - - - movq %rdx,%rbx - call __mulx_by_1_mont_256 - - addq 32(%rsi),%r14 - adcq 40(%rsi),%r15 - movq %r14,%rax - adcq 48(%rsi),%r10 - movq %r15,%rdx - adcq 56(%rsi),%r11 - sbbq %rsi,%rsi - - - - - movq %r10,%r12 - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r10 - movq %r11,%r13 - sbbq 24(%rbx),%r11 - sbbq $0,%rsi - - cmovncq %r14,%rax - cmovncq %r15,%rdx - cmovncq %r10,%r12 - movq %rax,0(%rdi) - cmovncq %r11,%r13 - movq %rdx,8(%rdi) - movq %r12,16(%rdi) - movq %r13,24(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_redcx_mont_256: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_redcx_mont_256: -.def __mulx_by_1_mont_256; .scl 3; .type 32; .endef -.p2align 5 -__mulx_by_1_mont_256: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r11 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - - movq %rax,%r14 - imulq %rcx,%rax - movq %rax,%r10 - - mulq 0(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq %rdx,%r14 - - mulq 8(%rbx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r11 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 16(%rbx) - movq %r11,%r15 - imulq %rcx,%r11 - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r12 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 24(%rbx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r14,%r13 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq %rdx,%r15 - - mulq 8(%rbx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rbx) - movq %r12,%r10 - imulq %rcx,%r12 - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r15,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rbx) - addq %rax,%r10 - movq %r12,%rax - adcq %rdx,%r10 - - mulq 8(%rbx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rbx) - movq %r13,%r11 - imulq %rcx,%r13 - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 24(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 0(%rbx) - addq %rax,%r11 - movq %r13,%rax - adcq %rdx,%r11 - - mulq 8(%rbx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 24(%rbx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - .byte 0xf3,0xc3 - -.section .pdata -.p2align 2 -.rva .LSEH_begin_mulx_mont_sparse_256 -.rva .LSEH_body_mulx_mont_sparse_256 -.rva .LSEH_info_mulx_mont_sparse_256_prologue - -.rva .LSEH_body_mulx_mont_sparse_256 -.rva .LSEH_epilogue_mulx_mont_sparse_256 -.rva .LSEH_info_mulx_mont_sparse_256_body - -.rva .LSEH_epilogue_mulx_mont_sparse_256 -.rva .LSEH_end_mulx_mont_sparse_256 -.rva .LSEH_info_mulx_mont_sparse_256_epilogue - -.rva .LSEH_begin_sqrx_mont_sparse_256 -.rva .LSEH_body_sqrx_mont_sparse_256 -.rva .LSEH_info_sqrx_mont_sparse_256_prologue - -.rva .LSEH_body_sqrx_mont_sparse_256 -.rva .LSEH_epilogue_sqrx_mont_sparse_256 -.rva .LSEH_info_sqrx_mont_sparse_256_body - -.rva .LSEH_epilogue_sqrx_mont_sparse_256 -.rva .LSEH_end_sqrx_mont_sparse_256 -.rva .LSEH_info_sqrx_mont_sparse_256_epilogue - -.rva .LSEH_begin_fromx_mont_256 -.rva .LSEH_body_fromx_mont_256 -.rva .LSEH_info_fromx_mont_256_prologue - -.rva .LSEH_body_fromx_mont_256 -.rva .LSEH_epilogue_fromx_mont_256 -.rva .LSEH_info_fromx_mont_256_body - -.rva .LSEH_epilogue_fromx_mont_256 -.rva .LSEH_end_fromx_mont_256 -.rva .LSEH_info_fromx_mont_256_epilogue - -.rva .LSEH_begin_redcx_mont_256 -.rva .LSEH_body_redcx_mont_256 -.rva .LSEH_info_redcx_mont_256_prologue - -.rva .LSEH_body_redcx_mont_256 -.rva .LSEH_epilogue_redcx_mont_256 -.rva .LSEH_info_redcx_mont_256_body - -.rva .LSEH_epilogue_redcx_mont_256 -.rva .LSEH_end_redcx_mont_256 -.rva .LSEH_info_redcx_mont_256_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_mulx_mont_sparse_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mulx_mont_sparse_256_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mulx_mont_sparse_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqrx_mont_sparse_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqrx_mont_sparse_256_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqrx_mont_sparse_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_fromx_mont_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_fromx_mont_256_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_fromx_mont_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_redcx_mont_256_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_redcx_mont_256_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_redcx_mont_256_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s deleted file mode 100644 index ce1354f46b4..00000000000 --- a/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s +++ /dev/null @@ -1,3608 +0,0 @@ -.text - - - - - - - -.def __subx_mod_384x384; .scl 3; .type 32; .endef -.p2align 5 -__subx_mod_384x384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - subq 0(%rdx),%r8 - movq 56(%rsi),%r15 - sbbq 8(%rdx),%r9 - movq 64(%rsi),%rax - sbbq 16(%rdx),%r10 - movq 72(%rsi),%rbx - sbbq 24(%rdx),%r11 - movq 80(%rsi),%rbp - sbbq 32(%rdx),%r12 - movq 88(%rsi),%rsi - sbbq 40(%rdx),%r13 - movq %r8,0(%rdi) - sbbq 48(%rdx),%r14 - movq 0(%rcx),%r8 - movq %r9,8(%rdi) - sbbq 56(%rdx),%r15 - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - sbbq 64(%rdx),%rax - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - sbbq 72(%rdx),%rbx - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - sbbq 80(%rdx),%rbp - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - sbbq 88(%rdx),%rsi - movq 40(%rcx),%r13 - sbbq %rdx,%rdx - - andq %rdx,%r8 - andq %rdx,%r9 - andq %rdx,%r10 - andq %rdx,%r11 - andq %rdx,%r12 - andq %rdx,%r13 - - addq %r8,%r14 - adcq %r9,%r15 - movq %r14,48(%rdi) - adcq %r10,%rax - movq %r15,56(%rdi) - adcq %r11,%rbx - movq %rax,64(%rdi) - adcq %r12,%rbp - movq %rbx,72(%rdi) - adcq %r13,%rsi - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 - - -.def __addx_mod_384; .scl 3; .type 32; .endef -.p2align 5 -__addx_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - movq %r8,%r14 - adcq 24(%rdx),%r11 - movq %r9,%r15 - adcq 32(%rdx),%r12 - movq %r10,%rax - adcq 40(%rdx),%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,0(%rdi) - cmovcq %rbx,%r11 - movq %r9,8(%rdi) - cmovcq %rbp,%r12 - movq %r10,16(%rdi) - cmovcq %rsi,%r13 - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 - - -.def __subx_mod_384; .scl 3; .type 32; .endef -.p2align 5 -__subx_mod_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -__subx_mod_384_a_is_loaded: - subq 0(%rdx),%r8 - movq 0(%rcx),%r14 - sbbq 8(%rdx),%r9 - movq 8(%rcx),%r15 - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rax - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbx - sbbq 32(%rdx),%r12 - movq 32(%rcx),%rbp - sbbq 40(%rdx),%r13 - movq 40(%rcx),%rsi - sbbq %rdx,%rdx - - andq %rdx,%r14 - andq %rdx,%r15 - andq %rdx,%rax - andq %rdx,%rbx - andq %rdx,%rbp - andq %rdx,%rsi - - addq %r14,%r8 - adcq %r15,%r9 - movq %r8,0(%rdi) - adcq %rax,%r10 - movq %r9,8(%rdi) - adcq %rbx,%r11 - movq %r10,16(%rdi) - adcq %rbp,%r12 - movq %r11,24(%rdi) - adcq %rsi,%r13 - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 - -.globl mulx_mont_384x - -.def mulx_mont_384x; .scl 2; .type 32; .endef -.p2align 5 -mulx_mont_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mulx_mont_384x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 -mul_mont_384x$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $328,%rsp - -.LSEH_body_mulx_mont_384x: - - - movq %rdx,%rbx - movq %rdi,32(%rsp) - movq %rsi,24(%rsp) - movq %rdx,16(%rsp) - movq %rcx,8(%rsp) - movq %r8,0(%rsp) - - - - - leaq 40(%rsp),%rdi - call __mulx_384 - - - leaq 48(%rbx),%rbx - leaq 128+48(%rsi),%rsi - leaq 96(%rdi),%rdi - call __mulx_384 - - - movq 8(%rsp),%rcx - leaq (%rbx),%rsi - leaq -48(%rbx),%rdx - leaq 40+192+48(%rsp),%rdi - call __addx_mod_384 - - movq 24(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq -48(%rdi),%rdi - call __addx_mod_384 - - leaq (%rdi),%rbx - leaq 48(%rdi),%rsi - call __mulx_384 - - - leaq (%rdi),%rsi - leaq 40(%rsp),%rdx - movq 8(%rsp),%rcx - call __subx_mod_384x384 - - leaq (%rdi),%rsi - leaq -96(%rdi),%rdx - call __subx_mod_384x384 - - - leaq 40(%rsp),%rsi - leaq 40+96(%rsp),%rdx - leaq 40(%rsp),%rdi - call __subx_mod_384x384 - - leaq (%rcx),%rbx - - - leaq 40(%rsp),%rsi - movq 0(%rsp),%rcx - movq 32(%rsp),%rdi - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - - leaq 40+192(%rsp),%rsi - movq 0(%rsp),%rcx - leaq 48(%rdi),%rdi - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - leaq 328(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_mulx_mont_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mulx_mont_384x: -.globl sqrx_mont_384x - -.def sqrx_mont_384x; .scl 2; .type 32; .endef -.p2align 5 -sqrx_mont_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqrx_mont_384x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -sqr_mont_384x$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $136,%rsp - -.LSEH_body_sqrx_mont_384x: - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - - movq %rdi,16(%rsp) - movq %rsi,24(%rsp) - - - leaq 48(%rsi),%rdx - leaq 32(%rsp),%rdi - call __addx_mod_384 - - - movq 24(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq 32+48(%rsp),%rdi - call __subx_mod_384 - - - movq 24(%rsp),%rsi - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - addq %rdx,%rdx - adcq %r15,%r15 - adcq %rax,%rax - movq %rdx,%r8 - adcq %r12,%r12 - movq %r15,%r9 - adcq %rdi,%rdi - movq %rax,%r10 - adcq %rbp,%rbp - movq %r12,%r11 - sbbq %rsi,%rsi - - subq 0(%rcx),%rdx - sbbq 8(%rcx),%r15 - movq %rdi,%r13 - sbbq 16(%rcx),%rax - sbbq 24(%rcx),%r12 - sbbq 32(%rcx),%rdi - movq %rbp,%r14 - sbbq 40(%rcx),%rbp - sbbq $0,%rsi - - cmovcq %r8,%rdx - cmovcq %r9,%r15 - cmovcq %r10,%rax - movq %rdx,48(%rbx) - cmovcq %r11,%r12 - movq %r15,56(%rbx) - cmovcq %r13,%rdi - movq %rax,64(%rbx) - cmovcq %r14,%rbp - movq %r12,72(%rbx) - movq %rdi,80(%rbx) - movq %rbp,88(%rbx) - - leaq 32(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rdx - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%rax - movq 32+24(%rsp),%r12 - movq 32+32(%rsp),%rdi - movq 32+40(%rsp),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_sqrx_mont_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqrx_mont_384x: - -.globl mulx_382x - -.def mulx_382x; .scl 2; .type 32; .endef -.p2align 5 -mulx_382x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mulx_382x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -mul_382x$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $136,%rsp - -.LSEH_body_mulx_382x: - - - leaq 96(%rdi),%rdi - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - movq %rdi,16(%rsp) - movq %rcx,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 48(%rsi),%r8 - adcq 56(%rsi),%r9 - adcq 64(%rsi),%r10 - adcq 72(%rsi),%r11 - adcq 80(%rsi),%r12 - adcq 88(%rsi),%r13 - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - movq 32(%rdx),%r12 - movq 40(%rdx),%r13 - - addq 48(%rdx),%r8 - adcq 56(%rdx),%r9 - adcq 64(%rdx),%r10 - adcq 72(%rdx),%r11 - adcq 80(%rdx),%r12 - adcq 88(%rdx),%r13 - - movq %r8,32+48(%rsp) - movq %r9,32+56(%rsp) - movq %r10,32+64(%rsp) - movq %r11,32+72(%rsp) - movq %r12,32+80(%rsp) - movq %r13,32+88(%rsp) - - - leaq 32+0(%rsp),%rsi - leaq 32+48(%rsp),%rbx - call __mulx_384 - - - movq 0(%rsp),%rsi - movq 8(%rsp),%rbx - leaq -96(%rdi),%rdi - call __mulx_384 - - - leaq 48+128(%rsi),%rsi - leaq 48(%rbx),%rbx - leaq 32(%rsp),%rdi - call __mulx_384 - - - movq 16(%rsp),%rsi - leaq 32(%rsp),%rdx - movq 24(%rsp),%rcx - movq %rsi,%rdi - call __subx_mod_384x384 - - - leaq 0(%rdi),%rsi - leaq -96(%rdi),%rdx - call __subx_mod_384x384 - - - leaq -96(%rdi),%rsi - leaq 32(%rsp),%rdx - leaq -96(%rdi),%rdi - call __subx_mod_384x384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_mulx_382x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mulx_382x: -.globl sqrx_382x - -.def sqrx_382x; .scl 2; .type 32; .endef -.p2align 5 -sqrx_382x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqrx_382x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -sqr_382x$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rsi - -.LSEH_body_sqrx_382x: - - - movq %rdx,%rcx - - - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%rbx - movq 32(%rsi),%rbp - movq 40(%rsi),%rdx - - movq %r14,%r8 - addq 48(%rsi),%r14 - movq %r15,%r9 - adcq 56(%rsi),%r15 - movq %rax,%r10 - adcq 64(%rsi),%rax - movq %rbx,%r11 - adcq 72(%rsi),%rbx - movq %rbp,%r12 - adcq 80(%rsi),%rbp - movq %rdx,%r13 - adcq 88(%rsi),%rdx - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - movq %rbp,32(%rdi) - movq %rdx,40(%rdi) - - - leaq 48(%rsi),%rdx - leaq 48(%rdi),%rdi - call __subx_mod_384_a_is_loaded - - - leaq (%rdi),%rsi - leaq -48(%rdi),%rbx - leaq -48(%rdi),%rdi - call __mulx_384 - - - movq (%rsp),%rsi - leaq 48(%rsi),%rbx - leaq 96(%rdi),%rdi - call __mulx_384 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq 64(%rdi),%rax - movq 72(%rdi),%rbx - movq 80(%rdi),%rbp - addq %r8,%r8 - movq 88(%rdi),%rdx - adcq %r9,%r9 - movq %r8,0(%rdi) - adcq %r10,%r10 - movq %r9,8(%rdi) - adcq %r11,%r11 - movq %r10,16(%rdi) - adcq %r12,%r12 - movq %r11,24(%rdi) - adcq %r13,%r13 - movq %r12,32(%rdi) - adcq %r14,%r14 - movq %r13,40(%rdi) - adcq %r15,%r15 - movq %r14,48(%rdi) - adcq %rax,%rax - movq %r15,56(%rdi) - adcq %rbx,%rbx - movq %rax,64(%rdi) - adcq %rbp,%rbp - movq %rbx,72(%rdi) - adcq %rdx,%rdx - movq %rbp,80(%rdi) - movq %rdx,88(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sqrx_382x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqrx_382x: -.globl mulx_384 - -.def mulx_384; .scl 2; .type 32; .endef -.p2align 5 -mulx_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mulx_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -mul_384$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -.LSEH_body_mulx_384: - - - movq %rdx,%rbx - call __mulx_384 - - movq 0(%rsp),%r15 - - movq 8(%rsp),%r14 - - movq 16(%rsp),%r13 - - movq 24(%rsp),%r12 - - movq 32(%rsp),%rbx - - movq 40(%rsp),%rbp - - leaq 48(%rsp),%rsp - -.LSEH_epilogue_mulx_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mulx_384: - -.def __mulx_384; .scl 3; .type 32; .endef -.p2align 5 -__mulx_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rbx),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - leaq -128(%rsi),%rsi - - mulxq %r14,%r9,%rcx - xorq %rbp,%rbp - - mulxq %r15,%r8,%rax - adcxq %rcx,%r8 - movq %r9,0(%rdi) - - mulxq %r10,%r9,%rcx - adcxq %rax,%r9 - - mulxq %r11,%r10,%rax - adcxq %rcx,%r10 - - mulxq %r12,%r11,%rcx - adcxq %rax,%r11 - - mulxq %r13,%r12,%r13 - movq 8(%rbx),%rdx - adcxq %rcx,%r12 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,8(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 16(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,16(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 24(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,24(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 32(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,32(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 40(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,40(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq %rax,%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - movq %r8,48(%rdi) - movq %r9,56(%rdi) - movq %r10,64(%rdi) - movq %r11,72(%rdi) - movq %r12,80(%rdi) - movq %r13,88(%rdi) - - .byte 0xf3,0xc3 - -.globl sqrx_384 - -.def sqrx_384; .scl 2; .type 32; .endef -.p2align 5 -sqrx_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqrx_384: - - - movq %rcx,%rdi - movq %rdx,%rsi -sqr_384$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rdi - -.LSEH_body_sqrx_384: - - - call __sqrx_384 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sqrx_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqrx_384: -.def __sqrx_384; .scl 3; .type 32; .endef -.p2align 5 -__sqrx_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%rcx - movq 32(%rsi),%rbx - - - mulxq %r14,%r8,%rdi - movq 40(%rsi),%rbp - mulxq %r15,%r9,%rax - addq %rdi,%r9 - mulxq %rcx,%r10,%rdi - adcq %rax,%r10 - mulxq %rbx,%r11,%rax - adcq %rdi,%r11 - mulxq %rbp,%r12,%r13 - movq %r14,%rdx - adcq %rax,%r12 - adcq $0,%r13 - - - xorq %r14,%r14 - mulxq %r15,%rdi,%rax - adcxq %rdi,%r10 - adoxq %rax,%r11 - - mulxq %rcx,%rdi,%rax - adcxq %rdi,%r11 - adoxq %rax,%r12 - - mulxq %rbx,%rdi,%rax - adcxq %rdi,%r12 - adoxq %rax,%r13 - - mulxq %rbp,%rdi,%rax - movq %r15,%rdx - adcxq %rdi,%r13 - adoxq %r14,%rax - adcxq %rax,%r14 - - - xorq %r15,%r15 - mulxq %rcx,%rdi,%rax - adcxq %rdi,%r12 - adoxq %rax,%r13 - - mulxq %rbx,%rdi,%rax - adcxq %rdi,%r13 - adoxq %rax,%r14 - - mulxq %rbp,%rdi,%rax - movq %rcx,%rdx - adcxq %rdi,%r14 - adoxq %r15,%rax - adcxq %rax,%r15 - - - xorq %rcx,%rcx - mulxq %rbx,%rdi,%rax - adcxq %rdi,%r14 - adoxq %rax,%r15 - - mulxq %rbp,%rdi,%rax - movq %rbx,%rdx - adcxq %rdi,%r15 - adoxq %rcx,%rax - adcxq %rax,%rcx - - - mulxq %rbp,%rdi,%rbx - movq 0(%rsi),%rdx - addq %rdi,%rcx - movq 8(%rsp),%rdi - adcq $0,%rbx - - - xorq %rbp,%rbp - adcxq %r8,%r8 - adcxq %r9,%r9 - adcxq %r10,%r10 - adcxq %r11,%r11 - adcxq %r12,%r12 - - - mulxq %rdx,%rdx,%rax - movq %rdx,0(%rdi) - movq 8(%rsi),%rdx - adoxq %rax,%r8 - movq %r8,8(%rdi) - - mulxq %rdx,%r8,%rax - movq 16(%rsi),%rdx - adoxq %r8,%r9 - adoxq %rax,%r10 - movq %r9,16(%rdi) - movq %r10,24(%rdi) - - mulxq %rdx,%r8,%r9 - movq 24(%rsi),%rdx - adoxq %r8,%r11 - adoxq %r9,%r12 - adcxq %r13,%r13 - adcxq %r14,%r14 - movq %r11,32(%rdi) - movq %r12,40(%rdi) - - mulxq %rdx,%r8,%r9 - movq 32(%rsi),%rdx - adoxq %r8,%r13 - adoxq %r9,%r14 - adcxq %r15,%r15 - adcxq %rcx,%rcx - movq %r13,48(%rdi) - movq %r14,56(%rdi) - - mulxq %rdx,%r8,%r9 - movq 40(%rsi),%rdx - adoxq %r8,%r15 - adoxq %r9,%rcx - adcxq %rbx,%rbx - adcxq %rbp,%rbp - movq %r15,64(%rdi) - movq %rcx,72(%rdi) - - mulxq %rdx,%r8,%r9 - adoxq %r8,%rbx - adoxq %r9,%rbp - - movq %rbx,80(%rdi) - movq %rbp,88(%rdi) - - .byte 0xf3,0xc3 - - - - -.globl redcx_mont_384 - -.def redcx_mont_384; .scl 2; .type 32; .endef -.p2align 5 -redcx_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_redcx_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -redc_mont_384$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_redcx_mont_384: - - - movq %rdx,%rbx - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_redcx_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_redcx_mont_384: - - - - -.globl fromx_mont_384 - -.def fromx_mont_384; .scl 2; .type 32; .endef -.p2align 5 -fromx_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_fromx_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -from_mont_384$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_fromx_mont_384: - - - movq %rdx,%rbx - call __mulx_by_1_mont_384 - - - - - movq %r14,%rax - movq %r15,%rcx - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_fromx_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_fromx_mont_384: -.def __mulx_by_1_mont_384; .scl 3; .type 32; .endef -.p2align 5 -__mulx_by_1_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq %rcx,%rdx - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - imulq %r8,%rdx - - - xorq %r14,%r14 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r8 - adoxq %rbp,%r9 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r9 - adoxq %rbp,%r10 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r10 - adoxq %rbp,%r11 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r13 - adoxq %r14,%rbp - adcxq %rbp,%r14 - imulq %r9,%rdx - - - xorq %r15,%r15 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r9 - adoxq %rbp,%r10 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r10 - adoxq %rbp,%r11 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r14 - adoxq %r15,%rbp - adcxq %rbp,%r15 - imulq %r10,%rdx - - - xorq %r8,%r8 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r10 - adoxq %rbp,%r11 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r15 - adoxq %r8,%rbp - adcxq %rbp,%r8 - imulq %r11,%rdx - - - xorq %r9,%r9 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r15 - adoxq %rbp,%r8 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r8 - adoxq %r9,%rbp - adcxq %rbp,%r9 - imulq %r12,%rdx - - - xorq %r10,%r10 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r15 - adoxq %rbp,%r8 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r8 - adoxq %rbp,%r9 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r9 - adoxq %r10,%rbp - adcxq %rbp,%r10 - imulq %r13,%rdx - - - xorq %r11,%r11 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r15 - adoxq %rbp,%r8 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r8 - adoxq %rbp,%r9 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r9 - adoxq %rbp,%r10 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r10 - adoxq %r11,%rbp - adcxq %rbp,%r11 - .byte 0xf3,0xc3 - - -.def __redx_tail_mont_384; .scl 3; .type 32; .endef -.p2align 5 -__redx_tail_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - - addq 48(%rsi),%r14 - movq %r14,%rax - adcq 56(%rsi),%r15 - adcq 64(%rsi),%r8 - adcq 72(%rsi),%r9 - movq %r15,%rcx - adcq 80(%rsi),%r10 - adcq 88(%rsi),%r11 - sbbq %r12,%r12 - - - - - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - .byte 0xf3,0xc3 - - -.globl sgn0x_pty_mont_384 - -.def sgn0x_pty_mont_384; .scl 2; .type 32; .endef -.p2align 5 -sgn0x_pty_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sgn0x_pty_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -sgn0_pty_mont_384$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_sgn0x_pty_mont_384: - - - movq %rsi,%rbx - leaq 0(%rdi),%rsi - movq %rdx,%rcx - call __mulx_by_1_mont_384 - - xorq %rax,%rax - movq %r14,%r13 - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - notq %rax - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sgn0x_pty_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sgn0x_pty_mont_384: - -.globl sgn0x_pty_mont_384x - -.def sgn0x_pty_mont_384x; .scl 2; .type 32; .endef -.p2align 5 -sgn0x_pty_mont_384x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sgn0x_pty_mont_384x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -sgn0_pty_mont_384x$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $8,%rsp - -.LSEH_body_sgn0x_pty_mont_384x: - - - movq %rsi,%rbx - leaq 48(%rdi),%rsi - movq %rdx,%rcx - call __mulx_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - leaq 0(%rdi),%rsi - xorq %rdi,%rdi - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rdi - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rdi - - movq %r14,0(%rsp) - notq %rdi - andq $1,%r13 - andq $2,%rdi - orq %r13,%rdi - - call __mulx_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - xorq %rax,%rax - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - movq 0(%rsp),%r12 - - notq %rax - - testq %r14,%r14 - cmovzq %rdi,%r13 - - testq %r12,%r12 - cmovnzq %rdi,%rax - - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 - - movq 16(%rsp),%r14 - - movq 24(%rsp),%r13 - - movq 32(%rsp),%r12 - - movq 40(%rsp),%rbx - - movq 48(%rsp),%rbp - - leaq 56(%rsp),%rsp - -.LSEH_epilogue_sgn0x_pty_mont_384x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sgn0x_pty_mont_384x: -.globl mulx_mont_384 - -.def mulx_mont_384; .scl 2; .type 32; .endef -.p2align 5 -mulx_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_mulx_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 -mul_mont_384$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - leaq -24(%rsp),%rsp - -.LSEH_body_mulx_mont_384: - - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - movq %r8,(%rsp) - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - movq 24(%rsp),%r15 - - movq 32(%rsp),%r14 - - movq 40(%rsp),%r13 - - movq 48(%rsp),%r12 - - movq 56(%rsp),%rbx - - movq 64(%rsp),%rbp - - leaq 72(%rsp),%rsp - -.LSEH_epilogue_mulx_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_mulx_mont_384: -.def __mulx_mont_384; .scl 3; .type 32; .endef -.p2align 5 -__mulx_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - - - mulxq %r15,%r14,%r10 - mulxq %rax,%r15,%r11 - addq %r14,%r9 - mulxq %r12,%rax,%r12 - adcq %r15,%r10 - mulxq %rdi,%rdi,%r13 - adcq %rax,%r11 - mulxq %rbp,%rbp,%r14 - movq 8(%rbx),%rdx - adcq %rdi,%r12 - adcq %rbp,%r13 - adcq $0,%r14 - xorq %r15,%r15 - - movq %r8,16(%rsp) - imulq 8(%rsp),%r8 - - - xorq %rax,%rax - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r9 - adcxq %rbp,%r10 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r8,%rdx - adoxq %rdi,%r14 - adcxq %rbp,%r15 - adoxq %rax,%r15 - adoxq %rax,%rax - - - xorq %r8,%r8 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r9 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 16(%rbx),%rdx - adcxq %rdi,%r13 - adoxq %rbp,%r14 - adcxq %r8,%r14 - adoxq %r8,%r15 - adcxq %r8,%r15 - adoxq %r8,%rax - adcxq %r8,%rax - movq %r9,16(%rsp) - imulq 8(%rsp),%r9 - - - xorq %r8,%r8 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r9,%rdx - adoxq %rdi,%r15 - adcxq %rbp,%rax - adoxq %r8,%rax - adoxq %r8,%r8 - - - xorq %r9,%r9 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r10 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 24(%rbx),%rdx - adcxq %rdi,%r14 - adoxq %rbp,%r15 - adcxq %r9,%r15 - adoxq %r9,%rax - adcxq %r9,%rax - adoxq %r9,%r8 - adcxq %r9,%r8 - movq %r10,16(%rsp) - imulq 8(%rsp),%r10 - - - xorq %r9,%r9 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r10,%rdx - adoxq %rdi,%rax - adcxq %rbp,%r8 - adoxq %r9,%r8 - adoxq %r9,%r9 - - - xorq %r10,%r10 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r11 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 32(%rbx),%rdx - adcxq %rdi,%r15 - adoxq %rbp,%rax - adcxq %r10,%rax - adoxq %r10,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcxq %r10,%r9 - movq %r11,16(%rsp) - imulq 8(%rsp),%r11 - - - xorq %r10,%r10 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r11,%rdx - adoxq %rdi,%r8 - adcxq %rbp,%r9 - adoxq %r10,%r9 - adoxq %r10,%r10 - - - xorq %r11,%r11 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r12 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 40+128(%rcx),%rdi,%rbp - movq 40(%rbx),%rdx - adcxq %rdi,%rax - adoxq %rbp,%r8 - adcxq %r11,%r8 - adoxq %r11,%r9 - adcxq %r11,%r9 - adoxq %r11,%r10 - adcxq %r11,%r10 - movq %r12,16(%rsp) - imulq 8(%rsp),%r12 - - - xorq %r11,%r11 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r8 - adcxq %rbp,%r9 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r12,%rdx - adoxq %rdi,%r9 - adcxq %rbp,%r10 - adoxq %r11,%r10 - adoxq %r11,%r11 - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r13 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - - mulxq 40+128(%rcx),%rdi,%rbp - movq %r13,%rdx - adcxq %rdi,%r8 - adoxq %rbp,%r9 - adcxq %r12,%r9 - adoxq %r12,%r10 - adcxq %r12,%r10 - adoxq %r12,%r11 - adcxq %r12,%r11 - imulq 8(%rsp),%rdx - movq 24(%rsp),%rbx - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - movq %r15,%r13 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r8 - adoxq %rbp,%r9 - movq %rax,%rsi - - mulxq 40+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r10 - movq %r14,%rdx - adcxq %r12,%r10 - adoxq %r12,%r11 - leaq 128(%rcx),%rcx - movq %r8,%r12 - adcq $0,%r11 - - - - - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - movq %r9,%rdi - sbbq 16(%rcx),%rax - sbbq 24(%rcx),%r8 - sbbq 32(%rcx),%r9 - movq %r10,%rbp - sbbq 40(%rcx),%r10 - sbbq $0,%r11 - - cmovncq %r14,%rdx - cmovcq %r13,%r15 - cmovcq %rsi,%rax - cmovncq %r8,%r12 - movq %rdx,0(%rbx) - cmovncq %r9,%rdi - movq %r15,8(%rbx) - cmovncq %r10,%rbp - movq %rax,16(%rbx) - movq %r12,24(%rbx) - movq %rdi,32(%rbx) - movq %rbp,40(%rbx) - - .byte 0xf3,0xc3 - - -.globl sqrx_mont_384 - -.def sqrx_mont_384; .scl 2; .type 32; .endef -.p2align 5 -sqrx_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqrx_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -sqr_mont_384$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - leaq -24(%rsp),%rsp - -.LSEH_body_sqrx_mont_384: - - - movq %rcx,%r8 - leaq -128(%rdx),%rcx - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - - leaq (%rsi),%rbx - movq %r8,(%rsp) - leaq -128(%rsi),%rsi - - mulxq %rdx,%r8,%r9 - call __mulx_mont_384 - - movq 24(%rsp),%r15 - - movq 32(%rsp),%r14 - - movq 40(%rsp),%r13 - - movq 48(%rsp),%r12 - - movq 56(%rsp),%rbx - - movq 64(%rsp),%rbp - - leaq 72(%rsp),%rsp - -.LSEH_epilogue_sqrx_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqrx_mont_384: - -.globl sqrx_n_mul_mont_384 - -.def sqrx_n_mul_mont_384; .scl 2; .type 32; .endef -.p2align 5 -sqrx_n_mul_mont_384: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqrx_n_mul_mont_384: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 - movq 48(%rsp),%r9 -sqr_n_mul_mont_384$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - leaq -40(%rsp),%rsp - -.LSEH_body_sqrx_n_mul_mont_384: - - - movq %rdx,%r10 - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq %rsi,%rbx - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - - movq %r8,(%rsp) - movq %r9,24(%rsp) - movq 0(%r9),%xmm2 - -.Loop_sqrx_384: - movd %r10d,%xmm1 - leaq -128(%rbx),%rsi - leaq -128(%rcx),%rcx - - mulxq %rdx,%r8,%r9 - call __mulx_mont_384 - - movd %xmm1,%r10d - decl %r10d - jnz .Loop_sqrx_384 - - movq %rdx,%r14 -.byte 102,72,15,126,210 - leaq -128(%rbx),%rsi - movq 24(%rsp),%rbx - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - movq 40(%rsp),%r15 - - movq 48(%rsp),%r14 - - movq 56(%rsp),%r13 - - movq 64(%rsp),%r12 - - movq 72(%rsp),%rbx - - movq 80(%rsp),%rbp - - leaq 88(%rsp),%rsp - -.LSEH_epilogue_sqrx_n_mul_mont_384: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqrx_n_mul_mont_384: - -.globl sqrx_n_mul_mont_383 - -.def sqrx_n_mul_mont_383; .scl 2; .type 32; .endef -.p2align 5 -sqrx_n_mul_mont_383: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqrx_n_mul_mont_383: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx - movq 40(%rsp),%r8 - movq 48(%rsp),%r9 -sqr_n_mul_mont_383$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - leaq -40(%rsp),%rsp - -.LSEH_body_sqrx_n_mul_mont_383: - - - movq %rdx,%r10 - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq %rsi,%rbx - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - - movq %r8,(%rsp) - movq %r9,24(%rsp) - movq 0(%r9),%xmm2 - leaq -128(%rcx),%rcx - -.Loop_sqrx_383: - movd %r10d,%xmm1 - leaq -128(%rbx),%rsi - - mulxq %rdx,%r8,%r9 - call __mulx_mont_383_nonred - - movd %xmm1,%r10d - decl %r10d - jnz .Loop_sqrx_383 - - movq %rdx,%r14 -.byte 102,72,15,126,210 - leaq -128(%rbx),%rsi - movq 24(%rsp),%rbx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - movq 40(%rsp),%r15 - - movq 48(%rsp),%r14 - - movq 56(%rsp),%r13 - - movq 64(%rsp),%r12 - - movq 72(%rsp),%rbx - - movq 80(%rsp),%rbp - - leaq 88(%rsp),%rsp - -.LSEH_epilogue_sqrx_n_mul_mont_383: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqrx_n_mul_mont_383: -.def __mulx_mont_383_nonred; .scl 3; .type 32; .endef -.p2align 5 -__mulx_mont_383_nonred: - .byte 0xf3,0x0f,0x1e,0xfa - - - mulxq %r15,%r14,%r10 - mulxq %rax,%r15,%r11 - addq %r14,%r9 - mulxq %r12,%rax,%r12 - adcq %r15,%r10 - mulxq %rdi,%rdi,%r13 - adcq %rax,%r11 - mulxq %rbp,%rbp,%r14 - movq 8(%rbx),%rdx - adcq %rdi,%r12 - adcq %rbp,%r13 - adcq $0,%r14 - movq %r8,%rax - imulq 8(%rsp),%r8 - - - xorq %r15,%r15 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r9 - adcxq %rbp,%r10 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r8,%rdx - adoxq %rdi,%r14 - adcxq %r15,%rbp - adoxq %rbp,%r15 - - - xorq %r8,%r8 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r9 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 16(%rbx),%rdx - adcxq %rdi,%r13 - adoxq %rbp,%r14 - adcxq %rax,%r14 - adoxq %rax,%r15 - adcxq %rax,%r15 - movq %r9,%r8 - imulq 8(%rsp),%r9 - - - xorq %rax,%rax - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r9,%rdx - adoxq %rdi,%r15 - adcxq %rax,%rbp - adoxq %rbp,%rax - - - xorq %r9,%r9 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r8 - adoxq %rbp,%r10 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 24(%rbx),%rdx - adcxq %rdi,%r14 - adoxq %rbp,%r15 - adcxq %r8,%r15 - adoxq %r8,%rax - adcxq %r8,%rax - movq %r10,%r9 - imulq 8(%rsp),%r10 - - - xorq %r8,%r8 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r10,%rdx - adoxq %rdi,%rax - adcxq %r8,%rbp - adoxq %rbp,%r8 - - - xorq %r10,%r10 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r11 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 32(%rbx),%rdx - adcxq %rdi,%r15 - adoxq %rbp,%rax - adcxq %r9,%rax - adoxq %r9,%r8 - adcxq %r9,%r8 - movq %r11,%r10 - imulq 8(%rsp),%r11 - - - xorq %r9,%r9 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r11,%rdx - adoxq %rdi,%r8 - adcxq %r9,%rbp - adoxq %rbp,%r9 - - - xorq %r11,%r11 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r12 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 40+128(%rcx),%rdi,%rbp - movq 40(%rbx),%rdx - adcxq %rdi,%rax - adoxq %rbp,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcxq %r10,%r9 - movq %r12,%r11 - imulq 8(%rsp),%r12 - - - xorq %r10,%r10 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r8 - adcxq %rbp,%r9 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r12,%rdx - adoxq %rdi,%r9 - adcxq %r10,%rbp - adoxq %rbp,%r10 - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r13 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - - mulxq 40+128(%rcx),%rdi,%rbp - movq %r13,%rdx - adcxq %rdi,%r8 - adoxq %rbp,%r9 - adcxq %r11,%r9 - adoxq %r11,%r10 - adcxq %r11,%r10 - imulq 8(%rsp),%rdx - movq 24(%rsp),%rbx - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r8 - adoxq %rbp,%r9 - - mulxq 40+128(%rcx),%rdi,%rbp - movq %r14,%rdx - adcxq %rdi,%r9 - adoxq %rbp,%r10 - adcq $0,%r10 - movq %r8,%r12 - - movq %r14,0(%rbx) - movq %r15,8(%rbx) - movq %rax,16(%rbx) - movq %r9,%rdi - movq %r8,24(%rbx) - movq %r9,32(%rbx) - movq %r10,40(%rbx) - movq %r10,%rbp - - .byte 0xf3,0xc3 - - -.globl sqrx_mont_382x - -.def sqrx_mont_382x; .scl 2; .type 32; .endef -.p2align 5 -sqrx_mont_382x: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_sqrx_mont_382x: - - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - movq %r9,%rcx -sqr_mont_382x$1: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $136,%rsp - -.LSEH_body_sqrx_mont_382x: - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - movq %rdi,16(%rsp) - movq %rsi,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %r8,%r14 - addq 48(%rsi),%r8 - movq %r9,%r15 - adcq 56(%rsi),%r9 - movq %r10,%rax - adcq 64(%rsi),%r10 - movq %r11,%rdx - adcq 72(%rsi),%r11 - movq %r12,%rbx - adcq 80(%rsi),%r12 - movq %r13,%rbp - adcq 88(%rsi),%r13 - - subq 48(%rsi),%r14 - sbbq 56(%rsi),%r15 - sbbq 64(%rsi),%rax - sbbq 72(%rsi),%rdx - sbbq 80(%rsi),%rbx - sbbq 88(%rsi),%rbp - sbbq %rdi,%rdi - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - movq %r14,32+48(%rsp) - movq %r15,32+56(%rsp) - movq %rax,32+64(%rsp) - movq %rdx,32+72(%rsp) - movq %rbx,32+80(%rsp) - movq %rbp,32+88(%rsp) - movq %rdi,32+96(%rsp) - - - - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_383_nonred - addq %rdx,%rdx - adcq %r15,%r15 - adcq %rax,%rax - adcq %r12,%r12 - adcq %rdi,%rdi - adcq %rbp,%rbp - - movq %rdx,48(%rbx) - movq %r15,56(%rbx) - movq %rax,64(%rbx) - movq %r12,72(%rbx) - movq %rdi,80(%rbx) - movq %rbp,88(%rbx) - - leaq 32-128(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rdx - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%rax - movq 32+24(%rsp),%r12 - movq 32+32(%rsp),%rdi - movq 32+40(%rsp),%rbp - - - - mulxq %r14,%r8,%r9 - call __mulx_mont_383_nonred - movq 32+96(%rsp),%r14 - leaq 128(%rcx),%rcx - movq 32+0(%rsp),%r8 - andq %r14,%r8 - movq 32+8(%rsp),%r9 - andq %r14,%r9 - movq 32+16(%rsp),%r10 - andq %r14,%r10 - movq 32+24(%rsp),%r11 - andq %r14,%r11 - movq 32+32(%rsp),%r13 - andq %r14,%r13 - andq 32+40(%rsp),%r14 - - subq %r8,%rdx - movq 0(%rcx),%r8 - sbbq %r9,%r15 - movq 8(%rcx),%r9 - sbbq %r10,%rax - movq 16(%rcx),%r10 - sbbq %r11,%r12 - movq 24(%rcx),%r11 - sbbq %r13,%rdi - movq 32(%rcx),%r13 - sbbq %r14,%rbp - sbbq %r14,%r14 - - andq %r14,%r8 - andq %r14,%r9 - andq %r14,%r10 - andq %r14,%r11 - andq %r14,%r13 - andq 40(%rcx),%r14 - - addq %r8,%rdx - adcq %r9,%r15 - adcq %r10,%rax - adcq %r11,%r12 - adcq %r13,%rdi - adcq %r14,%rbp - - movq %rdx,0(%rbx) - movq %r15,8(%rbx) - movq %rax,16(%rbx) - movq %r12,24(%rbx) - movq %rdi,32(%rbx) - movq %rbp,40(%rbx) - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 - - movq 8(%r8),%r14 - - movq 16(%r8),%r13 - - movq 24(%r8),%r12 - - movq 32(%r8),%rbx - - movq 40(%r8),%rbp - - leaq 48(%r8),%rsp - -.LSEH_epilogue_sqrx_mont_382x: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_sqrx_mont_382x: -.section .pdata -.p2align 2 -.rva .LSEH_begin_mulx_mont_384x -.rva .LSEH_body_mulx_mont_384x -.rva .LSEH_info_mulx_mont_384x_prologue - -.rva .LSEH_body_mulx_mont_384x -.rva .LSEH_epilogue_mulx_mont_384x -.rva .LSEH_info_mulx_mont_384x_body - -.rva .LSEH_epilogue_mulx_mont_384x -.rva .LSEH_end_mulx_mont_384x -.rva .LSEH_info_mulx_mont_384x_epilogue - -.rva .LSEH_begin_sqrx_mont_384x -.rva .LSEH_body_sqrx_mont_384x -.rva .LSEH_info_sqrx_mont_384x_prologue - -.rva .LSEH_body_sqrx_mont_384x -.rva .LSEH_epilogue_sqrx_mont_384x -.rva .LSEH_info_sqrx_mont_384x_body - -.rva .LSEH_epilogue_sqrx_mont_384x -.rva .LSEH_end_sqrx_mont_384x -.rva .LSEH_info_sqrx_mont_384x_epilogue - -.rva .LSEH_begin_mulx_382x -.rva .LSEH_body_mulx_382x -.rva .LSEH_info_mulx_382x_prologue - -.rva .LSEH_body_mulx_382x -.rva .LSEH_epilogue_mulx_382x -.rva .LSEH_info_mulx_382x_body - -.rva .LSEH_epilogue_mulx_382x -.rva .LSEH_end_mulx_382x -.rva .LSEH_info_mulx_382x_epilogue - -.rva .LSEH_begin_sqrx_382x -.rva .LSEH_body_sqrx_382x -.rva .LSEH_info_sqrx_382x_prologue - -.rva .LSEH_body_sqrx_382x -.rva .LSEH_epilogue_sqrx_382x -.rva .LSEH_info_sqrx_382x_body - -.rva .LSEH_epilogue_sqrx_382x -.rva .LSEH_end_sqrx_382x -.rva .LSEH_info_sqrx_382x_epilogue - -.rva .LSEH_begin_mulx_384 -.rva .LSEH_body_mulx_384 -.rva .LSEH_info_mulx_384_prologue - -.rva .LSEH_body_mulx_384 -.rva .LSEH_epilogue_mulx_384 -.rva .LSEH_info_mulx_384_body - -.rva .LSEH_epilogue_mulx_384 -.rva .LSEH_end_mulx_384 -.rva .LSEH_info_mulx_384_epilogue - -.rva .LSEH_begin_sqrx_384 -.rva .LSEH_body_sqrx_384 -.rva .LSEH_info_sqrx_384_prologue - -.rva .LSEH_body_sqrx_384 -.rva .LSEH_epilogue_sqrx_384 -.rva .LSEH_info_sqrx_384_body - -.rva .LSEH_epilogue_sqrx_384 -.rva .LSEH_end_sqrx_384 -.rva .LSEH_info_sqrx_384_epilogue - -.rva .LSEH_begin_redcx_mont_384 -.rva .LSEH_body_redcx_mont_384 -.rva .LSEH_info_redcx_mont_384_prologue - -.rva .LSEH_body_redcx_mont_384 -.rva .LSEH_epilogue_redcx_mont_384 -.rva .LSEH_info_redcx_mont_384_body - -.rva .LSEH_epilogue_redcx_mont_384 -.rva .LSEH_end_redcx_mont_384 -.rva .LSEH_info_redcx_mont_384_epilogue - -.rva .LSEH_begin_fromx_mont_384 -.rva .LSEH_body_fromx_mont_384 -.rva .LSEH_info_fromx_mont_384_prologue - -.rva .LSEH_body_fromx_mont_384 -.rva .LSEH_epilogue_fromx_mont_384 -.rva .LSEH_info_fromx_mont_384_body - -.rva .LSEH_epilogue_fromx_mont_384 -.rva .LSEH_end_fromx_mont_384 -.rva .LSEH_info_fromx_mont_384_epilogue - -.rva .LSEH_begin_sgn0x_pty_mont_384 -.rva .LSEH_body_sgn0x_pty_mont_384 -.rva .LSEH_info_sgn0x_pty_mont_384_prologue - -.rva .LSEH_body_sgn0x_pty_mont_384 -.rva .LSEH_epilogue_sgn0x_pty_mont_384 -.rva .LSEH_info_sgn0x_pty_mont_384_body - -.rva .LSEH_epilogue_sgn0x_pty_mont_384 -.rva .LSEH_end_sgn0x_pty_mont_384 -.rva .LSEH_info_sgn0x_pty_mont_384_epilogue - -.rva .LSEH_begin_sgn0x_pty_mont_384x -.rva .LSEH_body_sgn0x_pty_mont_384x -.rva .LSEH_info_sgn0x_pty_mont_384x_prologue - -.rva .LSEH_body_sgn0x_pty_mont_384x -.rva .LSEH_epilogue_sgn0x_pty_mont_384x -.rva .LSEH_info_sgn0x_pty_mont_384x_body - -.rva .LSEH_epilogue_sgn0x_pty_mont_384x -.rva .LSEH_end_sgn0x_pty_mont_384x -.rva .LSEH_info_sgn0x_pty_mont_384x_epilogue - -.rva .LSEH_begin_mulx_mont_384 -.rva .LSEH_body_mulx_mont_384 -.rva .LSEH_info_mulx_mont_384_prologue - -.rva .LSEH_body_mulx_mont_384 -.rva .LSEH_epilogue_mulx_mont_384 -.rva .LSEH_info_mulx_mont_384_body - -.rva .LSEH_epilogue_mulx_mont_384 -.rva .LSEH_end_mulx_mont_384 -.rva .LSEH_info_mulx_mont_384_epilogue - -.rva .LSEH_begin_sqrx_mont_384 -.rva .LSEH_body_sqrx_mont_384 -.rva .LSEH_info_sqrx_mont_384_prologue - -.rva .LSEH_body_sqrx_mont_384 -.rva .LSEH_epilogue_sqrx_mont_384 -.rva .LSEH_info_sqrx_mont_384_body - -.rva .LSEH_epilogue_sqrx_mont_384 -.rva .LSEH_end_sqrx_mont_384 -.rva .LSEH_info_sqrx_mont_384_epilogue - -.rva .LSEH_begin_sqrx_n_mul_mont_384 -.rva .LSEH_body_sqrx_n_mul_mont_384 -.rva .LSEH_info_sqrx_n_mul_mont_384_prologue - -.rva .LSEH_body_sqrx_n_mul_mont_384 -.rva .LSEH_epilogue_sqrx_n_mul_mont_384 -.rva .LSEH_info_sqrx_n_mul_mont_384_body - -.rva .LSEH_epilogue_sqrx_n_mul_mont_384 -.rva .LSEH_end_sqrx_n_mul_mont_384 -.rva .LSEH_info_sqrx_n_mul_mont_384_epilogue - -.rva .LSEH_begin_sqrx_n_mul_mont_383 -.rva .LSEH_body_sqrx_n_mul_mont_383 -.rva .LSEH_info_sqrx_n_mul_mont_383_prologue - -.rva .LSEH_body_sqrx_n_mul_mont_383 -.rva .LSEH_epilogue_sqrx_n_mul_mont_383 -.rva .LSEH_info_sqrx_n_mul_mont_383_body - -.rva .LSEH_epilogue_sqrx_n_mul_mont_383 -.rva .LSEH_end_sqrx_n_mul_mont_383 -.rva .LSEH_info_sqrx_n_mul_mont_383_epilogue - -.rva .LSEH_begin_sqrx_mont_382x -.rva .LSEH_body_sqrx_mont_382x -.rva .LSEH_info_sqrx_mont_382x_prologue - -.rva .LSEH_body_sqrx_mont_382x -.rva .LSEH_epilogue_sqrx_mont_382x -.rva .LSEH_info_sqrx_mont_382x_body - -.rva .LSEH_epilogue_sqrx_mont_382x -.rva .LSEH_end_sqrx_mont_382x -.rva .LSEH_info_sqrx_mont_382x_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_mulx_mont_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mulx_mont_384x_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x29,0x00 -.byte 0x00,0xe4,0x2a,0x00 -.byte 0x00,0xd4,0x2b,0x00 -.byte 0x00,0xc4,0x2c,0x00 -.byte 0x00,0x34,0x2d,0x00 -.byte 0x00,0x54,0x2e,0x00 -.byte 0x00,0x74,0x30,0x00 -.byte 0x00,0x64,0x31,0x00 -.byte 0x00,0x01,0x2f,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mulx_mont_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqrx_mont_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqrx_mont_384x_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x11,0x00 -.byte 0x00,0xe4,0x12,0x00 -.byte 0x00,0xd4,0x13,0x00 -.byte 0x00,0xc4,0x14,0x00 -.byte 0x00,0x34,0x15,0x00 -.byte 0x00,0x54,0x16,0x00 -.byte 0x00,0x74,0x18,0x00 -.byte 0x00,0x64,0x19,0x00 -.byte 0x00,0x01,0x17,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqrx_mont_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mulx_382x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mulx_382x_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x11,0x00 -.byte 0x00,0xe4,0x12,0x00 -.byte 0x00,0xd4,0x13,0x00 -.byte 0x00,0xc4,0x14,0x00 -.byte 0x00,0x34,0x15,0x00 -.byte 0x00,0x54,0x16,0x00 -.byte 0x00,0x74,0x18,0x00 -.byte 0x00,0x64,0x19,0x00 -.byte 0x00,0x01,0x17,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mulx_382x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqrx_382x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqrx_382x_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqrx_382x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mulx_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mulx_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x00,0x00 -.byte 0x00,0xe4,0x01,0x00 -.byte 0x00,0xd4,0x02,0x00 -.byte 0x00,0xc4,0x03,0x00 -.byte 0x00,0x34,0x04,0x00 -.byte 0x00,0x54,0x05,0x00 -.byte 0x00,0x74,0x07,0x00 -.byte 0x00,0x64,0x08,0x00 -.byte 0x00,0x52 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mulx_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqrx_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqrx_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqrx_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_redcx_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_redcx_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_redcx_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_fromx_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_fromx_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_fromx_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sgn0x_pty_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sgn0x_pty_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sgn0x_pty_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sgn0x_pty_mont_384x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sgn0x_pty_mont_384x_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x01,0x00 -.byte 0x00,0xe4,0x02,0x00 -.byte 0x00,0xd4,0x03,0x00 -.byte 0x00,0xc4,0x04,0x00 -.byte 0x00,0x34,0x05,0x00 -.byte 0x00,0x54,0x06,0x00 -.byte 0x00,0x74,0x08,0x00 -.byte 0x00,0x64,0x09,0x00 -.byte 0x00,0x62 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sgn0x_pty_mont_384x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_mulx_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_mulx_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x03,0x00 -.byte 0x00,0xe4,0x04,0x00 -.byte 0x00,0xd4,0x05,0x00 -.byte 0x00,0xc4,0x06,0x00 -.byte 0x00,0x34,0x07,0x00 -.byte 0x00,0x54,0x08,0x00 -.byte 0x00,0x74,0x0a,0x00 -.byte 0x00,0x64,0x0b,0x00 -.byte 0x00,0x82 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_mulx_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqrx_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqrx_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x03,0x00 -.byte 0x00,0xe4,0x04,0x00 -.byte 0x00,0xd4,0x05,0x00 -.byte 0x00,0xc4,0x06,0x00 -.byte 0x00,0x34,0x07,0x00 -.byte 0x00,0x54,0x08,0x00 -.byte 0x00,0x74,0x0a,0x00 -.byte 0x00,0x64,0x0b,0x00 -.byte 0x00,0x82 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqrx_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqrx_n_mul_mont_384_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqrx_n_mul_mont_384_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x05,0x00 -.byte 0x00,0xe4,0x06,0x00 -.byte 0x00,0xd4,0x07,0x00 -.byte 0x00,0xc4,0x08,0x00 -.byte 0x00,0x34,0x09,0x00 -.byte 0x00,0x54,0x0a,0x00 -.byte 0x00,0x74,0x0c,0x00 -.byte 0x00,0x64,0x0d,0x00 -.byte 0x00,0xa2 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqrx_n_mul_mont_384_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqrx_n_mul_mont_383_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqrx_n_mul_mont_383_body: -.byte 1,0,17,0 -.byte 0x00,0xf4,0x05,0x00 -.byte 0x00,0xe4,0x06,0x00 -.byte 0x00,0xd4,0x07,0x00 -.byte 0x00,0xc4,0x08,0x00 -.byte 0x00,0x34,0x09,0x00 -.byte 0x00,0x54,0x0a,0x00 -.byte 0x00,0x74,0x0c,0x00 -.byte 0x00,0x64,0x0d,0x00 -.byte 0x00,0xa2 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqrx_n_mul_mont_383_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_sqrx_mont_382x_prologue: -.byte 1,0,5,0x0b -.byte 0,0x74,1,0 -.byte 0,0x64,2,0 -.byte 0,0xb3 -.byte 0,0 -.long 0,0 -.LSEH_info_sqrx_mont_382x_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x11,0x00 -.byte 0x00,0xe4,0x12,0x00 -.byte 0x00,0xd4,0x13,0x00 -.byte 0x00,0xc4,0x14,0x00 -.byte 0x00,0x34,0x15,0x00 -.byte 0x00,0x54,0x16,0x00 -.byte 0x00,0x74,0x18,0x00 -.byte 0x00,0x64,0x19,0x00 -.byte 0x00,0x01,0x17,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_sqrx_mont_382x_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/sha256-armv8.S b/crypto/blst_src/build/coff/sha256-armv8.S deleted file mode 100644 index a4cd8090896..00000000000 --- a/crypto/blst_src/build/coff/sha256-armv8.S +++ /dev/null @@ -1,1093 +0,0 @@ -// -// Copyright Supranational LLC -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// ==================================================================== -// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -// project. -// ==================================================================== -// -// sha256_block procedure for ARMv8. -// -// This module is stripped of scalar code paths, with rationale that all -// known processors are NEON-capable. -// -// See original module at CRYPTOGAMS for further details. - -.comm __blst_platform_cap,4 -.text - -.p2align 6 - -.LK256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0 //terminator - -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 -.align 2 -.p2align 2 -.globl blst_sha256_block_armv8 -.def blst_sha256_block_armv8; -.type 32; -.endef -.p2align 6 -blst_sha256_block_armv8: -.Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v0.4s,v1.4s},[x0] - adr x3,.LK256 - -.Loop_hw: - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - sub x2,x2,#1 - ld1 {v16.4s},[x3],#16 - rev32 v4.16b,v4.16b - rev32 v5.16b,v5.16b - rev32 v6.16b,v6.16b - rev32 v7.16b,v7.16b - orr v18.16b,v0.16b,v0.16b // offload - orr v19.16b,v1.16b,v1.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.long 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.long 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.long 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - ld1 {v17.4s},[x3] - add v16.4s,v16.4s,v6.4s - sub x3,x3,#64*4-16 // rewind - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - add v17.4s,v17.4s,v7.4s - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - add v0.4s,v0.4s,v18.4s - add v1.4s,v1.4s,v19.4s - - cbnz x2,.Loop_hw - - st1 {v0.4s,v1.4s},[x0] - - ldr x29,[sp],#16 - ret - -.globl blst_sha256_block_data_order -.def blst_sha256_block_data_order; -.type 32; -.endef -.p2align 4 -blst_sha256_block_data_order: - adrp x16,__blst_platform_cap - ldr w16,[x16,#:lo12:__blst_platform_cap] - tst w16,#1 - b.ne .Lv8_entry - - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 - - adr x16,.LK256 - add x2,x1,x2,lsl#6 // len to point at the end of inp - - ld1 {v0.16b},[x1], #16 - ld1 {v1.16b},[x1], #16 - ld1 {v2.16b},[x1], #16 - ld1 {v3.16b},[x1], #16 - ld1 {v4.4s},[x16], #16 - ld1 {v5.4s},[x16], #16 - ld1 {v6.4s},[x16], #16 - ld1 {v7.4s},[x16], #16 - rev32 v0.16b,v0.16b // yes, even on - rev32 v1.16b,v1.16b // big-endian - rev32 v2.16b,v2.16b - rev32 v3.16b,v3.16b - mov x17,sp - add v4.4s,v4.4s,v0.4s - add v5.4s,v5.4s,v1.4s - add v6.4s,v6.4s,v2.4s - st1 {v4.4s,v5.4s},[x17], #32 - add v7.4s,v7.4s,v3.4s - st1 {v6.4s,v7.4s},[x17] - sub x17,x17,#32 - - ldp w3,w4,[x0] - ldp w5,w6,[x0,#8] - ldp w7,w8,[x0,#16] - ldp w9,w10,[x0,#24] - ldr w12,[sp,#0] - mov w13,wzr - eor w14,w4,w5 - mov w15,wzr - b .L_00_48 - -.p2align 4 -.L_00_48: - ext v4.16b,v0.16b,v1.16b,#4 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - bic w15,w9,w7 - ext v7.16b,v2.16b,v3.16b,#4 - eor w11,w7,w7,ror#5 - add w3,w3,w13 - mov d19,v3.d[1] - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w3,w3,ror#11 - ushr v5.4s,v4.4s,#3 - add w10,w10,w12 - add v0.4s,v0.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - ushr v7.4s,v4.4s,#18 - add w10,w10,w11 - ldr w12,[sp,#4] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w6,w6,w10 - sli v7.4s,v4.4s,#14 - eor w14,w14,w4 - ushr v16.4s,v19.4s,#17 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - eor v5.16b,v5.16b,v7.16b - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - sli v16.4s,v19.4s,#15 - add w10,w10,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - ushr v7.4s,v19.4s,#19 - add w9,w9,w12 - ror w11,w11,#6 - add v0.4s,v0.4s,v5.4s - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - sli v7.4s,v19.4s,#13 - add w9,w9,w11 - ldr w12,[sp,#8] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - eor v17.16b,v17.16b,v7.16b - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - add v0.4s,v0.4s,v17.4s - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - ushr v18.4s,v0.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v0.4s,#10 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - sli v18.4s,v0.4s,#15 - add w8,w8,w12 - ushr v17.4s,v0.4s,#19 - ror w11,w11,#6 - eor w13,w9,w10 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w9,ror#20 - add w8,w8,w11 - sli v17.4s,v0.4s,#13 - ldr w12,[sp,#12] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w4,w4,w8 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w10 - eor v17.16b,v17.16b,v17.16b - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - mov v17.d[1],v19.d[0] - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - add v0.4s,v0.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add v4.4s,v4.4s,v0.4s - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#16] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - ext v4.16b,v1.16b,v2.16b,#4 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - bic w15,w5,w3 - ext v7.16b,v3.16b,v0.16b,#4 - eor w11,w3,w3,ror#5 - add w7,w7,w13 - mov d19,v0.d[1] - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w7,w7,ror#11 - ushr v5.4s,v4.4s,#3 - add w6,w6,w12 - add v1.4s,v1.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - ushr v7.4s,v4.4s,#18 - add w6,w6,w11 - ldr w12,[sp,#20] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w10,w10,w6 - sli v7.4s,v4.4s,#14 - eor w14,w14,w8 - ushr v16.4s,v19.4s,#17 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - eor v5.16b,v5.16b,v7.16b - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - sli v16.4s,v19.4s,#15 - add w6,w6,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - ushr v7.4s,v19.4s,#19 - add w5,w5,w12 - ror w11,w11,#6 - add v1.4s,v1.4s,v5.4s - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - sli v7.4s,v19.4s,#13 - add w5,w5,w11 - ldr w12,[sp,#24] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - eor v17.16b,v17.16b,v7.16b - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - add v1.4s,v1.4s,v17.4s - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - ushr v18.4s,v1.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v1.4s,#10 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - sli v18.4s,v1.4s,#15 - add w4,w4,w12 - ushr v17.4s,v1.4s,#19 - ror w11,w11,#6 - eor w13,w5,w6 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w5,ror#20 - add w4,w4,w11 - sli v17.4s,v1.4s,#13 - ldr w12,[sp,#28] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w8,w8,w4 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w6 - eor v17.16b,v17.16b,v17.16b - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - mov v17.d[1],v19.d[0] - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - add v1.4s,v1.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add v4.4s,v4.4s,v1.4s - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[sp,#32] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - ext v4.16b,v2.16b,v3.16b,#4 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - bic w15,w9,w7 - ext v7.16b,v0.16b,v1.16b,#4 - eor w11,w7,w7,ror#5 - add w3,w3,w13 - mov d19,v1.d[1] - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w3,w3,ror#11 - ushr v5.4s,v4.4s,#3 - add w10,w10,w12 - add v2.4s,v2.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - ushr v7.4s,v4.4s,#18 - add w10,w10,w11 - ldr w12,[sp,#36] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w6,w6,w10 - sli v7.4s,v4.4s,#14 - eor w14,w14,w4 - ushr v16.4s,v19.4s,#17 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - eor v5.16b,v5.16b,v7.16b - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - sli v16.4s,v19.4s,#15 - add w10,w10,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - ushr v7.4s,v19.4s,#19 - add w9,w9,w12 - ror w11,w11,#6 - add v2.4s,v2.4s,v5.4s - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - sli v7.4s,v19.4s,#13 - add w9,w9,w11 - ldr w12,[sp,#40] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - eor v17.16b,v17.16b,v7.16b - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - add v2.4s,v2.4s,v17.4s - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - ushr v18.4s,v2.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v2.4s,#10 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - sli v18.4s,v2.4s,#15 - add w8,w8,w12 - ushr v17.4s,v2.4s,#19 - ror w11,w11,#6 - eor w13,w9,w10 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w9,ror#20 - add w8,w8,w11 - sli v17.4s,v2.4s,#13 - ldr w12,[sp,#44] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w4,w4,w8 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w10 - eor v17.16b,v17.16b,v17.16b - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - mov v17.d[1],v19.d[0] - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - add v2.4s,v2.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add v4.4s,v4.4s,v2.4s - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#48] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - ext v4.16b,v3.16b,v0.16b,#4 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - bic w15,w5,w3 - ext v7.16b,v1.16b,v2.16b,#4 - eor w11,w3,w3,ror#5 - add w7,w7,w13 - mov d19,v2.d[1] - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w7,w7,ror#11 - ushr v5.4s,v4.4s,#3 - add w6,w6,w12 - add v3.4s,v3.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - ushr v7.4s,v4.4s,#18 - add w6,w6,w11 - ldr w12,[sp,#52] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w10,w10,w6 - sli v7.4s,v4.4s,#14 - eor w14,w14,w8 - ushr v16.4s,v19.4s,#17 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - eor v5.16b,v5.16b,v7.16b - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - sli v16.4s,v19.4s,#15 - add w6,w6,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - ushr v7.4s,v19.4s,#19 - add w5,w5,w12 - ror w11,w11,#6 - add v3.4s,v3.4s,v5.4s - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - sli v7.4s,v19.4s,#13 - add w5,w5,w11 - ldr w12,[sp,#56] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - eor v17.16b,v17.16b,v7.16b - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - add v3.4s,v3.4s,v17.4s - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - ushr v18.4s,v3.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v3.4s,#10 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - sli v18.4s,v3.4s,#15 - add w4,w4,w12 - ushr v17.4s,v3.4s,#19 - ror w11,w11,#6 - eor w13,w5,w6 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w5,ror#20 - add w4,w4,w11 - sli v17.4s,v3.4s,#13 - ldr w12,[sp,#60] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w8,w8,w4 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w6 - eor v17.16b,v17.16b,v17.16b - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - mov v17.d[1],v19.d[0] - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - add v3.4s,v3.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add v4.4s,v4.4s,v3.4s - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[x16] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - cmp w12,#0 // check for K256 terminator - ldr w12,[sp,#0] - sub x17,x17,#64 - bne .L_00_48 - - sub x16,x16,#256 // rewind x16 - cmp x1,x2 - mov x17, #64 - csel x17, x17, xzr, eq - sub x1,x1,x17 // avoid SEGV - mov x17,sp - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - ld1 {v0.16b},[x1],#16 - bic w15,w9,w7 - eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 - add w3,w3,w13 - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - eor w15,w3,w3,ror#11 - rev32 v0.16b,v0.16b - add w10,w10,w12 - ror w11,w11,#6 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - add v4.4s,v4.4s,v0.4s - add w10,w10,w11 - ldr w12,[sp,#4] - and w14,w14,w13 - ror w15,w15,#2 - add w6,w6,w10 - eor w14,w14,w4 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - add w10,w10,w14 - orr w12,w12,w15 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - add w9,w9,w12 - ror w11,w11,#6 - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - add w9,w9,w11 - ldr w12,[sp,#8] - and w13,w13,w14 - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - orr w12,w12,w15 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - add w8,w8,w12 - ror w11,w11,#6 - eor w13,w9,w10 - eor w15,w15,w9,ror#20 - add w8,w8,w11 - ldr w12,[sp,#12] - and w14,w14,w13 - ror w15,w15,#2 - add w4,w4,w8 - eor w14,w14,w10 - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#16] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - ld1 {v1.16b},[x1],#16 - bic w15,w5,w3 - eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 - add w7,w7,w13 - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - eor w15,w7,w7,ror#11 - rev32 v1.16b,v1.16b - add w6,w6,w12 - ror w11,w11,#6 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - add v4.4s,v4.4s,v1.4s - add w6,w6,w11 - ldr w12,[sp,#20] - and w14,w14,w13 - ror w15,w15,#2 - add w10,w10,w6 - eor w14,w14,w8 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - add w6,w6,w14 - orr w12,w12,w15 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - add w5,w5,w12 - ror w11,w11,#6 - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - add w5,w5,w11 - ldr w12,[sp,#24] - and w13,w13,w14 - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - orr w12,w12,w15 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - add w4,w4,w12 - ror w11,w11,#6 - eor w13,w5,w6 - eor w15,w15,w5,ror#20 - add w4,w4,w11 - ldr w12,[sp,#28] - and w14,w14,w13 - ror w15,w15,#2 - add w8,w8,w4 - eor w14,w14,w6 - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[sp,#32] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - ld1 {v2.16b},[x1],#16 - bic w15,w9,w7 - eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 - add w3,w3,w13 - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - eor w15,w3,w3,ror#11 - rev32 v2.16b,v2.16b - add w10,w10,w12 - ror w11,w11,#6 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - add v4.4s,v4.4s,v2.4s - add w10,w10,w11 - ldr w12,[sp,#36] - and w14,w14,w13 - ror w15,w15,#2 - add w6,w6,w10 - eor w14,w14,w4 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - add w10,w10,w14 - orr w12,w12,w15 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - add w9,w9,w12 - ror w11,w11,#6 - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - add w9,w9,w11 - ldr w12,[sp,#40] - and w13,w13,w14 - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - orr w12,w12,w15 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - add w8,w8,w12 - ror w11,w11,#6 - eor w13,w9,w10 - eor w15,w15,w9,ror#20 - add w8,w8,w11 - ldr w12,[sp,#44] - and w14,w14,w13 - ror w15,w15,#2 - add w4,w4,w8 - eor w14,w14,w10 - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#48] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - ld1 {v3.16b},[x1],#16 - bic w15,w5,w3 - eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 - add w7,w7,w13 - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - eor w15,w7,w7,ror#11 - rev32 v3.16b,v3.16b - add w6,w6,w12 - ror w11,w11,#6 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - add v4.4s,v4.4s,v3.4s - add w6,w6,w11 - ldr w12,[sp,#52] - and w14,w14,w13 - ror w15,w15,#2 - add w10,w10,w6 - eor w14,w14,w8 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - add w6,w6,w14 - orr w12,w12,w15 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - add w5,w5,w12 - ror w11,w11,#6 - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - add w5,w5,w11 - ldr w12,[sp,#56] - and w13,w13,w14 - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - orr w12,w12,w15 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - add w4,w4,w12 - ror w11,w11,#6 - eor w13,w5,w6 - eor w15,w15,w5,ror#20 - add w4,w4,w11 - ldr w12,[sp,#60] - and w14,w14,w13 - ror w15,w15,#2 - add w8,w8,w4 - eor w14,w14,w6 - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - add w3,w3,w15 // h+=Sigma0(a) from the past - ldp w11,w12,[x0,#0] - add w3,w3,w13 // h+=Maj(a,b,c) from the past - ldp w13,w14,[x0,#8] - add w3,w3,w11 // accumulate - add w4,w4,w12 - ldp w11,w12,[x0,#16] - add w5,w5,w13 - add w6,w6,w14 - ldp w13,w14,[x0,#24] - add w7,w7,w11 - add w8,w8,w12 - ldr w12,[sp,#0] - stp w3,w4,[x0,#0] - add w9,w9,w13 - mov w13,wzr - stp w5,w6,[x0,#8] - add w10,w10,w14 - stp w7,w8,[x0,#16] - eor w14,w4,w5 - stp w9,w10,[x0,#24] - mov w15,wzr - mov x17,sp - b.ne .L_00_48 - - ldr x29,[x29] - add sp,sp,#16*4+16 - ret - -.globl blst_sha256_emit - -.def blst_sha256_emit; -.type 32; -.endef -.p2align 4 -blst_sha256_emit: - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] -#ifndef __AARCH64EB__ - rev x4,x4 - rev x5,x5 - rev x6,x6 - rev x7,x7 -#endif - str w4,[x0,#4] - lsr x4,x4,#32 - str w5,[x0,#12] - lsr x5,x5,#32 - str w6,[x0,#20] - lsr x6,x6,#32 - str w7,[x0,#28] - lsr x7,x7,#32 - str w4,[x0,#0] - str w5,[x0,#8] - str w6,[x0,#16] - str w7,[x0,#24] - ret - - -.globl blst_sha256_bcopy - -.def blst_sha256_bcopy; -.type 32; -.endef -.p2align 4 -blst_sha256_bcopy: -.Loop_bcopy: - ldrb w3,[x1],#1 - sub x2,x2,#1 - strb w3,[x0],#1 - cbnz x2,.Loop_bcopy - ret - - -.globl blst_sha256_hcopy - -.def blst_sha256_hcopy; -.type 32; -.endef -.p2align 4 -blst_sha256_hcopy: - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - stp x4,x5,[x0] - stp x6,x7,[x0,#16] - ret - diff --git a/crypto/blst_src/build/coff/sha256-portable-x86_64.s b/crypto/blst_src/build/coff/sha256-portable-x86_64.s deleted file mode 100644 index 603e46c53d7..00000000000 --- a/crypto/blst_src/build/coff/sha256-portable-x86_64.s +++ /dev/null @@ -1,1792 +0,0 @@ -.comm __blst_platform_cap,4 -.text - -.globl blst_sha256_block_data_order -.def blst_sha256_block_data_order; .scl 2; .type 32; .endef -.p2align 4 -blst_sha256_block_data_order: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_blst_sha256_block_data_order: - - - pushq %rbp - - movq %rsp,%rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -#ifdef __BLST_PORTABLE__ - testl $2,__blst_platform_cap(%rip) - jnz .Lblst_sha256_block_data_order$2 -#endif - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - shlq $4,%rdx - subq $64+24,%rsp - - -.LSEH_body_blst_sha256_block_data_order: - - leaq (%rsi,%rdx,4),%rdx - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - jmp .Lloop - -.p2align 4 -.Lloop: - movl %ebx,%edi - leaq K256(%rip),%rbp - xorl %ecx,%edi - movl 0(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 0(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - addl %r14d,%r11d - movl 4(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 4(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - addl %r14d,%r10d - movl 8(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 8(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - addl %r14d,%r9d - movl 12(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 12(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - addl %r14d,%r8d - movl 16(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 16(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - addl %r14d,%edx - movl 20(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 20(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - addl %r14d,%ecx - movl 24(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 24(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - addl %r14d,%ebx - movl 28(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 28(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - addl %r14d,%eax - movl 32(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 32(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - addl %r14d,%r11d - movl 36(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 36(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - addl %r14d,%r10d - movl 40(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 40(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - addl %r14d,%r9d - movl 44(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 44(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - addl %r14d,%r8d - movl 48(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 48(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - addl %r14d,%edx - movl 52(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 52(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - addl %r14d,%ecx - movl 56(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 56(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - addl %r14d,%ebx - movl 60(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 60(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - jmp .Lrounds_16_xx -.p2align 4 -.Lrounds_16_xx: - movl 4(%rsp),%r13d - movl 56(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 36(%rsp),%r12d - - addl 0(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 64(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 8(%rsp),%r13d - movl 60(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 40(%rsp),%r12d - - addl 4(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 68(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 12(%rsp),%r13d - movl 0(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 44(%rsp),%r12d - - addl 8(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 72(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 16(%rsp),%r13d - movl 4(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 48(%rsp),%r12d - - addl 12(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 76(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 20(%rsp),%r13d - movl 8(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 52(%rsp),%r12d - - addl 16(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 80(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 24(%rsp),%r13d - movl 12(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 56(%rsp),%r12d - - addl 20(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 84(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 28(%rsp),%r13d - movl 16(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 60(%rsp),%r12d - - addl 24(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 88(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 32(%rsp),%r13d - movl 20(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 0(%rsp),%r12d - - addl 28(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 92(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 36(%rsp),%r13d - movl 24(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 4(%rsp),%r12d - - addl 32(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 96(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 40(%rsp),%r13d - movl 28(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 8(%rsp),%r12d - - addl 36(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 100(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 44(%rsp),%r13d - movl 32(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 12(%rsp),%r12d - - addl 40(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 104(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 48(%rsp),%r13d - movl 36(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 16(%rsp),%r12d - - addl 44(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 108(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 52(%rsp),%r13d - movl 40(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 20(%rsp),%r12d - - addl 48(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 112(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 56(%rsp),%r13d - movl 44(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 24(%rsp),%r12d - - addl 52(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 116(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 60(%rsp),%r13d - movl 48(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 28(%rsp),%r12d - - addl 56(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 120(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 0(%rsp),%r13d - movl 52(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 32(%rsp),%r12d - - addl 60(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 124(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - leaq 64(%rbp),%rbp - cmpb $0x19,3(%rbp) - jnz .Lrounds_16_xx - - movq 64+0(%rsp),%rdi - addl %r14d,%eax - leaq 64(%rsi),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop - - leaq 64+24+48(%rsp),%r11 - - movq 64+24(%rsp),%r15 - movq -40(%r11),%r14 - movq -32(%r11),%r13 - movq -24(%r11),%r12 - movq -16(%r11),%rbx - movq -8(%r11),%rbp -.LSEH_epilogue_blst_sha256_block_data_order: - mov 8(%r11),%rdi - mov 16(%r11),%rsi - - leaq (%r11),%rsp - .byte 0xf3,0xc3 - -.LSEH_end_blst_sha256_block_data_order: - -#ifndef __BLST_PORTABLE__ -.p2align 6 - -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 -.globl blst_sha256_emit - -.def blst_sha256_emit; .scl 2; .type 32; .endef -.p2align 4 -blst_sha256_emit: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - bswapq %r8 - movq 24(%rdx),%r11 - bswapq %r9 - movl %r8d,4(%rcx) - bswapq %r10 - movl %r9d,12(%rcx) - bswapq %r11 - movl %r10d,20(%rcx) - shrq $32,%r8 - movl %r11d,28(%rcx) - shrq $32,%r9 - movl %r8d,0(%rcx) - shrq $32,%r10 - movl %r9d,8(%rcx) - shrq $32,%r11 - movl %r10d,16(%rcx) - movl %r11d,24(%rcx) - .byte 0xf3,0xc3 - - -.globl blst_sha256_bcopy - -.def blst_sha256_bcopy; .scl 2; .type 32; .endef -.p2align 4 -blst_sha256_bcopy: - .byte 0xf3,0x0f,0x1e,0xfa - - subq %rdx,%rcx -.Loop_bcopy: - movzbl (%rdx),%eax - leaq 1(%rdx),%rdx - movb %al,-1(%rcx,%rdx,1) - decq %r8 - jnz .Loop_bcopy - .byte 0xf3,0xc3 - - -.globl blst_sha256_hcopy - -.def blst_sha256_hcopy; .scl 2; .type 32; .endef -.p2align 4 -blst_sha256_hcopy: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - movq %r8,0(%rcx) - movq %r9,8(%rcx) - movq %r10,16(%rcx) - movq %r11,24(%rcx) - .byte 0xf3,0xc3 - -#endif -.section .pdata -.p2align 2 -.rva .LSEH_begin_blst_sha256_block_data_order -.rva .LSEH_body_blst_sha256_block_data_order -.rva .LSEH_info_blst_sha256_block_data_order_prologue - -.rva .LSEH_body_blst_sha256_block_data_order -.rva .LSEH_epilogue_blst_sha256_block_data_order -.rva .LSEH_info_blst_sha256_block_data_order_body - -.rva .LSEH_epilogue_blst_sha256_block_data_order -.rva .LSEH_end_blst_sha256_block_data_order -.rva .LSEH_info_blst_sha256_block_data_order_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_blst_sha256_block_data_order_prologue: -.byte 1,4,6,0x05 -.byte 4,0x74,2,0 -.byte 4,0x64,3,0 -.byte 4,0x53 -.byte 1,0x50 -.long 0,0 -.LSEH_info_blst_sha256_block_data_order_body: -.byte 1,0,18,0 -.byte 0x00,0xf4,0x0b,0x00 -.byte 0x00,0xe4,0x0c,0x00 -.byte 0x00,0xd4,0x0d,0x00 -.byte 0x00,0xc4,0x0e,0x00 -.byte 0x00,0x34,0x0f,0x00 -.byte 0x00,0x54,0x10,0x00 -.byte 0x00,0x74,0x12,0x00 -.byte 0x00,0x64,0x13,0x00 -.byte 0x00,0x01,0x11,0x00 -.byte 0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_blst_sha256_block_data_order_epilogue: -.byte 1,0,5,11 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0xb3 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/coff/sha256-x86_64.s b/crypto/blst_src/build/coff/sha256-x86_64.s deleted file mode 100644 index d65df5d0d4d..00000000000 --- a/crypto/blst_src/build/coff/sha256-x86_64.s +++ /dev/null @@ -1,1562 +0,0 @@ -.comm __blst_platform_cap,4 -.text - -.p2align 6 - -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 -.globl blst_sha256_block_data_order_shaext - -.def blst_sha256_block_data_order_shaext; .scl 2; .type 32; .endef -.p2align 6 -blst_sha256_block_data_order_shaext: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_blst_sha256_block_data_order_shaext: - - - pushq %rbp - - movq %rsp,%rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx -.Lblst_sha256_block_data_order$2: - subq $0x50,%rsp - - movaps %xmm6,-80(%rbp) - movaps %xmm7,-64(%rbp) - movaps %xmm8,-48(%rbp) - movaps %xmm9,-32(%rbp) - movaps %xmm10,-16(%rbp) - -.LSEH_body_blst_sha256_block_data_order_shaext: - - leaq K256+128(%rip),%rcx - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa 256-128(%rcx),%xmm7 - - pshufd $0x1b,%xmm1,%xmm0 - pshufd $0xb1,%xmm1,%xmm1 - pshufd $0x1b,%xmm2,%xmm2 - movdqa %xmm7,%xmm8 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp .Loop_shaext - -.p2align 4 -.Loop_shaext: - movdqu (%rsi),%xmm3 - movdqu 16(%rsi),%xmm4 - movdqu 32(%rsi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%rsi),%xmm6 - - movdqa 0-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 - movdqa %xmm2,%xmm10 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - nop - movdqa %xmm1,%xmm9 -.byte 15,56,203,202 - - movdqa 16-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - leaq 64(%rsi),%rsi -.byte 15,56,204,220 -.byte 15,56,203,202 - - movdqa 32-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - - movdqa 48-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 64-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 80-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 96-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 112-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 128-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 144-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 160-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 176-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 192-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 208-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - - movdqa 224-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa %xmm8,%xmm7 -.byte 15,56,203,202 - - movdqa 240-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - decq %rdx - nop -.byte 15,56,203,202 - - paddd %xmm10,%xmm2 - paddd %xmm9,%xmm1 - jnz .Loop_shaext - - pshufd $0xb1,%xmm2,%xmm2 - pshufd $0x1b,%xmm1,%xmm7 - pshufd $0xb1,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - - movdqu %xmm1,(%rdi) - movdqu %xmm2,16(%rdi) - movaps -80(%rbp),%xmm6 - movaps -64(%rbp),%xmm7 - movaps -48(%rbp),%xmm8 - movaps -32(%rbp),%xmm9 - movaps -16(%rbp),%xmm10 - movq %rbp,%rsp - - popq %rbp - -.LSEH_epilogue_blst_sha256_block_data_order_shaext: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_blst_sha256_block_data_order_shaext: -.globl blst_sha256_block_data_order - -.def blst_sha256_block_data_order; .scl 2; .type 32; .endef -.p2align 6 -blst_sha256_block_data_order: - .byte 0xf3,0x0f,0x1e,0xfa - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - movq %rsp,%r11 -.LSEH_begin_blst_sha256_block_data_order: - - - pushq %rbp - - movq %rsp,%rbp - - movq %rcx,%rdi - movq %rdx,%rsi - movq %r8,%rdx - testl $2,__blst_platform_cap(%rip) - jnz .Lblst_sha256_block_data_order$2 - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - shlq $4,%rdx - subq $88,%rsp - - leaq (%rsi,%rdx,4),%rdx - movq %rdi,-64(%rbp) - - movq %rdx,-48(%rbp) - movaps %xmm6,-128(%rbp) - movaps %xmm7,-112(%rbp) - movaps %xmm8,-96(%rbp) - movaps %xmm9,-80(%rbp) - -.LSEH_body_blst_sha256_block_data_order: - - - leaq -64(%rsp),%rsp - movl 0(%rdi),%eax - andq $-64,%rsp - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - - - jmp .Lloop_ssse3 -.p2align 4 -.Lloop_ssse3: - movdqa K256+256(%rip),%xmm7 - movq %rsi,-56(%rbp) - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 -.byte 102,15,56,0,199 - movdqu 48(%rsi),%xmm3 - leaq K256(%rip),%rsi -.byte 102,15,56,0,207 - movdqa 0(%rsi),%xmm4 - movdqa 16(%rsi),%xmm5 -.byte 102,15,56,0,215 - paddd %xmm0,%xmm4 - movdqa 32(%rsi),%xmm6 -.byte 102,15,56,0,223 - movdqa 48(%rsi),%xmm7 - paddd %xmm1,%xmm5 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - movdqa %xmm4,0(%rsp) - movl %eax,%r14d - movdqa %xmm5,16(%rsp) - movl %ebx,%edi - movdqa %xmm6,32(%rsp) - xorl %ecx,%edi - movdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lssse3_00_47 - -.p2align 4 -.Lssse3_00_47: - subq $-64,%rsi - rorl $14,%r13d - movdqa %xmm1,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm3,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,224,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,250,4 - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm3,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 4(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm0 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm0 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm0,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 0(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm0,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,0(%rsp) - rorl $14,%r13d - movdqa %xmm2,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm0,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,225,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,251,4 - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm0,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 20(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm1 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm1 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm1,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 16(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm1,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,16(%rsp) - rorl $14,%r13d - movdqa %xmm3,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm1,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,226,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,248,4 - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm1,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 36(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm2 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm2 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm2,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 32(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm2,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,32(%rsp) - rorl $14,%r13d - movdqa %xmm0,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm2,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,227,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,249,4 - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm2,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 52(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm3 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm3 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm3,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 48(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm3,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,48(%rsp) - cmpb $0,67(%rsi) - jne .Lssse3_00_47 - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq -64(%rbp),%rdi - movl %r14d,%eax - movq -56(%rbp),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - leaq 64(%rsi),%rsi - cmpq -48(%rbp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop_ssse3 - - xorps %xmm0,%xmm0 - movaps %xmm0,0(%rsp) - movaps %xmm0,16(%rsp) - movaps %xmm0,32(%rsp) - movaps %xmm0,48(%rsp) - movaps -128(%rbp),%xmm6 - movaps -112(%rbp),%xmm7 - movaps -96(%rbp),%xmm8 - movaps -80(%rbp),%xmm9 - movq -40(%rbp),%r15 - movq -32(%rbp),%r14 - movq -24(%rbp),%r13 - movq -16(%rbp),%r12 - movq -8(%rbp),%rbx - movq %rbp,%rsp - - popq %rbp - -.LSEH_epilogue_blst_sha256_block_data_order: - mov 8(%rsp),%rdi - mov 16(%rsp),%rsi - - .byte 0xf3,0xc3 - -.LSEH_end_blst_sha256_block_data_order: -.globl blst_sha256_emit - -.def blst_sha256_emit; .scl 2; .type 32; .endef -.p2align 4 -blst_sha256_emit: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - bswapq %r8 - movq 24(%rdx),%r11 - bswapq %r9 - movl %r8d,4(%rcx) - bswapq %r10 - movl %r9d,12(%rcx) - bswapq %r11 - movl %r10d,20(%rcx) - shrq $32,%r8 - movl %r11d,28(%rcx) - shrq $32,%r9 - movl %r8d,0(%rcx) - shrq $32,%r10 - movl %r9d,8(%rcx) - shrq $32,%r11 - movl %r10d,16(%rcx) - movl %r11d,24(%rcx) - .byte 0xf3,0xc3 - - -.globl blst_sha256_bcopy - -.def blst_sha256_bcopy; .scl 2; .type 32; .endef -.p2align 4 -blst_sha256_bcopy: - .byte 0xf3,0x0f,0x1e,0xfa - - subq %rdx,%rcx -.Loop_bcopy: - movzbl (%rdx),%eax - leaq 1(%rdx),%rdx - movb %al,-1(%rcx,%rdx,1) - decq %r8 - jnz .Loop_bcopy - .byte 0xf3,0xc3 - - -.globl blst_sha256_hcopy - -.def blst_sha256_hcopy; .scl 2; .type 32; .endef -.p2align 4 -blst_sha256_hcopy: - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - movq %r8,0(%rcx) - movq %r9,8(%rcx) - movq %r10,16(%rcx) - movq %r11,24(%rcx) - .byte 0xf3,0xc3 - -.section .pdata -.p2align 2 -.rva .LSEH_begin_blst_sha256_block_data_order_shaext -.rva .LSEH_body_blst_sha256_block_data_order_shaext -.rva .LSEH_info_blst_sha256_block_data_order_shaext_prologue - -.rva .LSEH_body_blst_sha256_block_data_order_shaext -.rva .LSEH_epilogue_blst_sha256_block_data_order_shaext -.rva .LSEH_info_blst_sha256_block_data_order_shaext_body - -.rva .LSEH_epilogue_blst_sha256_block_data_order_shaext -.rva .LSEH_end_blst_sha256_block_data_order_shaext -.rva .LSEH_info_blst_sha256_block_data_order_shaext_epilogue - -.rva .LSEH_begin_blst_sha256_block_data_order -.rva .LSEH_body_blst_sha256_block_data_order -.rva .LSEH_info_blst_sha256_block_data_order_prologue - -.rva .LSEH_body_blst_sha256_block_data_order -.rva .LSEH_epilogue_blst_sha256_block_data_order -.rva .LSEH_info_blst_sha256_block_data_order_body - -.rva .LSEH_epilogue_blst_sha256_block_data_order -.rva .LSEH_end_blst_sha256_block_data_order -.rva .LSEH_info_blst_sha256_block_data_order_epilogue - -.section .xdata -.p2align 3 -.LSEH_info_blst_sha256_block_data_order_shaext_prologue: -.byte 1,4,6,0x05 -.byte 4,0x74,2,0 -.byte 4,0x64,3,0 -.byte 4,0x53 -.byte 1,0x50 -.long 0,0 -.LSEH_info_blst_sha256_block_data_order_shaext_body: -.byte 1,0,17,85 -.byte 0x00,0x68,0x00,0x00 -.byte 0x00,0x78,0x01,0x00 -.byte 0x00,0x88,0x02,0x00 -.byte 0x00,0x98,0x03,0x00 -.byte 0x00,0xa8,0x04,0x00 -.byte 0x00,0x74,0x0c,0x00 -.byte 0x00,0x64,0x0d,0x00 -.byte 0x00,0x53 -.byte 0x00,0x92 -.byte 0x00,0x50 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_blst_sha256_block_data_order_shaext_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - -.LSEH_info_blst_sha256_block_data_order_prologue: -.byte 1,4,6,0x05 -.byte 4,0x74,2,0 -.byte 4,0x64,3,0 -.byte 4,0x53 -.byte 1,0x50 -.long 0,0 -.LSEH_info_blst_sha256_block_data_order_body: -.byte 1,0,25,133 -.byte 0x00,0x68,0x00,0x00 -.byte 0x00,0x78,0x01,0x00 -.byte 0x00,0x88,0x02,0x00 -.byte 0x00,0x98,0x03,0x00 -.byte 0x00,0xf4,0x0b,0x00 -.byte 0x00,0xe4,0x0c,0x00 -.byte 0x00,0xd4,0x0d,0x00 -.byte 0x00,0xc4,0x0e,0x00 -.byte 0x00,0x34,0x0f,0x00 -.byte 0x00,0x74,0x12,0x00 -.byte 0x00,0x64,0x13,0x00 -.byte 0x00,0x53 -.byte 0x00,0xf2 -.byte 0x00,0x50 -.byte 0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0x00,0x00,0x00,0x00 -.LSEH_info_blst_sha256_block_data_order_epilogue: -.byte 1,0,4,0 -.byte 0x00,0x74,0x01,0x00 -.byte 0x00,0x64,0x02,0x00 -.byte 0x00,0x00,0x00,0x00 - diff --git a/crypto/blst_src/build/elf/add_mod_256-armv8.S b/crypto/blst_src/build/elf/add_mod_256-armv8.S deleted file mode 100644 index 57476aaa1da..00000000000 --- a/crypto/blst_src/build/elf/add_mod_256-armv8.S +++ /dev/null @@ -1,379 +0,0 @@ -.text - -.globl add_mod_256 -.hidden add_mod_256 -.type add_mod_256,%function -.align 5 -add_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - - ldp x10,x11,[x1,#16] - adds x8,x8,x12 - ldp x14,x15,[x2,#16] - adcs x9,x9,x13 - ldp x4,x5,[x3] - adcs x10,x10,x14 - ldp x6,x7,[x3,#16] - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - stp x8,x9,[x0] - csel x11,x11,x2,lo - stp x10,x11,[x0,#16] - - ret -.size add_mod_256,.-add_mod_256 - -.globl mul_by_3_mod_256 -.hidden mul_by_3_mod_256 -.type mul_by_3_mod_256,%function -.align 5 -mul_by_3_mod_256: - ldp x12,x13,[x1] - ldp x14,x15,[x1,#16] - - adds x8,x12,x12 - ldp x4,x5,[x2] - adcs x9,x13,x13 - ldp x6,x7,[x2,#16] - adcs x10,x14,x14 - adcs x11,x15,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - csel x11,x11,x2,lo - - adds x8,x8,x12 - adcs x9,x9,x13 - adcs x10,x10,x14 - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - stp x8,x9,[x0] - csel x11,x11,x2,lo - stp x10,x11,[x0,#16] - - ret -.size mul_by_3_mod_256,.-mul_by_3_mod_256 - -.globl lshift_mod_256 -.hidden lshift_mod_256 -.type lshift_mod_256,%function -.align 5 -lshift_mod_256: - ldp x8,x9,[x1] - ldp x10,x11,[x1,#16] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - -.Loop_lshift_mod_256: - adds x8,x8,x8 - sub x2,x2,#1 - adcs x9,x9,x9 - adcs x10,x10,x10 - adcs x11,x11,x11 - adc x3,xzr,xzr - - subs x12,x8,x4 - sbcs x13,x9,x5 - sbcs x14,x10,x6 - sbcs x15,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x12,lo - csel x9,x9,x13,lo - csel x10,x10,x14,lo - csel x11,x11,x15,lo - - cbnz x2,.Loop_lshift_mod_256 - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - ret -.size lshift_mod_256,.-lshift_mod_256 - -.globl rshift_mod_256 -.hidden rshift_mod_256 -.type rshift_mod_256,%function -.align 5 -rshift_mod_256: - ldp x8,x9,[x1] - ldp x10,x11,[x1,#16] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - -.Loop_rshift: - adds x12,x8,x4 - sub x2,x2,#1 - adcs x13,x9,x5 - adcs x14,x10,x6 - adcs x15,x11,x7 - adc x3,xzr,xzr - tst x8,#1 - - csel x12,x12,x8,ne - csel x13,x13,x9,ne - csel x14,x14,x10,ne - csel x15,x15,x11,ne - csel x3,x3,xzr,ne - - extr x8,x13,x12,#1 - extr x9,x14,x13,#1 - extr x10,x15,x14,#1 - extr x11,x3,x15,#1 - - cbnz x2,.Loop_rshift - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - ret -.size rshift_mod_256,.-rshift_mod_256 - -.globl cneg_mod_256 -.hidden cneg_mod_256 -.type cneg_mod_256,%function -.align 5 -cneg_mod_256: - ldp x8,x9,[x1] - ldp x4,x5,[x3] - - ldp x10,x11,[x1,#16] - subs x12,x4,x8 - ldp x6,x7,[x3,#16] - orr x4,x8,x9 - sbcs x13,x5,x9 - orr x5,x10,x11 - sbcs x14,x6,x10 - orr x3,x4,x5 - sbc x15,x7,x11 - - cmp x3,#0 - csetm x3,ne - ands x2,x2,x3 - - csel x8,x8,x12,eq - csel x9,x9,x13,eq - csel x10,x10,x14,eq - stp x8,x9,[x0] - csel x11,x11,x15,eq - stp x10,x11,[x0,#16] - - ret -.size cneg_mod_256,.-cneg_mod_256 - -.globl sub_mod_256 -.hidden sub_mod_256 -.type sub_mod_256,%function -.align 5 -sub_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - - ldp x10,x11,[x1,#16] - subs x8,x8,x12 - ldp x14,x15,[x2,#16] - sbcs x9,x9,x13 - ldp x4,x5,[x3] - sbcs x10,x10,x14 - ldp x6,x7,[x3,#16] - sbcs x11,x11,x15 - sbc x3,xzr,xzr - - and x4,x4,x3 - and x5,x5,x3 - adds x8,x8,x4 - and x6,x6,x3 - adcs x9,x9,x5 - and x7,x7,x3 - adcs x10,x10,x6 - stp x8,x9,[x0] - adc x11,x11,x7 - stp x10,x11,[x0,#16] - - ret -.size sub_mod_256,.-sub_mod_256 - -.globl check_mod_256 -.hidden check_mod_256 -.type check_mod_256,%function -.align 5 -check_mod_256: - ldp x8,x9,[x0] - ldp x10,x11,[x0,#16] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - subs xzr,x8,x4 - sbcs xzr,x9,x5 - orr x8,x8,x9 - sbcs xzr,x10,x6 - orr x8,x8,x10 - sbcs xzr,x11,x7 - orr x8,x8,x11 - sbc x1,xzr,xzr - - cmp x8,#0 - mov x0,#1 - csel x0,x0,xzr,ne - and x0,x0,x1 - - ret -.size check_mod_256,.-check_mod_256 - -.globl add_n_check_mod_256 -.hidden add_n_check_mod_256 -.type add_n_check_mod_256,%function -.align 5 -add_n_check_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - ldp x10,x11,[x1,#16] - ldp x14,x15,[x2,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 - rev x10,x10 - rev x14,x14 - rev x11,x11 - rev x15,x15 -#endif - - adds x8,x8,x12 - ldp x4,x5,[x3] - adcs x9,x9,x13 - ldp x6,x7,[x3,#16] - adcs x10,x10,x14 - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - csel x11,x11,x2,lo - - orr x16, x8, x9 - orr x17, x10, x11 - orr x16, x16, x17 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - mov x17, #1 - cmp x16, #0 - csel x0, x17, xzr, ne - - ret -.size add_n_check_mod_256,.-add_n_check_mod_256 - -.globl sub_n_check_mod_256 -.hidden sub_n_check_mod_256 -.type sub_n_check_mod_256,%function -.align 5 -sub_n_check_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - ldp x10,x11,[x1,#16] - ldp x14,x15,[x2,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 - rev x10,x10 - rev x14,x14 - rev x11,x11 - rev x15,x15 -#endif - - subs x8,x8,x12 - sbcs x9,x9,x13 - ldp x4,x5,[x3] - sbcs x10,x10,x14 - ldp x6,x7,[x3,#16] - sbcs x11,x11,x15 - sbc x3,xzr,xzr - - and x4,x4,x3 - and x5,x5,x3 - adds x8,x8,x4 - and x6,x6,x3 - adcs x9,x9,x5 - and x7,x7,x3 - adcs x10,x10,x6 - adc x11,x11,x7 - - orr x16, x8, x9 - orr x17, x10, x11 - orr x16, x16, x17 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - mov x17, #1 - cmp x16, #0 - csel x0, x17, xzr, ne - - ret -.size sub_n_check_mod_256,.-sub_n_check_mod_256 diff --git a/crypto/blst_src/build/elf/add_mod_256-x86_64.s b/crypto/blst_src/build/elf/add_mod_256-x86_64.s deleted file mode 100644 index 2f41781959c..00000000000 --- a/crypto/blst_src/build/elf/add_mod_256-x86_64.s +++ /dev/null @@ -1,572 +0,0 @@ -.text - -.globl add_mod_256 -.hidden add_mod_256 -.type add_mod_256,@function -.align 32 -add_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - -.Loaded_a_add_mod_256: - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - movq %r8,%rax - adcq 16(%rdx),%r10 - movq %r9,%rsi - adcq 24(%rdx),%r11 - sbbq %rdx,%rdx - - movq %r10,%rbx - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - sbbq 16(%rcx),%r10 - movq %r11,%rbp - sbbq 24(%rcx),%r11 - sbbq $0,%rdx - - cmovcq %rax,%r8 - cmovcq %rsi,%r9 - movq %r8,0(%rdi) - cmovcq %rbx,%r10 - movq %r9,8(%rdi) - cmovcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size add_mod_256,.-add_mod_256 - - -.globl mul_by_3_mod_256 -.hidden mul_by_3_mod_256 -.type mul_by_3_mod_256,@function -.align 32 -mul_by_3_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - - - movq %rdx,%rcx - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq %rsi,%rdx - movq 24(%rsi),%r11 - - call __lshift_mod_256 - movq 0(%rsp),%r12 -.cfi_restore %r12 - jmp .Loaded_a_add_mod_256 - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_by_3_mod_256,.-mul_by_3_mod_256 - -.type __lshift_mod_256,@function -.align 32 -__lshift_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - addq %r8,%r8 - adcq %r9,%r9 - movq %r8,%rax - adcq %r10,%r10 - movq %r9,%rsi - adcq %r11,%r11 - sbbq %r12,%r12 - - movq %r10,%rbx - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - sbbq 16(%rcx),%r10 - movq %r11,%rbp - sbbq 24(%rcx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r8 - cmovcq %rsi,%r9 - cmovcq %rbx,%r10 - cmovcq %rbp,%r11 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __lshift_mod_256,.-__lshift_mod_256 - - -.globl lshift_mod_256 -.hidden lshift_mod_256 -.type lshift_mod_256,@function -.align 32 -lshift_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - -.Loop_lshift_mod_256: - call __lshift_mod_256 - decl %edx - jnz .Loop_lshift_mod_256 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 0(%rsp),%r12 -.cfi_restore %r12 - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size lshift_mod_256,.-lshift_mod_256 - - -.globl rshift_mod_256 -.hidden rshift_mod_256 -.type rshift_mod_256,@function -.align 32 -rshift_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%rbp - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - -.Loop_rshift_mod_256: - movq %rbp,%r8 - andq $1,%rbp - movq 0(%rcx),%rax - negq %rbp - movq 8(%rcx),%rsi - movq 16(%rcx),%rbx - - andq %rbp,%rax - andq %rbp,%rsi - andq %rbp,%rbx - andq 24(%rcx),%rbp - - addq %rax,%r8 - adcq %rsi,%r9 - adcq %rbx,%r10 - adcq %rbp,%r11 - sbbq %rax,%rax - - shrq $1,%r8 - movq %r9,%rbp - shrq $1,%r9 - movq %r10,%rbx - shrq $1,%r10 - movq %r11,%rsi - shrq $1,%r11 - - shlq $63,%rbp - shlq $63,%rbx - orq %r8,%rbp - shlq $63,%rsi - orq %rbx,%r9 - shlq $63,%rax - orq %rsi,%r10 - orq %rax,%r11 - - decl %edx - jnz .Loop_rshift_mod_256 - - movq %rbp,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size rshift_mod_256,.-rshift_mod_256 - - -.globl cneg_mod_256 -.hidden cneg_mod_256 -.type cneg_mod_256,@function -.align 32 -cneg_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - - - movq 0(%rsi),%r12 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq %r12,%r8 - movq 24(%rsi),%r11 - orq %r9,%r12 - orq %r10,%r12 - orq %r11,%r12 - movq $-1,%rbp - - movq 0(%rcx),%rax - cmovnzq %rbp,%r12 - movq 8(%rcx),%rsi - movq 16(%rcx),%rbx - andq %r12,%rax - movq 24(%rcx),%rbp - andq %r12,%rsi - andq %r12,%rbx - andq %r12,%rbp - - subq %r8,%rax - sbbq %r9,%rsi - sbbq %r10,%rbx - sbbq %r11,%rbp - - orq %rdx,%rdx - - cmovzq %r8,%rax - cmovzq %r9,%rsi - movq %rax,0(%rdi) - cmovzq %r10,%rbx - movq %rsi,8(%rdi) - cmovzq %r11,%rbp - movq %rbx,16(%rdi) - movq %rbp,24(%rdi) - - movq 0(%rsp),%r12 -.cfi_restore %r12 - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size cneg_mod_256,.-cneg_mod_256 - - -.globl sub_mod_256 -.hidden sub_mod_256 -.type sub_mod_256,@function -.align 32 -sub_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - subq 0(%rdx),%r8 - movq 0(%rcx),%rax - sbbq 8(%rdx),%r9 - movq 8(%rcx),%rsi - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rbx - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbp - sbbq %rdx,%rdx - - andq %rdx,%rax - andq %rdx,%rsi - andq %rdx,%rbx - andq %rdx,%rbp - - addq %rax,%r8 - adcq %rsi,%r9 - movq %r8,0(%rdi) - adcq %rbx,%r10 - movq %r9,8(%rdi) - adcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sub_mod_256,.-sub_mod_256 - - -.globl check_mod_256 -.hidden check_mod_256 -.type check_mod_256,@function -.align 32 -check_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - movq 0(%rdi),%rax - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - - movq %rax,%r8 - orq %r9,%rax - orq %r10,%rax - orq %r11,%rax - - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq %rsi,%rsi - - movq $1,%rdx - cmpq $0,%rax - cmovneq %rdx,%rax - andq %rsi,%rax - - .byte 0xf3,0xc3 -.cfi_endproc -.size check_mod_256,.-check_mod_256 - - -.globl add_n_check_mod_256 -.hidden add_n_check_mod_256 -.type add_n_check_mod_256,@function -.align 32 -add_n_check_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - movq %r8,%rax - adcq 16(%rdx),%r10 - movq %r9,%rsi - adcq 24(%rdx),%r11 - sbbq %rdx,%rdx - - movq %r10,%rbx - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - sbbq 16(%rcx),%r10 - movq %r11,%rbp - sbbq 24(%rcx),%r11 - sbbq $0,%rdx - - cmovcq %rax,%r8 - cmovcq %rsi,%r9 - movq %r8,0(%rdi) - cmovcq %rbx,%r10 - movq %r9,8(%rdi) - cmovcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - orq %r9,%r8 - orq %r11,%r10 - orq %r10,%r8 - movq $1,%rax - cmovzq %r8,%rax - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size add_n_check_mod_256,.-add_n_check_mod_256 - - -.globl sub_n_check_mod_256 -.hidden sub_n_check_mod_256 -.type sub_n_check_mod_256,@function -.align 32 -sub_n_check_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - subq 0(%rdx),%r8 - movq 0(%rcx),%rax - sbbq 8(%rdx),%r9 - movq 8(%rcx),%rsi - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rbx - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbp - sbbq %rdx,%rdx - - andq %rdx,%rax - andq %rdx,%rsi - andq %rdx,%rbx - andq %rdx,%rbp - - addq %rax,%r8 - adcq %rsi,%r9 - movq %r8,0(%rdi) - adcq %rbx,%r10 - movq %r9,8(%rdi) - adcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - orq %r9,%r8 - orq %r11,%r10 - orq %r10,%r8 - movq $1,%rax - cmovzq %r8,%rax - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sub_n_check_mod_256,.-sub_n_check_mod_256 - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/add_mod_384-armv8.S b/crypto/blst_src/build/elf/add_mod_384-armv8.S deleted file mode 100644 index 5c18d7fe892..00000000000 --- a/crypto/blst_src/build/elf/add_mod_384-armv8.S +++ /dev/null @@ -1,1000 +0,0 @@ -.text - -.globl add_mod_384 -.hidden add_mod_384 -.type add_mod_384,%function -.align 5 -add_mod_384: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size add_mod_384,.-add_mod_384 - -.type __add_mod_384,%function -.align 5 -__add_mod_384: - ldp x10,x11,[x1] - ldp x16,x17,[x2] - ldp x12,x13,[x1,#16] - ldp x19,x20,[x2,#16] - ldp x14,x15,[x1,#32] - ldp x21,x22,[x2,#32] - -__add_mod_384_ab_are_loaded: - adds x10,x10,x16 - adcs x11,x11,x17 - adcs x12,x12,x19 - adcs x13,x13,x20 - adcs x14,x14,x21 - adcs x15,x15,x22 - adc x3,xzr,xzr - - subs x16,x10,x4 - sbcs x17,x11,x5 - sbcs x19,x12,x6 - sbcs x20,x13,x7 - sbcs x21,x14,x8 - sbcs x22,x15,x9 - sbcs xzr,x3,xzr - - csel x10,x10,x16,lo - csel x11,x11,x17,lo - csel x12,x12,x19,lo - csel x13,x13,x20,lo - csel x14,x14,x21,lo - csel x15,x15,x22,lo - - ret -.size __add_mod_384,.-__add_mod_384 - -.globl add_mod_384x -.hidden add_mod_384x -.type add_mod_384x,%function -.align 5 -add_mod_384x: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __add_mod_384 - - stp x10,x11,[x0] - add x1,x1,#48 - stp x12,x13,[x0,#16] - add x2,x2,#48 - stp x14,x15,[x0,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size add_mod_384x,.-add_mod_384x - -.globl rshift_mod_384 -.hidden rshift_mod_384 -.type rshift_mod_384,%function -.align 5 -rshift_mod_384: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - -.Loop_rshift_mod_384: - sub x2,x2,#1 - bl __rshift_mod_384 - cbnz x2,.Loop_rshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size rshift_mod_384,.-rshift_mod_384 - -.type __rshift_mod_384,%function -.align 5 -__rshift_mod_384: - sbfx x22,x10,#0,#1 - and x16,x22,x4 - and x17,x22,x5 - adds x10,x10,x16 - and x19,x22,x6 - adcs x11,x11,x17 - and x20,x22,x7 - adcs x12,x12,x19 - and x21,x22,x8 - adcs x13,x13,x20 - and x22,x22,x9 - adcs x14,x14,x21 - extr x10,x11,x10,#1 // a[0:5] >>= 1 - adcs x15,x15,x22 - extr x11,x12,x11,#1 - adc x22,xzr,xzr - extr x12,x13,x12,#1 - extr x13,x14,x13,#1 - extr x14,x15,x14,#1 - extr x15,x22,x15,#1 - ret -.size __rshift_mod_384,.-__rshift_mod_384 - -.globl div_by_2_mod_384 -.hidden div_by_2_mod_384 -.type div_by_2_mod_384,%function -.align 5 -div_by_2_mod_384: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __rshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size div_by_2_mod_384,.-div_by_2_mod_384 - -.globl lshift_mod_384 -.hidden lshift_mod_384 -.type lshift_mod_384,%function -.align 5 -lshift_mod_384: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - -.Loop_lshift_mod_384: - sub x2,x2,#1 - bl __lshift_mod_384 - cbnz x2,.Loop_lshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size lshift_mod_384,.-lshift_mod_384 - -.type __lshift_mod_384,%function -.align 5 -__lshift_mod_384: - adds x10,x10,x10 - adcs x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x3,xzr,xzr - - subs x16,x10,x4 - sbcs x17,x11,x5 - sbcs x19,x12,x6 - sbcs x20,x13,x7 - sbcs x21,x14,x8 - sbcs x22,x15,x9 - sbcs xzr,x3,xzr - - csel x10,x10,x16,lo - csel x11,x11,x17,lo - csel x12,x12,x19,lo - csel x13,x13,x20,lo - csel x14,x14,x21,lo - csel x15,x15,x22,lo - - ret -.size __lshift_mod_384,.-__lshift_mod_384 - -.globl mul_by_3_mod_384 -.hidden mul_by_3_mod_384 -.type mul_by_3_mod_384,%function -.align 5 -mul_by_3_mod_384: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size mul_by_3_mod_384,.-mul_by_3_mod_384 - -.globl mul_by_8_mod_384 -.hidden mul_by_8_mod_384 -.type mul_by_8_mod_384,%function -.align 5 -mul_by_8_mod_384: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size mul_by_8_mod_384,.-mul_by_8_mod_384 - -.globl mul_by_3_mod_384x -.hidden mul_by_3_mod_384x -.type mul_by_3_mod_384x,%function -.align 5 -mul_by_3_mod_384x: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - - bl __add_mod_384_ab_are_loaded - - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __lshift_mod_384 - - ldp x16,x17,[x1,#48] - ldp x19,x20,[x1,#64] - ldp x21,x22,[x1,#80] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size mul_by_3_mod_384x,.-mul_by_3_mod_384x - -.globl mul_by_8_mod_384x -.hidden mul_by_8_mod_384x -.type mul_by_8_mod_384x,%function -.align 5 -mul_by_8_mod_384x: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size mul_by_8_mod_384x,.-mul_by_8_mod_384x - -.globl cneg_mod_384 -.hidden cneg_mod_384 -.type cneg_mod_384,%function -.align 5 -cneg_mod_384: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x4,x5,[x3] - ldp x12,x13,[x1,#16] - ldp x6,x7,[x3,#16] - - subs x16,x4,x10 - ldp x14,x15,[x1,#32] - ldp x8,x9,[x3,#32] - orr x3,x10,x11 - sbcs x17,x5,x11 - orr x3,x3,x12 - sbcs x19,x6,x12 - orr x3,x3,x13 - sbcs x20,x7,x13 - orr x3,x3,x14 - sbcs x21,x8,x14 - orr x3,x3,x15 - sbc x22,x9,x15 - - cmp x3,#0 - csetm x3,ne - ands x2,x2,x3 - - csel x10,x10,x16,eq - csel x11,x11,x17,eq - csel x12,x12,x19,eq - csel x13,x13,x20,eq - stp x10,x11,[x0] - csel x14,x14,x21,eq - stp x12,x13,[x0,#16] - csel x15,x15,x22,eq - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size cneg_mod_384,.-cneg_mod_384 - -.globl sub_mod_384 -.hidden sub_mod_384 -.type sub_mod_384,%function -.align 5 -sub_mod_384: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size sub_mod_384,.-sub_mod_384 - -.type __sub_mod_384,%function -.align 5 -__sub_mod_384: - ldp x10,x11,[x1] - ldp x16,x17,[x2] - ldp x12,x13,[x1,#16] - ldp x19,x20,[x2,#16] - ldp x14,x15,[x1,#32] - ldp x21,x22,[x2,#32] - - subs x10,x10,x16 - sbcs x11,x11,x17 - sbcs x12,x12,x19 - sbcs x13,x13,x20 - sbcs x14,x14,x21 - sbcs x15,x15,x22 - sbc x3,xzr,xzr - - and x16,x4,x3 - and x17,x5,x3 - adds x10,x10,x16 - and x19,x6,x3 - adcs x11,x11,x17 - and x20,x7,x3 - adcs x12,x12,x19 - and x21,x8,x3 - adcs x13,x13,x20 - and x22,x9,x3 - adcs x14,x14,x21 - adc x15,x15,x22 - - ret -.size __sub_mod_384,.-__sub_mod_384 - -.globl sub_mod_384x -.hidden sub_mod_384x -.type sub_mod_384x,%function -.align 5 -sub_mod_384x: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __sub_mod_384 - - stp x10,x11,[x0] - add x1,x1,#48 - stp x12,x13,[x0,#16] - add x2,x2,#48 - stp x14,x15,[x0,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size sub_mod_384x,.-sub_mod_384x - -.globl mul_by_1_plus_i_mod_384x -.hidden mul_by_1_plus_i_mod_384x -.type mul_by_1_plus_i_mod_384x,%function -.align 5 -mul_by_1_plus_i_mod_384x: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - add x2,x1,#48 - - bl __sub_mod_384 // a->re - a->im - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __add_mod_384_ab_are_loaded // a->re + a->im - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x - -.globl sgn0_pty_mod_384 -.hidden sgn0_pty_mod_384 -.type sgn0_pty_mod_384,%function -.align 5 -sgn0_pty_mod_384: - ldp x10,x11,[x0] - ldp x12,x13,[x0,#16] - ldp x14,x15,[x0,#32] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldp x8,x9,[x1,#32] - - and x0,x10,#1 - adds x10,x10,x10 - adcs x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x3,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x3,x3,xzr - - mvn x3,x3 - and x3,x3,#2 - orr x0,x0,x3 - - ret -.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 - -.globl sgn0_pty_mod_384x -.hidden sgn0_pty_mod_384x -.type sgn0_pty_mod_384x,%function -.align 5 -sgn0_pty_mod_384x: - ldp x10,x11,[x0] - ldp x12,x13,[x0,#16] - ldp x14,x15,[x0,#32] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldp x8,x9,[x1,#32] - - and x2,x10,#1 - orr x3,x10,x11 - adds x10,x10,x10 - orr x3,x3,x12 - adcs x11,x11,x11 - orr x3,x3,x13 - adcs x12,x12,x12 - orr x3,x3,x14 - adcs x13,x13,x13 - orr x3,x3,x15 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x16,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x16,x16,xzr - - ldp x10,x11,[x0,#48] - ldp x12,x13,[x0,#64] - ldp x14,x15,[x0,#80] - - mvn x16,x16 - and x16,x16,#2 - orr x2,x2,x16 - - and x0,x10,#1 - orr x1,x10,x11 - adds x10,x10,x10 - orr x1,x1,x12 - adcs x11,x11,x11 - orr x1,x1,x13 - adcs x12,x12,x12 - orr x1,x1,x14 - adcs x13,x13,x13 - orr x1,x1,x15 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x16,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x16,x16,xzr - - mvn x16,x16 - and x16,x16,#2 - orr x0,x0,x16 - - cmp x3,#0 - csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) - - cmp x1,#0 - csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) - - and x3,x3,#1 - and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity - - ret -.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x -.globl vec_select_32 -.hidden vec_select_32 -.type vec_select_32,%function -.align 5 -vec_select_32: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret -.size vec_select_32,.-vec_select_32 -.globl vec_select_48 -.hidden vec_select_48 -.type vec_select_48,%function -.align 5 -vec_select_48: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret -.size vec_select_48,.-vec_select_48 -.globl vec_select_96 -.hidden vec_select_96 -.type vec_select_96,%function -.align 5 -vec_select_96: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret -.size vec_select_96,.-vec_select_96 -.globl vec_select_192 -.hidden vec_select_192 -.type vec_select_192,%function -.align 5 -vec_select_192: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret -.size vec_select_192,.-vec_select_192 -.globl vec_select_144 -.hidden vec_select_144 -.type vec_select_144,%function -.align 5 -vec_select_144: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret -.size vec_select_144,.-vec_select_144 -.globl vec_select_288 -.hidden vec_select_288 -.type vec_select_288,%function -.align 5 -vec_select_288: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret -.size vec_select_288,.-vec_select_288 -.globl vec_prefetch -.hidden vec_prefetch -.type vec_prefetch,%function -.align 5 -vec_prefetch: - add x1, x1, x0 - sub x1, x1, #1 - mov x2, #64 - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - prfm pldl1keep, [x0] - ret -.size vec_prefetch,.-vec_prefetch -.globl vec_is_zero_16x -.hidden vec_is_zero_16x -.type vec_is_zero_16x,%function -.align 5 -vec_is_zero_16x: - ld1 {v0.2d}, [x0], #16 - lsr x1, x1, #4 - sub x1, x1, #1 - cbz x1, .Loop_is_zero_done - -.Loop_is_zero: - ld1 {v1.2d}, [x0], #16 - orr v0.16b, v0.16b, v1.16b - sub x1, x1, #1 - cbnz x1, .Loop_is_zero - -.Loop_is_zero_done: - dup v1.2d, v0.d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] - mov x0, #1 - cmp x1, #0 - csel x0, x0, xzr, eq - ret -.size vec_is_zero_16x,.-vec_is_zero_16x -.globl vec_is_equal_16x -.hidden vec_is_equal_16x -.type vec_is_equal_16x,%function -.align 5 -vec_is_equal_16x: - ld1 {v0.2d}, [x0], #16 - ld1 {v1.2d}, [x1], #16 - lsr x2, x2, #4 - eor v0.16b, v0.16b, v1.16b - -.Loop_is_equal: - sub x2, x2, #1 - cbz x2, .Loop_is_equal_done - ld1 {v1.2d}, [x0], #16 - ld1 {v2.2d}, [x1], #16 - eor v1.16b, v1.16b, v2.16b - orr v0.16b, v0.16b, v1.16b - b .Loop_is_equal - nop - -.Loop_is_equal_done: - dup v1.2d, v0.d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] - mov x0, #1 - cmp x1, #0 - csel x0, x0, xzr, eq - ret -.size vec_is_equal_16x,.-vec_is_equal_16x diff --git a/crypto/blst_src/build/elf/add_mod_384-x86_64.s b/crypto/blst_src/build/elf/add_mod_384-x86_64.s deleted file mode 100644 index 39eee6d1752..00000000000 --- a/crypto/blst_src/build/elf/add_mod_384-x86_64.s +++ /dev/null @@ -1,1907 +0,0 @@ -.text - -.globl add_mod_384 -.hidden add_mod_384 -.type add_mod_384,@function -.align 32 -add_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __add_mod_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size add_mod_384,.-add_mod_384 - -.type __add_mod_384,@function -.align 32 -__add_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -__add_mod_384_a_is_loaded: - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - movq %r8,%r14 - adcq 24(%rdx),%r11 - movq %r9,%r15 - adcq 32(%rdx),%r12 - movq %r10,%rax - adcq 40(%rdx),%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,0(%rdi) - cmovcq %rbx,%r11 - movq %r9,8(%rdi) - cmovcq %rbp,%r12 - movq %r10,16(%rdi) - cmovcq %rsi,%r13 - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __add_mod_384,.-__add_mod_384 - -.globl add_mod_384x -.hidden add_mod_384x -.type add_mod_384x,@function -.align 32 -add_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $24,%rsp -.cfi_adjust_cfa_offset 24 - - - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - leaq 48(%rsi),%rsi - leaq 48(%rdx),%rdx - leaq 48(%rdi),%rdi - call __add_mod_384 - - movq 0(%rsp),%rsi - movq 8(%rsp),%rdx - leaq -48(%rdi),%rdi - call __add_mod_384 - - movq 24+0(%rsp),%r15 -.cfi_restore %r15 - movq 24+8(%rsp),%r14 -.cfi_restore %r14 - movq 24+16(%rsp),%r13 -.cfi_restore %r13 - movq 24+24(%rsp),%r12 -.cfi_restore %r12 - movq 24+32(%rsp),%rbx -.cfi_restore %rbx - movq 24+40(%rsp),%rbp -.cfi_restore %rbp - leaq 24+48(%rsp),%rsp -.cfi_adjust_cfa_offset -24-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size add_mod_384x,.-add_mod_384x - - -.globl rshift_mod_384 -.hidden rshift_mod_384 -.type rshift_mod_384,@function -.align 32 -rshift_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -.Loop_rshift_mod_384: - call __rshift_mod_384 - decl %edx - jnz .Loop_rshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size rshift_mod_384,.-rshift_mod_384 - -.type __rshift_mod_384,@function -.align 32 -__rshift_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $1,%rsi - movq 0(%rcx),%r14 - andq %r8,%rsi - movq 8(%rcx),%r15 - negq %rsi - movq 16(%rcx),%rax - andq %rsi,%r14 - movq 24(%rcx),%rbx - andq %rsi,%r15 - movq 32(%rcx),%rbp - andq %rsi,%rax - andq %rsi,%rbx - andq %rsi,%rbp - andq 40(%rcx),%rsi - - addq %r8,%r14 - adcq %r9,%r15 - adcq %r10,%rax - adcq %r11,%rbx - adcq %r12,%rbp - adcq %r13,%rsi - sbbq %r13,%r13 - - shrq $1,%r14 - movq %r15,%r8 - shrq $1,%r15 - movq %rax,%r9 - shrq $1,%rax - movq %rbx,%r10 - shrq $1,%rbx - movq %rbp,%r11 - shrq $1,%rbp - movq %rsi,%r12 - shrq $1,%rsi - shlq $63,%r8 - shlq $63,%r9 - orq %r14,%r8 - shlq $63,%r10 - orq %r15,%r9 - shlq $63,%r11 - orq %rax,%r10 - shlq $63,%r12 - orq %rbx,%r11 - shlq $63,%r13 - orq %rbp,%r12 - orq %rsi,%r13 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __rshift_mod_384,.-__rshift_mod_384 - -.globl div_by_2_mod_384 -.hidden div_by_2_mod_384 -.type div_by_2_mod_384,@function -.align 32 -div_by_2_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq %rdx,%rcx - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - call __rshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size div_by_2_mod_384,.-div_by_2_mod_384 - - -.globl lshift_mod_384 -.hidden lshift_mod_384 -.type lshift_mod_384,@function -.align 32 -lshift_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -.Loop_lshift_mod_384: - addq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - movq %r8,%r14 - adcq %r11,%r11 - movq %r9,%r15 - adcq %r12,%r12 - movq %r10,%rax - adcq %r13,%r13 - movq %r11,%rbx - sbbq %rdi,%rdi - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdi - - movq (%rsp),%rdi - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - cmovcq %rbx,%r11 - cmovcq %rbp,%r12 - cmovcq %rsi,%r13 - - decl %edx - jnz .Loop_lshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size lshift_mod_384,.-lshift_mod_384 - -.type __lshift_mod_384,@function -.align 32 -__lshift_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - addq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - movq %r8,%r14 - adcq %r11,%r11 - movq %r9,%r15 - adcq %r12,%r12 - movq %r10,%rax - adcq %r13,%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - cmovcq %rbx,%r11 - cmovcq %rbp,%r12 - cmovcq %rsi,%r13 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __lshift_mod_384,.-__lshift_mod_384 - - -.globl mul_by_3_mod_384 -.hidden mul_by_3_mod_384 -.type mul_by_3_mod_384,@function -.align 32 -mul_by_3_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - - movq (%rsp),%rdx - call __add_mod_384_a_is_loaded - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_by_3_mod_384,.-mul_by_3_mod_384 - -.globl mul_by_8_mod_384 -.hidden mul_by_8_mod_384 -.type mul_by_8_mod_384,@function -.align 32 -mul_by_8_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_by_8_mod_384,.-mul_by_8_mod_384 - - -.globl mul_by_3_mod_384x -.hidden mul_by_3_mod_384x -.type mul_by_3_mod_384x,@function -.align 32 -mul_by_3_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - - movq (%rsp),%rdx - call __add_mod_384_a_is_loaded - - movq (%rsp),%rsi - leaq 48(%rdi),%rdi - - movq 48(%rsi),%r8 - movq 56(%rsi),%r9 - movq 64(%rsi),%r10 - movq 72(%rsi),%r11 - movq 80(%rsi),%r12 - movq 88(%rsi),%r13 - - call __lshift_mod_384 - - movq $48,%rdx - addq (%rsp),%rdx - call __add_mod_384_a_is_loaded - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_by_3_mod_384x,.-mul_by_3_mod_384x - -.globl mul_by_8_mod_384x -.hidden mul_by_8_mod_384x -.type mul_by_8_mod_384x,@function -.align 32 -mul_by_8_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - movq (%rsp),%rsi - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 48+0(%rsi),%r8 - movq 48+8(%rsi),%r9 - movq 48+16(%rsi),%r10 - movq 48+24(%rsi),%r11 - movq 48+32(%rsi),%r12 - movq 48+40(%rsi),%r13 - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - movq %r8,48+0(%rdi) - movq %r9,48+8(%rdi) - movq %r10,48+16(%rdi) - movq %r11,48+24(%rdi) - movq %r12,48+32(%rdi) - movq %r13,48+40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_by_8_mod_384x,.-mul_by_8_mod_384x - - -.globl cneg_mod_384 -.hidden cneg_mod_384 -.type cneg_mod_384,@function -.align 32 -cneg_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdx -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%rdx - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq %rdx,%r8 - movq 24(%rsi),%r11 - orq %r9,%rdx - movq 32(%rsi),%r12 - orq %r10,%rdx - movq 40(%rsi),%r13 - orq %r11,%rdx - movq $-1,%rsi - orq %r12,%rdx - orq %r13,%rdx - - movq 0(%rcx),%r14 - cmovnzq %rsi,%rdx - movq 8(%rcx),%r15 - movq 16(%rcx),%rax - andq %rdx,%r14 - movq 24(%rcx),%rbx - andq %rdx,%r15 - movq 32(%rcx),%rbp - andq %rdx,%rax - movq 40(%rcx),%rsi - andq %rdx,%rbx - movq 0(%rsp),%rcx - andq %rdx,%rbp - andq %rdx,%rsi - - subq %r8,%r14 - sbbq %r9,%r15 - sbbq %r10,%rax - sbbq %r11,%rbx - sbbq %r12,%rbp - sbbq %r13,%rsi - - orq %rcx,%rcx - - cmovzq %r8,%r14 - cmovzq %r9,%r15 - cmovzq %r10,%rax - movq %r14,0(%rdi) - cmovzq %r11,%rbx - movq %r15,8(%rdi) - cmovzq %r12,%rbp - movq %rax,16(%rdi) - cmovzq %r13,%rsi - movq %rbx,24(%rdi) - movq %rbp,32(%rdi) - movq %rsi,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size cneg_mod_384,.-cneg_mod_384 - - -.globl sub_mod_384 -.hidden sub_mod_384 -.type sub_mod_384,@function -.align 32 -sub_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __sub_mod_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sub_mod_384,.-sub_mod_384 - -.type __sub_mod_384,@function -.align 32 -__sub_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - subq 0(%rdx),%r8 - movq 0(%rcx),%r14 - sbbq 8(%rdx),%r9 - movq 8(%rcx),%r15 - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rax - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbx - sbbq 32(%rdx),%r12 - movq 32(%rcx),%rbp - sbbq 40(%rdx),%r13 - movq 40(%rcx),%rsi - sbbq %rdx,%rdx - - andq %rdx,%r14 - andq %rdx,%r15 - andq %rdx,%rax - andq %rdx,%rbx - andq %rdx,%rbp - andq %rdx,%rsi - - addq %r14,%r8 - adcq %r15,%r9 - movq %r8,0(%rdi) - adcq %rax,%r10 - movq %r9,8(%rdi) - adcq %rbx,%r11 - movq %r10,16(%rdi) - adcq %rbp,%r12 - movq %r11,24(%rdi) - adcq %rsi,%r13 - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __sub_mod_384,.-__sub_mod_384 - -.globl sub_mod_384x -.hidden sub_mod_384x -.type sub_mod_384x,@function -.align 32 -sub_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $24,%rsp -.cfi_adjust_cfa_offset 24 - - - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - leaq 48(%rsi),%rsi - leaq 48(%rdx),%rdx - leaq 48(%rdi),%rdi - call __sub_mod_384 - - movq 0(%rsp),%rsi - movq 8(%rsp),%rdx - leaq -48(%rdi),%rdi - call __sub_mod_384 - - movq 24+0(%rsp),%r15 -.cfi_restore %r15 - movq 24+8(%rsp),%r14 -.cfi_restore %r14 - movq 24+16(%rsp),%r13 -.cfi_restore %r13 - movq 24+24(%rsp),%r12 -.cfi_restore %r12 - movq 24+32(%rsp),%rbx -.cfi_restore %rbx - movq 24+40(%rsp),%rbp -.cfi_restore %rbp - leaq 24+48(%rsp),%rsp -.cfi_adjust_cfa_offset -24-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sub_mod_384x,.-sub_mod_384x -.globl mul_by_1_plus_i_mod_384x -.hidden mul_by_1_plus_i_mod_384x -.type mul_by_1_plus_i_mod_384x,@function -.align 32 -mul_by_1_plus_i_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $56,%rsp -.cfi_adjust_cfa_offset 56 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %r8,%r14 - addq 48(%rsi),%r8 - movq %r9,%r15 - adcq 56(%rsi),%r9 - movq %r10,%rax - adcq 64(%rsi),%r10 - movq %r11,%rbx - adcq 72(%rsi),%r11 - movq %r12,%rcx - adcq 80(%rsi),%r12 - movq %r13,%rbp - adcq 88(%rsi),%r13 - movq %rdi,48(%rsp) - sbbq %rdi,%rdi - - subq 48(%rsi),%r14 - sbbq 56(%rsi),%r15 - sbbq 64(%rsi),%rax - sbbq 72(%rsi),%rbx - sbbq 80(%rsi),%rcx - sbbq 88(%rsi),%rbp - sbbq %rsi,%rsi - - movq %r8,0(%rsp) - movq 0(%rdx),%r8 - movq %r9,8(%rsp) - movq 8(%rdx),%r9 - movq %r10,16(%rsp) - movq 16(%rdx),%r10 - movq %r11,24(%rsp) - movq 24(%rdx),%r11 - movq %r12,32(%rsp) - andq %rsi,%r8 - movq 32(%rdx),%r12 - movq %r13,40(%rsp) - andq %rsi,%r9 - movq 40(%rdx),%r13 - andq %rsi,%r10 - andq %rsi,%r11 - andq %rsi,%r12 - andq %rsi,%r13 - movq 48(%rsp),%rsi - - addq %r8,%r14 - movq 0(%rsp),%r8 - adcq %r9,%r15 - movq 8(%rsp),%r9 - adcq %r10,%rax - movq 16(%rsp),%r10 - adcq %r11,%rbx - movq 24(%rsp),%r11 - adcq %r12,%rcx - movq 32(%rsp),%r12 - adcq %r13,%rbp - movq 40(%rsp),%r13 - - movq %r14,0(%rsi) - movq %r8,%r14 - movq %r15,8(%rsi) - movq %rax,16(%rsi) - movq %r9,%r15 - movq %rbx,24(%rsi) - movq %rcx,32(%rsi) - movq %r10,%rax - movq %rbp,40(%rsi) - - subq 0(%rdx),%r8 - movq %r11,%rbx - sbbq 8(%rdx),%r9 - sbbq 16(%rdx),%r10 - movq %r12,%rcx - sbbq 24(%rdx),%r11 - sbbq 32(%rdx),%r12 - movq %r13,%rbp - sbbq 40(%rdx),%r13 - sbbq $0,%rdi - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,48(%rsi) - cmovcq %rbx,%r11 - movq %r9,56(%rsi) - cmovcq %rcx,%r12 - movq %r10,64(%rsi) - cmovcq %rbp,%r13 - movq %r11,72(%rsi) - movq %r12,80(%rsi) - movq %r13,88(%rsi) - - movq 56+0(%rsp),%r15 -.cfi_restore %r15 - movq 56+8(%rsp),%r14 -.cfi_restore %r14 - movq 56+16(%rsp),%r13 -.cfi_restore %r13 - movq 56+24(%rsp),%r12 -.cfi_restore %r12 - movq 56+32(%rsp),%rbx -.cfi_restore %rbx - movq 56+40(%rsp),%rbp -.cfi_restore %rbp - leaq 56+48(%rsp),%rsp -.cfi_adjust_cfa_offset -56-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x -.globl sgn0_pty_mod_384 -.hidden sgn0_pty_mod_384 -.type sgn0_pty_mod_384,@function -.align 32 -sgn0_pty_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%rcx - movq 40(%rdi),%rdx - - xorq %rax,%rax - movq %r8,%rdi - addq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %rcx,%rcx - adcq %rdx,%rdx - adcq $0,%rax - - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - sbbq 40(%rsi),%rdx - sbbq $0,%rax - - notq %rax - andq $1,%rdi - andq $2,%rax - orq %rdi,%rax - - - .byte 0xf3,0xc3 -.cfi_endproc -.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 - -.globl sgn0_pty_mod_384x -.hidden sgn0_pty_mod_384x -.type sgn0_pty_mod_384x,@function -.align 32 -sgn0_pty_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 48(%rdi),%r8 - movq 56(%rdi),%r9 - movq 64(%rdi),%r10 - movq 72(%rdi),%r11 - movq 80(%rdi),%rcx - movq 88(%rdi),%rdx - - movq %r8,%rbx - orq %r9,%r8 - orq %r10,%r8 - orq %r11,%r8 - orq %rcx,%r8 - orq %rdx,%r8 - - leaq 0(%rdi),%rax - xorq %rdi,%rdi - movq %rbx,%rbp - addq %rbx,%rbx - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %rcx,%rcx - adcq %rdx,%rdx - adcq $0,%rdi - - subq 0(%rsi),%rbx - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - sbbq 40(%rsi),%rdx - sbbq $0,%rdi - - movq %r8,0(%rsp) - notq %rdi - andq $1,%rbp - andq $2,%rdi - orq %rbp,%rdi - - movq 0(%rax),%r8 - movq 8(%rax),%r9 - movq 16(%rax),%r10 - movq 24(%rax),%r11 - movq 32(%rax),%rcx - movq 40(%rax),%rdx - - movq %r8,%rbx - orq %r9,%r8 - orq %r10,%r8 - orq %r11,%r8 - orq %rcx,%r8 - orq %rdx,%r8 - - xorq %rax,%rax - movq %rbx,%rbp - addq %rbx,%rbx - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %rcx,%rcx - adcq %rdx,%rdx - adcq $0,%rax - - subq 0(%rsi),%rbx - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - sbbq 40(%rsi),%rdx - sbbq $0,%rax - - movq 0(%rsp),%rbx - - notq %rax - - testq %r8,%r8 - cmovzq %rdi,%rbp - - testq %rbx,%rbx - cmovnzq %rdi,%rax - - andq $1,%rbp - andq $2,%rax - orq %rbp,%rax - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x -.globl vec_select_32 -.hidden vec_select_32 -.type vec_select_32,@function -.align 32 -vec_select_32: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 16(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 16(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 16(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-16(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-16(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-16(%rdi) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,16-16(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size vec_select_32,.-vec_select_32 -.globl vec_select_48 -.hidden vec_select_48 -.type vec_select_48,@function -.align 32 -vec_select_48: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 24(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 24(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 24(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-24(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-24(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-24(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-24(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-24(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-24(%rdi) - pand %xmm4,%xmm0 - pand %xmm5,%xmm1 - por %xmm1,%xmm0 - movdqu %xmm0,32-24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size vec_select_48,.-vec_select_48 -.globl vec_select_96 -.hidden vec_select_96 -.type vec_select_96,@function -.align 32 -vec_select_96: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 48(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 48(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 48(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-48(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-48(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-48(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-48(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-48(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-48(%rdi) - pand %xmm4,%xmm0 - movdqu 32+16-48(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-48(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-48(%rdi) - pand %xmm4,%xmm2 - movdqu 48+16-48(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-48(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-48(%rdi) - pand %xmm4,%xmm0 - movdqu 64+16-48(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-48(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-48(%rdi) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,80-48(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size vec_select_96,.-vec_select_96 -.globl vec_select_192 -.hidden vec_select_192 -.type vec_select_192,@function -.align 32 -vec_select_192: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 96(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 96(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 96(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-96(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-96(%rdi) - pand %xmm4,%xmm0 - movdqu 32+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-96(%rdi) - pand %xmm4,%xmm2 - movdqu 48+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-96(%rdi) - pand %xmm4,%xmm0 - movdqu 64+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-96(%rdi) - pand %xmm4,%xmm2 - movdqu 80+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 80+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,80-96(%rdi) - pand %xmm4,%xmm0 - movdqu 96+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 96+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,96-96(%rdi) - pand %xmm4,%xmm2 - movdqu 112+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 112+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,112-96(%rdi) - pand %xmm4,%xmm0 - movdqu 128+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 128+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,128-96(%rdi) - pand %xmm4,%xmm2 - movdqu 144+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 144+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,144-96(%rdi) - pand %xmm4,%xmm0 - movdqu 160+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 160+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,160-96(%rdi) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,176-96(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size vec_select_192,.-vec_select_192 -.globl vec_select_144 -.hidden vec_select_144 -.type vec_select_144,@function -.align 32 -vec_select_144: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 72(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 72(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 72(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-72(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-72(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-72(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-72(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-72(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-72(%rdi) - pand %xmm4,%xmm0 - movdqu 32+16-72(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-72(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-72(%rdi) - pand %xmm4,%xmm2 - movdqu 48+16-72(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-72(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-72(%rdi) - pand %xmm4,%xmm0 - movdqu 64+16-72(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-72(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-72(%rdi) - pand %xmm4,%xmm2 - movdqu 80+16-72(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 80+16-72(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,80-72(%rdi) - pand %xmm4,%xmm0 - movdqu 96+16-72(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 96+16-72(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,96-72(%rdi) - pand %xmm4,%xmm2 - movdqu 112+16-72(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 112+16-72(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,112-72(%rdi) - pand %xmm4,%xmm0 - pand %xmm5,%xmm1 - por %xmm1,%xmm0 - movdqu %xmm0,128-72(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size vec_select_144,.-vec_select_144 -.globl vec_select_288 -.hidden vec_select_288 -.type vec_select_288,@function -.align 32 -vec_select_288: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 144(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 144(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 144(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-144(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-144(%rdi) - pand %xmm4,%xmm0 - movdqu 32+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-144(%rdi) - pand %xmm4,%xmm2 - movdqu 48+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-144(%rdi) - pand %xmm4,%xmm0 - movdqu 64+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-144(%rdi) - pand %xmm4,%xmm2 - movdqu 80+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 80+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,80-144(%rdi) - pand %xmm4,%xmm0 - movdqu 96+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 96+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,96-144(%rdi) - pand %xmm4,%xmm2 - movdqu 112+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 112+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,112-144(%rdi) - pand %xmm4,%xmm0 - movdqu 128+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 128+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,128-144(%rdi) - pand %xmm4,%xmm2 - movdqu 144+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 144+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,144-144(%rdi) - pand %xmm4,%xmm0 - movdqu 160+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 160+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,160-144(%rdi) - pand %xmm4,%xmm2 - movdqu 176+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 176+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,176-144(%rdi) - pand %xmm4,%xmm0 - movdqu 192+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 192+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,192-144(%rdi) - pand %xmm4,%xmm2 - movdqu 208+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 208+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,208-144(%rdi) - pand %xmm4,%xmm0 - movdqu 224+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 224+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,224-144(%rdi) - pand %xmm4,%xmm2 - movdqu 240+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 240+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,240-144(%rdi) - pand %xmm4,%xmm0 - movdqu 256+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 256+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,256-144(%rdi) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,272-144(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size vec_select_288,.-vec_select_288 -.globl vec_prefetch -.hidden vec_prefetch -.type vec_prefetch,@function -.align 32 -vec_prefetch: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - leaq -1(%rdi,%rsi,1),%rsi - movq $64,%rax - xorq %r8,%r8 - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - prefetchnta (%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size vec_prefetch,.-vec_prefetch -.globl vec_is_zero_16x -.hidden vec_is_zero_16x -.type vec_is_zero_16x,@function -.align 32 -vec_is_zero_16x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - shrl $4,%esi - movdqu (%rdi),%xmm0 - leaq 16(%rdi),%rdi - -.Loop_is_zero: - decl %esi - jz .Loop_is_zero_done - movdqu (%rdi),%xmm1 - leaq 16(%rdi),%rdi - por %xmm1,%xmm0 - jmp .Loop_is_zero - -.Loop_is_zero_done: - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 -.byte 102,72,15,126,192 - incl %esi - testq %rax,%rax - cmovnzl %esi,%eax - xorl $1,%eax - .byte 0xf3,0xc3 -.cfi_endproc -.size vec_is_zero_16x,.-vec_is_zero_16x -.globl vec_is_equal_16x -.hidden vec_is_equal_16x -.type vec_is_equal_16x,@function -.align 32 -vec_is_equal_16x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - shrl $4,%edx - movdqu (%rdi),%xmm0 - movdqu (%rsi),%xmm1 - subq %rdi,%rsi - leaq 16(%rdi),%rdi - pxor %xmm1,%xmm0 - -.Loop_is_equal: - decl %edx - jz .Loop_is_equal_done - movdqu (%rdi),%xmm1 - movdqu (%rdi,%rsi,1),%xmm2 - leaq 16(%rdi),%rdi - pxor %xmm2,%xmm1 - por %xmm1,%xmm0 - jmp .Loop_is_equal - -.Loop_is_equal_done: - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 -.byte 102,72,15,126,192 - incl %edx - testq %rax,%rax - cmovnzl %edx,%eax - xorl $1,%eax - .byte 0xf3,0xc3 -.cfi_endproc -.size vec_is_equal_16x,.-vec_is_equal_16x - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s b/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s deleted file mode 100644 index 084f3d8262d..00000000000 --- a/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s +++ /dev/null @@ -1,252 +0,0 @@ -.text - -.type __add_mod_384x384,@function -.align 32 -__add_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - addq 0(%rdx),%r8 - movq 56(%rsi),%r15 - adcq 8(%rdx),%r9 - movq 64(%rsi),%rax - adcq 16(%rdx),%r10 - movq 72(%rsi),%rbx - adcq 24(%rdx),%r11 - movq 80(%rsi),%rbp - adcq 32(%rdx),%r12 - movq 88(%rsi),%rsi - adcq 40(%rdx),%r13 - movq %r8,0(%rdi) - adcq 48(%rdx),%r14 - movq %r9,8(%rdi) - adcq 56(%rdx),%r15 - movq %r10,16(%rdi) - adcq 64(%rdx),%rax - movq %r12,32(%rdi) - movq %r14,%r8 - adcq 72(%rdx),%rbx - movq %r11,24(%rdi) - movq %r15,%r9 - adcq 80(%rdx),%rbp - movq %r13,40(%rdi) - movq %rax,%r10 - adcq 88(%rdx),%rsi - movq %rbx,%r11 - sbbq %rdx,%rdx - - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - movq %rbp,%r12 - sbbq 16(%rcx),%rax - sbbq 24(%rcx),%rbx - sbbq 32(%rcx),%rbp - movq %rsi,%r13 - sbbq 40(%rcx),%rsi - sbbq $0,%rdx - - cmovcq %r8,%r14 - cmovcq %r9,%r15 - cmovcq %r10,%rax - movq %r14,48(%rdi) - cmovcq %r11,%rbx - movq %r15,56(%rdi) - cmovcq %r12,%rbp - movq %rax,64(%rdi) - cmovcq %r13,%rsi - movq %rbx,72(%rdi) - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __add_mod_384x384,.-__add_mod_384x384 - -.type __sub_mod_384x384,@function -.align 32 -__sub_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - subq 0(%rdx),%r8 - movq 56(%rsi),%r15 - sbbq 8(%rdx),%r9 - movq 64(%rsi),%rax - sbbq 16(%rdx),%r10 - movq 72(%rsi),%rbx - sbbq 24(%rdx),%r11 - movq 80(%rsi),%rbp - sbbq 32(%rdx),%r12 - movq 88(%rsi),%rsi - sbbq 40(%rdx),%r13 - movq %r8,0(%rdi) - sbbq 48(%rdx),%r14 - movq 0(%rcx),%r8 - movq %r9,8(%rdi) - sbbq 56(%rdx),%r15 - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - sbbq 64(%rdx),%rax - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - sbbq 72(%rdx),%rbx - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - sbbq 80(%rdx),%rbp - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - sbbq 88(%rdx),%rsi - movq 40(%rcx),%r13 - sbbq %rdx,%rdx - - andq %rdx,%r8 - andq %rdx,%r9 - andq %rdx,%r10 - andq %rdx,%r11 - andq %rdx,%r12 - andq %rdx,%r13 - - addq %r8,%r14 - adcq %r9,%r15 - movq %r14,48(%rdi) - adcq %r10,%rax - movq %r15,56(%rdi) - adcq %r11,%rbx - movq %rax,64(%rdi) - adcq %r12,%rbp - movq %rbx,72(%rdi) - adcq %r13,%rsi - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __sub_mod_384x384,.-__sub_mod_384x384 - -.globl add_mod_384x384 -.hidden add_mod_384x384 -.type add_mod_384x384,@function -.align 32 -add_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __add_mod_384x384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size add_mod_384x384,.-add_mod_384x384 - -.globl sub_mod_384x384 -.hidden sub_mod_384x384 -.type sub_mod_384x384,@function -.align 32 -sub_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __sub_mod_384x384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sub_mod_384x384,.-sub_mod_384x384 - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S deleted file mode 100644 index 0c5ac5b882d..00000000000 --- a/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S +++ /dev/null @@ -1,785 +0,0 @@ -.text - -.globl ct_inverse_mod_256 -.hidden ct_inverse_mod_256 -.type ct_inverse_mod_256, %function -.align 5 -ct_inverse_mod_256: - .inst 0xd503233f - stp x29, x30, [sp,#-80]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - sub sp, sp, #1040 - - ldp x4, x5, [x1,#8*0] - ldp x6, x7, [x1,#8*2] - - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... - str x0, [sp] - - ldp x8, x9, [x2,#8*0] - ldp x10, x11, [x2,#8*2] - - stp x4, x5, [x1,#8*0] // copy input to |a| - stp x6, x7, [x1,#8*2] - stp x8, x9, [x1,#8*4] // copy modulus to |b| - stp x10, x11, [x1,#8*6] - - ////////////////////////////////////////// first iteration - bl .Lab_approximation_31_256_loaded - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - str x12,[x0,#8*8] // initialize |u| with |f0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to dst |b| - bl __smul_256_n_shift_by_31 - str x12, [x0,#8*9] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - ldr x8, [x1,#8*8] // |u| - ldr x9, [x1,#8*13] // |v| - madd x4, x16, x8, xzr // |u|*|f0| - madd x4, x17, x9, x4 // |v|*|g0| - str x4, [x0,#8*4] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*5] - stp x5, x5, [x0,#8*7] - - madd x4, x12, x8, xzr // |u|*|f1| - madd x4, x13, x9, x4 // |v|*|g1| - str x4, [x0,#8*9] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*10] - stp x5, x5, [x0,#8*12] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - ////////////////////////////////////////// two[!] last iterations - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #47 // 31 + 512 % 31 - //bl __ab_approximation_62_256 // |a| and |b| are exact, - ldr x7, [x1,#8*0] // just load - ldr x11, [x1,#8*4] - bl __inner_loop_62_256 - - mov x16, x14 - mov x17, x15 - ldr x0, [sp] // original out_ptr - bl __smul_256x63 - bl __smul_512x63_tail - ldr x30, [x29,#8] - - smulh x20, x7, x17 // figure out top-most limb - ldp x8, x9, [x3,#8*0] - adc x23, x23, x25 - ldp x10, x11, [x3,#8*2] - - add x20, x20, x23 // x20 is 1, 0 or -1 - asr x19, x20, #63 // sign as mask - - and x23, x8, x19 // add mod<<256 conditionally - and x24, x9, x19 - adds x4, x4, x23 - and x25, x10, x19 - adcs x5, x5, x24 - and x26, x11, x19 - adcs x6, x6, x25 - adcs x7, x22, x26 - adc x20, x20, xzr // x20 is 1, 0 or -1 - - neg x19, x20 - orr x20, x20, x19 // excess bit or sign as mask - asr x19, x19, #63 // excess bit as mask - - and x8, x8, x20 // mask |mod| - and x9, x9, x20 - and x10, x10, x20 - and x11, x11, x20 - - eor x8, x8, x19 // conditionally negate |mod| - eor x9, x9, x19 - adds x8, x8, x19, lsr#63 - eor x10, x10, x19 - adcs x9, x9, xzr - eor x11, x11, x19 - adcs x10, x10, xzr - adc x11, x11, xzr - - adds x4, x4, x8 // final adjustment for |mod|<<256 - adcs x5, x5, x9 - adcs x6, x6, x10 - stp x4, x5, [x0,#8*4] - adc x7, x7, x11 - stp x6, x7, [x0,#8*6] - - add sp, sp, #1040 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldr x29, [sp],#80 - .inst 0xd50323bf - ret -.size ct_inverse_mod_256,.-ct_inverse_mod_256 - -//////////////////////////////////////////////////////////////////////// -.type __smul_256x63, %function -.align 5 -__smul_256x63: - ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) - asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x6, x7, [x1,#8*2+64] - eor x16, x16, x14 // conditionally negate |f_| (or |g_|) - ldr x22, [x1,#8*4+64] - - eor x4, x4, x14 // conditionally negate |u| (or |v|) - sub x16, x16, x14 - eor x5, x5, x14 - adds x4, x4, x14, lsr#63 - eor x6, x6, x14 - adcs x5, x5, xzr - eor x7, x7, x14 - adcs x6, x6, xzr - eor x22, x22, x14 - umulh x19, x4, x16 - adcs x7, x7, xzr - umulh x20, x5, x16 - adcs x22, x22, xzr - umulh x21, x6, x16 - mul x4, x4, x16 - cmp x16, #0 - mul x5, x5, x16 - csel x22, x22, xzr, ne - mul x6, x6, x16 - adds x5, x5, x19 - mul x24, x7, x16 - adcs x6, x6, x20 - adcs x24, x24, x21 - adc x26, xzr, xzr - ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) - asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x10, x11, [x1,#8*2+104] - eor x17, x17, x14 // conditionally negate |f_| (or |g_|) - ldr x23, [x1,#8*4+104] - - eor x8, x8, x14 // conditionally negate |u| (or |v|) - sub x17, x17, x14 - eor x9, x9, x14 - adds x8, x8, x14, lsr#63 - eor x10, x10, x14 - adcs x9, x9, xzr - eor x11, x11, x14 - adcs x10, x10, xzr - eor x23, x23, x14 - umulh x19, x8, x17 - adcs x11, x11, xzr - umulh x20, x9, x17 - adcs x23, x23, xzr - umulh x21, x10, x17 - adc x15, xzr, xzr // used in __smul_512x63_tail - mul x8, x8, x17 - cmp x17, #0 - mul x9, x9, x17 - csel x23, x23, xzr, ne - mul x10, x10, x17 - adds x9, x9, x19 - mul x25, x11, x17 - adcs x10, x10, x20 - adcs x25, x25, x21 - adc x26, x26, xzr - - adds x4, x4, x8 - adcs x5, x5, x9 - adcs x6, x6, x10 - stp x4, x5, [x0,#8*0] - adcs x24, x24, x25 - stp x6, x24, [x0,#8*2] - - ret -.size __smul_256x63,.-__smul_256x63 - -.type __smul_512x63_tail, %function -.align 5 -__smul_512x63_tail: - umulh x24, x7, x16 - ldp x5, x6, [x1,#8*18] // load rest of |v| - adc x26, x26, xzr - ldr x7, [x1,#8*20] - and x22, x22, x16 - - umulh x11, x11, x17 // resume |v|*|g1| chain - - sub x24, x24, x22 // tie up |u|*|f1| chain - asr x25, x24, #63 - - eor x5, x5, x14 // conditionally negate rest of |v| - eor x6, x6, x14 - adds x5, x5, x15 - eor x7, x7, x14 - adcs x6, x6, xzr - umulh x19, x23, x17 - adc x7, x7, xzr - umulh x20, x5, x17 - add x11, x11, x26 - umulh x21, x6, x17 - - mul x4, x23, x17 - mul x5, x5, x17 - adds x4, x4, x11 - mul x6, x6, x17 - adcs x5, x5, x19 - mul x22, x7, x17 - adcs x6, x6, x20 - adcs x22, x22, x21 - adc x23, xzr, xzr // used in the final step - - adds x4, x4, x24 - adcs x5, x5, x25 - adcs x6, x6, x25 - stp x4, x5, [x0,#8*4] - adcs x22, x22, x25 // carry is used in the final step - stp x6, x22, [x0,#8*6] - - ret -.size __smul_512x63_tail,.-__smul_512x63_tail - -.type __smul_256_n_shift_by_31, %function -.align 5 -__smul_256_n_shift_by_31: - ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) - asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x6, x7, [x1,#8*2+0] - eor x25, x12, x24 // conditionally negate |f0| (or |g0|) - - eor x4, x4, x24 // conditionally negate |a| (or |b|) - sub x25, x25, x24 - eor x5, x5, x24 - adds x4, x4, x24, lsr#63 - eor x6, x6, x24 - adcs x5, x5, xzr - eor x7, x7, x24 - umulh x19, x4, x25 - adcs x6, x6, xzr - umulh x20, x5, x25 - adc x7, x7, xzr - umulh x21, x6, x25 - and x24, x24, x25 - umulh x22, x7, x25 - neg x24, x24 - - mul x4, x4, x25 - mul x5, x5, x25 - mul x6, x6, x25 - adds x5, x5, x19 - mul x7, x7, x25 - adcs x6, x6, x20 - adcs x7, x7, x21 - adc x22, x22, x24 - ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) - asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x10, x11, [x1,#8*2+32] - eor x25, x13, x24 // conditionally negate |f0| (or |g0|) - - eor x8, x8, x24 // conditionally negate |a| (or |b|) - sub x25, x25, x24 - eor x9, x9, x24 - adds x8, x8, x24, lsr#63 - eor x10, x10, x24 - adcs x9, x9, xzr - eor x11, x11, x24 - umulh x19, x8, x25 - adcs x10, x10, xzr - umulh x20, x9, x25 - adc x11, x11, xzr - umulh x21, x10, x25 - and x24, x24, x25 - umulh x23, x11, x25 - neg x24, x24 - - mul x8, x8, x25 - mul x9, x9, x25 - mul x10, x10, x25 - adds x9, x9, x19 - mul x11, x11, x25 - adcs x10, x10, x20 - adcs x11, x11, x21 - adc x23, x23, x24 - adds x4, x4, x8 - adcs x5, x5, x9 - adcs x6, x6, x10 - adcs x7, x7, x11 - adc x8, x22, x23 - - extr x4, x5, x4, #31 - extr x5, x6, x5, #31 - extr x6, x7, x6, #31 - asr x23, x8, #63 // result's sign as mask - extr x7, x8, x7, #31 - - eor x4, x4, x23 // ensure the result is positive - eor x5, x5, x23 - adds x4, x4, x23, lsr#63 - eor x6, x6, x23 - adcs x5, x5, xzr - eor x7, x7, x23 - adcs x6, x6, xzr - stp x4, x5, [x0,#8*0] - adc x7, x7, xzr - stp x6, x7, [x0,#8*2] - - eor x12, x12, x23 // adjust |f/g| accordingly - eor x13, x13, x23 - sub x12, x12, x23 - sub x13, x13, x23 - - ret -.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 -.type __ab_approximation_31_256, %function -.align 4 -__ab_approximation_31_256: - ldp x6, x7, [x1,#8*2] - ldp x10, x11, [x1,#8*6] - ldp x4, x5, [x1,#8*0] - ldp x8, x9, [x1,#8*4] - -.Lab_approximation_31_256_loaded: - orr x19, x7, x11 // check top-most limbs, ... - cmp x19, #0 - csel x7, x7, x6, ne - csel x11, x11, x10, ne - csel x6, x6, x5, ne - orr x19, x7, x11 // and ones before top-most, ... - csel x10, x10, x9, ne - - cmp x19, #0 - csel x7, x7, x6, ne - csel x11, x11, x10, ne - csel x6, x6, x4, ne - orr x19, x7, x11 // and one more, ... - csel x10, x10, x8, ne - - clz x19, x19 - cmp x19, #64 - csel x19, x19, xzr, ne - csel x7, x7, x6, ne - csel x11, x11, x10, ne - neg x20, x19 - - lslv x7, x7, x19 // align high limbs to the left - lslv x11, x11, x19 - lsrv x6, x6, x20 - lsrv x10, x10, x20 - and x6, x6, x20, asr#6 - and x10, x10, x20, asr#6 - orr x7, x7, x6 - orr x11, x11, x10 - - bfxil x7, x4, #0, #31 - bfxil x11, x8, #0, #31 - - b __inner_loop_31_256 - ret -.size __ab_approximation_31_256,.-__ab_approximation_31_256 - -.type __inner_loop_31_256, %function -.align 4 -__inner_loop_31_256: - mov x2, #31 - mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov x23,#0x7FFFFFFF7FFFFFFF - -.Loop_31_256: - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x19, x15 - csel x11, x11, x7, hs // |b_| = |a_| - csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x15, x15, x13, hs // exchange |fg0| and |fg1| - csel x13, x13, x19, hs - lsr x7, x7, #1 - and x19, x15, x22 - and x20, x23, x22 - sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x15, x15, x15 // |f1|<<=1 - add x13, x13, x20 - sub x15, x15, x23 - cbnz x2, .Loop_31_256 - - mov x23, #0x7FFFFFFF - ubfx x12, x13, #0, #32 - ubfx x13, x13, #32, #32 - ubfx x14, x15, #0, #32 - ubfx x15, x15, #32, #32 - sub x12, x12, x23 // remove bias - sub x13, x13, x23 - sub x14, x14, x23 - sub x15, x15, x23 - - ret -.size __inner_loop_31_256,.-__inner_loop_31_256 - -.type __inner_loop_62_256, %function -.align 4 -__inner_loop_62_256: - mov x12, #1 // |f0|=1 - mov x13, #0 // |g0|=0 - mov x14, #0 // |f1|=0 - mov x15, #1 // |g1|=1 - -.Loop_62_256: - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x19, x12 - csel x11, x11, x7, hs // |b_| = |a_| - csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - mov x20, x13 - csel x12, x12, x14, hs // exchange |f0| and |f1| - csel x14, x14, x19, hs - csel x13, x13, x15, hs // exchange |g0| and |g1| - csel x15, x15, x20, hs - lsr x7, x7, #1 - and x19, x14, x22 - and x20, x15, x22 - add x14, x14, x14 // |f1|<<=1 - add x15, x15, x15 // |g1|<<=1 - sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) - cbnz x2, .Loop_62_256 - - ret -.size __inner_loop_62_256,.-__inner_loop_62_256 diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s deleted file mode 100644 index 0f0ca4923d7..00000000000 --- a/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s +++ /dev/null @@ -1,1186 +0,0 @@ -.text - -.globl ct_inverse_mod_256 -.hidden ct_inverse_mod_256 -.type ct_inverse_mod_256,@function -.align 32 -ct_inverse_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $1072,%rsp -.cfi_adjust_cfa_offset 1072 - - - leaq 48+511(%rsp),%rax - andq $-512,%rax - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - movq 0(%rdx),%r12 - movq 8(%rdx),%r13 - movq 16(%rdx),%r14 - movq 24(%rdx),%r15 - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - - movq %r12,32(%rax) - movq %r13,40(%rax) - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rax,%rsi - - - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - - - movq %rdx,64(%rdi) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - - - movq %rdx,72(%rdi) - - - xorq $256,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - - - - movq 64(%rsi),%r8 - movq 104(%rsi),%r12 - movq %r8,%r9 - imulq 0(%rsp),%r8 - movq %r12,%r13 - imulq 8(%rsp),%r12 - addq %r12,%r8 - movq %r8,32(%rdi) - sarq $63,%r8 - movq %r8,40(%rdi) - movq %r8,48(%rdi) - movq %r8,56(%rdi) - movq %r8,64(%rdi) - leaq 64(%rsi),%rsi - - imulq %rdx,%r9 - imulq %rcx,%r13 - addq %r13,%r9 - movq %r9,72(%rdi) - sarq $63,%r9 - movq %r9,80(%rdi) - movq %r9,88(%rdi) - movq %r9,96(%rdi) - movq %r9,104(%rdi) - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - sarq $63,%rbp - movq %rbp,40(%rdi) - movq %rbp,48(%rdi) - movq %rbp,56(%rdi) - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - - xorq $256+64,%rsi - movl $47,%edx - - movq 0(%rsi),%r8 - - movq 32(%rsi),%r10 - - call __inner_loop_62_256 - - - - - - - - leaq 64(%rsi),%rsi - - - - - - movq %r12,%rdx - movq %r13,%rcx - movq 32(%rsp),%rdi - call __smulq_512x63 - adcq %rbp,%rdx - - movq 40(%rsp),%rsi - movq %rdx,%rax - sarq $63,%rdx - - movq %rdx,%r8 - movq %rdx,%r9 - andq 0(%rsi),%r8 - movq %rdx,%r10 - andq 8(%rsi),%r9 - andq 16(%rsi),%r10 - andq 24(%rsi),%rdx - - addq %r8,%r12 - adcq %r9,%r13 - adcq %r10,%r14 - adcq %rdx,%r15 - adcq $0,%rax - - movq %rax,%rdx - negq %rax - orq %rax,%rdx - sarq $63,%rax - - movq %rdx,%r8 - movq %rdx,%r9 - andq 0(%rsi),%r8 - movq %rdx,%r10 - andq 8(%rsi),%r9 - andq 16(%rsi),%r10 - andq 24(%rsi),%rdx - - xorq %rax,%r8 - xorq %rcx,%rcx - xorq %rax,%r9 - subq %rax,%rcx - xorq %rax,%r10 - xorq %rax,%rdx - addq %rcx,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%rdx - - addq %r8,%r12 - adcq %r9,%r13 - adcq %r10,%r14 - adcq %rdx,%r15 - - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 1072(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -1072-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size ct_inverse_mod_256,.-ct_inverse_mod_256 -.type __smulq_512x63,@function -.align 32 -__smulq_512x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%rbp - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%rbp - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%rbp - - mulq %rbx - movq %rax,0(%rdi) - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %r9,8(%rdi) - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %r10,16(%rdi) - movq %rdx,%r11 - andq %rbx,%rbp - negq %rbp - mulq %rbx - addq %rax,%r11 - adcq %rdx,%rbp - movq %r11,24(%rdi) - - movq 40(%rsi),%r8 - movq 48(%rsi),%r9 - movq 56(%rsi),%r10 - movq 64(%rsi),%r11 - movq 72(%rsi),%r12 - movq 80(%rsi),%r13 - movq 88(%rsi),%r14 - movq 96(%rsi),%r15 - - movq %rcx,%rdx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rcx - addq %rax,%rcx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - xorq %rdx,%r14 - xorq %rdx,%r15 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - - mulq %rcx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rcx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rcx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rcx - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rcx - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - mulq %rcx - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rcx - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - imulq %rcx - addq %rax,%r15 - adcq $0,%rdx - - movq %rbp,%rbx - sarq $63,%rbp - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq %rbx,%r12 - adcq %rbp,%r13 - adcq %rbp,%r14 - adcq %rbp,%r15 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulq_512x63,.-__smulq_512x63 - -.type __smulq_256x63,@function -.align 32 -__smulq_256x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - movq 0+32(%rsi),%rbp - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%rbp - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%rbp - - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - andq %rbx,%rbp - negq %rbp - mulq %rbx - addq %rax,%r11 - adcq %rdx,%rbp - movq %rcx,%rdx - movq 40+0(%rsi),%r12 - movq 40+8(%rsi),%r13 - movq 40+16(%rsi),%r14 - movq 40+24(%rsi),%r15 - movq 40+32(%rsi),%rcx - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r12 - xorq %rdx,%r13 - xorq %rdx,%r14 - xorq %rdx,%r15 - xorq %rdx,%rcx - addq %r12,%rax - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rcx - - mulq %rbx - movq %rax,%r12 - movq %r13,%rax - movq %rdx,%r13 - mulq %rbx - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rbx - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - andq %rbx,%rcx - negq %rcx - mulq %rbx - addq %rax,%r15 - adcq %rdx,%rcx - addq %r12,%r8 - adcq %r13,%r9 - adcq %r14,%r10 - adcq %r15,%r11 - adcq %rcx,%rbp - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %rbp,32(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulq_256x63,.-__smulq_256x63 -.type __smulq_256_n_shift_by_31,@function -.align 32 -__smulq_256_n_shift_by_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,0(%rdi) - movq %rcx,8(%rdi) - movq %rdx,%rbp - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - - movq %rbp,%rbx - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%rbx - addq %rax,%rbx - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - andq %rbx,%rbp - negq %rbp - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbx - addq %rax,%r11 - adcq %rdx,%rbp - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r14 - movq 32+24(%rsi),%r15 - - movq %rcx,%rbx - sarq $63,%rcx - xorq %rax,%rax - subq %rcx,%rax - - xorq %rcx,%rbx - addq %rax,%rbx - - xorq %rcx,%r12 - xorq %rcx,%r13 - xorq %rcx,%r14 - xorq %rcx,%r15 - addq %r12,%rax - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - - mulq %rbx - movq %rax,%r12 - movq %r13,%rax - andq %rbx,%rcx - negq %rcx - movq %rdx,%r13 - mulq %rbx - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rbx - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - mulq %rbx - addq %rax,%r15 - adcq %rdx,%rcx - addq %r12,%r8 - adcq %r13,%r9 - adcq %r14,%r10 - adcq %r15,%r11 - adcq %rcx,%rbp - - movq 0(%rdi),%rdx - movq 8(%rdi),%rcx - - shrdq $31,%r9,%r8 - shrdq $31,%r10,%r9 - shrdq $31,%r11,%r10 - shrdq $31,%rbp,%r11 - - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - xorq %rbp,%rdx - xorq %rbp,%rcx - addq %rax,%rdx - addq %rax,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 -.type __ab_approximation_31_256,@function -.align 32 -__ab_approximation_31_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 24(%rsi),%r9 - movq 56(%rsi),%r11 - movq 16(%rsi),%rbx - movq 48(%rsi),%rbp - movq 8(%rsi),%r8 - movq 40(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 0(%rsi),%r8 - cmovzq %r10,%rbp - movq 32(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - - movq %r9,%rax - orq %r11,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %r8,%r9 - cmovzq %r10,%r11 - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%rbx,%r9 - shldq %cl,%rbp,%r11 - - movl $0x7FFFFFFF,%eax - andq %rax,%r8 - andq %rax,%r10 - notq %rax - andq %rax,%r9 - andq %rax,%r11 - orq %r9,%r8 - orq %r11,%r10 - - jmp __inner_loop_31_256 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ab_approximation_31_256,.-__ab_approximation_31_256 -.type __inner_loop_31_256,@function -.align 32 -__inner_loop_31_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $0x7FFFFFFF80000000,%rcx - movq $0x800000007FFFFFFF,%r13 - movq $0x7FFFFFFF7FFFFFFF,%r15 - -.Loop_31_256: - cmpq %r10,%r8 - movq %r8,%rax - movq %r10,%rbx - movq %rcx,%rbp - movq %r13,%r14 - cmovbq %r10,%r8 - cmovbq %rax,%r10 - cmovbq %r13,%rcx - cmovbq %rbp,%r13 - - subq %r10,%r8 - subq %r13,%rcx - addq %r15,%rcx - - testq $1,%rax - cmovzq %rax,%r8 - cmovzq %rbx,%r10 - cmovzq %rbp,%rcx - cmovzq %r14,%r13 - - shrq $1,%r8 - addq %r13,%r13 - subq %r15,%r13 - subl $1,%edx - jnz .Loop_31_256 - - shrq $32,%r15 - movl %ecx,%edx - movl %r13d,%r12d - shrq $32,%rcx - shrq $32,%r13 - subq %r15,%rdx - subq %r15,%rcx - subq %r15,%r12 - subq %r15,%r13 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __inner_loop_31_256,.-__inner_loop_31_256 - -.type __inner_loop_62_256,@function -.align 32 -__inner_loop_62_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movl %edx,%r15d - movq $1,%rdx - xorq %rcx,%rcx - xorq %r12,%r12 - movq %rdx,%r13 - movq %rdx,%r14 - -.Loop_62_256: - xorq %rax,%rax - testq %r14,%r8 - movq %r10,%rbx - cmovnzq %r10,%rax - subq %r8,%rbx - movq %r8,%rbp - subq %rax,%r8 - cmovcq %rbx,%r8 - cmovcq %rbp,%r10 - movq %rdx,%rax - cmovcq %r12,%rdx - cmovcq %rax,%r12 - movq %rcx,%rbx - cmovcq %r13,%rcx - cmovcq %rbx,%r13 - xorq %rax,%rax - xorq %rbx,%rbx - shrq $1,%r8 - testq %r14,%rbp - cmovnzq %r12,%rax - cmovnzq %r13,%rbx - addq %r12,%r12 - addq %r13,%r13 - subq %rax,%rdx - subq %rbx,%rcx - subl $1,%r15d - jnz .Loop_62_256 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __inner_loop_62_256,.-__inner_loop_62_256 - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S deleted file mode 100644 index 99bb9def767..00000000000 --- a/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S +++ /dev/null @@ -1,718 +0,0 @@ -.text - -.globl ct_inverse_mod_383 -.hidden ct_inverse_mod_383 -.type ct_inverse_mod_383, %function -.align 5 -ct_inverse_mod_383: - .inst 0xd503233f - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #1040 - - ldp x22, x4, [x1,#8*0] - ldp x5, x6, [x1,#8*2] - ldp x7, x8, [x1,#8*4] - - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... - stp x0, x3, [sp] - - ldp x9, x10, [x2,#8*0] - ldp x11, x12, [x2,#8*2] - ldp x13, x14, [x2,#8*4] - - stp x22, x4, [x1,#8*0] // copy input to |a| - stp x5, x6, [x1,#8*2] - stp x7, x8, [x1,#8*4] - stp x9, x10, [x1,#8*6] // copy modulus to |b| - stp x11, x12, [x1,#8*8] - stp x13, x14, [x1,#8*10] - - ////////////////////////////////////////// first iteration - mov x2, #62 - bl .Lab_approximation_62_loaded - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - str x15,[x0,#8*12] // initialize |u| with |f0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to dst |b| - bl __smul_383_n_shift_by_62 - str x15, [x0,#8*12] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - ldr x7, [x1,#8*12] // |u| - ldr x8, [x1,#8*18] // |v| - mul x3, x20, x7 // |u|*|f0| - smulh x4, x20, x7 - mul x5, x21, x8 // |v|*|g0| - smulh x6, x21, x8 - adds x3, x3, x5 - adc x4, x4, x6 - stp x3, x4, [x0,#8*6] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*8] - stp x5, x5, [x0,#8*10] - - mul x3, x15, x7 // |u|*|f1| - smulh x4, x15, x7 - mul x5, x16, x8 // |v|*|g1| - smulh x6, x16, x8 - adds x3, x3, x5 - adc x4, x4, x6 - stp x3, x4, [x0,#8*12] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*14] - stp x5, x5, [x0,#8*16] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - asr x27, x27, #63 // sign extension - stp x27, x27, [x0,#8*6] - stp x27, x27, [x0,#8*8] - stp x27, x27, [x0,#8*10] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - ////////////////////////////////////////// iteration before last - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldp x3, x8, [x1,#8*0] // just load - ldp x9, x14, [x1,#8*6] - bl __inner_loop_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - str x3, [x0,#8*0] - str x9, [x0,#8*6] - - mov x20, x15 // exact |f0| - mov x21, x16 // exact |g0| - mov x15, x17 - mov x16, x19 - add x0, x0, #8*12 // pointer to dst |u| - bl __smul_383x63 - - mov x20, x15 // exact |f1| - mov x21, x16 // exact |g1| - add x0, x0, #8*6 // pointer to dst |v| - bl __smul_383x63 - bl __smul_767x63_tail - - ////////////////////////////////////////// last iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #22 // 766 % 62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldr x3, [x1,#8*0] // just load - eor x8, x8, x8 - ldr x9, [x1,#8*6] - eor x14, x14, x14 - bl __inner_loop_62 - - mov x20, x17 - mov x21, x19 - ldp x0, x15, [sp] // original out_ptr and n_ptr - bl __smul_383x63 - bl __smul_767x63_tail - ldr x30, [x29,#8] - - asr x22, x8, #63 // sign as mask - ldp x9, x10, [x15,#8*0] - ldp x11, x12, [x15,#8*2] - ldp x13, x14, [x15,#8*4] - - and x9, x9, x22 // add mod<<384 conditionally - and x10, x10, x22 - adds x3, x3, x9 - and x11, x11, x22 - adcs x4, x4, x10 - and x12, x12, x22 - adcs x5, x5, x11 - and x13, x13, x22 - adcs x6, x6, x12 - and x14, x14, x22 - stp x3, x4, [x0,#8*6] - adcs x7, x7, x13 - stp x5, x6, [x0,#8*8] - adc x8, x8, x14 - stp x7, x8, [x0,#8*10] - - add sp, sp, #1040 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 - .inst 0xd50323bf - ret -.size ct_inverse_mod_383,.-ct_inverse_mod_383 - -//////////////////////////////////////////////////////////////////////// -// see corresponding commentary in ctx_inverse_mod_384-x86_64... -.type __smul_383x63, %function -.align 5 -__smul_383x63: - ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) - asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x5, x6, [x1,#8*2+96] - eor x20, x20, x17 // conditionally negate |f_| (or |g_|) - ldp x7, x8, [x1,#8*4+96] - - eor x3, x3, x17 // conditionally negate |u| (or |v|) - sub x20, x20, x17 - eor x4, x4, x17 - adds x3, x3, x17, lsr#63 - eor x5, x5, x17 - adcs x4, x4, xzr - eor x6, x6, x17 - adcs x5, x5, xzr - eor x7, x7, x17 - adcs x6, x6, xzr - umulh x22, x3, x20 - eor x8, x8, x17 - umulh x23, x4, x20 - adcs x7, x7, xzr - umulh x24, x5, x20 - adcs x8, x8, xzr - umulh x25, x6, x20 - umulh x26, x7, x20 - mul x3, x3, x20 - mul x4, x4, x20 - mul x5, x5, x20 - adds x4, x4, x22 - mul x6, x6, x20 - adcs x5, x5, x23 - mul x7, x7, x20 - adcs x6, x6, x24 - mul x27,x8, x20 - adcs x7, x7, x25 - adcs x27,x27,x26 - adc x2, xzr, xzr - ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) - asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x11, x12, [x1,#8*2+144] - eor x21, x21, x17 // conditionally negate |f_| (or |g_|) - ldp x13, x14, [x1,#8*4+144] - - eor x9, x9, x17 // conditionally negate |u| (or |v|) - sub x21, x21, x17 - eor x10, x10, x17 - adds x9, x9, x17, lsr#63 - eor x11, x11, x17 - adcs x10, x10, xzr - eor x12, x12, x17 - adcs x11, x11, xzr - eor x13, x13, x17 - adcs x12, x12, xzr - umulh x22, x9, x21 - eor x14, x14, x17 - umulh x23, x10, x21 - adcs x13, x13, xzr - umulh x24, x11, x21 - adcs x14, x14, xzr - umulh x25, x12, x21 - adc x19, xzr, xzr // used in __smul_767x63_tail - umulh x26, x13, x21 - mul x9, x9, x21 - mul x10, x10, x21 - mul x11, x11, x21 - adds x10, x10, x22 - mul x12, x12, x21 - adcs x11, x11, x23 - mul x13, x13, x21 - adcs x12, x12, x24 - mul x28,x14, x21 - adcs x13, x13, x25 - adcs x28,x28,x26 - adc x2, x2, xzr - - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - stp x3, x4, [x0,#8*0] - adcs x7, x7, x13 - stp x5, x6, [x0,#8*2] - adcs x27, x27, x28 - stp x7, x27, [x0,#8*4] - adc x28, x2, xzr // used in __smul_767x63_tail - - ret -.size __smul_383x63,.-__smul_383x63 - -.type __smul_767x63_tail, %function -.align 5 -__smul_767x63_tail: - smulh x27, x8, x20 - ldp x3, x4, [x1,#8*24] // load rest of |v| - umulh x14,x14, x21 - ldp x5, x6, [x1,#8*26] - ldp x7, x8, [x1,#8*28] - - eor x3, x3, x17 // conditionally negate rest of |v| - eor x4, x4, x17 - eor x5, x5, x17 - adds x3, x3, x19 - eor x6, x6, x17 - adcs x4, x4, xzr - eor x7, x7, x17 - adcs x5, x5, xzr - eor x8, x8, x17 - adcs x6, x6, xzr - umulh x22, x3, x21 - adcs x7, x7, xzr - umulh x23, x4, x21 - adc x8, x8, xzr - - umulh x24, x5, x21 - add x14, x14, x28 - umulh x25, x6, x21 - asr x28, x27, #63 - umulh x26, x7, x21 - mul x3, x3, x21 - mul x4, x4, x21 - mul x5, x5, x21 - adds x3, x3, x14 - mul x6, x6, x21 - adcs x4, x4, x22 - mul x7, x7, x21 - adcs x5, x5, x23 - mul x8, x8, x21 - adcs x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, x26 - - adds x3, x3, x27 - adcs x4, x4, x28 - adcs x5, x5, x28 - adcs x6, x6, x28 - stp x3, x4, [x0,#8*6] - adcs x7, x7, x28 - stp x5, x6, [x0,#8*8] - adc x8, x8, x28 - stp x7, x8, [x0,#8*10] - - ret -.size __smul_767x63_tail,.-__smul_767x63_tail - -.type __smul_383_n_shift_by_62, %function -.align 5 -__smul_383_n_shift_by_62: - ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) - asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x5, x6, [x1,#8*2+0] - eor x2, x15, x28 // conditionally negate |f0| (or |g0|) - ldp x7, x8, [x1,#8*4+0] - - eor x3, x3, x28 // conditionally negate |a| (or |b|) - sub x2, x2, x28 - eor x4, x4, x28 - adds x3, x3, x28, lsr#63 - eor x5, x5, x28 - adcs x4, x4, xzr - eor x6, x6, x28 - adcs x5, x5, xzr - eor x7, x7, x28 - umulh x22, x3, x2 - adcs x6, x6, xzr - umulh x23, x4, x2 - eor x8, x8, x28 - umulh x24, x5, x2 - adcs x7, x7, xzr - umulh x25, x6, x2 - adc x8, x8, xzr - - umulh x26, x7, x2 - smulh x27, x8, x2 - mul x3, x3, x2 - mul x4, x4, x2 - mul x5, x5, x2 - adds x4, x4, x22 - mul x6, x6, x2 - adcs x5, x5, x23 - mul x7, x7, x2 - adcs x6, x6, x24 - mul x8, x8, x2 - adcs x7, x7, x25 - adcs x8, x8 ,x26 - adc x27, x27, xzr - ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) - asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x11, x12, [x1,#8*2+48] - eor x2, x16, x28 // conditionally negate |f0| (or |g0|) - ldp x13, x14, [x1,#8*4+48] - - eor x9, x9, x28 // conditionally negate |a| (or |b|) - sub x2, x2, x28 - eor x10, x10, x28 - adds x9, x9, x28, lsr#63 - eor x11, x11, x28 - adcs x10, x10, xzr - eor x12, x12, x28 - adcs x11, x11, xzr - eor x13, x13, x28 - umulh x22, x9, x2 - adcs x12, x12, xzr - umulh x23, x10, x2 - eor x14, x14, x28 - umulh x24, x11, x2 - adcs x13, x13, xzr - umulh x25, x12, x2 - adc x14, x14, xzr - - umulh x26, x13, x2 - smulh x28, x14, x2 - mul x9, x9, x2 - mul x10, x10, x2 - mul x11, x11, x2 - adds x10, x10, x22 - mul x12, x12, x2 - adcs x11, x11, x23 - mul x13, x13, x2 - adcs x12, x12, x24 - mul x14, x14, x2 - adcs x13, x13, x25 - adcs x14, x14 ,x26 - adc x28, x28, xzr - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - adcs x7, x7, x13 - adcs x8, x8, x14 - adc x9, x27, x28 - - extr x3, x4, x3, #62 - extr x4, x5, x4, #62 - extr x5, x6, x5, #62 - asr x28, x9, #63 - extr x6, x7, x6, #62 - extr x7, x8, x7, #62 - extr x8, x9, x8, #62 - - eor x3, x3, x28 - eor x4, x4, x28 - adds x3, x3, x28, lsr#63 - eor x5, x5, x28 - adcs x4, x4, xzr - eor x6, x6, x28 - adcs x5, x5, xzr - eor x7, x7, x28 - adcs x6, x6, xzr - eor x8, x8, x28 - stp x3, x4, [x0,#8*0] - adcs x7, x7, xzr - stp x5, x6, [x0,#8*2] - adc x8, x8, xzr - stp x7, x8, [x0,#8*4] - - eor x15, x15, x28 - eor x16, x16, x28 - sub x15, x15, x28 - sub x16, x16, x28 - - ret -.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 -.type __ab_approximation_62, %function -.align 4 -__ab_approximation_62: - ldp x7, x8, [x1,#8*4] - ldp x13, x14, [x1,#8*10] - ldp x5, x6, [x1,#8*2] - ldp x11, x12, [x1,#8*8] - -.Lab_approximation_62_loaded: - orr x22, x8, x14 // check top-most limbs, ... - cmp x22, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x6, ne - orr x22, x8, x14 // ... ones before top-most, ... - csel x13, x13, x12, ne - - ldp x3, x4, [x1,#8*0] - ldp x9, x10, [x1,#8*6] - - cmp x22, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x5, ne - orr x22, x8, x14 // ... and ones before that ... - csel x13, x13, x11, ne - - cmp x22, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x4, ne - orr x22, x8, x14 - csel x13, x13, x10, ne - - clz x22, x22 - cmp x22, #64 - csel x22, x22, xzr, ne - csel x8, x8, x7, ne - csel x14, x14, x13, ne - neg x23, x22 - - lslv x8, x8, x22 // align high limbs to the left - lslv x14, x14, x22 - lsrv x7, x7, x23 - lsrv x13, x13, x23 - and x7, x7, x23, asr#6 - and x13, x13, x23, asr#6 - orr x8, x8, x7 - orr x14, x14, x13 - - b __inner_loop_62 - ret -.size __ab_approximation_62,.-__ab_approximation_62 -.type __inner_loop_62, %function -.align 4 -__inner_loop_62: - mov x15, #1 // |f0|=1 - mov x16, #0 // |g0|=0 - mov x17, #0 // |f1|=0 - mov x19, #1 // |g1|=1 - -.Loop_62: - sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - subs x24, x9, x3 // |b_|-|a_| - and x22, x9, x28 - sbc x25, x14, x8 - and x23, x14, x28 - subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x22, x15 - sbcs x27, x8, x23 - mov x23, x16 - csel x9, x9, x3, hs // |b_| = |a_| - csel x14, x14, x8, hs - csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x8, x27, x25, hs - csel x15, x15, x17, hs // exchange |f0| and |f1| - csel x17, x17, x22, hs - csel x16, x16, x19, hs // exchange |g0| and |g1| - csel x19, x19, x23, hs - extr x3, x8, x3, #1 - lsr x8, x8, #1 - and x22, x17, x28 - and x23, x19, x28 - add x17, x17, x17 // |f1|<<=1 - add x19, x19, x19 // |g1|<<=1 - sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) - cbnz x2, .Loop_62 - - ret -.size __inner_loop_62,.-__inner_loop_62 diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S deleted file mode 100644 index 07dd99a8af3..00000000000 --- a/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S +++ /dev/null @@ -1,325 +0,0 @@ -.text - -.globl ct_is_square_mod_384 -.hidden ct_is_square_mod_384 -.type ct_is_square_mod_384, %function -.align 5 -ct_is_square_mod_384: - .inst 0xd503233f - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #512 - - ldp x3, x4, [x0,#8*0] // load input - ldp x5, x6, [x0,#8*2] - ldp x7, x8, [x0,#8*4] - - add x0, sp, #255 // find closest 256-byte-aligned spot - and x0, x0, #-256 // in the frame... - - ldp x9, x10, [x1,#8*0] // load modulus - ldp x11, x12, [x1,#8*2] - ldp x13, x14, [x1,#8*4] - - stp x3, x4, [x0,#8*6] // copy input to |a| - stp x5, x6, [x0,#8*8] - stp x7, x8, [x0,#8*10] - stp x9, x10, [x0,#8*0] // copy modulus to |b| - stp x11, x12, [x0,#8*2] - stp x13, x14, [x0,#8*4] - - eor x2, x2, x2 // init the .Legendre symbol - mov x15, #24 // 24 is 768/30-1 - b .Loop_is_square - -.align 4 -.Loop_is_square: - bl __ab_approximation_30 - sub x15, x15, #1 - - eor x1, x0, #128 // pointer to dst |b| - bl __smul_384_n_shift_by_30 - - mov x19, x16 // |f0| - mov x20, x17 // |g0| - add x1, x1, #8*6 // pointer to dst |a| - bl __smul_384_n_shift_by_30 - - ldp x9, x10, [x1,#-8*6] - eor x0, x0, #128 // flip-flop src |a|b| - and x27, x27, x9 // if |a| was negative, - add x2, x2, x27, lsr#1 // adjust |L| - - cbnz x15, .Loop_is_square - - ////////////////////////////////////////// last iteration - //bl __ab_approximation_30 // |a| and |b| are exact, - //ldr x8, [x0,#8*6] // and loaded - //ldr x14, [x0,#8*0] - mov x15, #48 // 48 is 768%30 + 30 - bl __inner_loop_48 - ldr x30, [x29,#8] - - and x0, x2, #1 - eor x0, x0, #1 - - add sp, sp, #512 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 - .inst 0xd50323bf - ret -.size ct_is_square_mod_384,.-ct_is_square_mod_384 - -.type __smul_384_n_shift_by_30, %function -.align 5 -__smul_384_n_shift_by_30: - ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) - asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) - ldp x5, x6, [x0,#8*2+0] - eor x20, x20, x27 // conditionally negate |g1| (or |f1|) - ldp x7, x8, [x0,#8*4+0] - - eor x3, x3, x27 // conditionally negate |b| (or |a|) - sub x20, x20, x27 - eor x4, x4, x27 - adds x3, x3, x27, lsr#63 - eor x5, x5, x27 - adcs x4, x4, xzr - eor x6, x6, x27 - adcs x5, x5, xzr - eor x7, x7, x27 - umulh x21, x3, x20 - adcs x6, x6, xzr - umulh x22, x4, x20 - eor x8, x8, x27 - umulh x23, x5, x20 - adcs x7, x7, xzr - umulh x24, x6, x20 - adc x8, x8, xzr - - umulh x25, x7, x20 - and x28, x20, x27 - umulh x26, x8, x20 - neg x28, x28 - mul x3, x3, x20 - mul x4, x4, x20 - mul x5, x5, x20 - adds x4, x4, x21 - mul x6, x6, x20 - adcs x5, x5, x22 - mul x7, x7, x20 - adcs x6, x6, x23 - mul x8, x8, x20 - adcs x7, x7, x24 - adcs x8, x8 ,x25 - adc x26, x26, x28 - ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) - asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) - ldp x11, x12, [x0,#8*2+48] - eor x19, x19, x27 // conditionally negate |g1| (or |f1|) - ldp x13, x14, [x0,#8*4+48] - - eor x9, x9, x27 // conditionally negate |b| (or |a|) - sub x19, x19, x27 - eor x10, x10, x27 - adds x9, x9, x27, lsr#63 - eor x11, x11, x27 - adcs x10, x10, xzr - eor x12, x12, x27 - adcs x11, x11, xzr - eor x13, x13, x27 - umulh x21, x9, x19 - adcs x12, x12, xzr - umulh x22, x10, x19 - eor x14, x14, x27 - umulh x23, x11, x19 - adcs x13, x13, xzr - umulh x24, x12, x19 - adc x14, x14, xzr - - umulh x25, x13, x19 - and x28, x19, x27 - umulh x27, x14, x19 - neg x28, x28 - mul x9, x9, x19 - mul x10, x10, x19 - mul x11, x11, x19 - adds x10, x10, x21 - mul x12, x12, x19 - adcs x11, x11, x22 - mul x13, x13, x19 - adcs x12, x12, x23 - mul x14, x14, x19 - adcs x13, x13, x24 - adcs x14, x14 ,x25 - adc x27, x27, x28 - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - adcs x7, x7, x13 - adcs x8, x8, x14 - adc x9, x26, x27 - - extr x3, x4, x3, #30 - extr x4, x5, x4, #30 - extr x5, x6, x5, #30 - asr x27, x9, #63 - extr x6, x7, x6, #30 - extr x7, x8, x7, #30 - extr x8, x9, x8, #30 - - eor x3, x3, x27 - eor x4, x4, x27 - adds x3, x3, x27, lsr#63 - eor x5, x5, x27 - adcs x4, x4, xzr - eor x6, x6, x27 - adcs x5, x5, xzr - eor x7, x7, x27 - adcs x6, x6, xzr - eor x8, x8, x27 - stp x3, x4, [x1,#8*0] - adcs x7, x7, xzr - stp x5, x6, [x1,#8*2] - adc x8, x8, xzr - stp x7, x8, [x1,#8*4] - - ret -.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 -.type __ab_approximation_30, %function -.align 4 -__ab_approximation_30: - ldp x13, x14, [x0,#8*4] // |a| is still in registers - ldp x11, x12, [x0,#8*2] - - orr x21, x8, x14 // check top-most limbs, ... - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x6, ne - orr x21, x8, x14 // ... ones before top-most, ... - csel x13, x13, x12, ne - - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x5, ne - orr x21, x8, x14 // ... and ones before that ... - csel x13, x13, x11, ne - - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x4, ne - orr x21, x8, x14 // and one more, ... - csel x13, x13, x10, ne - - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x3, ne - orr x21, x8, x14 - csel x13, x13, x9, ne - - clz x21, x21 - cmp x21, #64 - csel x21, x21, xzr, ne - csel x8, x8, x7, ne - csel x14, x14, x13, ne - neg x22, x21 - - lslv x8, x8, x21 // align high limbs to the left - lslv x14, x14, x21 - lsrv x7, x7, x22 - lsrv x13, x13, x22 - and x7, x7, x22, asr#6 - and x13, x13, x22, asr#6 - orr x8, x8, x7 - orr x14, x14, x13 - - bfxil x8, x3, #0, #32 - bfxil x14, x9, #0, #32 - - b __inner_loop_30 - ret -.size __ab_approximation_30,.-__ab_approximation_30 - -.type __inner_loop_30, %function -.align 4 -__inner_loop_30: - mov x28, #30 - mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov x27,#0x7FFFFFFF7FFFFFFF - -.Loop_30: - sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting - and x25, x8, x14 - sub x28, x28, #1 - and x21, x14, x24 - - sub x22, x14, x8 // |b_|-|a_| - subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) - add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 - mov x21, x20 - csel x14, x14, x8, hs // |b_| = |a_| - csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x20, x20, x17, hs // exchange |fg0| and |fg1| - csel x17, x17, x21, hs - csel x2, x2, x25, hs - lsr x8, x8, #1 - and x21, x20, x24 - and x22, x27, x24 - add x23, x14, #2 - sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x20, x20, x20 // |f1|<<=1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - add x17, x17, x22 - sub x20, x20, x27 - - cbnz x28, .Loop_30 - - mov x27, #0x7FFFFFFF - ubfx x16, x17, #0, #32 - ubfx x17, x17, #32, #32 - ubfx x19, x20, #0, #32 - ubfx x20, x20, #32, #32 - sub x16, x16, x27 // remove the bias - sub x17, x17, x27 - sub x19, x19, x27 - sub x20, x20, x27 - - ret -.size __inner_loop_30,.-__inner_loop_30 -.type __inner_loop_48, %function -.align 4 -__inner_loop_48: -.Loop_48: - sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting - and x25, x3, x9 - sub x15, x15, #1 - and x21, x9, x24 - sub x22, x9, x3 // |b_|-|a_| - subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) - add x25, x2, x25, lsr#1 - csel x9, x9, x3, hs // |b_| = |a_| - csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x2, x2, x25, hs - add x23, x9, #2 - lsr x3, x3, #1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - - cbnz x15, .Loop_48 - - ret -.size __inner_loop_48,.-__inner_loop_48 diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s deleted file mode 100644 index bf610fa7440..00000000000 --- a/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s +++ /dev/null @@ -1,480 +0,0 @@ -.text - -.globl ct_is_square_mod_384 -.hidden ct_is_square_mod_384 -.type ct_is_square_mod_384,@function -.align 32 -ct_is_square_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $536,%rsp -.cfi_adjust_cfa_offset 536 - - - leaq 24+255(%rsp),%rax - andq $-256,%rax - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rbx - movq 24(%rsi),%rcx - movq 32(%rsi),%rdx - movq 40(%rsi),%rdi - movq %rax,%rsi - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - movq %r12,32(%rax) - movq %r13,40(%rax) - - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rbx,64(%rax) - movq %rcx,72(%rax) - movq %rdx,80(%rax) - movq %rdi,88(%rax) - - xorq %rbp,%rbp - movl $24,%ecx - jmp .Loop_is_square - -.align 32 -.Loop_is_square: - movl %ecx,16(%rsp) - - call __ab_approximation_30 - movq %rax,0(%rsp) - movq %rbx,8(%rsp) - - movq $128+48,%rdi - xorq %rsi,%rdi - call __smulq_384_n_shift_by_30 - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq -48(%rdi),%rdi - call __smulq_384_n_shift_by_30 - - movl 16(%rsp),%ecx - xorq $128,%rsi - - andq 48(%rdi),%r14 - shrq $1,%r14 - addq %r14,%rbp - - subl $1,%ecx - jnz .Loop_is_square - - - - - movq 48(%rsi),%r9 - call __inner_loop_48 - - movq $1,%rax - andq %rbp,%rax - xorq $1,%rax - - leaq 536(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -536-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size ct_is_square_mod_384,.-ct_is_square_mod_384 - -.type __smulq_384_n_shift_by_30,@function -.align 32 -__smulq_384_n_shift_by_30: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %rdx,%r14 - andq %rbx,%r14 - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbx - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbx - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - negq %r14 - mulq %rbx - addq %rax,%r13 - adcq %rdx,%r14 - leaq 48(%rsi),%rsi - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %rdx,%r15 - andq %rbx,%r15 - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbx - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbx - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - negq %r15 - mulq %rbx - addq %rax,%r13 - adcq %rdx,%r15 - leaq -48(%rsi),%rsi - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq %r15,%r14 - - shrdq $30,%r9,%r8 - shrdq $30,%r10,%r9 - shrdq $30,%r11,%r10 - shrdq $30,%r12,%r11 - shrdq $30,%r13,%r12 - shrdq $30,%r14,%r13 - - sarq $63,%r14 - xorq %rbx,%rbx - subq %r14,%rbx - - xorq %r14,%r8 - xorq %r14,%r9 - xorq %r14,%r10 - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%r13 - addq %rbx,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 -.type __ab_approximation_30,@function -.align 32 -__ab_approximation_30: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 88(%rsi),%rbx - movq 80(%rsi),%r15 - movq 72(%rsi),%r14 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r11,%r12 - movq 64(%rsi),%r11 - cmovzq %r14,%r15 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r10,%r12 - movq 56(%rsi),%r10 - cmovzq %r11,%r15 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r9,%r12 - movq 48(%rsi),%r9 - cmovzq %r10,%r15 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r8,%r12 - cmovzq %r9,%r15 - - movq %r13,%rax - orq %rbx,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %r8,%r13 - cmovzq %r9,%rbx - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%r12,%r13 - shldq %cl,%r15,%rbx - - movq $0xFFFFFFFF00000000,%rax - movl %r8d,%r8d - movl %r9d,%r9d - andq %rax,%r13 - andq %rax,%rbx - orq %r13,%r8 - orq %rbx,%r9 - - jmp __inner_loop_30 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ab_approximation_30,.-__ab_approximation_30 -.type __inner_loop_30,@function -.align 32 -__inner_loop_30: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $0x7FFFFFFF80000000,%rbx - movq $0x800000007FFFFFFF,%rcx - leaq -1(%rbx),%r15 - movl $30,%edi - -.Loop_30: - movq %r8,%rax - andq %r9,%rax - shrq $1,%rax - - cmpq %r9,%r8 - movq %r8,%r10 - movq %r9,%r11 - leaq (%rax,%rbp,1),%rax - movq %rbx,%r12 - movq %rcx,%r13 - movq %rbp,%r14 - cmovbq %r9,%r8 - cmovbq %r10,%r9 - cmovbq %rcx,%rbx - cmovbq %r12,%rcx - cmovbq %rax,%rbp - - subq %r9,%r8 - subq %rcx,%rbx - addq %r15,%rbx - - testq $1,%r10 - cmovzq %r10,%r8 - cmovzq %r11,%r9 - cmovzq %r12,%rbx - cmovzq %r13,%rcx - cmovzq %r14,%rbp - - leaq 2(%r9),%rax - shrq $1,%r8 - shrq $2,%rax - addq %rcx,%rcx - leaq (%rax,%rbp,1),%rbp - subq %r15,%rcx - - subl $1,%edi - jnz .Loop_30 - - shrq $32,%r15 - movl %ebx,%eax - shrq $32,%rbx - movl %ecx,%edx - shrq $32,%rcx - subq %r15,%rax - subq %r15,%rbx - subq %r15,%rdx - subq %r15,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc -.size __inner_loop_30,.-__inner_loop_30 - -.type __inner_loop_48,@function -.align 32 -__inner_loop_48: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movl $48,%edi - -.Loop_48: - movq %r8,%rax - andq %r9,%rax - shrq $1,%rax - - cmpq %r9,%r8 - movq %r8,%r10 - movq %r9,%r11 - leaq (%rax,%rbp,1),%rax - movq %rbp,%r12 - cmovbq %r9,%r8 - cmovbq %r10,%r9 - cmovbq %rax,%rbp - - subq %r9,%r8 - - testq $1,%r10 - cmovzq %r10,%r8 - cmovzq %r11,%r9 - cmovzq %r12,%rbp - - leaq 2(%r9),%rax - shrq $1,%r8 - shrq $2,%rax - addq %rax,%rbp - - subl $1,%edi - jnz .Loop_48 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __inner_loop_48,.-__inner_loop_48 - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s deleted file mode 100644 index 9cca518721f..00000000000 --- a/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s +++ /dev/null @@ -1,1201 +0,0 @@ -.comm __blst_platform_cap,4 -.text - -.globl ct_inverse_mod_383 -.hidden ct_inverse_mod_383 -.type ct_inverse_mod_383,@function -.align 32 -ct_inverse_mod_383: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz ct_inverse_mod_383$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $1112,%rsp -.cfi_adjust_cfa_offset 1112 - - - leaq 88+511(%rsp),%rax - andq $-512,%rax - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq 0(%rdx),%r14 - movq 8(%rdx),%r15 - movq 16(%rdx),%rbx - movq 24(%rdx),%rbp - movq 32(%rdx),%rsi - movq 40(%rdx),%rdi - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - movq %r12,32(%rax) - movq %r13,40(%rax) - - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rbx,64(%rax) - movq %rbp,72(%rax) - movq %rsi,80(%rax) - movq %rax,%rsi - movq %rdi,88(%rax) - - - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - - - movq %rdx,96(%rdi) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - - - movq %rdx,96(%rdi) - - - xorq $256,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - - - - movq 96(%rsi),%rax - movq 144(%rsi),%r11 - movq %rdx,%rbx - movq %rax,%r10 - imulq 56(%rsp) - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq 64(%rsp) - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,48(%rdi) - movq %r9,56(%rdi) - sarq $63,%r9 - movq %r9,64(%rdi) - movq %r9,72(%rdi) - movq %r9,80(%rdi) - movq %r9,88(%rdi) - leaq 96(%rsi),%rsi - - movq %r10,%rax - imulq %rbx - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq %rcx - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,96(%rdi) - movq %r9,104(%rdi) - sarq $63,%r9 - movq %r9,112(%rdi) - movq %r9,120(%rdi) - movq %r9,128(%rdi) - movq %r9,136(%rdi) - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - sarq $63,%r13 - movq %r13,48(%rdi) - movq %r13,56(%rdi) - movq %r13,64(%rdi) - movq %r13,72(%rdi) - movq %r13,80(%rdi) - movq %r13,88(%rdi) - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - - xorq $256+96,%rsi - movl $62,%edi - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 48(%rsi),%r10 - movq 56(%rsi),%r11 - call __inner_loop_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - movq %r8,0(%rdi) - movq %r10,48(%rdi) - - - - leaq 96(%rsi),%rsi - leaq 96(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - - - xorq $256+96,%rsi - movl $22,%edi - - movq 0(%rsi),%r8 - xorq %r9,%r9 - movq 48(%rsi),%r10 - xorq %r11,%r11 - call __inner_loop_62 - - - - - - - - leaq 96(%rsi),%rsi - - - - - - movq %r12,%rdx - movq %r13,%rcx - movq 32(%rsp),%rdi - call __smulq_767x63 - - movq 40(%rsp),%rsi - movq %rax,%rdx - sarq $63,%rax - - movq %rax,%r8 - movq %rax,%r9 - movq %rax,%r10 - andq 0(%rsi),%r8 - andq 8(%rsi),%r9 - movq %rax,%r11 - andq 16(%rsi),%r10 - andq 24(%rsi),%r11 - movq %rax,%r12 - andq 32(%rsi),%r12 - andq 40(%rsi),%rax - - addq %r8,%r14 - adcq %r9,%r15 - adcq %r10,%rbx - adcq %r11,%rbp - adcq %r12,%rcx - adcq %rax,%rdx - - movq %r14,48(%rdi) - movq %r15,56(%rdi) - movq %rbx,64(%rdi) - movq %rbp,72(%rdi) - movq %rcx,80(%rdi) - movq %rdx,88(%rdi) - - leaq 1112(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -1112-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size ct_inverse_mod_383,.-ct_inverse_mod_383 -.type __smulq_767x63,@function -.align 32 -__smulq_767x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - leaq 48(%rsi),%rsi - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,0(%rdi) - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - movq %r9,8(%rdi) - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - movq %r10,16(%rdi) - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - movq %r11,24(%rdi) - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - movq %r12,32(%rdi) - imulq %rbp - addq %rax,%r13 - adcq $0,%rdx - - movq %r13,40(%rdi) - movq %rdx,48(%rdi) - sarq $63,%rdx - movq %rdx,56(%rdi) - movq %rcx,%rdx - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - movq 56(%rsi),%r15 - movq 64(%rsi),%rbx - movq 72(%rsi),%rbp - movq 80(%rsi),%rcx - movq 88(%rsi),%rdi - - movq %rdx,%rsi - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rsi - addq %rax,%rsi - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - xorq %rdx,%r14 - xorq %rdx,%r15 - xorq %rdx,%rbx - xorq %rdx,%rbp - xorq %rdx,%rcx - xorq %rdx,%rdi - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rbx - adcq $0,%rbp - adcq $0,%rcx - adcq $0,%rdi - - mulq %rsi - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rsi - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rsi - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rsi - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rsi - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - mulq %rsi - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rsi - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - mulq %rsi - addq %rax,%r15 - movq %rbx,%rax - adcq $0,%rdx - movq %rdx,%rbx - mulq %rsi - addq %rax,%rbx - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rbp - mulq %rsi - addq %rax,%rbp - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rcx - mulq %rsi - addq %rax,%rcx - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%rdi - movq 8(%rsp),%rdx - imulq %rsi,%rax - movq 16(%rsp),%rsi - addq %rdi,%rax - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - adcq 24(%rdx),%r11 - adcq 32(%rdx),%r12 - adcq 40(%rdx),%r13 - adcq 48(%rdx),%r14 - movq 56(%rdx),%rdi - adcq %rdi,%r15 - adcq %rdi,%rbx - adcq %rdi,%rbp - adcq %rdi,%rcx - adcq %rdi,%rax - - movq %rdx,%rdi - - movq %r8,0(%rdx) - movq %r9,8(%rdx) - movq %r10,16(%rdx) - movq %r11,24(%rdx) - movq %r12,32(%rdx) - movq %r13,40(%rdx) - movq %r14,48(%rdx) - movq %r15,56(%rdx) - movq %rbx,64(%rdx) - movq %rbp,72(%rdx) - movq %rcx,80(%rdx) - movq %rax,88(%rdx) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulq_767x63,.-__smulq_767x63 -.type __smulq_383x63,@function -.align 32 -__smulq_383x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp,%rax - addq %rax,%r13 - - leaq 48(%rsi),%rsi - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp,%rax - addq %rax,%r13 - - leaq -48(%rsi),%rsi - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulq_383x63,.-__smulq_383x63 -.type __smulq_383_n_shift_by_62,@function -.align 32 -__smulq_383_n_shift_by_62: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,%rbx - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp - addq %rax,%r13 - adcq $0,%rdx - - leaq 48(%rsi),%rsi - movq %rdx,%r14 - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp - addq %rax,%r13 - adcq $0,%rdx - - leaq -48(%rsi),%rsi - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq %rdx,%r14 - movq %rbx,%rdx - - shrdq $62,%r9,%r8 - shrdq $62,%r10,%r9 - shrdq $62,%r11,%r10 - shrdq $62,%r12,%r11 - shrdq $62,%r13,%r12 - shrdq $62,%r14,%r13 - - sarq $63,%r14 - xorq %rbp,%rbp - subq %r14,%rbp - - xorq %r14,%r8 - xorq %r14,%r9 - xorq %r14,%r10 - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%r13 - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - xorq %r14,%rdx - xorq %r14,%rcx - addq %rbp,%rdx - addq %rbp,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 -.type __ab_approximation_62,@function -.align 32 -__ab_approximation_62: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 40(%rsi),%r9 - movq 88(%rsi),%r11 - movq 32(%rsi),%rbx - movq 80(%rsi),%rbp - movq 24(%rsi),%r8 - movq 72(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - movq 16(%rsi),%r8 - movq 64(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - movq 8(%rsi),%r8 - movq 56(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - movq 0(%rsi),%r8 - movq 48(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%rbx,%r9 - shldq %cl,%rbp,%r11 - - jmp __inner_loop_62 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ab_approximation_62,.-__ab_approximation_62 -.type __inner_loop_62,@function -.align 8 -.long 0 -__inner_loop_62: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $1,%rdx - xorq %rcx,%rcx - xorq %r12,%r12 - movq $1,%r13 - movq %rsi,8(%rsp) - -.Loop_62: - xorq %rax,%rax - xorq %rbx,%rbx - testq $1,%r8 - movq %r10,%rbp - movq %r11,%r14 - cmovnzq %r10,%rax - cmovnzq %r11,%rbx - subq %r8,%rbp - sbbq %r9,%r14 - movq %r8,%r15 - movq %r9,%rsi - subq %rax,%r8 - sbbq %rbx,%r9 - cmovcq %rbp,%r8 - cmovcq %r14,%r9 - cmovcq %r15,%r10 - cmovcq %rsi,%r11 - movq %rdx,%rax - cmovcq %r12,%rdx - cmovcq %rax,%r12 - movq %rcx,%rbx - cmovcq %r13,%rcx - cmovcq %rbx,%r13 - xorq %rax,%rax - xorq %rbx,%rbx - shrdq $1,%r9,%r8 - shrq $1,%r9 - testq $1,%r15 - cmovnzq %r12,%rax - cmovnzq %r13,%rbx - addq %r12,%r12 - addq %r13,%r13 - subq %rax,%rdx - subq %rbx,%rcx - subl $1,%edi - jnz .Loop_62 - - movq 8(%rsp),%rsi - .byte 0xf3,0xc3 -.cfi_endproc -.size __inner_loop_62,.-__inner_loop_62 - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s deleted file mode 100644 index 9f4d12babd4..00000000000 --- a/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s +++ /dev/null @@ -1,1576 +0,0 @@ -.text - -.globl ctx_inverse_mod_383 -.hidden ctx_inverse_mod_383 -.type ctx_inverse_mod_383,@function -.align 32 -ctx_inverse_mod_383: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -ct_inverse_mod_383$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $1112,%rsp -.cfi_adjust_cfa_offset 1112 - - - leaq 88+511(%rsp),%rax - andq $-512,%rax - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq 0(%rdx),%r14 - movq 8(%rdx),%r15 - movq 16(%rdx),%rbx - movq 24(%rdx),%rbp - movq 32(%rdx),%rsi - movq 40(%rdx),%rdi - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - movq %r12,32(%rax) - movq %r13,40(%rax) - - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rbx,64(%rax) - movq %rbp,72(%rax) - movq %rsi,80(%rax) - movq %rax,%rsi - movq %rdi,88(%rax) - - - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - - - movq %rdx,96(%rdi) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - - - movq %rdx,96(%rdi) - - - xorq $256,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - - - - movq 96(%rsi),%rax - movq 144(%rsi),%r11 - movq %rdx,%rbx - movq %rax,%r10 - imulq 56(%rsp) - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq 64(%rsp) - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,48(%rdi) - movq %r9,56(%rdi) - sarq $63,%r9 - movq %r9,64(%rdi) - movq %r9,72(%rdi) - movq %r9,80(%rdi) - movq %r9,88(%rdi) - leaq 96(%rsi),%rsi - - movq %r10,%rax - imulq %rbx - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq %rcx - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,96(%rdi) - movq %r9,104(%rdi) - sarq $63,%r9 - movq %r9,112(%rdi) - movq %r9,120(%rdi) - movq %r9,128(%rdi) - movq %r9,136(%rdi) - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - sarq $63,%r13 - movq %r13,48(%rdi) - movq %r13,56(%rdi) - movq %r13,64(%rdi) - movq %r13,72(%rdi) - movq %r13,80(%rdi) - movq %r13,88(%rdi) - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - - xorq $256+96,%rsi - movl $53,%edi - - movq 0(%rsi),%r8 - - movq 48(%rsi),%r10 - - call __tail_loop_53 - - - - - - - - leaq 96(%rsi),%rsi - - - - - - movq %r12,%rdx - movq %r13,%rcx - movq 32(%rsp),%rdi - call __smulx_767x63 - - movq 40(%rsp),%rsi - movq %rax,%rdx - sarq $63,%rax - - movq %rax,%r8 - movq %rax,%r9 - movq %rax,%r10 - andq 0(%rsi),%r8 - andq 8(%rsi),%r9 - movq %rax,%r11 - andq 16(%rsi),%r10 - andq 24(%rsi),%r11 - movq %rax,%r12 - andq 32(%rsi),%r12 - andq 40(%rsi),%rax - - addq %r8,%r14 - adcq %r9,%r15 - adcq %r10,%rbx - adcq %r11,%rbp - adcq %r12,%rcx - adcq %rax,%rdx - - movq %r14,48(%rdi) - movq %r15,56(%rdi) - movq %rbx,64(%rdi) - movq %rbp,72(%rdi) - movq %rcx,80(%rdi) - movq %rdx,88(%rdi) - - leaq 1112(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -1112-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 -.type __smulx_767x63,@function -.align 32 -__smulx_767x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - leaq 48(%rsi),%rsi - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r13 - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %r13,%r10 - mulxq %r11,%r11,%r13 - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %r13,%r12 - adcq $0,%rbp - imulq %rdx - addq %rbp,%rax - adcq $0,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %rax,40(%rdi) - movq %rdx,48(%rdi) - sarq $63,%rdx - movq %rdx,56(%rdi) - movq %rcx,%rdx - movq %rcx,%rax - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - movq 56(%rsi),%r15 - movq 64(%rsi),%rbx - movq 72(%rsi),%rbp - movq 80(%rsi),%rcx - movq 88(%rsi),%rdi - - sarq $63,%rax - xorq %rsi,%rsi - subq %rax,%rsi - - xorq %rax,%rdx - addq %rsi,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %rax,%r13 - xorq %rax,%r14 - xorq %rax,%r15 - xorq %rax,%rbx - xorq %rax,%rbp - xorq %rax,%rcx - xorq %rax,%rdi - addq %rsi,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rbx - adcq $0,%rbp - adcq $0,%rcx - adcq $0,%rdi - - mulxq %r8,%r8,%rax - mulxq %r9,%r9,%rsi - addq %rax,%r9 - mulxq %r10,%r10,%rax - adcq %rsi,%r10 - mulxq %r11,%r11,%rsi - adcq %rax,%r11 - mulxq %r12,%r12,%rax - adcq %rsi,%r12 - mulxq %r13,%r13,%rsi - adcq %rax,%r13 - mulxq %r14,%r14,%rax - adcq %rsi,%r14 - mulxq %r15,%r15,%rsi - adcq %rax,%r15 - mulxq %rbx,%rbx,%rax - adcq %rsi,%rbx - mulxq %rbp,%rbp,%rsi - adcq %rax,%rbp - mulxq %rcx,%rcx,%rax - adcq %rsi,%rcx - mulxq %rdi,%rdi,%rsi - movq 8(%rsp),%rdx - movq 16(%rsp),%rsi - adcq %rdi,%rax - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - adcq 24(%rdx),%r11 - adcq 32(%rdx),%r12 - adcq 40(%rdx),%r13 - adcq 48(%rdx),%r14 - movq 56(%rdx),%rdi - adcq %rdi,%r15 - adcq %rdi,%rbx - adcq %rdi,%rbp - adcq %rdi,%rcx - adcq %rdi,%rax - - movq %rdx,%rdi - - movq %r8,0(%rdx) - movq %r9,8(%rdx) - movq %r10,16(%rdx) - movq %r11,24(%rdx) - movq %r12,32(%rdx) - movq %r13,40(%rdx) - movq %r14,48(%rdx) - movq %r15,56(%rdx) - movq %rbx,64(%rdx) - movq %rbp,72(%rdx) - movq %rcx,80(%rdx) - movq %rax,88(%rdx) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulx_767x63,.-__smulx_767x63 -.type __smulx_383x63,@function -.align 32 -__smulx_383x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - movq 0+32(%rsi),%r12 - movq 0+40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%rdx - addq %rax,%rdx - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - xorq %rbp,%r12 - xorq %rbp,%r13 - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%rax - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %rax,%r10 - mulxq %r11,%r11,%rax - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %rax,%r12 - mulxq %r13,%r13,%rax - movq %rcx,%rdx - adcq %rbp,%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 48+0(%rsi),%r8 - movq 48+8(%rsi),%r9 - movq 48+16(%rsi),%r10 - movq 48+24(%rsi),%r11 - movq 48+32(%rsi),%r12 - movq 48+40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%rdx - addq %rax,%rdx - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - xorq %rbp,%r12 - xorq %rbp,%r13 - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%rax - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %rax,%r10 - mulxq %r11,%r11,%rax - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %rax,%r12 - mulxq %r13,%r13,%rax - adcq %rbp,%r13 - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulx_383x63,.-__smulx_383x63 -.type __smulx_383_n_shift_by_31,@function -.align 32 -__smulx_383_n_shift_by_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,%rbx - xorq %r14,%r14 - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - movq 0+32(%rsi),%r12 - movq 0+40(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r13 - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %r13,%r10 - mulxq %r11,%r11,%r13 - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %r13,%r12 - adcq $0,%rbp - imulq %rdx - addq %rbp,%rax - adcq %rdx,%r14 - - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %rax,40(%rdi) - movq 48+0(%rsi),%r8 - movq 48+8(%rsi),%r9 - movq 48+16(%rsi),%r10 - movq 48+24(%rsi),%r11 - movq 48+32(%rsi),%r12 - movq 48+40(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r13 - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %r13,%r10 - mulxq %r11,%r11,%r13 - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %r13,%r12 - adcq $0,%rbp - imulq %rdx - addq %rbp,%rax - adcq $0,%rdx - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%rax - adcq %rdx,%r14 - movq %rbx,%rdx - - shrdq $31,%r9,%r8 - shrdq $31,%r10,%r9 - shrdq $31,%r11,%r10 - shrdq $31,%r12,%r11 - shrdq $31,%rax,%r12 - shrdq $31,%r14,%rax - - sarq $63,%r14 - xorq %rbp,%rbp - subq %r14,%rbp - - xorq %r14,%r8 - xorq %r14,%r9 - xorq %r14,%r10 - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %rax,40(%rdi) - - xorq %r14,%rdx - xorq %r14,%rcx - addq %rbp,%rdx - addq %rbp,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 -.type __smulx_191_n_shift_by_31,@function -.align 32 -__smulx_191_n_shift_by_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,%rbx - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %r10,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r10 - addq %rbp,%r9 - adcq $0,%r10 - imulq %rdx - addq %rax,%r10 - adcq $0,%rdx - movq %rdx,%r14 - movq %rcx,%rdx - movq 48+0(%rsi),%r11 - movq 48+8(%rsi),%r12 - movq 48+16(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r11,%r11,%rbp - mulxq %r12,%r12,%r13 - addq %rbp,%r12 - adcq $0,%r13 - imulq %rdx - addq %rax,%r13 - adcq $0,%rdx - addq %r8,%r11 - adcq %r9,%r12 - adcq %r10,%r13 - adcq %rdx,%r14 - movq %rbx,%rdx - - shrdq $31,%r12,%r11 - shrdq $31,%r13,%r12 - shrdq $31,%r14,%r13 - - sarq $63,%r14 - xorq %rbp,%rbp - subq %r14,%rbp - - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%r13 - addq %rbp,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %r11,0(%rdi) - movq %r12,8(%rdi) - movq %r13,16(%rdi) - - xorq %r14,%rdx - xorq %r14,%rcx - addq %rbp,%rdx - addq %rbp,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc -.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 -.type __ab_approximation_31,@function -.align 32 -__ab_approximation_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 40(%rsi),%r9 - movq 88(%rsi),%r11 - movq 32(%rsi),%rbx - movq 80(%rsi),%rbp - movq 24(%rsi),%r8 - movq 72(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 16(%rsi),%r8 - cmovzq %r10,%rbp - movq 64(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 8(%rsi),%r8 - cmovzq %r10,%rbp - movq 56(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 0(%rsi),%r8 - cmovzq %r10,%rbp - movq 48(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - - movq %r9,%rax - orq %r11,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %r8,%r9 - cmovzq %r10,%r11 - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%rbx,%r9 - shldq %cl,%rbp,%r11 - - movl $0x7FFFFFFF,%eax - andq %rax,%r8 - andq %rax,%r10 - andnq %r9,%rax,%r9 - andnq %r11,%rax,%r11 - orq %r9,%r8 - orq %r11,%r10 - - jmp __inner_loop_31 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ab_approximation_31,.-__ab_approximation_31 -.type __inner_loop_31,@function -.align 32 -__inner_loop_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $0x7FFFFFFF80000000,%rcx - movq $0x800000007FFFFFFF,%r13 - movq $0x7FFFFFFF7FFFFFFF,%r15 - -.Loop_31: - cmpq %r10,%r8 - movq %r8,%rax - movq %r10,%rbx - movq %rcx,%rbp - movq %r13,%r14 - cmovbq %r10,%r8 - cmovbq %rax,%r10 - cmovbq %r13,%rcx - cmovbq %rbp,%r13 - - subq %r10,%r8 - subq %r13,%rcx - addq %r15,%rcx - - testq $1,%rax - cmovzq %rax,%r8 - cmovzq %rbx,%r10 - cmovzq %rbp,%rcx - cmovzq %r14,%r13 - - shrq $1,%r8 - addq %r13,%r13 - subq %r15,%r13 - subl $1,%edi - jnz .Loop_31 - - shrq $32,%r15 - movl %ecx,%edx - movl %r13d,%r12d - shrq $32,%rcx - shrq $32,%r13 - subq %r15,%rdx - subq %r15,%rcx - subq %r15,%r12 - subq %r15,%r13 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __inner_loop_31,.-__inner_loop_31 - -.type __tail_loop_53,@function -.align 32 -__tail_loop_53: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $1,%rdx - xorq %rcx,%rcx - xorq %r12,%r12 - movq $1,%r13 - -.Loop_53: - xorq %rax,%rax - testq $1,%r8 - movq %r10,%rbx - cmovnzq %r10,%rax - subq %r8,%rbx - movq %r8,%rbp - subq %rax,%r8 - cmovcq %rbx,%r8 - cmovcq %rbp,%r10 - movq %rdx,%rax - cmovcq %r12,%rdx - cmovcq %rax,%r12 - movq %rcx,%rbx - cmovcq %r13,%rcx - cmovcq %rbx,%r13 - xorq %rax,%rax - xorq %rbx,%rbx - shrq $1,%r8 - testq $1,%rbp - cmovnzq %r12,%rax - cmovnzq %r13,%rbx - addq %r12,%r12 - addq %r13,%r13 - subq %rax,%rdx - subq %rbx,%rcx - subl $1,%edi - jnz .Loop_53 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __tail_loop_53,.-__tail_loop_53 - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/div3w-armv8.S b/crypto/blst_src/build/elf/div3w-armv8.S deleted file mode 100644 index 37621bee415..00000000000 --- a/crypto/blst_src/build/elf/div3w-armv8.S +++ /dev/null @@ -1,88 +0,0 @@ -.text - -.globl div_3_limbs -.type div_3_limbs,%function -.align 5 -div_3_limbs: - ldp x4,x5,[x0] // load R - eor x0,x0,x0 // Q = 0 - mov x3,#64 // loop counter - nop - -.Loop: - subs x6,x4,x1 // R - D - add x0,x0,x0 // Q <<= 1 - sbcs x7,x5,x2 - add x0,x0,#1 // Q + speculative bit - csel x4,x4,x6,lo // select between R and R - D - extr x1,x2,x1,#1 // D >>= 1 - csel x5,x5,x7,lo - lsr x2,x2,#1 - sbc x0,x0,xzr // subtract speculative bit - sub x3,x3,#1 - cbnz x3,.Loop - - asr x3,x0,#63 // top bit -> mask - add x0,x0,x0 // Q <<= 1 - subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + speculative bit - sbcs x7,x5,x2 - sbc x0,x0,xzr // subtract speculative bit - - orr x0,x0,x3 // all ones if overflow - - ret -.size div_3_limbs,.-div_3_limbs -.globl quot_rem_128 -.type quot_rem_128,%function -.align 5 -quot_rem_128: - ldp x3,x4,[x1] - - mul x5,x3,x2 // divisor[0:1} * quotient - umulh x6,x3,x2 - mul x11, x4,x2 - umulh x7,x4,x2 - - ldp x8,x9,[x0] // load 3 limbs of the dividend - ldr x10,[x0,#16] - - adds x6,x6,x11 - adc x7,x7,xzr - - subs x8,x8,x5 // dividend - divisor * quotient - sbcs x9,x9,x6 - sbcs x10,x10,x7 - sbc x5,xzr,xzr // borrow -> mask - - add x2,x2,x5 // if borrowed, adjust the quotient ... - and x3,x3,x5 - and x4,x4,x5 - adds x8,x8,x3 // ... and add divisor - adc x9,x9,x4 - - stp x8,x9,[x0] // save 2 limbs of the remainder - str x2,[x0,#16] // and one limb of the quotient - - mov x0,x2 // return adjusted quotient - - ret -.size quot_rem_128,.-quot_rem_128 - -.globl quot_rem_64 -.type quot_rem_64,%function -.align 5 -quot_rem_64: - ldr x3,[x1] - ldr x8,[x0] // load 1 limb of the dividend - - mul x5,x3,x2 // divisor * quotient - - sub x8,x8,x5 // dividend - divisor * quotient - - stp x8,x2,[x0] // save remainder and quotient - - mov x0,x2 // return quotient - - ret -.size quot_rem_64,.-quot_rem_64 diff --git a/crypto/blst_src/build/elf/div3w-x86_64.s b/crypto/blst_src/build/elf/div3w-x86_64.s deleted file mode 100644 index 5d9fd8a9139..00000000000 --- a/crypto/blst_src/build/elf/div3w-x86_64.s +++ /dev/null @@ -1,132 +0,0 @@ -.text - -.globl div_3_limbs -.hidden div_3_limbs -.type div_3_limbs,@function -.align 32 -div_3_limbs: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - - movq (%rdi),%r8 - movq 8(%rdi),%r9 - xorq %rax,%rax - movl $64,%ecx - -.Loop: - movq %r8,%r10 - subq %rsi,%r8 - movq %r9,%r11 - sbbq %rdx,%r9 - leaq 1(%rax,%rax,1),%rax - movq %rdx,%rdi - cmovcq %r10,%r8 - cmovcq %r11,%r9 - sbbq $0,%rax - shlq $63,%rdi - shrq $1,%rsi - shrq $1,%rdx - orq %rdi,%rsi - subl $1,%ecx - jnz .Loop - - leaq 1(%rax,%rax,1),%rcx - sarq $63,%rax - - subq %rsi,%r8 - sbbq %rdx,%r9 - sbbq $0,%rcx - - orq %rcx,%rax - - - .byte 0xf3,0xc3 -.cfi_endproc -.size div_3_limbs,.-div_3_limbs -.globl quot_rem_128 -.hidden quot_rem_128 -.type quot_rem_128,@function -.align 32 -quot_rem_128: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - - movq %rdx,%rax - movq %rdx,%rcx - - mulq 0(%rsi) - movq %rax,%r8 - movq %rcx,%rax - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r9 - adcq $0,%rdx - - movq 0(%rdi),%r10 - movq 8(%rdi),%r11 - movq 16(%rdi),%rax - - subq %r8,%r10 - sbbq %r9,%r11 - sbbq %rdx,%rax - sbbq %r8,%r8 - - addq %r8,%rcx - movq %r8,%r9 - andq 0(%rsi),%r8 - andq 8(%rsi),%r9 - addq %r8,%r10 - adcq %r9,%r11 - - movq %r10,0(%rdi) - movq %r11,8(%rdi) - movq %rcx,16(%rdi) - - movq %rcx,%rax - - - .byte 0xf3,0xc3 -.cfi_endproc -.size quot_rem_128,.-quot_rem_128 - - - - - -.globl quot_rem_64 -.hidden quot_rem_64 -.type quot_rem_64,@function -.align 32 -quot_rem_64: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - - movq %rdx,%rax - imulq 0(%rsi),%rdx - - movq 0(%rdi),%r10 - - subq %rdx,%r10 - - movq %r10,0(%rdi) - movq %rax,8(%rdi) - - - .byte 0xf3,0xc3 -.cfi_endproc -.size quot_rem_64,.-quot_rem_64 - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/mul_mont_256-armv8.S b/crypto/blst_src/build/elf/mul_mont_256-armv8.S deleted file mode 100644 index 8bb1197f464..00000000000 --- a/crypto/blst_src/build/elf/mul_mont_256-armv8.S +++ /dev/null @@ -1,464 +0,0 @@ -.text - -.globl mul_mont_sparse_256 -.hidden mul_mont_sparse_256 -.type mul_mont_sparse_256,%function -.align 5 -mul_mont_sparse_256: - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x10,x11,[x1] - ldr x9, [x2] - ldp x12,x13,[x1,#16] - - mul x19,x10,x9 - ldp x5,x6,[x3] - mul x20,x11,x9 - ldp x7,x8,[x3,#16] - mul x21,x12,x9 - mul x22,x13,x9 - - umulh x14,x10,x9 - umulh x15,x11,x9 - mul x3,x4,x19 - umulh x16,x12,x9 - umulh x17,x13,x9 - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,xzr, x17 - mul x17,x8,x3 - ldr x9,[x2,8*1] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - ldr x9,[x2,8*2] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - ldr x9,[x2,8*3] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - adcs x20,x21,x15 - adcs x21,x22,x16 - adcs x22,x23,x17 - adc x23,xzr,xzr - - subs x14,x19,x5 - sbcs x15,x20,x6 - sbcs x16,x21,x7 - sbcs x17,x22,x8 - sbcs xzr, x23,xzr - - csel x19,x19,x14,lo - csel x20,x20,x15,lo - csel x21,x21,x16,lo - csel x22,x22,x17,lo - - stp x19,x20,[x0] - stp x21,x22,[x0,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - ret -.size mul_mont_sparse_256,.-mul_mont_sparse_256 -.globl sqr_mont_sparse_256 -.hidden sqr_mont_sparse_256 -.type sqr_mont_sparse_256,%function -.align 5 -sqr_mont_sparse_256: - .inst 0xd503233f - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - mov x4,x3 - - //////////////////////////////////////////////////////////////// - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul x11,x6,x5 // a[1]*a[0] - umulh x15,x6,x5 - mul x12,x7,x5 // a[2]*a[0] - umulh x16,x7,x5 - mul x13,x8,x5 // a[3]*a[0] - umulh x19,x8,x5 - - adds x12,x12,x15 // accumulate high parts of multiplication - mul x14,x7,x6 // a[2]*a[1] - umulh x15,x7,x6 - adcs x13,x13,x16 - mul x16,x8,x6 // a[3]*a[1] - umulh x17,x8,x6 - adc x19,x19,xzr // can't overflow - - mul x20,x8,x7 // a[3]*a[2] - umulh x21,x8,x7 - - adds x15,x15,x16 // accumulate high parts of multiplication - mul x10,x5,x5 // a[0]*a[0] - adc x16,x17,xzr // can't overflow - - adds x13,x13,x14 // accumulate low parts of multiplication - umulh x5,x5,x5 - adcs x19,x19,x15 - mul x15,x6,x6 // a[1]*a[1] - adcs x20,x20,x16 - umulh x6,x6,x6 - adc x21,x21,xzr // can't overflow - - adds x11,x11,x11 // acc[1-6]*=2 - mul x16,x7,x7 // a[2]*a[2] - adcs x12,x12,x12 - umulh x7,x7,x7 - adcs x13,x13,x13 - mul x17,x8,x8 // a[3]*a[3] - adcs x19,x19,x19 - umulh x8,x8,x8 - adcs x20,x20,x20 - adcs x21,x21,x21 - adc x22,xzr,xzr - - adds x11,x11,x5 // +a[i]*a[i] - adcs x12,x12,x15 - adcs x13,x13,x6 - adcs x19,x19,x16 - adcs x20,x20,x7 - adcs x21,x21,x17 - adc x22,x22,x8 - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - adds x10,x10,x19 // accumulate upper half - adcs x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - adc x19,xzr,xzr - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - sbcs xzr, x19,xzr - - csel x10,x10,x14,lo - csel x11,x11,x15,lo - csel x12,x12,x16,lo - csel x13,x13,x17,lo - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - .inst 0xd50323bf - ret -.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 -.globl from_mont_256 -.hidden from_mont_256 -.type from_mont_256,%function -.align 5 -from_mont_256: - .inst 0xd503233f - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x4,x3 - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - - csel x10,x10,x14,lo - csel x11,x11,x15,lo - csel x12,x12,x16,lo - csel x13,x13,x17,lo - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldr x29,[sp],#16 - .inst 0xd50323bf - ret -.size from_mont_256,.-from_mont_256 - -.globl redc_mont_256 -.hidden redc_mont_256 -.type redc_mont_256,%function -.align 5 -redc_mont_256: - .inst 0xd503233f - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x4,x3 - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - ldp x14,x15,[x1,#32] - ldp x16,x17,[x1,#48] - - adds x10,x10,x14 - adcs x11,x11,x15 - adcs x12,x12,x16 - adcs x13,x13,x17 - adc x9,xzr,xzr - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - sbcs xzr, x9,xzr - - csel x10,x10,x14,lo - csel x11,x11,x15,lo - csel x12,x12,x16,lo - csel x13,x13,x17,lo - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldr x29,[sp],#16 - .inst 0xd50323bf - ret -.size redc_mont_256,.-redc_mont_256 - -.type __mul_by_1_mont_256,%function -.align 5 -__mul_by_1_mont_256: - mul x3,x4,x10 - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - adc x13,x9,x17 - - ret -.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 diff --git a/crypto/blst_src/build/elf/mul_mont_384-armv8.S b/crypto/blst_src/build/elf/mul_mont_384-armv8.S deleted file mode 100644 index c048e816b85..00000000000 --- a/crypto/blst_src/build/elf/mul_mont_384-armv8.S +++ /dev/null @@ -1,2372 +0,0 @@ -.text - -.globl add_mod_384x384 -.type add_mod_384x384,%function -.align 5 -add_mod_384x384: - .inst 0xd503233f - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __add_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - .inst 0xd50323bf - ret -.size add_mod_384x384,.-add_mod_384x384 - -.type __add_mod_384x384,%function -.align 5 -__add_mod_384x384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - adds x11,x11,x19 - ldp x21,x22,[x2,#16] - adcs x12,x12,x20 - ldp x15, x16, [x1,#32] - adcs x13,x13,x21 - ldp x23,x24,[x2,#32] - adcs x14,x14,x22 - stp x11, x12, [x0] - adcs x15,x15,x23 - ldp x11, x12, [x1,#48] - adcs x16,x16,x24 - - ldp x19,x20,[x2,#48] - stp x13, x14, [x0,#16] - ldp x13, x14, [x1,#64] - ldp x21,x22,[x2,#64] - - adcs x11,x11,x19 - stp x15, x16, [x0,#32] - adcs x12,x12,x20 - ldp x15, x16, [x1,#80] - adcs x13,x13,x21 - ldp x23,x24,[x2,#80] - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x17,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x17,xzr - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - stp x11,x12,[x0,#48] - csel x15,x15,x23,lo - stp x13,x14,[x0,#64] - csel x16,x16,x24,lo - stp x15,x16,[x0,#80] - - ret -.size __add_mod_384x384,.-__add_mod_384x384 - -.globl sub_mod_384x384 -.type sub_mod_384x384,%function -.align 5 -sub_mod_384x384: - .inst 0xd503233f - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - .inst 0xd50323bf - ret -.size sub_mod_384x384,.-sub_mod_384x384 - -.type __sub_mod_384x384,%function -.align 5 -__sub_mod_384x384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - subs x11,x11,x19 - ldp x21,x22,[x2,#16] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#32] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#32] - sbcs x14,x14,x22 - stp x11, x12, [x0] - sbcs x15,x15,x23 - ldp x11, x12, [x1,#48] - sbcs x16,x16,x24 - - ldp x19,x20,[x2,#48] - stp x13, x14, [x0,#16] - ldp x13, x14, [x1,#64] - ldp x21,x22,[x2,#64] - - sbcs x11,x11,x19 - stp x15, x16, [x0,#32] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#80] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#80] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x17,xzr,xzr - - and x19,x5,x17 - and x20,x6,x17 - adds x11,x11,x19 - and x21,x7,x17 - adcs x12,x12,x20 - and x22,x8,x17 - adcs x13,x13,x21 - and x23,x9,x17 - adcs x14,x14,x22 - and x24,x10,x17 - adcs x15,x15,x23 - stp x11,x12,[x0,#48] - adc x16,x16,x24 - stp x13,x14,[x0,#64] - stp x15,x16,[x0,#80] - - ret -.size __sub_mod_384x384,.-__sub_mod_384x384 - -.type __add_mod_384,%function -.align 5 -__add_mod_384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - adds x11,x11,x19 - ldp x21,x22,[x2,#16] - adcs x12,x12,x20 - ldp x15, x16, [x1,#32] - adcs x13,x13,x21 - ldp x23,x24,[x2,#32] - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x17,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x17,xzr - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - csel x15,x15,x23,lo - stp x11,x12,[x0] - csel x16,x16,x24,lo - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret -.size __add_mod_384,.-__add_mod_384 - -.type __sub_mod_384,%function -.align 5 -__sub_mod_384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - subs x11,x11,x19 - ldp x21,x22,[x2,#16] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#32] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#32] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x17,xzr,xzr - - and x19,x5,x17 - and x20,x6,x17 - adds x11,x11,x19 - and x21,x7,x17 - adcs x12,x12,x20 - and x22,x8,x17 - adcs x13,x13,x21 - and x23,x9,x17 - adcs x14,x14,x22 - and x24,x10,x17 - adcs x15,x15,x23 - stp x11,x12,[x0] - adc x16,x16,x24 - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret -.size __sub_mod_384,.-__sub_mod_384 - -.globl mul_mont_384x -.hidden mul_mont_384x -.type mul_mont_384x,%function -.align 5 -mul_mont_384x: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#288 // space for 3 768-bit vectors - - mov x26,x0 // save r_ptr - mov x27,x1 // save b_ptr - mov x28,x2 // save b_ptr - - sub x0,sp,#0 // mul_384(t0, a->re, b->re) - bl __mul_384 - - add x1,x1,#48 // mul_384(t1, a->im, b->im) - add x2,x2,#48 - add x0,sp,#96 - bl __mul_384 - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - sub x2,x1,#48 - add x0,sp,#240 - bl __add_mod_384 - - add x1,x28,#0 - add x2,x28,#48 - add x0,sp,#192 // t2 - bl __add_mod_384 - - add x1,x0,#0 - add x2,x0,#48 - bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - mov x1,x0 - add x2,sp,#0 - bl __sub_mod_384x384 - - add x2,sp,#96 - bl __sub_mod_384x384 // t2 = t2-t0-t1 - - add x1,sp,#0 - add x2,sp,#96 - add x0,sp,#0 - bl __sub_mod_384x384 // t0 = t0-t1 - - add x1,sp,#0 // ret->re = redc(t0) - add x0,x26,#0 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - - add x1,sp,#192 // ret->im = redc(t2) - add x0,x0,#48 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#288 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size mul_mont_384x,.-mul_mont_384x - -.globl sqr_mont_384x -.hidden sqr_mont_384x -.type sqr_mont_384x,%function -.align 5 -sqr_mont_384x: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 2 384-bit vectors - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - add x2,x1,#48 - add x0,sp,#0 - bl __add_mod_384 // t0 = a->re + a->im - - add x0,sp,#48 - bl __sub_mod_384 // t1 = a->re - a->im - - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) - - adds x11,x11,x11 // add with itself - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x25,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x25,xzr - - csel x19,x11,x19,lo - csel x20,x12,x20,lo - csel x21,x13,x21,lo - ldp x11,x12,[sp] - csel x22,x14,x22,lo - ldr x17, [sp,#48] - csel x23,x15,x23,lo - ldp x13,x14,[sp,#16] - csel x24,x16,x24,lo - ldp x15,x16,[sp,#32] - - stp x19,x20,[x2,#48] - stp x21,x22,[x2,#64] - stp x23,x24,[x2,#80] - - add x2,sp,#48 - bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size sqr_mont_384x,.-sqr_mont_384x - -.globl mul_mont_384 -.hidden mul_mont_384 -.type mul_mont_384,%function -.align 5 -mul_mont_384: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there - - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size mul_mont_384,.-mul_mont_384 - -.type __mul_mont_384,%function -.align 5 -__mul_mont_384: - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - mul x4,x4,x19 - - umulh x26,x11,x17 - umulh x27,x12,x17 - umulh x28,x13,x17 - umulh x0,x14,x17 - umulh x1,x15,x17 - umulh x3,x16,x17 - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,xzr, x3 - mul x3,x10,x4 - mov x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*1] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*2] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*3] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*4] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*5] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr - adc x17,x17,xzr - - adds x19,x20,x26 - adcs x20,x21,x27 - adcs x21,x22,x28 - adcs x22,x23,x0 - adcs x23,x24,x1 - adcs x24,x25,x3 - adc x25,x17,xzr - - subs x26,x19,x5 - sbcs x27,x20,x6 - sbcs x28,x21,x7 - sbcs x0,x22,x8 - sbcs x1,x23,x9 - sbcs x3,x24,x10 - sbcs xzr, x25,xzr - - csel x11,x19,x26,lo - csel x12,x20,x27,lo - csel x13,x21,x28,lo - csel x14,x22,x0,lo - csel x15,x23,x1,lo - csel x16,x24,x3,lo - ret -.size __mul_mont_384,.-__mul_mont_384 - -.globl sqr_mont_384 -.hidden sqr_mont_384 -.type sqr_mont_384,%function -.align 5 -sqr_mont_384: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for 768-bit vector - mov x4,x3 // adjust for missing b_ptr - - mov x3,x0 // save r_ptr - mov x0,sp - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __sqr_384 - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - mov x1,sp - mov x0,x3 // restore r_ptr - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size sqr_mont_384,.-sqr_mont_384 - -.globl sqr_n_mul_mont_383 -.hidden sqr_n_mul_mont_383 -.type sqr_n_mul_mont_383,%function -.align 5 -sqr_n_mul_mont_383: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 768-bit vector - mov x17,x5 // save b_ptr - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - mov x0,sp -.Loop_sqr_383: - bl __sqr_384 - sub x2,x2,#1 // counter - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - mov x1,sp - bl __mul_by_1_mont_384 - - ldp x19,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x11,x11,x19 // just accumulate upper half - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adc x16,x16,x24 - - cbnz x2,.Loop_sqr_383 - - mov x2,x17 - ldr x17,[x17] - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 -.type __sqr_384,%function -.align 5 -__sqr_384: - mul x19,x12,x11 - mul x20,x13,x11 - mul x21,x14,x11 - mul x22,x15,x11 - mul x23,x16,x11 - - umulh x6,x12,x11 - umulh x7,x13,x11 - umulh x8,x14,x11 - umulh x9,x15,x11 - adds x20,x20,x6 - umulh x10,x16,x11 - adcs x21,x21,x7 - mul x7,x13,x12 - adcs x22,x22,x8 - mul x8,x14,x12 - adcs x23,x23,x9 - mul x9,x15,x12 - adc x24,xzr, x10 - mul x10,x16,x12 - - adds x21,x21,x7 - umulh x7,x13,x12 - adcs x22,x22,x8 - umulh x8,x14,x12 - adcs x23,x23,x9 - umulh x9,x15,x12 - adcs x24,x24,x10 - umulh x10,x16,x12 - adc x25,xzr,xzr - - mul x5,x11,x11 - adds x22,x22,x7 - umulh x11, x11,x11 - adcs x23,x23,x8 - mul x8,x14,x13 - adcs x24,x24,x9 - mul x9,x15,x13 - adc x25,x25,x10 - mul x10,x16,x13 - - adds x23,x23,x8 - umulh x8,x14,x13 - adcs x24,x24,x9 - umulh x9,x15,x13 - adcs x25,x25,x10 - umulh x10,x16,x13 - adc x26,xzr,xzr - - mul x6,x12,x12 - adds x24,x24,x8 - umulh x12, x12,x12 - adcs x25,x25,x9 - mul x9,x15,x14 - adc x26,x26,x10 - mul x10,x16,x14 - - adds x25,x25,x9 - umulh x9,x15,x14 - adcs x26,x26,x10 - umulh x10,x16,x14 - adc x27,xzr,xzr - mul x7,x13,x13 - adds x26,x26,x9 - umulh x13, x13,x13 - adc x27,x27,x10 - mul x8,x14,x14 - - mul x10,x16,x15 - umulh x14, x14,x14 - adds x27,x27,x10 - umulh x10,x16,x15 - mul x9,x15,x15 - adc x28,x10,xzr - - adds x19,x19,x19 - adcs x20,x20,x20 - adcs x21,x21,x21 - adcs x22,x22,x22 - adcs x23,x23,x23 - adcs x24,x24,x24 - adcs x25,x25,x25 - adcs x26,x26,x26 - umulh x15, x15,x15 - adcs x27,x27,x27 - mul x10,x16,x16 - adcs x28,x28,x28 - umulh x16, x16,x16 - adc x1,xzr,xzr - - adds x19,x19,x11 - adcs x20,x20,x6 - adcs x21,x21,x12 - adcs x22,x22,x7 - adcs x23,x23,x13 - adcs x24,x24,x8 - adcs x25,x25,x14 - stp x5,x19,[x0] - adcs x26,x26,x9 - stp x20,x21,[x0,#16] - adcs x27,x27,x15 - stp x22,x23,[x0,#32] - adcs x28,x28,x10 - stp x24,x25,[x0,#48] - adc x16,x16,x1 - stp x26,x27,[x0,#64] - stp x28,x16,[x0,#80] - - ret -.size __sqr_384,.-__sqr_384 -.globl sqr_384 -.hidden sqr_384 -.type sqr_384,%function -.align 5 -sqr_384: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __sqr_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size sqr_384,.-sqr_384 - -.globl redc_mont_384 -.hidden redc_mont_384 -.type redc_mont_384,%function -.align 5 -redc_mont_384: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size redc_mont_384,.-redc_mont_384 - -.globl from_mont_384 -.hidden from_mont_384 -.type from_mont_384,%function -.align 5 -from_mont_384: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - csel x15,x15,x23,lo - csel x16,x16,x24,lo - - stp x11,x12,[x0] - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size from_mont_384,.-from_mont_384 - -.type __mul_by_1_mont_384,%function -.align 5 -__mul_by_1_mont_384: - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - mul x26,x4,x11 - ldp x15,x16,[x1,#32] - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - ret -.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 - -.type __redc_tail_mont_384,%function -.align 5 -__redc_tail_mont_384: - ldp x19,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x11,x11,x19 // accumulate upper half - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x25,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x25,xzr - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - csel x15,x15,x23,lo - csel x16,x16,x24,lo - - stp x11,x12,[x0] - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret -.size __redc_tail_mont_384,.-__redc_tail_mont_384 - -.globl mul_384 -.hidden mul_384 -.type mul_384,%function -.align 5 -mul_384: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - bl __mul_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size mul_384,.-mul_384 - -.type __mul_384,%function -.align 5 -__mul_384: - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - - umulh x5,x11,x17 - umulh x6,x12,x17 - umulh x7,x13,x17 - umulh x8,x14,x17 - umulh x9,x15,x17 - umulh x10,x16,x17 - ldr x17,[x2,8*1] - - str x19,[x0] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,xzr, x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(1+1)] - adc x25,xzr,xzr - - str x19,[x0,8*1] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(2+1)] - adc x25,xzr,xzr - - str x19,[x0,8*2] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(3+1)] - adc x25,xzr,xzr - - str x19,[x0,8*3] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(4+1)] - adc x25,xzr,xzr - - str x19,[x0,8*4] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - adc x25,xzr,xzr - - str x19,[x0,8*5] - adds x19,x20,x5 - adcs x20,x21,x6 - adcs x21,x22,x7 - adcs x22,x23,x8 - adcs x23,x24,x9 - adc x24,x25,x10 - - stp x19,x20,[x0,#48] - stp x21,x22,[x0,#64] - stp x23,x24,[x0,#80] - - ret -.size __mul_384,.-__mul_384 - -.globl mul_382x -.hidden mul_382x -.type mul_382x,%function -.align 5 -mul_382x: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for two 384-bit vectors - - ldp x11,x12,[x1] - mov x26,x0 // save r_ptr - ldp x19,x20,[x1,#48] - mov x27,x1 // save a_ptr - ldp x13,x14,[x1,#16] - mov x28,x2 // save b_ptr - ldp x21,x22,[x1,#64] - ldp x15,x16,[x1,#32] - adds x5,x11,x19 // t0 = a->re + a->im - ldp x23,x24,[x1,#80] - adcs x6,x12,x20 - ldp x11,x12,[x2] - adcs x7,x13,x21 - ldp x19,x20,[x2,#48] - adcs x8,x14,x22 - ldp x13,x14,[x2,#16] - adcs x9,x15,x23 - ldp x21,x22,[x2,#64] - adc x10,x16,x24 - ldp x15,x16,[x2,#32] - - stp x5,x6,[sp] - adds x5,x11,x19 // t1 = b->re + b->im - ldp x23,x24,[x2,#80] - adcs x6,x12,x20 - stp x7,x8,[sp,#16] - adcs x7,x13,x21 - adcs x8,x14,x22 - stp x9,x10,[sp,#32] - adcs x9,x15,x23 - stp x5,x6,[sp,#48] - adc x10,x16,x24 - stp x7,x8,[sp,#64] - stp x9,x10,[sp,#80] - - bl __mul_384 // mul_384(ret->re, a->re, b->re) - - add x1,sp,#0 // mul_384(ret->im, t0, t1) - add x2,sp,#48 - add x0,x26,#96 - bl __mul_384 - - add x1,x27,#48 // mul_384(tx, a->im, b->im) - add x2,x28,#48 - add x0,sp,#0 - bl __mul_384 - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - add x1,x26,#96 // ret->im -= tx - add x2,sp,#0 - add x0,x26,#96 - bl __sub_mod_384x384 - - add x2,x26,#0 // ret->im -= ret->re - bl __sub_mod_384x384 - - add x1,x26,#0 // ret->re -= tx - add x2,sp,#0 - add x0,x26,#0 - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size mul_382x,.-mul_382x - -.globl sqr_382x -.hidden sqr_382x -.type sqr_382x,%function -.align 5 -sqr_382x: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp x11,x12,[x1] - ldp x19,x20,[x1,#48] - ldp x13,x14,[x1,#16] - adds x5,x11,x19 // t0 = a->re + a->im - ldp x21,x22,[x1,#64] - adcs x6,x12,x20 - ldp x15,x16,[x1,#32] - adcs x7,x13,x21 - ldp x23,x24,[x1,#80] - adcs x8,x14,x22 - stp x5,x6,[x0] - adcs x9,x15,x23 - ldp x5,x6,[x2] - adc x10,x16,x24 - stp x7,x8,[x0,#16] - - subs x11,x11,x19 // t1 = a->re - a->im - ldp x7,x8,[x2,#16] - sbcs x12,x12,x20 - stp x9,x10,[x0,#32] - sbcs x13,x13,x21 - ldp x9,x10,[x2,#32] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x25,xzr,xzr - - and x19,x5,x25 - and x20,x6,x25 - adds x11,x11,x19 - and x21,x7,x25 - adcs x12,x12,x20 - and x22,x8,x25 - adcs x13,x13,x21 - and x23,x9,x25 - adcs x14,x14,x22 - and x24,x10,x25 - adcs x15,x15,x23 - stp x11,x12,[x0,#48] - adc x16,x16,x24 - stp x13,x14,[x0,#64] - stp x15,x16,[x0,#80] - - mov x4,x1 // save a_ptr - add x1,x0,#0 // mul_384(ret->re, t0, t1) - add x2,x0,#48 - bl __mul_384 - - add x1,x4,#0 // mul_384(ret->im, a->re, a->im) - add x2,x4,#48 - add x0,x0,#96 - bl __mul_384 - ldr x30,[x29,#8] - - ldp x11,x12,[x0] - ldp x13,x14,[x0,#16] - adds x11,x11,x11 // add with itself - ldp x15,x16,[x0,#32] - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adcs x19,x19,x19 - adcs x20,x20,x20 - stp x11,x12,[x0] - adcs x21,x21,x21 - stp x13,x14,[x0,#16] - adcs x22,x22,x22 - stp x15,x16,[x0,#32] - adcs x23,x23,x23 - stp x19,x20,[x0,#48] - adc x24,x24,x24 - stp x21,x22,[x0,#64] - stp x23,x24,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size sqr_382x,.-sqr_382x - -.globl sqr_mont_382x -.hidden sqr_mont_382x -.type sqr_mont_382x,%function -.align 5 -sqr_mont_382x: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#112 // space for two 384-bit vectors + word - mov x4,x3 // adjust for missing b_ptr - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - ldp x17,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x5,x11,x17 // t0 = a->re + a->im - adcs x6,x12,x20 - adcs x7,x13,x21 - adcs x8,x14,x22 - adcs x9,x15,x23 - adc x10,x16,x24 - - subs x19,x11,x17 // t1 = a->re - a->im - sbcs x20,x12,x20 - sbcs x21,x13,x21 - sbcs x22,x14,x22 - sbcs x23,x15,x23 - sbcs x24,x16,x24 - sbc x25,xzr,xzr // borrow flag as mask - - stp x5,x6,[sp] - stp x7,x8,[sp,#16] - stp x9,x10,[sp,#32] - stp x19,x20,[sp,#48] - stp x21,x22,[sp,#64] - stp x23,x24,[sp,#80] - str x25,[sp,#96] - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - add x2,x1,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) - - adds x19,x11,x11 // add with itself - adcs x20,x12,x12 - adcs x21,x13,x13 - adcs x22,x14,x14 - adcs x23,x15,x15 - adc x24,x16,x16 - - stp x19,x20,[x2,#48] - stp x21,x22,[x2,#64] - stp x23,x24,[x2,#80] - - ldp x11,x12,[sp] - ldr x17,[sp,#48] - ldp x13,x14,[sp,#16] - ldp x15,x16,[sp,#32] - - add x2,sp,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) - ldr x30,[x29,#8] - - ldr x25,[sp,#96] // account for sign from a->re - a->im - ldp x19,x20,[sp] - ldp x21,x22,[sp,#16] - ldp x23,x24,[sp,#32] - - and x19,x19,x25 - and x20,x20,x25 - and x21,x21,x25 - and x22,x22,x25 - and x23,x23,x25 - and x24,x24,x25 - - subs x11,x11,x19 - sbcs x12,x12,x20 - sbcs x13,x13,x21 - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x25,xzr,xzr - - and x19,x5,x25 - and x20,x6,x25 - and x21,x7,x25 - and x22,x8,x25 - and x23,x9,x25 - and x24,x10,x25 - - adds x11,x11,x19 - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adc x16,x16,x24 - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#112 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size sqr_mont_382x,.-sqr_mont_382x - -.type __mul_mont_383_nonred,%function -.align 5 -__mul_mont_383_nonred: - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - mul x4,x4,x19 - - umulh x26,x11,x17 - umulh x27,x12,x17 - umulh x28,x13,x17 - umulh x0,x14,x17 - umulh x1,x15,x17 - umulh x3,x16,x17 - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,xzr, x3 - mul x3,x10,x4 - ldr x17,[x2,8*1] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*2] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*3] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*4] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*5] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr - - adds x11,x20,x26 - adcs x12,x21,x27 - adcs x13,x22,x28 - adcs x14,x23,x0 - adcs x15,x24,x1 - adcs x16,x25,x3 - - ret -.size __mul_mont_383_nonred,.-__mul_mont_383_nonred - -.globl sgn0_pty_mont_384 -.hidden sgn0_pty_mont_384 -.type sgn0_pty_mont_384,%function -.align 5 -sgn0_pty_mont_384: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov x4,x2 - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - ldp x9,x10,[x1,#32] - mov x1,x0 - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and x0,x11,#1 - adds x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x0,x0,x17 - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 - -.globl sgn0_pty_mont_384x -.hidden sgn0_pty_mont_384x -.type sgn0_pty_mont_384x,%function -.align 5 -sgn0_pty_mont_384x: - .inst 0xd503233f - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov x4,x2 - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - ldp x9,x10,[x1,#32] - mov x1,x0 - - bl __mul_by_1_mont_384 - add x1,x1,#48 - - and x2,x11,#1 - orr x3,x11,x12 - adds x11,x11,x11 - orr x3,x3,x13 - adcs x12,x12,x12 - orr x3,x3,x14 - adcs x13,x13,x13 - orr x3,x3,x15 - adcs x14,x14,x14 - orr x3,x3,x16 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x2,x2,x17 - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and x0,x11,#1 - orr x1,x11,x12 - adds x11,x11,x11 - orr x1,x1,x13 - adcs x12,x12,x12 - orr x1,x1,x14 - adcs x13,x13,x13 - orr x1,x1,x15 - adcs x14,x14,x14 - orr x1,x1,x16 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x0,x0,x17 - - cmp x3,#0 - csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) - - cmp x1,#0 - csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) - - and x3,x3,#1 - and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - .inst 0xd50323bf - ret -.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x diff --git a/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s deleted file mode 100644 index 10b1b56cb50..00000000000 --- a/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s +++ /dev/null @@ -1,731 +0,0 @@ -.comm __blst_platform_cap,4 -.text - -.globl mul_mont_sparse_256 -.hidden mul_mont_sparse_256 -.type mul_mont_sparse_256,@function -.align 32 -mul_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_mont_sparse_256$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rdx),%rax - movq 0(%rsi),%r13 - movq 8(%rsi),%r14 - movq 16(%rsi),%r12 - movq 24(%rsi),%rbp - movq %rdx,%rbx - - movq %rax,%r15 - mulq %r13 - movq %rax,%r9 - movq %r15,%rax - movq %rdx,%r10 - call __mulq_mont_sparse_256 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_mont_sparse_256,.-mul_mont_sparse_256 - -.globl sqr_mont_sparse_256 -.hidden sqr_mont_sparse_256 -.type sqr_mont_sparse_256,@function -.align 32 -sqr_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_mont_sparse_256$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%rax - movq %rcx,%r8 - movq 8(%rsi),%r14 - movq %rdx,%rcx - movq 16(%rsi),%r12 - leaq (%rsi),%rbx - movq 24(%rsi),%rbp - - movq %rax,%r15 - mulq %rax - movq %rax,%r9 - movq %r15,%rax - movq %rdx,%r10 - call __mulq_mont_sparse_256 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 -.type __mulq_mont_sparse_256,@function -.align 32 -__mulq_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - mulq %r14 - addq %rax,%r10 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %r12 - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq %rbp - addq %rax,%r12 - movq 8(%rbx),%rax - adcq $0,%rdx - xorq %r14,%r14 - movq %rdx,%r13 - - movq %r9,%rdi - imulq %r8,%r9 - - - movq %rax,%r15 - mulq 0(%rsi) - addq %rax,%r10 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rax,%r12 - movq %r15,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rsi) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq %rdx,%r14 - xorq %r15,%r15 - - - mulq 0(%rcx) - addq %rax,%rdi - movq %r9,%rax - adcq %rdx,%rdi - - mulq 8(%rcx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %rdi,%r10 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rax,%r12 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - addq %rdx,%r13 - adcq $0,%r14 - adcq $0,%r15 - movq %r10,%rdi - imulq %r8,%r10 - - - movq %rax,%r9 - mulq 0(%rsi) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rsi) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq %rdx,%r15 - xorq %r9,%r9 - - - mulq 0(%rcx) - addq %rax,%rdi - movq %r10,%rax - adcq %rdx,%rdi - - mulq 8(%rcx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %rdi,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rax,%r13 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - addq %rdx,%r14 - adcq $0,%r15 - adcq $0,%r9 - movq %r11,%rdi - imulq %r8,%r11 - - - movq %rax,%r10 - mulq 0(%rsi) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rsi) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq %rdx,%r9 - xorq %r10,%r10 - - - mulq 0(%rcx) - addq %rax,%rdi - movq %r11,%rax - adcq %rdx,%rdi - - mulq 8(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %rdi,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - addq %rdx,%r15 - adcq $0,%r9 - adcq $0,%r10 - imulq %r8,%rax - movq 8(%rsp),%rsi - - - movq %rax,%r11 - mulq 0(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq %rdx,%r12 - - mulq 8(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r12,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - movq %r14,%rbx - addq %rbp,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %rdx,%r9 - adcq $0,%r10 - - - - - movq %r15,%r12 - subq 0(%rcx),%r13 - sbbq 8(%rcx),%r14 - sbbq 16(%rcx),%r15 - movq %r9,%rbp - sbbq 24(%rcx),%r9 - sbbq $0,%r10 - - cmovcq %rax,%r13 - cmovcq %rbx,%r14 - cmovcq %r12,%r15 - movq %r13,0(%rsi) - cmovcq %rbp,%r9 - movq %r14,8(%rsi) - movq %r15,16(%rsi) - movq %r9,24(%rsi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 -.globl from_mont_256 -.hidden from_mont_256 -.type from_mont_256,@function -.align 32 -from_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz from_mont_256$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulq_by_1_mont_256 - - - - - - movq %r14,%r10 - movq %r15,%r11 - movq %r9,%r12 - - subq 0(%rbx),%r13 - sbbq 8(%rbx),%r14 - sbbq 16(%rbx),%r15 - sbbq 24(%rbx),%r9 - - cmovncq %r13,%rax - cmovncq %r14,%r10 - cmovncq %r15,%r11 - movq %rax,0(%rdi) - cmovncq %r9,%r12 - movq %r10,8(%rdi) - movq %r11,16(%rdi) - movq %r12,24(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size from_mont_256,.-from_mont_256 - -.globl redc_mont_256 -.hidden redc_mont_256 -.type redc_mont_256,@function -.align 32 -redc_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz redc_mont_256$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulq_by_1_mont_256 - - addq 32(%rsi),%r13 - adcq 40(%rsi),%r14 - movq %r13,%rax - adcq 48(%rsi),%r15 - movq %r14,%r10 - adcq 56(%rsi),%r9 - sbbq %rsi,%rsi - - - - - movq %r15,%r11 - subq 0(%rbx),%r13 - sbbq 8(%rbx),%r14 - sbbq 16(%rbx),%r15 - movq %r9,%r12 - sbbq 24(%rbx),%r9 - sbbq $0,%rsi - - cmovncq %r13,%rax - cmovncq %r14,%r10 - cmovncq %r15,%r11 - movq %rax,0(%rdi) - cmovncq %r9,%r12 - movq %r10,8(%rdi) - movq %r11,16(%rdi) - movq %r12,24(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size redc_mont_256,.-redc_mont_256 -.type __mulq_by_1_mont_256,@function -.align 32 -__mulq_by_1_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - - movq %rax,%r13 - imulq %rcx,%rax - movq %rax,%r9 - - mulq 0(%rbx) - addq %rax,%r13 - movq %r9,%rax - adcq %rdx,%r13 - - mulq 8(%rbx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %r13,%r10 - adcq $0,%rdx - movq %rdx,%r13 - - mulq 16(%rbx) - movq %r10,%r14 - imulq %rcx,%r10 - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %r13,%r11 - adcq $0,%rdx - movq %rdx,%r13 - - mulq 24(%rbx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r13,%r12 - adcq $0,%rdx - movq %rdx,%r13 - - mulq 0(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq %rdx,%r14 - - mulq 8(%rbx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r11 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 16(%rbx) - movq %r11,%r15 - imulq %rcx,%r11 - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r12 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 24(%rbx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r14,%r13 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq %rdx,%r15 - - mulq 8(%rbx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rbx) - movq %r12,%r9 - imulq %rcx,%r12 - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r15,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rbx) - addq %rax,%r9 - movq %r12,%rax - adcq %rdx,%r9 - - mulq 8(%rbx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r9,%r15 - adcq $0,%rdx - movq %rdx,%r9 - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s deleted file mode 100644 index 903ba23b12c..00000000000 --- a/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s +++ /dev/null @@ -1,3681 +0,0 @@ -.comm __blst_platform_cap,4 -.text - - - - - - - -.type __subq_mod_384x384,@function -.align 32 -__subq_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - subq 0(%rdx),%r8 - movq 56(%rsi),%r15 - sbbq 8(%rdx),%r9 - movq 64(%rsi),%rax - sbbq 16(%rdx),%r10 - movq 72(%rsi),%rbx - sbbq 24(%rdx),%r11 - movq 80(%rsi),%rbp - sbbq 32(%rdx),%r12 - movq 88(%rsi),%rsi - sbbq 40(%rdx),%r13 - movq %r8,0(%rdi) - sbbq 48(%rdx),%r14 - movq 0(%rcx),%r8 - movq %r9,8(%rdi) - sbbq 56(%rdx),%r15 - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - sbbq 64(%rdx),%rax - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - sbbq 72(%rdx),%rbx - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - sbbq 80(%rdx),%rbp - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - sbbq 88(%rdx),%rsi - movq 40(%rcx),%r13 - sbbq %rdx,%rdx - - andq %rdx,%r8 - andq %rdx,%r9 - andq %rdx,%r10 - andq %rdx,%r11 - andq %rdx,%r12 - andq %rdx,%r13 - - addq %r8,%r14 - adcq %r9,%r15 - movq %r14,48(%rdi) - adcq %r10,%rax - movq %r15,56(%rdi) - adcq %r11,%rbx - movq %rax,64(%rdi) - adcq %r12,%rbp - movq %rbx,72(%rdi) - adcq %r13,%rsi - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __subq_mod_384x384,.-__subq_mod_384x384 - -.type __addq_mod_384,@function -.align 32 -__addq_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - movq %r8,%r14 - adcq 24(%rdx),%r11 - movq %r9,%r15 - adcq 32(%rdx),%r12 - movq %r10,%rax - adcq 40(%rdx),%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,0(%rdi) - cmovcq %rbx,%r11 - movq %r9,8(%rdi) - cmovcq %rbp,%r12 - movq %r10,16(%rdi) - cmovcq %rsi,%r13 - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __addq_mod_384,.-__addq_mod_384 - -.type __subq_mod_384,@function -.align 32 -__subq_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -__subq_mod_384_a_is_loaded: - subq 0(%rdx),%r8 - movq 0(%rcx),%r14 - sbbq 8(%rdx),%r9 - movq 8(%rcx),%r15 - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rax - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbx - sbbq 32(%rdx),%r12 - movq 32(%rcx),%rbp - sbbq 40(%rdx),%r13 - movq 40(%rcx),%rsi - sbbq %rdx,%rdx - - andq %rdx,%r14 - andq %rdx,%r15 - andq %rdx,%rax - andq %rdx,%rbx - andq %rdx,%rbp - andq %rdx,%rsi - - addq %r14,%r8 - adcq %r15,%r9 - movq %r8,0(%rdi) - adcq %rax,%r10 - movq %r9,8(%rdi) - adcq %rbx,%r11 - movq %r10,16(%rdi) - adcq %rbp,%r12 - movq %r11,24(%rdi) - adcq %rsi,%r13 - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __subq_mod_384,.-__subq_mod_384 -.globl mul_mont_384x -.hidden mul_mont_384x -.type mul_mont_384x,@function -.align 32 -mul_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_mont_384x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $328,%rsp -.cfi_adjust_cfa_offset 328 - - - movq %rdx,%rbx - movq %rdi,32(%rsp) - movq %rsi,24(%rsp) - movq %rdx,16(%rsp) - movq %rcx,8(%rsp) - movq %r8,0(%rsp) - - - - - leaq 40(%rsp),%rdi - call __mulq_384 - - - leaq 48(%rbx),%rbx - leaq 48(%rsi),%rsi - leaq 40+96(%rsp),%rdi - call __mulq_384 - - - movq 8(%rsp),%rcx - leaq -48(%rsi),%rdx - leaq 40+192+48(%rsp),%rdi - call __addq_mod_384 - - movq 16(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq -48(%rdi),%rdi - call __addq_mod_384 - - leaq (%rdi),%rbx - leaq 48(%rdi),%rsi - call __mulq_384 - - - leaq (%rdi),%rsi - leaq 40(%rsp),%rdx - movq 8(%rsp),%rcx - call __subq_mod_384x384 - - leaq (%rdi),%rsi - leaq -96(%rdi),%rdx - call __subq_mod_384x384 - - - leaq 40(%rsp),%rsi - leaq 40+96(%rsp),%rdx - leaq 40(%rsp),%rdi - call __subq_mod_384x384 - - movq %rcx,%rbx - - - leaq 40(%rsp),%rsi - movq 0(%rsp),%rcx - movq 32(%rsp),%rdi - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - - leaq 40+192(%rsp),%rsi - movq 0(%rsp),%rcx - leaq 48(%rdi),%rdi - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - leaq 328(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -328-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_mont_384x,.-mul_mont_384x -.globl sqr_mont_384x -.hidden sqr_mont_384x -.type sqr_mont_384x,@function -.align 32 -sqr_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_mont_384x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - - - leaq 48(%rsi),%rdx - leaq 32(%rsp),%rdi - call __addq_mod_384 - - - movq 16(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq 32+48(%rsp),%rdi - call __subq_mod_384 - - - movq 16(%rsp),%rsi - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rax - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - - call __mulq_mont_384 - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - movq %r14,%r12 - adcq %r9,%r9 - movq %r15,%r13 - adcq %r10,%r10 - movq %r8,%rax - adcq %r11,%r11 - movq %r9,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - movq %r10,%rbp - sbbq 16(%rcx),%r8 - sbbq 24(%rcx),%r9 - sbbq 32(%rcx),%r10 - movq %r11,%rsi - sbbq 40(%rcx),%r11 - sbbq $0,%rdx - - cmovcq %r12,%r14 - cmovcq %r13,%r15 - cmovcq %rax,%r8 - movq %r14,48(%rdi) - cmovcq %rbx,%r9 - movq %r15,56(%rdi) - cmovcq %rbp,%r10 - movq %r8,64(%rdi) - cmovcq %rsi,%r11 - movq %r9,72(%rdi) - movq %r10,80(%rdi) - movq %r11,88(%rdi) - - leaq 32(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rax - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%r12 - movq 32+24(%rsp),%r13 - - call __mulq_mont_384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqr_mont_384x,.-sqr_mont_384x - -.globl mul_382x -.hidden mul_382x -.type mul_382x,@function -.align 32 -mul_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_382x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - leaq 96(%rdi),%rdi - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - movq %rdi,16(%rsp) - movq %rcx,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 48(%rsi),%r8 - adcq 56(%rsi),%r9 - adcq 64(%rsi),%r10 - adcq 72(%rsi),%r11 - adcq 80(%rsi),%r12 - adcq 88(%rsi),%r13 - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - movq 32(%rdx),%r12 - movq 40(%rdx),%r13 - - addq 48(%rdx),%r8 - adcq 56(%rdx),%r9 - adcq 64(%rdx),%r10 - adcq 72(%rdx),%r11 - adcq 80(%rdx),%r12 - adcq 88(%rdx),%r13 - - movq %r8,32+48(%rsp) - movq %r9,32+56(%rsp) - movq %r10,32+64(%rsp) - movq %r11,32+72(%rsp) - movq %r12,32+80(%rsp) - movq %r13,32+88(%rsp) - - - leaq 32+0(%rsp),%rsi - leaq 32+48(%rsp),%rbx - call __mulq_384 - - - movq 0(%rsp),%rsi - movq 8(%rsp),%rbx - leaq -96(%rdi),%rdi - call __mulq_384 - - - leaq 48(%rsi),%rsi - leaq 48(%rbx),%rbx - leaq 32(%rsp),%rdi - call __mulq_384 - - - movq 16(%rsp),%rsi - leaq 32(%rsp),%rdx - movq 24(%rsp),%rcx - movq %rsi,%rdi - call __subq_mod_384x384 - - - leaq 0(%rdi),%rsi - leaq -96(%rdi),%rdx - call __subq_mod_384x384 - - - leaq -96(%rdi),%rsi - leaq 32(%rsp),%rdx - leaq -96(%rdi),%rdi - call __subq_mod_384x384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_382x,.-mul_382x -.globl sqr_382x -.hidden sqr_382x -.type sqr_382x,@function -.align 32 -sqr_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_382x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rcx - - - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%rbx - movq 32(%rsi),%rbp - movq 40(%rsi),%rdx - - movq %r14,%r8 - addq 48(%rsi),%r14 - movq %r15,%r9 - adcq 56(%rsi),%r15 - movq %rax,%r10 - adcq 64(%rsi),%rax - movq %rbx,%r11 - adcq 72(%rsi),%rbx - movq %rbp,%r12 - adcq 80(%rsi),%rbp - movq %rdx,%r13 - adcq 88(%rsi),%rdx - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - movq %rbp,32(%rdi) - movq %rdx,40(%rdi) - - - leaq 48(%rsi),%rdx - leaq 48(%rdi),%rdi - call __subq_mod_384_a_is_loaded - - - leaq (%rdi),%rsi - leaq -48(%rdi),%rbx - leaq -48(%rdi),%rdi - call __mulq_384 - - - movq (%rsp),%rsi - leaq 48(%rsi),%rbx - leaq 96(%rdi),%rdi - call __mulq_384 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq 64(%rdi),%rax - movq 72(%rdi),%rbx - movq 80(%rdi),%rbp - addq %r8,%r8 - movq 88(%rdi),%rdx - adcq %r9,%r9 - movq %r8,0(%rdi) - adcq %r10,%r10 - movq %r9,8(%rdi) - adcq %r11,%r11 - movq %r10,16(%rdi) - adcq %r12,%r12 - movq %r11,24(%rdi) - adcq %r13,%r13 - movq %r12,32(%rdi) - adcq %r14,%r14 - movq %r13,40(%rdi) - adcq %r15,%r15 - movq %r14,48(%rdi) - adcq %rax,%rax - movq %r15,56(%rdi) - adcq %rbx,%rbx - movq %rax,64(%rdi) - adcq %rbp,%rbp - movq %rbx,72(%rdi) - adcq %rdx,%rdx - movq %rbp,80(%rdi) - movq %rdx,88(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -8*7 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqr_382x,.-sqr_382x -.globl mul_384 -.hidden mul_384 -.type mul_384,@function -.align 32 -mul_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - - - movq %rdx,%rbx - call __mulq_384 - - movq 0(%rsp),%r12 -.cfi_restore %r12 - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_384,.-mul_384 - -.type __mulq_384,@function -.align 32 -__mulq_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rbx),%rax - - movq %rax,%rbp - mulq 0(%rsi) - movq %rax,0(%rdi) - movq %rbp,%rax - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r11 - movq 8(%rbx),%rax - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,8(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,16(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,24(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 32(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,32(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 40(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,40(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq %rax,%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rcx,48(%rdi) - movq %r8,56(%rdi) - movq %r9,64(%rdi) - movq %r10,72(%rdi) - movq %r11,80(%rdi) - movq %r12,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulq_384,.-__mulq_384 -.globl sqr_384 -.hidden sqr_384 -.type sqr_384,@function -.align 32 -sqr_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __sqrq_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqr_384,.-sqr_384 - -.type __sqrq_384,@function -.align 32 -__sqrq_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r15 - movq 16(%rsi),%rcx - movq 24(%rsi),%rbx - - - movq %rax,%r14 - mulq %r15 - movq %rax,%r9 - movq %r14,%rax - movq 32(%rsi),%rbp - movq %rdx,%r10 - - mulq %rcx - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - movq 40(%rsi),%rsi - movq %rdx,%r11 - - mulq %rbx - addq %rax,%r11 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq %rbp - addq %rax,%r12 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r13 - - mulq %rsi - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - - mulq %rax - xorq %r8,%r8 - movq %rax,0(%rdi) - movq %r15,%rax - addq %r9,%r9 - adcq $0,%r8 - addq %rdx,%r9 - adcq $0,%r8 - movq %r9,8(%rdi) - - mulq %rcx - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq %rbx - addq %rax,%r12 - movq %r15,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq %rbp - addq %rax,%r13 - movq %r15,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq %rsi - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq %rax - xorq %r9,%r9 - addq %rax,%r8 - movq %rcx,%rax - addq %r10,%r10 - adcq %r11,%r11 - adcq $0,%r9 - addq %r8,%r10 - adcq %rdx,%r11 - adcq $0,%r9 - movq %r10,16(%rdi) - - mulq %rbx - addq %rax,%r13 - movq %rcx,%rax - adcq $0,%rdx - movq %r11,24(%rdi) - movq %rdx,%r8 - - mulq %rbp - addq %rax,%r14 - movq %rcx,%rax - adcq $0,%rdx - addq %r8,%r14 - adcq $0,%rdx - movq %rdx,%r8 - - mulq %rsi - addq %rax,%r15 - movq %rcx,%rax - adcq $0,%rdx - addq %r8,%r15 - adcq $0,%rdx - movq %rdx,%rcx - - mulq %rax - xorq %r11,%r11 - addq %rax,%r9 - movq %rbx,%rax - addq %r12,%r12 - adcq %r13,%r13 - adcq $0,%r11 - addq %r9,%r12 - adcq %rdx,%r13 - adcq $0,%r11 - movq %r12,32(%rdi) - - - mulq %rbp - addq %rax,%r15 - movq %rbx,%rax - adcq $0,%rdx - movq %r13,40(%rdi) - movq %rdx,%r8 - - mulq %rsi - addq %rax,%rcx - movq %rbx,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%rbx - - mulq %rax - xorq %r12,%r12 - addq %rax,%r11 - movq %rbp,%rax - addq %r14,%r14 - adcq %r15,%r15 - adcq $0,%r12 - addq %r11,%r14 - adcq %rdx,%r15 - movq %r14,48(%rdi) - adcq $0,%r12 - movq %r15,56(%rdi) - - - mulq %rsi - addq %rax,%rbx - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq %rax - xorq %r13,%r13 - addq %rax,%r12 - movq %rsi,%rax - addq %rcx,%rcx - adcq %rbx,%rbx - adcq $0,%r13 - addq %r12,%rcx - adcq %rdx,%rbx - movq %rcx,64(%rdi) - adcq $0,%r13 - movq %rbx,72(%rdi) - - - mulq %rax - addq %r13,%rax - addq %rbp,%rbp - adcq $0,%rdx - addq %rbp,%rax - adcq $0,%rdx - movq %rax,80(%rdi) - movq %rdx,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __sqrq_384,.-__sqrq_384 - -.globl sqr_mont_384 -.hidden sqr_mont_384 -.type sqr_mont_384,@function -.align 32 -sqr_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $120,%rsp -.cfi_adjust_cfa_offset 8*15 - - - movq %rcx,96(%rsp) - movq %rdx,104(%rsp) - movq %rdi,112(%rsp) - - movq %rsp,%rdi - call __sqrq_384 - - leaq 0(%rsp),%rsi - movq 96(%rsp),%rcx - movq 104(%rsp),%rbx - movq 112(%rsp),%rdi - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - leaq 120(%rsp),%r8 - movq 120(%rsp),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -8*21 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqr_mont_384,.-sqr_mont_384 - - - -.globl redc_mont_384 -.hidden redc_mont_384 -.type redc_mont_384,@function -.align 32 -redc_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz redc_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size redc_mont_384,.-redc_mont_384 - - - - -.globl from_mont_384 -.hidden from_mont_384 -.type from_mont_384,@function -.align 32 -from_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz from_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulq_by_1_mont_384 - - - - - - movq %r15,%rcx - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size from_mont_384,.-from_mont_384 -.type __mulq_by_1_mont_384,@function -.align 32 -__mulq_by_1_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rax,%r14 - imulq %rcx,%rax - movq %rax,%r8 - - mulq 0(%rbx) - addq %rax,%r14 - movq %r8,%rax - adcq %rdx,%r14 - - mulq 8(%rbx) - addq %rax,%r9 - movq %r8,%rax - adcq $0,%rdx - addq %r14,%r9 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 16(%rbx) - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - addq %r14,%r10 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 24(%rbx) - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %r9,%r15 - imulq %rcx,%r9 - addq %r14,%r11 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 32(%rbx) - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %r14,%r12 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 40(%rbx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %r14,%r13 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rbx) - addq %rax,%r15 - movq %r9,%rax - adcq %rdx,%r15 - - mulq 8(%rbx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %r15,%r10 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rbx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %r15,%r11 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rbx) - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - movq %r10,%r8 - imulq %rcx,%r10 - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 32(%rbx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 40(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %r15,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rbx) - addq %rax,%r8 - movq %r10,%rax - adcq %rdx,%r8 - - mulq 8(%rbx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rbx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r8,%r12 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 24(%rbx) - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - movq %r11,%r9 - imulq %rcx,%r11 - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %r8,%r14 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %r8,%r15 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 0(%rbx) - addq %rax,%r9 - movq %r11,%rax - adcq %rdx,%r9 - - mulq 8(%rbx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rbx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rbx) - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - movq %r12,%r10 - imulq %rcx,%r12 - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %r9,%r15 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rbx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 0(%rbx) - addq %rax,%r10 - movq %r12,%rax - adcq %rdx,%r10 - - mulq 8(%rbx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 24(%rbx) - addq %rax,%r15 - movq %r12,%rax - adcq $0,%rdx - movq %r13,%r11 - imulq %rcx,%r13 - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rbx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r8 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rbx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 0(%rbx) - addq %rax,%r11 - movq %r13,%rax - adcq %rdx,%r11 - - mulq 8(%rbx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 24(%rbx) - addq %rax,%r8 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rbx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r9 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rbx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 - -.type __redq_tail_mont_384,@function -.align 32 -__redq_tail_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - addq 48(%rsi),%r14 - movq %r14,%rax - adcq 56(%rsi),%r15 - adcq 64(%rsi),%r8 - adcq 72(%rsi),%r9 - movq %r15,%rcx - adcq 80(%rsi),%r10 - adcq 88(%rsi),%r11 - sbbq %r12,%r12 - - - - - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __redq_tail_mont_384,.-__redq_tail_mont_384 - -.globl sgn0_pty_mont_384 -.hidden sgn0_pty_mont_384 -.type sgn0_pty_mont_384,@function -.align 32 -sgn0_pty_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sgn0_pty_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - leaq 0(%rdi),%rsi - movq %rdx,%rcx - call __mulq_by_1_mont_384 - - xorq %rax,%rax - movq %r14,%r13 - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - notq %rax - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 - -.globl sgn0_pty_mont_384x -.hidden sgn0_pty_mont_384x -.type sgn0_pty_mont_384x,@function -.align 32 -sgn0_pty_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sgn0_pty_mont_384x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - leaq 48(%rdi),%rsi - movq %rdx,%rcx - call __mulq_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - leaq 0(%rdi),%rsi - xorq %rdi,%rdi - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rdi - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rdi - - movq %r14,0(%rsp) - notq %rdi - andq $1,%r13 - andq $2,%rdi - orq %r13,%rdi - - call __mulq_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - xorq %rax,%rax - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - movq 0(%rsp),%r12 - - notq %rax - - testq %r14,%r14 - cmovzq %rdi,%r13 - - testq %r12,%r12 - cmovnzq %rdi,%rax - - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x -.globl mul_mont_384 -.hidden mul_mont_384 -.type mul_mont_384,@function -.align 32 -mul_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz mul_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $24,%rsp -.cfi_adjust_cfa_offset 8*3 - - - movq 0(%rdx),%rax - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - movq %rdx,%rbx - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - - call __mulq_mont_384 - - movq 24(%rsp),%r15 -.cfi_restore %r15 - movq 32(%rsp),%r14 -.cfi_restore %r14 - movq 40(%rsp),%r13 -.cfi_restore %r13 - movq 48(%rsp),%r12 -.cfi_restore %r12 - movq 56(%rsp),%rbx -.cfi_restore %rbx - movq 64(%rsp),%rbp -.cfi_restore %rbp - leaq 72(%rsp),%rsp -.cfi_adjust_cfa_offset -72 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mul_mont_384,.-mul_mont_384 -.type __mulq_mont_384,@function -.align 32 -__mulq_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rax,%rdi - mulq %r14 - movq %rax,%r8 - movq %rdi,%rax - movq %rdx,%r9 - - mulq %r15 - addq %rax,%r9 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %r12 - addq %rax,%r10 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r11 - - movq %r8,%rbp - imulq 8(%rsp),%r8 - - mulq %r13 - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r13 - - mulq 40(%rsi) - addq %rax,%r13 - movq %r8,%rax - adcq $0,%rdx - xorq %r15,%r15 - movq %rdx,%r14 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r8,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r9 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r9 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r10 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r13 - movq 8(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq %rdx,%r14 - adcq $0,%r15 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r9 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 8(%rsi) - addq %rax,%r10 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r10 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - movq %r9,%rbp - imulq 8(%rsp),%r9 - - mulq 24(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r12 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rsi) - addq %r8,%r14 - adcq $0,%rdx - xorq %r8,%r8 - addq %rax,%r14 - movq %r9,%rax - adcq %rdx,%r15 - adcq $0,%r8 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r9,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r10 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r14 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq %rdx,%r15 - adcq $0,%r8 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r10 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r11 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - movq %r10,%rbp - imulq 8(%rsp),%r10 - - mulq 24(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rsi) - addq %r9,%r15 - adcq $0,%rdx - xorq %r9,%r9 - addq %rax,%r15 - movq %r10,%rax - adcq %rdx,%r8 - adcq $0,%r9 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r10,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r15 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq %rdx,%r8 - adcq $0,%r9 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 8(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r12 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - movq %r11,%rbp - imulq 8(%rsp),%r11 - - mulq 24(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r15 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rsi) - addq %r10,%r8 - adcq $0,%rdx - xorq %r10,%r10 - addq %rax,%r8 - movq %r11,%rax - adcq %rdx,%r9 - adcq $0,%r10 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r11,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r14 - adcq $0,%rdx - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r8 - movq 32(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r8 - adcq %rdx,%r9 - adcq $0,%r10 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 8(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - movq %r12,%rbp - imulq 8(%rsp),%r12 - - mulq 24(%rsi) - addq %rax,%r15 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rsi) - addq %rax,%r8 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %r11,%r9 - adcq $0,%rdx - xorq %r11,%r11 - addq %rax,%r9 - movq %r12,%rax - adcq %rdx,%r10 - adcq $0,%r11 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r12,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r8 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r9 - movq 40(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r9 - adcq %rdx,%r10 - adcq $0,%r11 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 8(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r14 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 16(%rsi) - addq %rax,%r15 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r15 - adcq $0,%rdx - movq %rdx,%r12 - - movq %r13,%rbp - imulq 8(%rsp),%r13 - - mulq 24(%rsi) - addq %rax,%r8 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r8 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rsi) - addq %rax,%r9 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r9 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 40(%rsi) - addq %r12,%r10 - adcq $0,%rdx - xorq %r12,%r12 - addq %rax,%r10 - movq %r13,%rax - adcq %rdx,%r11 - adcq $0,%r12 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r13,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %rbp,%r9 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %rbp,%r10 - adcq %rdx,%r11 - adcq $0,%r12 - - - - - movq 16(%rsp),%rdi - subq 0(%rcx),%r14 - movq %r15,%rdx - sbbq 8(%rcx),%r15 - movq %r8,%rbx - sbbq 16(%rcx),%r8 - movq %r9,%rsi - sbbq 24(%rcx),%r9 - movq %r10,%rbp - sbbq 32(%rcx),%r10 - movq %r11,%r13 - sbbq 40(%rcx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r14 - cmovcq %rdx,%r15 - cmovcq %rbx,%r8 - movq %r14,0(%rdi) - cmovcq %rsi,%r9 - movq %r15,8(%rdi) - cmovcq %rbp,%r10 - movq %r8,16(%rdi) - cmovcq %r13,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulq_mont_384,.-__mulq_mont_384 -.globl sqr_n_mul_mont_384 -.hidden sqr_n_mul_mont_384 -.type sqr_n_mul_mont_384,@function -.align 32 -sqr_n_mul_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_n_mul_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 8*17 - - - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - movq %rcx,16(%rsp) - leaq 32(%rsp),%rdi - movq %r9,24(%rsp) - movq (%r9),%xmm2 - -.Loop_sqr_384: - movd %edx,%xmm1 - - call __sqrq_384 - - leaq 0(%rdi),%rsi - movq 0(%rsp),%rcx - movq 16(%rsp),%rbx - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - movd %xmm1,%edx - leaq 0(%rdi),%rsi - decl %edx - jnz .Loop_sqr_384 - -.byte 102,72,15,126,208 - movq %rbx,%rcx - movq 24(%rsp),%rbx - - - - - - - movq %r8,%r12 - movq %r9,%r13 - - call __mulq_mont_384 - - leaq 136(%rsp),%r8 - movq 136(%rsp),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -8*23 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 - -.globl sqr_n_mul_mont_383 -.hidden sqr_n_mul_mont_383 -.type sqr_n_mul_mont_383,@function -.align 32 -sqr_n_mul_mont_383: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_n_mul_mont_383$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 8*17 - - - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - movq %rcx,16(%rsp) - leaq 32(%rsp),%rdi - movq %r9,24(%rsp) - movq (%r9),%xmm2 - -.Loop_sqr_383: - movd %edx,%xmm1 - - call __sqrq_384 - - leaq 0(%rdi),%rsi - movq 0(%rsp),%rcx - movq 16(%rsp),%rbx - call __mulq_by_1_mont_384 - - movd %xmm1,%edx - addq 48(%rsi),%r14 - adcq 56(%rsi),%r15 - adcq 64(%rsi),%r8 - adcq 72(%rsi),%r9 - adcq 80(%rsi),%r10 - adcq 88(%rsi),%r11 - leaq 0(%rdi),%rsi - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - decl %edx - jnz .Loop_sqr_383 - -.byte 102,72,15,126,208 - movq %rbx,%rcx - movq 24(%rsp),%rbx - - - - - - - movq %r8,%r12 - movq %r9,%r13 - - call __mulq_mont_384 - - leaq 136(%rsp),%r8 - movq 136(%rsp),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -8*23 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 -.type __mulq_mont_383_nonred,@function -.align 32 -__mulq_mont_383_nonred: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rax,%rbp - mulq %r14 - movq %rax,%r8 - movq %rbp,%rax - movq %rdx,%r9 - - mulq %r15 - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %r12 - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - movq %r8,%r15 - imulq 8(%rsp),%r8 - - mulq %r13 - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r13 - - mulq 40(%rsi) - addq %rax,%r13 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rcx) - addq %rax,%r15 - movq %r8,%rax - adcq %rdx,%r15 - - mulq 8(%rcx) - addq %rax,%r9 - movq %r8,%rax - adcq $0,%rdx - addq %r15,%r9 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rcx) - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - addq %r15,%r10 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rcx) - addq %r15,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq 32(%rcx) - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 40(%rcx) - addq %rax,%r13 - movq 8(%rbx),%rax - adcq $0,%rdx - addq %r15,%r13 - adcq %rdx,%r14 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq 8(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r10 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r11 - adcq $0,%rdx - movq %rdx,%r15 - - movq %r9,%r8 - imulq 8(%rsp),%r9 - - mulq 24(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 32(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 40(%rsi) - addq %r15,%r14 - adcq $0,%rdx - addq %rax,%r14 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rcx) - addq %rax,%r8 - movq %r9,%rax - adcq %rdx,%r8 - - mulq 8(%rcx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %r8,%r10 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rcx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 24(%rcx) - addq %r8,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rcx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rcx) - addq %rax,%r14 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %r8,%r14 - adcq %rdx,%r15 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 8(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r12 - adcq $0,%rdx - movq %rdx,%r8 - - movq %r10,%r9 - imulq 8(%rsp),%r10 - - mulq 24(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r14 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rsi) - addq %r8,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 0(%rcx) - addq %rax,%r9 - movq %r10,%rax - adcq %rdx,%r9 - - mulq 8(%rcx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r9,%r11 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rcx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rcx) - addq %r9,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rcx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rcx) - addq %rax,%r15 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %r9,%r15 - adcq %rdx,%r8 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - movq %r11,%r10 - imulq 8(%rsp),%r11 - - mulq 24(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rsi) - addq %rax,%r15 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r15 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rsi) - addq %r9,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 0(%rcx) - addq %rax,%r10 - movq %r11,%rax - adcq %rdx,%r10 - - mulq 8(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r10,%r12 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 24(%rcx) - addq %r10,%r14 - adcq $0,%rdx - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rcx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rcx) - addq %rax,%r8 - movq 32(%rbx),%rax - adcq $0,%rdx - addq %r10,%r8 - adcq %rdx,%r9 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 8(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - movq %r12,%r11 - imulq 8(%rsp),%r12 - - mulq 24(%rsi) - addq %rax,%r15 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r8 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rsi) - addq %r10,%r9 - adcq $0,%rdx - addq %rax,%r9 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 0(%rcx) - addq %rax,%r11 - movq %r12,%rax - adcq %rdx,%r11 - - mulq 8(%rcx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rcx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 24(%rcx) - addq %r11,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rcx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rcx) - addq %rax,%r9 - movq 40(%rbx),%rax - adcq $0,%rdx - addq %r11,%r9 - adcq %rdx,%r10 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 8(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rsi) - addq %rax,%r15 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - movq %r13,%r12 - imulq 8(%rsp),%r13 - - mulq 24(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r9 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %r11,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 0(%rcx) - addq %rax,%r12 - movq %r13,%rax - adcq %rdx,%r12 - - mulq 8(%rcx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %r12,%r14 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 16(%rcx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r12,%r15 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 24(%rcx) - addq %r12,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rcx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %r12,%r9 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 40(%rcx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %r12,%r10 - adcq %rdx,%r11 - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred -.globl sqr_mont_382x -.hidden sqr_mont_382x -.type sqr_mont_382x,@function -.align 32 -sqr_mont_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,__blst_platform_cap(%rip) - jnz sqr_mont_382x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - movq %rsi,16(%rsp) - movq %rdi,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %r8,%r14 - addq 48(%rsi),%r8 - movq %r9,%r15 - adcq 56(%rsi),%r9 - movq %r10,%rax - adcq 64(%rsi),%r10 - movq %r11,%rdx - adcq 72(%rsi),%r11 - movq %r12,%rbx - adcq 80(%rsi),%r12 - movq %r13,%rbp - adcq 88(%rsi),%r13 - - subq 48(%rsi),%r14 - sbbq 56(%rsi),%r15 - sbbq 64(%rsi),%rax - sbbq 72(%rsi),%rdx - sbbq 80(%rsi),%rbx - sbbq 88(%rsi),%rbp - sbbq %rdi,%rdi - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - movq %r14,32+48(%rsp) - movq %r15,32+56(%rsp) - movq %rax,32+64(%rsp) - movq %rdx,32+72(%rsp) - movq %rbx,32+80(%rsp) - movq %rbp,32+88(%rsp) - movq %rdi,32+96(%rsp) - - - - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rax - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - - movq 24(%rsp),%rdi - call __mulq_mont_383_nonred - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - - movq %r14,48(%rdi) - movq %r15,56(%rdi) - movq %r8,64(%rdi) - movq %r9,72(%rdi) - movq %r10,80(%rdi) - movq %r11,88(%rdi) - - leaq 32(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rax - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%r12 - movq 32+24(%rsp),%r13 - - call __mulq_mont_383_nonred - movq 32+96(%rsp),%rsi - movq 32+0(%rsp),%r12 - movq 32+8(%rsp),%r13 - andq %rsi,%r12 - movq 32+16(%rsp),%rax - andq %rsi,%r13 - movq 32+24(%rsp),%rbx - andq %rsi,%rax - movq 32+32(%rsp),%rbp - andq %rsi,%rbx - andq %rsi,%rbp - andq 32+40(%rsp),%rsi - - subq %r12,%r14 - movq 0(%rcx),%r12 - sbbq %r13,%r15 - movq 8(%rcx),%r13 - sbbq %rax,%r8 - movq 16(%rcx),%rax - sbbq %rbx,%r9 - movq 24(%rcx),%rbx - sbbq %rbp,%r10 - movq 32(%rcx),%rbp - sbbq %rsi,%r11 - sbbq %rsi,%rsi - - andq %rsi,%r12 - andq %rsi,%r13 - andq %rsi,%rax - andq %rsi,%rbx - andq %rsi,%rbp - andq 40(%rcx),%rsi - - addq %r12,%r14 - adcq %r13,%r15 - adcq %rax,%r8 - adcq %rbx,%r9 - adcq %rbp,%r10 - adcq %rsi,%r11 - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqr_mont_382x,.-sqr_mont_382x - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s deleted file mode 100644 index 42e89134cff..00000000000 --- a/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s +++ /dev/null @@ -1,631 +0,0 @@ -.text - -.globl mulx_mont_sparse_256 -.hidden mulx_mont_sparse_256 -.type mulx_mont_sparse_256,@function -.align 32 -mulx_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_mont_sparse_256$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rbp - movq 24(%rsi),%r9 - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%rax,%r11 - call __mulx_mont_sparse_256 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mulx_mont_sparse_256,.-mulx_mont_sparse_256 - -.globl sqrx_mont_sparse_256 -.hidden sqrx_mont_sparse_256 -.type sqrx_mont_sparse_256,@function -.align 32 -sqrx_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_mont_sparse_256$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - movq %rcx,%r8 - movq %rdx,%rcx - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rbp - movq 24(%rsi),%r9 - leaq -128(%rbx),%rsi - leaq -128(%rcx),%rcx - - mulxq %rdx,%rax,%r11 - call __mulx_mont_sparse_256 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 -.type __mulx_mont_sparse_256,@function -.align 32 -__mulx_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - mulxq %r15,%r15,%r12 - mulxq %rbp,%rbp,%r13 - addq %r15,%r11 - mulxq %r9,%r9,%r14 - movq 8(%rbx),%rdx - adcq %rbp,%r12 - adcq %r9,%r13 - adcq $0,%r14 - - movq %rax,%r10 - imulq %r8,%rax - - - xorq %r15,%r15 - mulxq 0+128(%rsi),%rbp,%r9 - adoxq %rbp,%r11 - adcxq %r9,%r12 - - mulxq 8+128(%rsi),%rbp,%r9 - adoxq %rbp,%r12 - adcxq %r9,%r13 - - mulxq 16+128(%rsi),%rbp,%r9 - adoxq %rbp,%r13 - adcxq %r9,%r14 - - mulxq 24+128(%rsi),%rbp,%r9 - movq %rax,%rdx - adoxq %rbp,%r14 - adcxq %r15,%r9 - adoxq %r9,%r15 - - - mulxq 0+128(%rcx),%rbp,%rax - adcxq %rbp,%r10 - adoxq %r11,%rax - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%rax - adoxq %r9,%r12 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r12 - adoxq %r9,%r13 - - mulxq 24+128(%rcx),%rbp,%r9 - movq 16(%rbx),%rdx - adcxq %rbp,%r13 - adoxq %r9,%r14 - adcxq %r10,%r14 - adoxq %r10,%r15 - adcxq %r10,%r15 - adoxq %r10,%r10 - adcq $0,%r10 - movq %rax,%r11 - imulq %r8,%rax - - - xorq %rbp,%rbp - mulxq 0+128(%rsi),%rbp,%r9 - adoxq %rbp,%r12 - adcxq %r9,%r13 - - mulxq 8+128(%rsi),%rbp,%r9 - adoxq %rbp,%r13 - adcxq %r9,%r14 - - mulxq 16+128(%rsi),%rbp,%r9 - adoxq %rbp,%r14 - adcxq %r9,%r15 - - mulxq 24+128(%rsi),%rbp,%r9 - movq %rax,%rdx - adoxq %rbp,%r15 - adcxq %r10,%r9 - adoxq %r9,%r10 - - - mulxq 0+128(%rcx),%rbp,%rax - adcxq %rbp,%r11 - adoxq %r12,%rax - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%rax - adoxq %r9,%r13 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r13 - adoxq %r9,%r14 - - mulxq 24+128(%rcx),%rbp,%r9 - movq 24(%rbx),%rdx - adcxq %rbp,%r14 - adoxq %r9,%r15 - adcxq %r11,%r15 - adoxq %r11,%r10 - adcxq %r11,%r10 - adoxq %r11,%r11 - adcq $0,%r11 - movq %rax,%r12 - imulq %r8,%rax - - - xorq %rbp,%rbp - mulxq 0+128(%rsi),%rbp,%r9 - adoxq %rbp,%r13 - adcxq %r9,%r14 - - mulxq 8+128(%rsi),%rbp,%r9 - adoxq %rbp,%r14 - adcxq %r9,%r15 - - mulxq 16+128(%rsi),%rbp,%r9 - adoxq %rbp,%r15 - adcxq %r9,%r10 - - mulxq 24+128(%rsi),%rbp,%r9 - movq %rax,%rdx - adoxq %rbp,%r10 - adcxq %r11,%r9 - adoxq %r9,%r11 - - - mulxq 0+128(%rcx),%rbp,%rax - adcxq %rbp,%r12 - adoxq %r13,%rax - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%rax - adoxq %r9,%r14 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r14 - adoxq %r9,%r15 - - mulxq 24+128(%rcx),%rbp,%r9 - movq %rax,%rdx - adcxq %rbp,%r15 - adoxq %r9,%r10 - adcxq %r12,%r10 - adoxq %r12,%r11 - adcxq %r12,%r11 - adoxq %r12,%r12 - adcq $0,%r12 - imulq %r8,%rdx - - - xorq %rbp,%rbp - mulxq 0+128(%rcx),%r13,%r9 - adcxq %rax,%r13 - adoxq %r9,%r14 - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%r14 - adoxq %r9,%r15 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r15 - adoxq %r9,%r10 - - mulxq 24+128(%rcx),%rbp,%r9 - movq %r14,%rdx - leaq 128(%rcx),%rcx - adcxq %rbp,%r10 - adoxq %r9,%r11 - movq %r15,%rax - adcxq %r13,%r11 - adoxq %r13,%r12 - adcq $0,%r12 - - - - - movq %r10,%rbp - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - sbbq 16(%rcx),%r10 - movq %r11,%r9 - sbbq 24(%rcx),%r11 - sbbq $0,%r12 - - cmovcq %rdx,%r14 - cmovcq %rax,%r15 - cmovcq %rbp,%r10 - movq %r14,0(%rdi) - cmovcq %r9,%r11 - movq %r15,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 -.globl fromx_mont_256 -.hidden fromx_mont_256 -.type fromx_mont_256,@function -.align 32 -fromx_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -from_mont_256$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulx_by_1_mont_256 - - - - - - movq %r15,%rdx - movq %r10,%r12 - movq %r11,%r13 - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r10 - sbbq 24(%rbx),%r11 - - cmovncq %r14,%rax - cmovncq %r15,%rdx - cmovncq %r10,%r12 - movq %rax,0(%rdi) - cmovncq %r11,%r13 - movq %rdx,8(%rdi) - movq %r12,16(%rdi) - movq %r13,24(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size fromx_mont_256,.-fromx_mont_256 - -.globl redcx_mont_256 -.hidden redcx_mont_256 -.type redcx_mont_256,@function -.align 32 -redcx_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -redc_mont_256$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulx_by_1_mont_256 - - addq 32(%rsi),%r14 - adcq 40(%rsi),%r15 - movq %r14,%rax - adcq 48(%rsi),%r10 - movq %r15,%rdx - adcq 56(%rsi),%r11 - sbbq %rsi,%rsi - - - - - movq %r10,%r12 - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r10 - movq %r11,%r13 - sbbq 24(%rbx),%r11 - sbbq $0,%rsi - - cmovncq %r14,%rax - cmovncq %r15,%rdx - cmovncq %r10,%r12 - movq %rax,0(%rdi) - cmovncq %r11,%r13 - movq %rdx,8(%rdi) - movq %r12,16(%rdi) - movq %r13,24(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size redcx_mont_256,.-redcx_mont_256 -.type __mulx_by_1_mont_256,@function -.align 32 -__mulx_by_1_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r11 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - - movq %rax,%r14 - imulq %rcx,%rax - movq %rax,%r10 - - mulq 0(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq %rdx,%r14 - - mulq 8(%rbx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r11 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 16(%rbx) - movq %r11,%r15 - imulq %rcx,%r11 - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r12 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 24(%rbx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r14,%r13 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq %rdx,%r15 - - mulq 8(%rbx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rbx) - movq %r12,%r10 - imulq %rcx,%r12 - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r15,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rbx) - addq %rax,%r10 - movq %r12,%rax - adcq %rdx,%r10 - - mulq 8(%rbx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rbx) - movq %r13,%r11 - imulq %rcx,%r13 - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 24(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 0(%rbx) - addq %rax,%r11 - movq %r13,%rax - adcq %rdx,%r11 - - mulq 8(%rbx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 24(%rbx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s deleted file mode 100644 index 5c67d918d22..00000000000 --- a/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s +++ /dev/null @@ -1,2983 +0,0 @@ -.text - - - - - - - -.type __subx_mod_384x384,@function -.align 32 -__subx_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - subq 0(%rdx),%r8 - movq 56(%rsi),%r15 - sbbq 8(%rdx),%r9 - movq 64(%rsi),%rax - sbbq 16(%rdx),%r10 - movq 72(%rsi),%rbx - sbbq 24(%rdx),%r11 - movq 80(%rsi),%rbp - sbbq 32(%rdx),%r12 - movq 88(%rsi),%rsi - sbbq 40(%rdx),%r13 - movq %r8,0(%rdi) - sbbq 48(%rdx),%r14 - movq 0(%rcx),%r8 - movq %r9,8(%rdi) - sbbq 56(%rdx),%r15 - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - sbbq 64(%rdx),%rax - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - sbbq 72(%rdx),%rbx - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - sbbq 80(%rdx),%rbp - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - sbbq 88(%rdx),%rsi - movq 40(%rcx),%r13 - sbbq %rdx,%rdx - - andq %rdx,%r8 - andq %rdx,%r9 - andq %rdx,%r10 - andq %rdx,%r11 - andq %rdx,%r12 - andq %rdx,%r13 - - addq %r8,%r14 - adcq %r9,%r15 - movq %r14,48(%rdi) - adcq %r10,%rax - movq %r15,56(%rdi) - adcq %r11,%rbx - movq %rax,64(%rdi) - adcq %r12,%rbp - movq %rbx,72(%rdi) - adcq %r13,%rsi - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __subx_mod_384x384,.-__subx_mod_384x384 - -.type __addx_mod_384,@function -.align 32 -__addx_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - movq %r8,%r14 - adcq 24(%rdx),%r11 - movq %r9,%r15 - adcq 32(%rdx),%r12 - movq %r10,%rax - adcq 40(%rdx),%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,0(%rdi) - cmovcq %rbx,%r11 - movq %r9,8(%rdi) - cmovcq %rbp,%r12 - movq %r10,16(%rdi) - cmovcq %rsi,%r13 - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __addx_mod_384,.-__addx_mod_384 - -.type __subx_mod_384,@function -.align 32 -__subx_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -__subx_mod_384_a_is_loaded: - subq 0(%rdx),%r8 - movq 0(%rcx),%r14 - sbbq 8(%rdx),%r9 - movq 8(%rcx),%r15 - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rax - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbx - sbbq 32(%rdx),%r12 - movq 32(%rcx),%rbp - sbbq 40(%rdx),%r13 - movq 40(%rcx),%rsi - sbbq %rdx,%rdx - - andq %rdx,%r14 - andq %rdx,%r15 - andq %rdx,%rax - andq %rdx,%rbx - andq %rdx,%rbp - andq %rdx,%rsi - - addq %r14,%r8 - adcq %r15,%r9 - movq %r8,0(%rdi) - adcq %rax,%r10 - movq %r9,8(%rdi) - adcq %rbx,%r11 - movq %r10,16(%rdi) - adcq %rbp,%r12 - movq %r11,24(%rdi) - adcq %rsi,%r13 - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __subx_mod_384,.-__subx_mod_384 -.globl mulx_mont_384x -.hidden mulx_mont_384x -.type mulx_mont_384x,@function -.align 32 -mulx_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_mont_384x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $328,%rsp -.cfi_adjust_cfa_offset 328 - - - movq %rdx,%rbx - movq %rdi,32(%rsp) - movq %rsi,24(%rsp) - movq %rdx,16(%rsp) - movq %rcx,8(%rsp) - movq %r8,0(%rsp) - - - - - leaq 40(%rsp),%rdi - call __mulx_384 - - - leaq 48(%rbx),%rbx - leaq 128+48(%rsi),%rsi - leaq 96(%rdi),%rdi - call __mulx_384 - - - movq 8(%rsp),%rcx - leaq (%rbx),%rsi - leaq -48(%rbx),%rdx - leaq 40+192+48(%rsp),%rdi - call __addx_mod_384 - - movq 24(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq -48(%rdi),%rdi - call __addx_mod_384 - - leaq (%rdi),%rbx - leaq 48(%rdi),%rsi - call __mulx_384 - - - leaq (%rdi),%rsi - leaq 40(%rsp),%rdx - movq 8(%rsp),%rcx - call __subx_mod_384x384 - - leaq (%rdi),%rsi - leaq -96(%rdi),%rdx - call __subx_mod_384x384 - - - leaq 40(%rsp),%rsi - leaq 40+96(%rsp),%rdx - leaq 40(%rsp),%rdi - call __subx_mod_384x384 - - leaq (%rcx),%rbx - - - leaq 40(%rsp),%rsi - movq 0(%rsp),%rcx - movq 32(%rsp),%rdi - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - - leaq 40+192(%rsp),%rsi - movq 0(%rsp),%rcx - leaq 48(%rdi),%rdi - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - leaq 328(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -328-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mulx_mont_384x,.-mulx_mont_384x -.globl sqrx_mont_384x -.hidden sqrx_mont_384x -.type sqrx_mont_384x,@function -.align 32 -sqrx_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_mont_384x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - - movq %rdi,16(%rsp) - movq %rsi,24(%rsp) - - - leaq 48(%rsi),%rdx - leaq 32(%rsp),%rdi - call __addx_mod_384 - - - movq 24(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq 32+48(%rsp),%rdi - call __subx_mod_384 - - - movq 24(%rsp),%rsi - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - addq %rdx,%rdx - adcq %r15,%r15 - adcq %rax,%rax - movq %rdx,%r8 - adcq %r12,%r12 - movq %r15,%r9 - adcq %rdi,%rdi - movq %rax,%r10 - adcq %rbp,%rbp - movq %r12,%r11 - sbbq %rsi,%rsi - - subq 0(%rcx),%rdx - sbbq 8(%rcx),%r15 - movq %rdi,%r13 - sbbq 16(%rcx),%rax - sbbq 24(%rcx),%r12 - sbbq 32(%rcx),%rdi - movq %rbp,%r14 - sbbq 40(%rcx),%rbp - sbbq $0,%rsi - - cmovcq %r8,%rdx - cmovcq %r9,%r15 - cmovcq %r10,%rax - movq %rdx,48(%rbx) - cmovcq %r11,%r12 - movq %r15,56(%rbx) - cmovcq %r13,%rdi - movq %rax,64(%rbx) - cmovcq %r14,%rbp - movq %r12,72(%rbx) - movq %rdi,80(%rbx) - movq %rbp,88(%rbx) - - leaq 32(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rdx - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%rax - movq 32+24(%rsp),%r12 - movq 32+32(%rsp),%rdi - movq 32+40(%rsp),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqrx_mont_384x,.-sqrx_mont_384x - -.globl mulx_382x -.hidden mulx_382x -.type mulx_382x,@function -.align 32 -mulx_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_382x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - leaq 96(%rdi),%rdi - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - movq %rdi,16(%rsp) - movq %rcx,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 48(%rsi),%r8 - adcq 56(%rsi),%r9 - adcq 64(%rsi),%r10 - adcq 72(%rsi),%r11 - adcq 80(%rsi),%r12 - adcq 88(%rsi),%r13 - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - movq 32(%rdx),%r12 - movq 40(%rdx),%r13 - - addq 48(%rdx),%r8 - adcq 56(%rdx),%r9 - adcq 64(%rdx),%r10 - adcq 72(%rdx),%r11 - adcq 80(%rdx),%r12 - adcq 88(%rdx),%r13 - - movq %r8,32+48(%rsp) - movq %r9,32+56(%rsp) - movq %r10,32+64(%rsp) - movq %r11,32+72(%rsp) - movq %r12,32+80(%rsp) - movq %r13,32+88(%rsp) - - - leaq 32+0(%rsp),%rsi - leaq 32+48(%rsp),%rbx - call __mulx_384 - - - movq 0(%rsp),%rsi - movq 8(%rsp),%rbx - leaq -96(%rdi),%rdi - call __mulx_384 - - - leaq 48+128(%rsi),%rsi - leaq 48(%rbx),%rbx - leaq 32(%rsp),%rdi - call __mulx_384 - - - movq 16(%rsp),%rsi - leaq 32(%rsp),%rdx - movq 24(%rsp),%rcx - movq %rsi,%rdi - call __subx_mod_384x384 - - - leaq 0(%rdi),%rsi - leaq -96(%rdi),%rdx - call __subx_mod_384x384 - - - leaq -96(%rdi),%rsi - leaq 32(%rsp),%rdx - leaq -96(%rdi),%rdi - call __subx_mod_384x384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mulx_382x,.-mulx_382x -.globl sqrx_382x -.hidden sqrx_382x -.type sqrx_382x,@function -.align 32 -sqrx_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_382x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rcx - - - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%rbx - movq 32(%rsi),%rbp - movq 40(%rsi),%rdx - - movq %r14,%r8 - addq 48(%rsi),%r14 - movq %r15,%r9 - adcq 56(%rsi),%r15 - movq %rax,%r10 - adcq 64(%rsi),%rax - movq %rbx,%r11 - adcq 72(%rsi),%rbx - movq %rbp,%r12 - adcq 80(%rsi),%rbp - movq %rdx,%r13 - adcq 88(%rsi),%rdx - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - movq %rbp,32(%rdi) - movq %rdx,40(%rdi) - - - leaq 48(%rsi),%rdx - leaq 48(%rdi),%rdi - call __subx_mod_384_a_is_loaded - - - leaq (%rdi),%rsi - leaq -48(%rdi),%rbx - leaq -48(%rdi),%rdi - call __mulx_384 - - - movq (%rsp),%rsi - leaq 48(%rsi),%rbx - leaq 96(%rdi),%rdi - call __mulx_384 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq 64(%rdi),%rax - movq 72(%rdi),%rbx - movq 80(%rdi),%rbp - addq %r8,%r8 - movq 88(%rdi),%rdx - adcq %r9,%r9 - movq %r8,0(%rdi) - adcq %r10,%r10 - movq %r9,8(%rdi) - adcq %r11,%r11 - movq %r10,16(%rdi) - adcq %r12,%r12 - movq %r11,24(%rdi) - adcq %r13,%r13 - movq %r12,32(%rdi) - adcq %r14,%r14 - movq %r13,40(%rdi) - adcq %r15,%r15 - movq %r14,48(%rdi) - adcq %rax,%rax - movq %r15,56(%rdi) - adcq %rbx,%rbx - movq %rax,64(%rdi) - adcq %rbp,%rbp - movq %rbx,72(%rdi) - adcq %rdx,%rdx - movq %rbp,80(%rdi) - movq %rdx,88(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -8*7 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqrx_382x,.-sqrx_382x -.globl mulx_384 -.hidden mulx_384 -.type mulx_384,@function -.align 32 -mulx_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - - - movq %rdx,%rbx - call __mulx_384 - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mulx_384,.-mulx_384 - -.type __mulx_384,@function -.align 32 -__mulx_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rbx),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - leaq -128(%rsi),%rsi - - mulxq %r14,%r9,%rcx - xorq %rbp,%rbp - - mulxq %r15,%r8,%rax - adcxq %rcx,%r8 - movq %r9,0(%rdi) - - mulxq %r10,%r9,%rcx - adcxq %rax,%r9 - - mulxq %r11,%r10,%rax - adcxq %rcx,%r10 - - mulxq %r12,%r11,%rcx - adcxq %rax,%r11 - - mulxq %r13,%r12,%r13 - movq 8(%rbx),%rdx - adcxq %rcx,%r12 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,8(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 16(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,16(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 24(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,24(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 32(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,32(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 40(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,40(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq %rax,%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - movq %r8,48(%rdi) - movq %r9,56(%rdi) - movq %r10,64(%rdi) - movq %r11,72(%rdi) - movq %r12,80(%rdi) - movq %r13,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulx_384,.-__mulx_384 -.globl sqrx_384 -.hidden sqrx_384 -.type sqrx_384,@function -.align 32 -sqrx_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - call __sqrx_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqrx_384,.-sqrx_384 -.type __sqrx_384,@function -.align 32 -__sqrx_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%rcx - movq 32(%rsi),%rbx - - - mulxq %r14,%r8,%rdi - movq 40(%rsi),%rbp - mulxq %r15,%r9,%rax - addq %rdi,%r9 - mulxq %rcx,%r10,%rdi - adcq %rax,%r10 - mulxq %rbx,%r11,%rax - adcq %rdi,%r11 - mulxq %rbp,%r12,%r13 - movq %r14,%rdx - adcq %rax,%r12 - adcq $0,%r13 - - - xorq %r14,%r14 - mulxq %r15,%rdi,%rax - adcxq %rdi,%r10 - adoxq %rax,%r11 - - mulxq %rcx,%rdi,%rax - adcxq %rdi,%r11 - adoxq %rax,%r12 - - mulxq %rbx,%rdi,%rax - adcxq %rdi,%r12 - adoxq %rax,%r13 - - mulxq %rbp,%rdi,%rax - movq %r15,%rdx - adcxq %rdi,%r13 - adoxq %r14,%rax - adcxq %rax,%r14 - - - xorq %r15,%r15 - mulxq %rcx,%rdi,%rax - adcxq %rdi,%r12 - adoxq %rax,%r13 - - mulxq %rbx,%rdi,%rax - adcxq %rdi,%r13 - adoxq %rax,%r14 - - mulxq %rbp,%rdi,%rax - movq %rcx,%rdx - adcxq %rdi,%r14 - adoxq %r15,%rax - adcxq %rax,%r15 - - - xorq %rcx,%rcx - mulxq %rbx,%rdi,%rax - adcxq %rdi,%r14 - adoxq %rax,%r15 - - mulxq %rbp,%rdi,%rax - movq %rbx,%rdx - adcxq %rdi,%r15 - adoxq %rcx,%rax - adcxq %rax,%rcx - - - mulxq %rbp,%rdi,%rbx - movq 0(%rsi),%rdx - addq %rdi,%rcx - movq 8(%rsp),%rdi - adcq $0,%rbx - - - xorq %rbp,%rbp - adcxq %r8,%r8 - adcxq %r9,%r9 - adcxq %r10,%r10 - adcxq %r11,%r11 - adcxq %r12,%r12 - - - mulxq %rdx,%rdx,%rax - movq %rdx,0(%rdi) - movq 8(%rsi),%rdx - adoxq %rax,%r8 - movq %r8,8(%rdi) - - mulxq %rdx,%r8,%rax - movq 16(%rsi),%rdx - adoxq %r8,%r9 - adoxq %rax,%r10 - movq %r9,16(%rdi) - movq %r10,24(%rdi) - - mulxq %rdx,%r8,%r9 - movq 24(%rsi),%rdx - adoxq %r8,%r11 - adoxq %r9,%r12 - adcxq %r13,%r13 - adcxq %r14,%r14 - movq %r11,32(%rdi) - movq %r12,40(%rdi) - - mulxq %rdx,%r8,%r9 - movq 32(%rsi),%rdx - adoxq %r8,%r13 - adoxq %r9,%r14 - adcxq %r15,%r15 - adcxq %rcx,%rcx - movq %r13,48(%rdi) - movq %r14,56(%rdi) - - mulxq %rdx,%r8,%r9 - movq 40(%rsi),%rdx - adoxq %r8,%r15 - adoxq %r9,%rcx - adcxq %rbx,%rbx - adcxq %rbp,%rbp - movq %r15,64(%rdi) - movq %rcx,72(%rdi) - - mulxq %rdx,%r8,%r9 - adoxq %r8,%rbx - adoxq %r9,%rbp - - movq %rbx,80(%rdi) - movq %rbp,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __sqrx_384,.-__sqrx_384 - - - -.globl redcx_mont_384 -.hidden redcx_mont_384 -.type redcx_mont_384,@function -.align 32 -redcx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -redc_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size redcx_mont_384,.-redcx_mont_384 - - - - -.globl fromx_mont_384 -.hidden fromx_mont_384 -.type fromx_mont_384,@function -.align 32 -fromx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -from_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulx_by_1_mont_384 - - - - - movq %r14,%rax - movq %r15,%rcx - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size fromx_mont_384,.-fromx_mont_384 -.type __mulx_by_1_mont_384,@function -.align 32 -__mulx_by_1_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq %rcx,%rdx - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - imulq %r8,%rdx - - - xorq %r14,%r14 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r8 - adoxq %rbp,%r9 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r9 - adoxq %rbp,%r10 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r10 - adoxq %rbp,%r11 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r13 - adoxq %r14,%rbp - adcxq %rbp,%r14 - imulq %r9,%rdx - - - xorq %r15,%r15 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r9 - adoxq %rbp,%r10 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r10 - adoxq %rbp,%r11 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r14 - adoxq %r15,%rbp - adcxq %rbp,%r15 - imulq %r10,%rdx - - - xorq %r8,%r8 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r10 - adoxq %rbp,%r11 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r15 - adoxq %r8,%rbp - adcxq %rbp,%r8 - imulq %r11,%rdx - - - xorq %r9,%r9 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r15 - adoxq %rbp,%r8 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r8 - adoxq %r9,%rbp - adcxq %rbp,%r9 - imulq %r12,%rdx - - - xorq %r10,%r10 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r15 - adoxq %rbp,%r8 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r8 - adoxq %rbp,%r9 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r9 - adoxq %r10,%rbp - adcxq %rbp,%r10 - imulq %r13,%rdx - - - xorq %r11,%r11 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r15 - adoxq %rbp,%r8 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r8 - adoxq %rbp,%r9 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r9 - adoxq %rbp,%r10 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r10 - adoxq %r11,%rbp - adcxq %rbp,%r11 - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 - -.type __redx_tail_mont_384,@function -.align 32 -__redx_tail_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - addq 48(%rsi),%r14 - movq %r14,%rax - adcq 56(%rsi),%r15 - adcq 64(%rsi),%r8 - adcq 72(%rsi),%r9 - movq %r15,%rcx - adcq 80(%rsi),%r10 - adcq 88(%rsi),%r11 - sbbq %r12,%r12 - - - - - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __redx_tail_mont_384,.-__redx_tail_mont_384 - -.globl sgn0x_pty_mont_384 -.hidden sgn0x_pty_mont_384 -.type sgn0x_pty_mont_384,@function -.align 32 -sgn0x_pty_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sgn0_pty_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - leaq 0(%rdi),%rsi - movq %rdx,%rcx - call __mulx_by_1_mont_384 - - xorq %rax,%rax - movq %r14,%r13 - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - notq %rax - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 - -.globl sgn0x_pty_mont_384x -.hidden sgn0x_pty_mont_384x -.type sgn0x_pty_mont_384x,@function -.align 32 -sgn0x_pty_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sgn0_pty_mont_384x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - leaq 48(%rdi),%rsi - movq %rdx,%rcx - call __mulx_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - leaq 0(%rdi),%rsi - xorq %rdi,%rdi - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rdi - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rdi - - movq %r14,0(%rsp) - notq %rdi - andq $1,%r13 - andq $2,%rdi - orq %r13,%rdi - - call __mulx_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - xorq %rax,%rax - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - movq 0(%rsp),%r12 - - notq %rax - - testq %r14,%r14 - cmovzq %rdi,%r13 - - testq %r12,%r12 - cmovnzq %rdi,%rax - - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x -.globl mulx_mont_384 -.hidden mulx_mont_384 -.type mulx_mont_384,@function -.align 32 -mulx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -24(%rsp),%rsp -.cfi_adjust_cfa_offset 8*3 - - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - movq %r8,(%rsp) - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - movq 24(%rsp),%r15 -.cfi_restore %r15 - movq 32(%rsp),%r14 -.cfi_restore %r14 - movq 40(%rsp),%r13 -.cfi_restore %r13 - movq 48(%rsp),%r12 -.cfi_restore %r12 - movq 56(%rsp),%rbx -.cfi_restore %rbx - movq 64(%rsp),%rbp -.cfi_restore %rbp - leaq 72(%rsp),%rsp -.cfi_adjust_cfa_offset -8*9 - - .byte 0xf3,0xc3 -.cfi_endproc -.size mulx_mont_384,.-mulx_mont_384 -.type __mulx_mont_384,@function -.align 32 -__mulx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - mulxq %r15,%r14,%r10 - mulxq %rax,%r15,%r11 - addq %r14,%r9 - mulxq %r12,%rax,%r12 - adcq %r15,%r10 - mulxq %rdi,%rdi,%r13 - adcq %rax,%r11 - mulxq %rbp,%rbp,%r14 - movq 8(%rbx),%rdx - adcq %rdi,%r12 - adcq %rbp,%r13 - adcq $0,%r14 - xorq %r15,%r15 - - movq %r8,16(%rsp) - imulq 8(%rsp),%r8 - - - xorq %rax,%rax - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r9 - adcxq %rbp,%r10 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r8,%rdx - adoxq %rdi,%r14 - adcxq %rbp,%r15 - adoxq %rax,%r15 - adoxq %rax,%rax - - - xorq %r8,%r8 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r9 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 16(%rbx),%rdx - adcxq %rdi,%r13 - adoxq %rbp,%r14 - adcxq %r8,%r14 - adoxq %r8,%r15 - adcxq %r8,%r15 - adoxq %r8,%rax - adcxq %r8,%rax - movq %r9,16(%rsp) - imulq 8(%rsp),%r9 - - - xorq %r8,%r8 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r9,%rdx - adoxq %rdi,%r15 - adcxq %rbp,%rax - adoxq %r8,%rax - adoxq %r8,%r8 - - - xorq %r9,%r9 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r10 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 24(%rbx),%rdx - adcxq %rdi,%r14 - adoxq %rbp,%r15 - adcxq %r9,%r15 - adoxq %r9,%rax - adcxq %r9,%rax - adoxq %r9,%r8 - adcxq %r9,%r8 - movq %r10,16(%rsp) - imulq 8(%rsp),%r10 - - - xorq %r9,%r9 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r10,%rdx - adoxq %rdi,%rax - adcxq %rbp,%r8 - adoxq %r9,%r8 - adoxq %r9,%r9 - - - xorq %r10,%r10 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r11 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 32(%rbx),%rdx - adcxq %rdi,%r15 - adoxq %rbp,%rax - adcxq %r10,%rax - adoxq %r10,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcxq %r10,%r9 - movq %r11,16(%rsp) - imulq 8(%rsp),%r11 - - - xorq %r10,%r10 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r11,%rdx - adoxq %rdi,%r8 - adcxq %rbp,%r9 - adoxq %r10,%r9 - adoxq %r10,%r10 - - - xorq %r11,%r11 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r12 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 40+128(%rcx),%rdi,%rbp - movq 40(%rbx),%rdx - adcxq %rdi,%rax - adoxq %rbp,%r8 - adcxq %r11,%r8 - adoxq %r11,%r9 - adcxq %r11,%r9 - adoxq %r11,%r10 - adcxq %r11,%r10 - movq %r12,16(%rsp) - imulq 8(%rsp),%r12 - - - xorq %r11,%r11 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r8 - adcxq %rbp,%r9 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r12,%rdx - adoxq %rdi,%r9 - adcxq %rbp,%r10 - adoxq %r11,%r10 - adoxq %r11,%r11 - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r13 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - - mulxq 40+128(%rcx),%rdi,%rbp - movq %r13,%rdx - adcxq %rdi,%r8 - adoxq %rbp,%r9 - adcxq %r12,%r9 - adoxq %r12,%r10 - adcxq %r12,%r10 - adoxq %r12,%r11 - adcxq %r12,%r11 - imulq 8(%rsp),%rdx - movq 24(%rsp),%rbx - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - movq %r15,%r13 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r8 - adoxq %rbp,%r9 - movq %rax,%rsi - - mulxq 40+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r10 - movq %r14,%rdx - adcxq %r12,%r10 - adoxq %r12,%r11 - leaq 128(%rcx),%rcx - movq %r8,%r12 - adcq $0,%r11 - - - - - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - movq %r9,%rdi - sbbq 16(%rcx),%rax - sbbq 24(%rcx),%r8 - sbbq 32(%rcx),%r9 - movq %r10,%rbp - sbbq 40(%rcx),%r10 - sbbq $0,%r11 - - cmovncq %r14,%rdx - cmovcq %r13,%r15 - cmovcq %rsi,%rax - cmovncq %r8,%r12 - movq %rdx,0(%rbx) - cmovncq %r9,%rdi - movq %r15,8(%rbx) - cmovncq %r10,%rbp - movq %rax,16(%rbx) - movq %r12,24(%rbx) - movq %rdi,32(%rbx) - movq %rbp,40(%rbx) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulx_mont_384,.-__mulx_mont_384 -.globl sqrx_mont_384 -.hidden sqrx_mont_384 -.type sqrx_mont_384,@function -.align 32 -sqrx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -24(%rsp),%rsp -.cfi_adjust_cfa_offset 8*3 - - - movq %rcx,%r8 - leaq -128(%rdx),%rcx - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - - leaq (%rsi),%rbx - movq %r8,(%rsp) - leaq -128(%rsi),%rsi - - mulxq %rdx,%r8,%r9 - call __mulx_mont_384 - - movq 24(%rsp),%r15 -.cfi_restore %r15 - movq 32(%rsp),%r14 -.cfi_restore %r14 - movq 40(%rsp),%r13 -.cfi_restore %r13 - movq 48(%rsp),%r12 -.cfi_restore %r12 - movq 56(%rsp),%rbx -.cfi_restore %rbx - movq 64(%rsp),%rbp -.cfi_restore %rbp - leaq 72(%rsp),%rsp -.cfi_adjust_cfa_offset -8*9 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqrx_mont_384,.-sqrx_mont_384 - -.globl sqrx_n_mul_mont_384 -.hidden sqrx_n_mul_mont_384 -.type sqrx_n_mul_mont_384,@function -.align 32 -sqrx_n_mul_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_n_mul_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -40(%rsp),%rsp -.cfi_adjust_cfa_offset 8*5 - - - movq %rdx,%r10 - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq %rsi,%rbx - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - - movq %r8,(%rsp) - movq %r9,24(%rsp) - movq 0(%r9),%xmm2 - -.Loop_sqrx_384: - movd %r10d,%xmm1 - leaq -128(%rbx),%rsi - leaq -128(%rcx),%rcx - - mulxq %rdx,%r8,%r9 - call __mulx_mont_384 - - movd %xmm1,%r10d - decl %r10d - jnz .Loop_sqrx_384 - - movq %rdx,%r14 -.byte 102,72,15,126,210 - leaq -128(%rbx),%rsi - movq 24(%rsp),%rbx - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - movq 40(%rsp),%r15 -.cfi_restore %r15 - movq 48(%rsp),%r14 -.cfi_restore %r14 - movq 56(%rsp),%r13 -.cfi_restore %r13 - movq 64(%rsp),%r12 -.cfi_restore %r12 - movq 72(%rsp),%rbx -.cfi_restore %rbx - movq 80(%rsp),%rbp -.cfi_restore %rbp - leaq 88(%rsp),%rsp -.cfi_adjust_cfa_offset -8*11 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 - -.globl sqrx_n_mul_mont_383 -.hidden sqrx_n_mul_mont_383 -.type sqrx_n_mul_mont_383,@function -.align 32 -sqrx_n_mul_mont_383: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_n_mul_mont_383$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -40(%rsp),%rsp -.cfi_adjust_cfa_offset 8*5 - - - movq %rdx,%r10 - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq %rsi,%rbx - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - - movq %r8,(%rsp) - movq %r9,24(%rsp) - movq 0(%r9),%xmm2 - leaq -128(%rcx),%rcx - -.Loop_sqrx_383: - movd %r10d,%xmm1 - leaq -128(%rbx),%rsi - - mulxq %rdx,%r8,%r9 - call __mulx_mont_383_nonred - - movd %xmm1,%r10d - decl %r10d - jnz .Loop_sqrx_383 - - movq %rdx,%r14 -.byte 102,72,15,126,210 - leaq -128(%rbx),%rsi - movq 24(%rsp),%rbx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - movq 40(%rsp),%r15 -.cfi_restore %r15 - movq 48(%rsp),%r14 -.cfi_restore %r14 - movq 56(%rsp),%r13 -.cfi_restore %r13 - movq 64(%rsp),%r12 -.cfi_restore %r12 - movq 72(%rsp),%rbx -.cfi_restore %rbx - movq 80(%rsp),%rbp -.cfi_restore %rbp - leaq 88(%rsp),%rsp -.cfi_adjust_cfa_offset -8*11 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 -.type __mulx_mont_383_nonred,@function -.align 32 -__mulx_mont_383_nonred: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - mulxq %r15,%r14,%r10 - mulxq %rax,%r15,%r11 - addq %r14,%r9 - mulxq %r12,%rax,%r12 - adcq %r15,%r10 - mulxq %rdi,%rdi,%r13 - adcq %rax,%r11 - mulxq %rbp,%rbp,%r14 - movq 8(%rbx),%rdx - adcq %rdi,%r12 - adcq %rbp,%r13 - adcq $0,%r14 - movq %r8,%rax - imulq 8(%rsp),%r8 - - - xorq %r15,%r15 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r9 - adcxq %rbp,%r10 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r8,%rdx - adoxq %rdi,%r14 - adcxq %r15,%rbp - adoxq %rbp,%r15 - - - xorq %r8,%r8 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r9 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 16(%rbx),%rdx - adcxq %rdi,%r13 - adoxq %rbp,%r14 - adcxq %rax,%r14 - adoxq %rax,%r15 - adcxq %rax,%r15 - movq %r9,%r8 - imulq 8(%rsp),%r9 - - - xorq %rax,%rax - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r9,%rdx - adoxq %rdi,%r15 - adcxq %rax,%rbp - adoxq %rbp,%rax - - - xorq %r9,%r9 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r8 - adoxq %rbp,%r10 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 24(%rbx),%rdx - adcxq %rdi,%r14 - adoxq %rbp,%r15 - adcxq %r8,%r15 - adoxq %r8,%rax - adcxq %r8,%rax - movq %r10,%r9 - imulq 8(%rsp),%r10 - - - xorq %r8,%r8 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r10,%rdx - adoxq %rdi,%rax - adcxq %r8,%rbp - adoxq %rbp,%r8 - - - xorq %r10,%r10 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r11 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 32(%rbx),%rdx - adcxq %rdi,%r15 - adoxq %rbp,%rax - adcxq %r9,%rax - adoxq %r9,%r8 - adcxq %r9,%r8 - movq %r11,%r10 - imulq 8(%rsp),%r11 - - - xorq %r9,%r9 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r11,%rdx - adoxq %rdi,%r8 - adcxq %r9,%rbp - adoxq %rbp,%r9 - - - xorq %r11,%r11 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r12 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 40+128(%rcx),%rdi,%rbp - movq 40(%rbx),%rdx - adcxq %rdi,%rax - adoxq %rbp,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcxq %r10,%r9 - movq %r12,%r11 - imulq 8(%rsp),%r12 - - - xorq %r10,%r10 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r8 - adcxq %rbp,%r9 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r12,%rdx - adoxq %rdi,%r9 - adcxq %r10,%rbp - adoxq %rbp,%r10 - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r13 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - - mulxq 40+128(%rcx),%rdi,%rbp - movq %r13,%rdx - adcxq %rdi,%r8 - adoxq %rbp,%r9 - adcxq %r11,%r9 - adoxq %r11,%r10 - adcxq %r11,%r10 - imulq 8(%rsp),%rdx - movq 24(%rsp),%rbx - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r8 - adoxq %rbp,%r9 - - mulxq 40+128(%rcx),%rdi,%rbp - movq %r14,%rdx - adcxq %rdi,%r9 - adoxq %rbp,%r10 - adcq $0,%r10 - movq %r8,%r12 - - movq %r14,0(%rbx) - movq %r15,8(%rbx) - movq %rax,16(%rbx) - movq %r9,%rdi - movq %r8,24(%rbx) - movq %r9,32(%rbx) - movq %r10,40(%rbx) - movq %r10,%rbp - - .byte 0xf3,0xc3 -.cfi_endproc -.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred -.globl sqrx_mont_382x -.hidden sqrx_mont_382x -.type sqrx_mont_382x,@function -.align 32 -sqrx_mont_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_mont_382x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - movq %rdi,16(%rsp) - movq %rsi,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %r8,%r14 - addq 48(%rsi),%r8 - movq %r9,%r15 - adcq 56(%rsi),%r9 - movq %r10,%rax - adcq 64(%rsi),%r10 - movq %r11,%rdx - adcq 72(%rsi),%r11 - movq %r12,%rbx - adcq 80(%rsi),%r12 - movq %r13,%rbp - adcq 88(%rsi),%r13 - - subq 48(%rsi),%r14 - sbbq 56(%rsi),%r15 - sbbq 64(%rsi),%rax - sbbq 72(%rsi),%rdx - sbbq 80(%rsi),%rbx - sbbq 88(%rsi),%rbp - sbbq %rdi,%rdi - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - movq %r14,32+48(%rsp) - movq %r15,32+56(%rsp) - movq %rax,32+64(%rsp) - movq %rdx,32+72(%rsp) - movq %rbx,32+80(%rsp) - movq %rbp,32+88(%rsp) - movq %rdi,32+96(%rsp) - - - - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_383_nonred - addq %rdx,%rdx - adcq %r15,%r15 - adcq %rax,%rax - adcq %r12,%r12 - adcq %rdi,%rdi - adcq %rbp,%rbp - - movq %rdx,48(%rbx) - movq %r15,56(%rbx) - movq %rax,64(%rbx) - movq %r12,72(%rbx) - movq %rdi,80(%rbx) - movq %rbp,88(%rbx) - - leaq 32-128(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rdx - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%rax - movq 32+24(%rsp),%r12 - movq 32+32(%rsp),%rdi - movq 32+40(%rsp),%rbp - - - - mulxq %r14,%r8,%r9 - call __mulx_mont_383_nonred - movq 32+96(%rsp),%r14 - leaq 128(%rcx),%rcx - movq 32+0(%rsp),%r8 - andq %r14,%r8 - movq 32+8(%rsp),%r9 - andq %r14,%r9 - movq 32+16(%rsp),%r10 - andq %r14,%r10 - movq 32+24(%rsp),%r11 - andq %r14,%r11 - movq 32+32(%rsp),%r13 - andq %r14,%r13 - andq 32+40(%rsp),%r14 - - subq %r8,%rdx - movq 0(%rcx),%r8 - sbbq %r9,%r15 - movq 8(%rcx),%r9 - sbbq %r10,%rax - movq 16(%rcx),%r10 - sbbq %r11,%r12 - movq 24(%rcx),%r11 - sbbq %r13,%rdi - movq 32(%rcx),%r13 - sbbq %r14,%rbp - sbbq %r14,%r14 - - andq %r14,%r8 - andq %r14,%r9 - andq %r14,%r10 - andq %r14,%r11 - andq %r14,%r13 - andq 40(%rcx),%r14 - - addq %r8,%rdx - adcq %r9,%r15 - adcq %r10,%rax - adcq %r11,%r12 - adcq %r13,%rdi - adcq %r14,%rbp - - movq %rdx,0(%rbx) - movq %r15,8(%rbx) - movq %rax,16(%rbx) - movq %r12,24(%rbx) - movq %rdi,32(%rbx) - movq %rbp,40(%rbx) - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc -.size sqrx_mont_382x,.-sqrx_mont_382x - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/sha256-armv8.S b/crypto/blst_src/build/elf/sha256-armv8.S deleted file mode 100644 index 45c1162c467..00000000000 --- a/crypto/blst_src/build/elf/sha256-armv8.S +++ /dev/null @@ -1,1083 +0,0 @@ -// -// Copyright Supranational LLC -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// ==================================================================== -// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -// project. -// ==================================================================== -// -// sha256_block procedure for ARMv8. -// -// This module is stripped of scalar code paths, with rationale that all -// known processors are NEON-capable. -// -// See original module at CRYPTOGAMS for further details. - -.comm __blst_platform_cap,4 -.text - -.align 6 -.type .LK256,%object -.LK256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0 //terminator -.size .LK256,.-.LK256 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 -.align 2 -.align 2 -.globl blst_sha256_block_armv8 -.type blst_sha256_block_armv8,%function -.align 6 -blst_sha256_block_armv8: -.Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v0.4s,v1.4s},[x0] - adr x3,.LK256 - -.Loop_hw: - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - sub x2,x2,#1 - ld1 {v16.4s},[x3],#16 - rev32 v4.16b,v4.16b - rev32 v5.16b,v5.16b - rev32 v6.16b,v6.16b - rev32 v7.16b,v7.16b - orr v18.16b,v0.16b,v0.16b // offload - orr v19.16b,v1.16b,v1.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.inst 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.inst 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.inst 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - ld1 {v17.4s},[x3] - add v16.4s,v16.4s,v6.4s - sub x3,x3,#64*4-16 // rewind - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - add v17.4s,v17.4s,v7.4s - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - add v0.4s,v0.4s,v18.4s - add v1.4s,v1.4s,v19.4s - - cbnz x2,.Loop_hw - - st1 {v0.4s,v1.4s},[x0] - - ldr x29,[sp],#16 - ret -.size blst_sha256_block_armv8,.-blst_sha256_block_armv8 -.globl blst_sha256_block_data_order -.type blst_sha256_block_data_order,%function -.align 4 -blst_sha256_block_data_order: - adrp x16,__blst_platform_cap - ldr w16,[x16,#:lo12:__blst_platform_cap] - tst w16,#1 - b.ne .Lv8_entry - - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 - - adr x16,.LK256 - add x2,x1,x2,lsl#6 // len to point at the end of inp - - ld1 {v0.16b},[x1], #16 - ld1 {v1.16b},[x1], #16 - ld1 {v2.16b},[x1], #16 - ld1 {v3.16b},[x1], #16 - ld1 {v4.4s},[x16], #16 - ld1 {v5.4s},[x16], #16 - ld1 {v6.4s},[x16], #16 - ld1 {v7.4s},[x16], #16 - rev32 v0.16b,v0.16b // yes, even on - rev32 v1.16b,v1.16b // big-endian - rev32 v2.16b,v2.16b - rev32 v3.16b,v3.16b - mov x17,sp - add v4.4s,v4.4s,v0.4s - add v5.4s,v5.4s,v1.4s - add v6.4s,v6.4s,v2.4s - st1 {v4.4s,v5.4s},[x17], #32 - add v7.4s,v7.4s,v3.4s - st1 {v6.4s,v7.4s},[x17] - sub x17,x17,#32 - - ldp w3,w4,[x0] - ldp w5,w6,[x0,#8] - ldp w7,w8,[x0,#16] - ldp w9,w10,[x0,#24] - ldr w12,[sp,#0] - mov w13,wzr - eor w14,w4,w5 - mov w15,wzr - b .L_00_48 - -.align 4 -.L_00_48: - ext v4.16b,v0.16b,v1.16b,#4 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - bic w15,w9,w7 - ext v7.16b,v2.16b,v3.16b,#4 - eor w11,w7,w7,ror#5 - add w3,w3,w13 - mov d19,v3.d[1] - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w3,w3,ror#11 - ushr v5.4s,v4.4s,#3 - add w10,w10,w12 - add v0.4s,v0.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - ushr v7.4s,v4.4s,#18 - add w10,w10,w11 - ldr w12,[sp,#4] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w6,w6,w10 - sli v7.4s,v4.4s,#14 - eor w14,w14,w4 - ushr v16.4s,v19.4s,#17 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - eor v5.16b,v5.16b,v7.16b - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - sli v16.4s,v19.4s,#15 - add w10,w10,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - ushr v7.4s,v19.4s,#19 - add w9,w9,w12 - ror w11,w11,#6 - add v0.4s,v0.4s,v5.4s - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - sli v7.4s,v19.4s,#13 - add w9,w9,w11 - ldr w12,[sp,#8] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - eor v17.16b,v17.16b,v7.16b - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - add v0.4s,v0.4s,v17.4s - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - ushr v18.4s,v0.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v0.4s,#10 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - sli v18.4s,v0.4s,#15 - add w8,w8,w12 - ushr v17.4s,v0.4s,#19 - ror w11,w11,#6 - eor w13,w9,w10 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w9,ror#20 - add w8,w8,w11 - sli v17.4s,v0.4s,#13 - ldr w12,[sp,#12] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w4,w4,w8 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w10 - eor v17.16b,v17.16b,v17.16b - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - mov v17.d[1],v19.d[0] - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - add v0.4s,v0.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add v4.4s,v4.4s,v0.4s - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#16] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - ext v4.16b,v1.16b,v2.16b,#4 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - bic w15,w5,w3 - ext v7.16b,v3.16b,v0.16b,#4 - eor w11,w3,w3,ror#5 - add w7,w7,w13 - mov d19,v0.d[1] - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w7,w7,ror#11 - ushr v5.4s,v4.4s,#3 - add w6,w6,w12 - add v1.4s,v1.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - ushr v7.4s,v4.4s,#18 - add w6,w6,w11 - ldr w12,[sp,#20] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w10,w10,w6 - sli v7.4s,v4.4s,#14 - eor w14,w14,w8 - ushr v16.4s,v19.4s,#17 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - eor v5.16b,v5.16b,v7.16b - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - sli v16.4s,v19.4s,#15 - add w6,w6,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - ushr v7.4s,v19.4s,#19 - add w5,w5,w12 - ror w11,w11,#6 - add v1.4s,v1.4s,v5.4s - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - sli v7.4s,v19.4s,#13 - add w5,w5,w11 - ldr w12,[sp,#24] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - eor v17.16b,v17.16b,v7.16b - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - add v1.4s,v1.4s,v17.4s - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - ushr v18.4s,v1.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v1.4s,#10 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - sli v18.4s,v1.4s,#15 - add w4,w4,w12 - ushr v17.4s,v1.4s,#19 - ror w11,w11,#6 - eor w13,w5,w6 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w5,ror#20 - add w4,w4,w11 - sli v17.4s,v1.4s,#13 - ldr w12,[sp,#28] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w8,w8,w4 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w6 - eor v17.16b,v17.16b,v17.16b - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - mov v17.d[1],v19.d[0] - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - add v1.4s,v1.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add v4.4s,v4.4s,v1.4s - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[sp,#32] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - ext v4.16b,v2.16b,v3.16b,#4 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - bic w15,w9,w7 - ext v7.16b,v0.16b,v1.16b,#4 - eor w11,w7,w7,ror#5 - add w3,w3,w13 - mov d19,v1.d[1] - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w3,w3,ror#11 - ushr v5.4s,v4.4s,#3 - add w10,w10,w12 - add v2.4s,v2.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - ushr v7.4s,v4.4s,#18 - add w10,w10,w11 - ldr w12,[sp,#36] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w6,w6,w10 - sli v7.4s,v4.4s,#14 - eor w14,w14,w4 - ushr v16.4s,v19.4s,#17 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - eor v5.16b,v5.16b,v7.16b - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - sli v16.4s,v19.4s,#15 - add w10,w10,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - ushr v7.4s,v19.4s,#19 - add w9,w9,w12 - ror w11,w11,#6 - add v2.4s,v2.4s,v5.4s - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - sli v7.4s,v19.4s,#13 - add w9,w9,w11 - ldr w12,[sp,#40] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - eor v17.16b,v17.16b,v7.16b - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - add v2.4s,v2.4s,v17.4s - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - ushr v18.4s,v2.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v2.4s,#10 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - sli v18.4s,v2.4s,#15 - add w8,w8,w12 - ushr v17.4s,v2.4s,#19 - ror w11,w11,#6 - eor w13,w9,w10 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w9,ror#20 - add w8,w8,w11 - sli v17.4s,v2.4s,#13 - ldr w12,[sp,#44] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w4,w4,w8 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w10 - eor v17.16b,v17.16b,v17.16b - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - mov v17.d[1],v19.d[0] - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - add v2.4s,v2.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add v4.4s,v4.4s,v2.4s - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#48] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - ext v4.16b,v3.16b,v0.16b,#4 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - bic w15,w5,w3 - ext v7.16b,v1.16b,v2.16b,#4 - eor w11,w3,w3,ror#5 - add w7,w7,w13 - mov d19,v2.d[1] - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w7,w7,ror#11 - ushr v5.4s,v4.4s,#3 - add w6,w6,w12 - add v3.4s,v3.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - ushr v7.4s,v4.4s,#18 - add w6,w6,w11 - ldr w12,[sp,#52] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w10,w10,w6 - sli v7.4s,v4.4s,#14 - eor w14,w14,w8 - ushr v16.4s,v19.4s,#17 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - eor v5.16b,v5.16b,v7.16b - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - sli v16.4s,v19.4s,#15 - add w6,w6,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - ushr v7.4s,v19.4s,#19 - add w5,w5,w12 - ror w11,w11,#6 - add v3.4s,v3.4s,v5.4s - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - sli v7.4s,v19.4s,#13 - add w5,w5,w11 - ldr w12,[sp,#56] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - eor v17.16b,v17.16b,v7.16b - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - add v3.4s,v3.4s,v17.4s - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - ushr v18.4s,v3.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v3.4s,#10 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - sli v18.4s,v3.4s,#15 - add w4,w4,w12 - ushr v17.4s,v3.4s,#19 - ror w11,w11,#6 - eor w13,w5,w6 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w5,ror#20 - add w4,w4,w11 - sli v17.4s,v3.4s,#13 - ldr w12,[sp,#60] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w8,w8,w4 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w6 - eor v17.16b,v17.16b,v17.16b - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - mov v17.d[1],v19.d[0] - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - add v3.4s,v3.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add v4.4s,v4.4s,v3.4s - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[x16] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - cmp w12,#0 // check for K256 terminator - ldr w12,[sp,#0] - sub x17,x17,#64 - bne .L_00_48 - - sub x16,x16,#256 // rewind x16 - cmp x1,x2 - mov x17, #64 - csel x17, x17, xzr, eq - sub x1,x1,x17 // avoid SEGV - mov x17,sp - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - ld1 {v0.16b},[x1],#16 - bic w15,w9,w7 - eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 - add w3,w3,w13 - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - eor w15,w3,w3,ror#11 - rev32 v0.16b,v0.16b - add w10,w10,w12 - ror w11,w11,#6 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - add v4.4s,v4.4s,v0.4s - add w10,w10,w11 - ldr w12,[sp,#4] - and w14,w14,w13 - ror w15,w15,#2 - add w6,w6,w10 - eor w14,w14,w4 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - add w10,w10,w14 - orr w12,w12,w15 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - add w9,w9,w12 - ror w11,w11,#6 - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - add w9,w9,w11 - ldr w12,[sp,#8] - and w13,w13,w14 - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - orr w12,w12,w15 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - add w8,w8,w12 - ror w11,w11,#6 - eor w13,w9,w10 - eor w15,w15,w9,ror#20 - add w8,w8,w11 - ldr w12,[sp,#12] - and w14,w14,w13 - ror w15,w15,#2 - add w4,w4,w8 - eor w14,w14,w10 - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#16] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - ld1 {v1.16b},[x1],#16 - bic w15,w5,w3 - eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 - add w7,w7,w13 - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - eor w15,w7,w7,ror#11 - rev32 v1.16b,v1.16b - add w6,w6,w12 - ror w11,w11,#6 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - add v4.4s,v4.4s,v1.4s - add w6,w6,w11 - ldr w12,[sp,#20] - and w14,w14,w13 - ror w15,w15,#2 - add w10,w10,w6 - eor w14,w14,w8 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - add w6,w6,w14 - orr w12,w12,w15 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - add w5,w5,w12 - ror w11,w11,#6 - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - add w5,w5,w11 - ldr w12,[sp,#24] - and w13,w13,w14 - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - orr w12,w12,w15 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - add w4,w4,w12 - ror w11,w11,#6 - eor w13,w5,w6 - eor w15,w15,w5,ror#20 - add w4,w4,w11 - ldr w12,[sp,#28] - and w14,w14,w13 - ror w15,w15,#2 - add w8,w8,w4 - eor w14,w14,w6 - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[sp,#32] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - ld1 {v2.16b},[x1],#16 - bic w15,w9,w7 - eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 - add w3,w3,w13 - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - eor w15,w3,w3,ror#11 - rev32 v2.16b,v2.16b - add w10,w10,w12 - ror w11,w11,#6 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - add v4.4s,v4.4s,v2.4s - add w10,w10,w11 - ldr w12,[sp,#36] - and w14,w14,w13 - ror w15,w15,#2 - add w6,w6,w10 - eor w14,w14,w4 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - add w10,w10,w14 - orr w12,w12,w15 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - add w9,w9,w12 - ror w11,w11,#6 - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - add w9,w9,w11 - ldr w12,[sp,#40] - and w13,w13,w14 - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - orr w12,w12,w15 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - add w8,w8,w12 - ror w11,w11,#6 - eor w13,w9,w10 - eor w15,w15,w9,ror#20 - add w8,w8,w11 - ldr w12,[sp,#44] - and w14,w14,w13 - ror w15,w15,#2 - add w4,w4,w8 - eor w14,w14,w10 - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#48] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - ld1 {v3.16b},[x1],#16 - bic w15,w5,w3 - eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 - add w7,w7,w13 - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - eor w15,w7,w7,ror#11 - rev32 v3.16b,v3.16b - add w6,w6,w12 - ror w11,w11,#6 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - add v4.4s,v4.4s,v3.4s - add w6,w6,w11 - ldr w12,[sp,#52] - and w14,w14,w13 - ror w15,w15,#2 - add w10,w10,w6 - eor w14,w14,w8 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - add w6,w6,w14 - orr w12,w12,w15 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - add w5,w5,w12 - ror w11,w11,#6 - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - add w5,w5,w11 - ldr w12,[sp,#56] - and w13,w13,w14 - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - orr w12,w12,w15 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - add w4,w4,w12 - ror w11,w11,#6 - eor w13,w5,w6 - eor w15,w15,w5,ror#20 - add w4,w4,w11 - ldr w12,[sp,#60] - and w14,w14,w13 - ror w15,w15,#2 - add w8,w8,w4 - eor w14,w14,w6 - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - add w3,w3,w15 // h+=Sigma0(a) from the past - ldp w11,w12,[x0,#0] - add w3,w3,w13 // h+=Maj(a,b,c) from the past - ldp w13,w14,[x0,#8] - add w3,w3,w11 // accumulate - add w4,w4,w12 - ldp w11,w12,[x0,#16] - add w5,w5,w13 - add w6,w6,w14 - ldp w13,w14,[x0,#24] - add w7,w7,w11 - add w8,w8,w12 - ldr w12,[sp,#0] - stp w3,w4,[x0,#0] - add w9,w9,w13 - mov w13,wzr - stp w5,w6,[x0,#8] - add w10,w10,w14 - stp w7,w8,[x0,#16] - eor w14,w4,w5 - stp w9,w10,[x0,#24] - mov w15,wzr - mov x17,sp - b.ne .L_00_48 - - ldr x29,[x29] - add sp,sp,#16*4+16 - ret -.size blst_sha256_block_data_order,.-blst_sha256_block_data_order -.globl blst_sha256_emit -.hidden blst_sha256_emit -.type blst_sha256_emit,%function -.align 4 -blst_sha256_emit: - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] -#ifndef __AARCH64EB__ - rev x4,x4 - rev x5,x5 - rev x6,x6 - rev x7,x7 -#endif - str w4,[x0,#4] - lsr x4,x4,#32 - str w5,[x0,#12] - lsr x5,x5,#32 - str w6,[x0,#20] - lsr x6,x6,#32 - str w7,[x0,#28] - lsr x7,x7,#32 - str w4,[x0,#0] - str w5,[x0,#8] - str w6,[x0,#16] - str w7,[x0,#24] - ret -.size blst_sha256_emit,.-blst_sha256_emit - -.globl blst_sha256_bcopy -.hidden blst_sha256_bcopy -.type blst_sha256_bcopy,%function -.align 4 -blst_sha256_bcopy: -.Loop_bcopy: - ldrb w3,[x1],#1 - sub x2,x2,#1 - strb w3,[x0],#1 - cbnz x2,.Loop_bcopy - ret -.size blst_sha256_bcopy,.-blst_sha256_bcopy - -.globl blst_sha256_hcopy -.hidden blst_sha256_hcopy -.type blst_sha256_hcopy,%function -.align 4 -blst_sha256_hcopy: - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - stp x4,x5,[x0] - stp x6,x7,[x0,#16] - ret -.size blst_sha256_hcopy,.-blst_sha256_hcopy diff --git a/crypto/blst_src/build/elf/sha256-portable-x86_64.s b/crypto/blst_src/build/elf/sha256-portable-x86_64.s deleted file mode 100644 index 2fd6a770917..00000000000 --- a/crypto/blst_src/build/elf/sha256-portable-x86_64.s +++ /dev/null @@ -1,1758 +0,0 @@ -.comm __blst_platform_cap,4 -.text - -.globl blst_sha256_block_data_order -.type blst_sha256_block_data_order,@function -.align 16 -blst_sha256_block_data_order: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - movq %rsp,%rbp -.cfi_def_cfa_register %rbp -#ifdef __BLST_PORTABLE__ - testl $2,__blst_platform_cap(%rip) - jnz .Lblst_sha256_block_data_order$2 -#endif - pushq %rbx -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $64+24,%rsp - -.cfi_def_cfa %rsp,144 - - leaq (%rsi,%rdx,4),%rdx - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - jmp .Lloop - -.align 16 -.Lloop: - movl %ebx,%edi - leaq K256(%rip),%rbp - xorl %ecx,%edi - movl 0(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 0(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - addl %r14d,%r11d - movl 4(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 4(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - addl %r14d,%r10d - movl 8(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 8(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - addl %r14d,%r9d - movl 12(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 12(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - addl %r14d,%r8d - movl 16(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 16(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - addl %r14d,%edx - movl 20(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 20(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - addl %r14d,%ecx - movl 24(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 24(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - addl %r14d,%ebx - movl 28(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 28(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - addl %r14d,%eax - movl 32(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 32(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - addl %r14d,%r11d - movl 36(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 36(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - addl %r14d,%r10d - movl 40(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 40(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - addl %r14d,%r9d - movl 44(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 44(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - addl %r14d,%r8d - movl 48(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 48(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - addl %r14d,%edx - movl 52(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 52(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - addl %r14d,%ecx - movl 56(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 56(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - addl %r14d,%ebx - movl 60(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 60(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - jmp .Lrounds_16_xx -.align 16 -.Lrounds_16_xx: - movl 4(%rsp),%r13d - movl 56(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 36(%rsp),%r12d - - addl 0(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 64(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 8(%rsp),%r13d - movl 60(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 40(%rsp),%r12d - - addl 4(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 68(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 12(%rsp),%r13d - movl 0(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 44(%rsp),%r12d - - addl 8(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 72(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 16(%rsp),%r13d - movl 4(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 48(%rsp),%r12d - - addl 12(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 76(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 20(%rsp),%r13d - movl 8(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 52(%rsp),%r12d - - addl 16(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 80(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 24(%rsp),%r13d - movl 12(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 56(%rsp),%r12d - - addl 20(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 84(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 28(%rsp),%r13d - movl 16(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 60(%rsp),%r12d - - addl 24(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 88(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 32(%rsp),%r13d - movl 20(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 0(%rsp),%r12d - - addl 28(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 92(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 36(%rsp),%r13d - movl 24(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 4(%rsp),%r12d - - addl 32(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 96(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 40(%rsp),%r13d - movl 28(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 8(%rsp),%r12d - - addl 36(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 100(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 44(%rsp),%r13d - movl 32(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 12(%rsp),%r12d - - addl 40(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 104(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 48(%rsp),%r13d - movl 36(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 16(%rsp),%r12d - - addl 44(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 108(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 52(%rsp),%r13d - movl 40(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 20(%rsp),%r12d - - addl 48(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 112(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 56(%rsp),%r13d - movl 44(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 24(%rsp),%r12d - - addl 52(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 116(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 60(%rsp),%r13d - movl 48(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 28(%rsp),%r12d - - addl 56(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 120(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 0(%rsp),%r13d - movl 52(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 32(%rsp),%r12d - - addl 60(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 124(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - leaq 64(%rbp),%rbp - cmpb $0x19,3(%rbp) - jnz .Lrounds_16_xx - - movq 64+0(%rsp),%rdi - addl %r14d,%eax - leaq 64(%rsi),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop - - leaq 64+24+48(%rsp),%r11 -.cfi_def_cfa %r11,8 - movq 64+24(%rsp),%r15 - movq -40(%r11),%r14 - movq -32(%r11),%r13 - movq -24(%r11),%r12 - movq -16(%r11),%rbx - movq -8(%r11),%rbp -.cfi_restore %r12 -.cfi_restore %r13 -.cfi_restore %r14 -.cfi_restore %r15 -.cfi_restore %rbp -.cfi_restore %rbx - leaq (%r11),%rsp - .byte 0xf3,0xc3 -.cfi_endproc -.size blst_sha256_block_data_order,.-blst_sha256_block_data_order - -#ifndef __BLST_PORTABLE__ -.align 64 -.type K256,@object -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 -.globl blst_sha256_emit -.hidden blst_sha256_emit -.type blst_sha256_emit,@function -.align 16 -blst_sha256_emit: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - bswapq %r8 - movq 24(%rsi),%r11 - bswapq %r9 - movl %r8d,4(%rdi) - bswapq %r10 - movl %r9d,12(%rdi) - bswapq %r11 - movl %r10d,20(%rdi) - shrq $32,%r8 - movl %r11d,28(%rdi) - shrq $32,%r9 - movl %r8d,0(%rdi) - shrq $32,%r10 - movl %r9d,8(%rdi) - shrq $32,%r11 - movl %r10d,16(%rdi) - movl %r11d,24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size blst_sha256_emit,.-blst_sha256_emit - -.globl blst_sha256_bcopy -.hidden blst_sha256_bcopy -.type blst_sha256_bcopy,@function -.align 16 -blst_sha256_bcopy: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - subq %rsi,%rdi -.Loop_bcopy: - movzbl (%rsi),%eax - leaq 1(%rsi),%rsi - movb %al,-1(%rdi,%rsi,1) - decq %rdx - jnz .Loop_bcopy - .byte 0xf3,0xc3 -.cfi_endproc -.size blst_sha256_bcopy,.-blst_sha256_bcopy - -.globl blst_sha256_hcopy -.hidden blst_sha256_hcopy -.type blst_sha256_hcopy,@function -.align 16 -blst_sha256_hcopy: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size blst_sha256_hcopy,.-blst_sha256_hcopy -#endif - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/elf/sha256-x86_64.s b/crypto/blst_src/build/elf/sha256-x86_64.s deleted file mode 100644 index 940051aab16..00000000000 --- a/crypto/blst_src/build/elf/sha256-x86_64.s +++ /dev/null @@ -1,1455 +0,0 @@ -.comm __blst_platform_cap,4 -.text - -.align 64 -.type K256,@object -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 -.globl blst_sha256_block_data_order_shaext -.hidden blst_sha256_block_data_order_shaext -.type blst_sha256_block_data_order_shaext,@function -.align 64 -blst_sha256_block_data_order_shaext: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - movq %rsp,%rbp -.cfi_def_cfa_register %rbp -.Lblst_sha256_block_data_order$2: - - leaq K256+128(%rip),%rcx - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa 256-128(%rcx),%xmm7 - - pshufd $0x1b,%xmm1,%xmm0 - pshufd $0xb1,%xmm1,%xmm1 - pshufd $0x1b,%xmm2,%xmm2 - movdqa %xmm7,%xmm8 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - movdqu (%rsi),%xmm3 - movdqu 16(%rsi),%xmm4 - movdqu 32(%rsi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%rsi),%xmm6 - - movdqa 0-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 - movdqa %xmm2,%xmm10 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - nop - movdqa %xmm1,%xmm9 -.byte 15,56,203,202 - - movdqa 16-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - leaq 64(%rsi),%rsi -.byte 15,56,204,220 -.byte 15,56,203,202 - - movdqa 32-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - - movdqa 48-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 64-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 80-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 96-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 112-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 128-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 144-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 160-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 176-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 192-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 208-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - - movdqa 224-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa %xmm8,%xmm7 -.byte 15,56,203,202 - - movdqa 240-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - decq %rdx - nop -.byte 15,56,203,202 - - paddd %xmm10,%xmm2 - paddd %xmm9,%xmm1 - jnz .Loop_shaext - - pshufd $0xb1,%xmm2,%xmm2 - pshufd $0x1b,%xmm1,%xmm7 - pshufd $0xb1,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - - movdqu %xmm1,(%rdi) - movdqu %xmm2,16(%rdi) -.cfi_def_cfa_register %rsp - popq %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbp - - .byte 0xf3,0xc3 -.cfi_endproc -.size blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext -.globl blst_sha256_block_data_order -.hidden blst_sha256_block_data_order -.type blst_sha256_block_data_order,@function -.align 64 -blst_sha256_block_data_order: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - movq %rsp,%rbp -.cfi_def_cfa_register %rbp - testl $2,__blst_platform_cap(%rip) - jnz .Lblst_sha256_block_data_order$2 - pushq %rbx -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $24,%rsp - - leaq (%rsi,%rdx,4),%rdx - movq %rdi,-64(%rbp) - - movq %rdx,-48(%rbp) - - - leaq -64(%rsp),%rsp - movl 0(%rdi),%eax - andq $-64,%rsp - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - - - jmp .Lloop_ssse3 -.align 16 -.Lloop_ssse3: - movdqa K256+256(%rip),%xmm7 - movq %rsi,-56(%rbp) - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 -.byte 102,15,56,0,199 - movdqu 48(%rsi),%xmm3 - leaq K256(%rip),%rsi -.byte 102,15,56,0,207 - movdqa 0(%rsi),%xmm4 - movdqa 16(%rsi),%xmm5 -.byte 102,15,56,0,215 - paddd %xmm0,%xmm4 - movdqa 32(%rsi),%xmm6 -.byte 102,15,56,0,223 - movdqa 48(%rsi),%xmm7 - paddd %xmm1,%xmm5 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - movdqa %xmm4,0(%rsp) - movl %eax,%r14d - movdqa %xmm5,16(%rsp) - movl %ebx,%edi - movdqa %xmm6,32(%rsp) - xorl %ecx,%edi - movdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lssse3_00_47 - -.align 16 -.Lssse3_00_47: - subq $-64,%rsi - rorl $14,%r13d - movdqa %xmm1,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm3,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,224,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,250,4 - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm3,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 4(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm0 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm0 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm0,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 0(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm0,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,0(%rsp) - rorl $14,%r13d - movdqa %xmm2,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm0,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,225,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,251,4 - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm0,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 20(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm1 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm1 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm1,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 16(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm1,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,16(%rsp) - rorl $14,%r13d - movdqa %xmm3,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm1,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,226,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,248,4 - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm1,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 36(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm2 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm2 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm2,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 32(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm2,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,32(%rsp) - rorl $14,%r13d - movdqa %xmm0,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm2,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,227,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,249,4 - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm2,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 52(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm3 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm3 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm3,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 48(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm3,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,48(%rsp) - cmpb $0,67(%rsi) - jne .Lssse3_00_47 - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq -64(%rbp),%rdi - movl %r14d,%eax - movq -56(%rbp),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - leaq 64(%rsi),%rsi - cmpq -48(%rbp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop_ssse3 - - xorps %xmm0,%xmm0 - movaps %xmm0,0(%rsp) - movaps %xmm0,16(%rsp) - movaps %xmm0,32(%rsp) - movaps %xmm0,48(%rsp) - movq -40(%rbp),%r15 - movq -32(%rbp),%r14 - movq -24(%rbp),%r13 - movq -16(%rbp),%r12 - movq -8(%rbp),%rbx - movq %rbp,%rsp -.cfi_def_cfa_register %rsp - popq %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbp -.cfi_restore %r12 -.cfi_restore %r13 -.cfi_restore %r14 -.cfi_restore %r15 -.cfi_restore %rbx - .byte 0xf3,0xc3 -.cfi_endproc -.size blst_sha256_block_data_order,.-blst_sha256_block_data_order -.globl blst_sha256_emit -.hidden blst_sha256_emit -.type blst_sha256_emit,@function -.align 16 -blst_sha256_emit: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - bswapq %r8 - movq 24(%rsi),%r11 - bswapq %r9 - movl %r8d,4(%rdi) - bswapq %r10 - movl %r9d,12(%rdi) - bswapq %r11 - movl %r10d,20(%rdi) - shrq $32,%r8 - movl %r11d,28(%rdi) - shrq $32,%r9 - movl %r8d,0(%rdi) - shrq $32,%r10 - movl %r9d,8(%rdi) - shrq $32,%r11 - movl %r10d,16(%rdi) - movl %r11d,24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size blst_sha256_emit,.-blst_sha256_emit - -.globl blst_sha256_bcopy -.hidden blst_sha256_bcopy -.type blst_sha256_bcopy,@function -.align 16 -blst_sha256_bcopy: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - subq %rsi,%rdi -.Loop_bcopy: - movzbl (%rsi),%eax - leaq 1(%rsi),%rsi - movb %al,-1(%rdi,%rsi,1) - decq %rdx - jnz .Loop_bcopy - .byte 0xf3,0xc3 -.cfi_endproc -.size blst_sha256_bcopy,.-blst_sha256_bcopy - -.globl blst_sha256_hcopy -.hidden blst_sha256_hcopy -.type blst_sha256_hcopy,@function -.align 16 -blst_sha256_hcopy: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size blst_sha256_hcopy,.-blst_sha256_hcopy - -.section .note.GNU-stack,"",@progbits -.section .note.gnu.property,"a",@note - .long 4,2f-1f,5 - .byte 0x47,0x4E,0x55,0 -1: .long 0xc0000002,4,3 -.align 8 -2: diff --git a/crypto/blst_src/build/mach-o/add_mod_256-armv8.S b/crypto/blst_src/build/mach-o/add_mod_256-armv8.S deleted file mode 100644 index 198d65aef69..00000000000 --- a/crypto/blst_src/build/mach-o/add_mod_256-armv8.S +++ /dev/null @@ -1,379 +0,0 @@ -.text - -.globl _add_mod_256 -.private_extern _add_mod_256 - -.align 5 -_add_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - - ldp x10,x11,[x1,#16] - adds x8,x8,x12 - ldp x14,x15,[x2,#16] - adcs x9,x9,x13 - ldp x4,x5,[x3] - adcs x10,x10,x14 - ldp x6,x7,[x3,#16] - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - stp x8,x9,[x0] - csel x11,x11,x2,lo - stp x10,x11,[x0,#16] - - ret - - -.globl _mul_by_3_mod_256 -.private_extern _mul_by_3_mod_256 - -.align 5 -_mul_by_3_mod_256: - ldp x12,x13,[x1] - ldp x14,x15,[x1,#16] - - adds x8,x12,x12 - ldp x4,x5,[x2] - adcs x9,x13,x13 - ldp x6,x7,[x2,#16] - adcs x10,x14,x14 - adcs x11,x15,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - csel x11,x11,x2,lo - - adds x8,x8,x12 - adcs x9,x9,x13 - adcs x10,x10,x14 - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - stp x8,x9,[x0] - csel x11,x11,x2,lo - stp x10,x11,[x0,#16] - - ret - - -.globl _lshift_mod_256 -.private_extern _lshift_mod_256 - -.align 5 -_lshift_mod_256: - ldp x8,x9,[x1] - ldp x10,x11,[x1,#16] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - -Loop_lshift_mod_256: - adds x8,x8,x8 - sub x2,x2,#1 - adcs x9,x9,x9 - adcs x10,x10,x10 - adcs x11,x11,x11 - adc x3,xzr,xzr - - subs x12,x8,x4 - sbcs x13,x9,x5 - sbcs x14,x10,x6 - sbcs x15,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x12,lo - csel x9,x9,x13,lo - csel x10,x10,x14,lo - csel x11,x11,x15,lo - - cbnz x2,Loop_lshift_mod_256 - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - ret - - -.globl _rshift_mod_256 -.private_extern _rshift_mod_256 - -.align 5 -_rshift_mod_256: - ldp x8,x9,[x1] - ldp x10,x11,[x1,#16] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - -Loop_rshift: - adds x12,x8,x4 - sub x2,x2,#1 - adcs x13,x9,x5 - adcs x14,x10,x6 - adcs x15,x11,x7 - adc x3,xzr,xzr - tst x8,#1 - - csel x12,x12,x8,ne - csel x13,x13,x9,ne - csel x14,x14,x10,ne - csel x15,x15,x11,ne - csel x3,x3,xzr,ne - - extr x8,x13,x12,#1 - extr x9,x14,x13,#1 - extr x10,x15,x14,#1 - extr x11,x3,x15,#1 - - cbnz x2,Loop_rshift - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - ret - - -.globl _cneg_mod_256 -.private_extern _cneg_mod_256 - -.align 5 -_cneg_mod_256: - ldp x8,x9,[x1] - ldp x4,x5,[x3] - - ldp x10,x11,[x1,#16] - subs x12,x4,x8 - ldp x6,x7,[x3,#16] - orr x4,x8,x9 - sbcs x13,x5,x9 - orr x5,x10,x11 - sbcs x14,x6,x10 - orr x3,x4,x5 - sbc x15,x7,x11 - - cmp x3,#0 - csetm x3,ne - ands x2,x2,x3 - - csel x8,x8,x12,eq - csel x9,x9,x13,eq - csel x10,x10,x14,eq - stp x8,x9,[x0] - csel x11,x11,x15,eq - stp x10,x11,[x0,#16] - - ret - - -.globl _sub_mod_256 -.private_extern _sub_mod_256 - -.align 5 -_sub_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - - ldp x10,x11,[x1,#16] - subs x8,x8,x12 - ldp x14,x15,[x2,#16] - sbcs x9,x9,x13 - ldp x4,x5,[x3] - sbcs x10,x10,x14 - ldp x6,x7,[x3,#16] - sbcs x11,x11,x15 - sbc x3,xzr,xzr - - and x4,x4,x3 - and x5,x5,x3 - adds x8,x8,x4 - and x6,x6,x3 - adcs x9,x9,x5 - and x7,x7,x3 - adcs x10,x10,x6 - stp x8,x9,[x0] - adc x11,x11,x7 - stp x10,x11,[x0,#16] - - ret - - -.globl _check_mod_256 -.private_extern _check_mod_256 - -.align 5 -_check_mod_256: - ldp x8,x9,[x0] - ldp x10,x11,[x0,#16] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - subs xzr,x8,x4 - sbcs xzr,x9,x5 - orr x8,x8,x9 - sbcs xzr,x10,x6 - orr x8,x8,x10 - sbcs xzr,x11,x7 - orr x8,x8,x11 - sbc x1,xzr,xzr - - cmp x8,#0 - mov x0,#1 - csel x0,x0,xzr,ne - and x0,x0,x1 - - ret - - -.globl _add_n_check_mod_256 -.private_extern _add_n_check_mod_256 - -.align 5 -_add_n_check_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - ldp x10,x11,[x1,#16] - ldp x14,x15,[x2,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 - rev x10,x10 - rev x14,x14 - rev x11,x11 - rev x15,x15 -#endif - - adds x8,x8,x12 - ldp x4,x5,[x3] - adcs x9,x9,x13 - ldp x6,x7,[x3,#16] - adcs x10,x10,x14 - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csel x8,x8,x16,lo - csel x9,x9,x17,lo - csel x10,x10,x1,lo - csel x11,x11,x2,lo - - orr x16, x8, x9 - orr x17, x10, x11 - orr x16, x16, x17 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - mov x17, #1 - cmp x16, #0 - csel x0, x17, xzr, ne - - ret - - -.globl _sub_n_check_mod_256 -.private_extern _sub_n_check_mod_256 - -.align 5 -_sub_n_check_mod_256: - ldp x8,x9,[x1] - ldp x12,x13,[x2] - ldp x10,x11,[x1,#16] - ldp x14,x15,[x2,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 - rev x10,x10 - rev x14,x14 - rev x11,x11 - rev x15,x15 -#endif - - subs x8,x8,x12 - sbcs x9,x9,x13 - ldp x4,x5,[x3] - sbcs x10,x10,x14 - ldp x6,x7,[x3,#16] - sbcs x11,x11,x15 - sbc x3,xzr,xzr - - and x4,x4,x3 - and x5,x5,x3 - adds x8,x8,x4 - and x6,x6,x3 - adcs x9,x9,x5 - and x7,x7,x3 - adcs x10,x10,x6 - adc x11,x11,x7 - - orr x16, x8, x9 - orr x17, x10, x11 - orr x16, x16, x17 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - mov x17, #1 - cmp x16, #0 - csel x0, x17, xzr, ne - - ret - diff --git a/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s deleted file mode 100644 index 19e5ba9834f..00000000000 --- a/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s +++ /dev/null @@ -1,564 +0,0 @@ -.text - -.globl _add_mod_256 -.private_extern _add_mod_256 - -.p2align 5 -_add_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - -L$oaded_a_add_mod_256: - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - movq %r8,%rax - adcq 16(%rdx),%r10 - movq %r9,%rsi - adcq 24(%rdx),%r11 - sbbq %rdx,%rdx - - movq %r10,%rbx - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - sbbq 16(%rcx),%r10 - movq %r11,%rbp - sbbq 24(%rcx),%r11 - sbbq $0,%rdx - - cmovcq %rax,%r8 - cmovcq %rsi,%r9 - movq %r8,0(%rdi) - cmovcq %rbx,%r10 - movq %r9,8(%rdi) - cmovcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _mul_by_3_mod_256 -.private_extern _mul_by_3_mod_256 - -.p2align 5 -_mul_by_3_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - - - movq %rdx,%rcx - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq %rsi,%rdx - movq 24(%rsi),%r11 - - call __lshift_mod_256 - movq 0(%rsp),%r12 -.cfi_restore %r12 - jmp L$oaded_a_add_mod_256 - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__lshift_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - addq %r8,%r8 - adcq %r9,%r9 - movq %r8,%rax - adcq %r10,%r10 - movq %r9,%rsi - adcq %r11,%r11 - sbbq %r12,%r12 - - movq %r10,%rbx - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - sbbq 16(%rcx),%r10 - movq %r11,%rbp - sbbq 24(%rcx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r8 - cmovcq %rsi,%r9 - cmovcq %rbx,%r10 - cmovcq %rbp,%r11 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _lshift_mod_256 -.private_extern _lshift_mod_256 - -.p2align 5 -_lshift_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - -L$oop_lshift_mod_256: - call __lshift_mod_256 - decl %edx - jnz L$oop_lshift_mod_256 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 0(%rsp),%r12 -.cfi_restore %r12 - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _rshift_mod_256 -.private_extern _rshift_mod_256 - -.p2align 5 -_rshift_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%rbp - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - -L$oop_rshift_mod_256: - movq %rbp,%r8 - andq $1,%rbp - movq 0(%rcx),%rax - negq %rbp - movq 8(%rcx),%rsi - movq 16(%rcx),%rbx - - andq %rbp,%rax - andq %rbp,%rsi - andq %rbp,%rbx - andq 24(%rcx),%rbp - - addq %rax,%r8 - adcq %rsi,%r9 - adcq %rbx,%r10 - adcq %rbp,%r11 - sbbq %rax,%rax - - shrq $1,%r8 - movq %r9,%rbp - shrq $1,%r9 - movq %r10,%rbx - shrq $1,%r10 - movq %r11,%rsi - shrq $1,%r11 - - shlq $63,%rbp - shlq $63,%rbx - orq %r8,%rbp - shlq $63,%rsi - orq %rbx,%r9 - shlq $63,%rax - orq %rsi,%r10 - orq %rax,%r11 - - decl %edx - jnz L$oop_rshift_mod_256 - - movq %rbp,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _cneg_mod_256 -.private_extern _cneg_mod_256 - -.p2align 5 -_cneg_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - - - movq 0(%rsi),%r12 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq %r12,%r8 - movq 24(%rsi),%r11 - orq %r9,%r12 - orq %r10,%r12 - orq %r11,%r12 - movq $-1,%rbp - - movq 0(%rcx),%rax - cmovnzq %rbp,%r12 - movq 8(%rcx),%rsi - movq 16(%rcx),%rbx - andq %r12,%rax - movq 24(%rcx),%rbp - andq %r12,%rsi - andq %r12,%rbx - andq %r12,%rbp - - subq %r8,%rax - sbbq %r9,%rsi - sbbq %r10,%rbx - sbbq %r11,%rbp - - orq %rdx,%rdx - - cmovzq %r8,%rax - cmovzq %r9,%rsi - movq %rax,0(%rdi) - cmovzq %r10,%rbx - movq %rsi,8(%rdi) - cmovzq %r11,%rbp - movq %rbx,16(%rdi) - movq %rbp,24(%rdi) - - movq 0(%rsp),%r12 -.cfi_restore %r12 - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _sub_mod_256 -.private_extern _sub_mod_256 - -.p2align 5 -_sub_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - subq 0(%rdx),%r8 - movq 0(%rcx),%rax - sbbq 8(%rdx),%r9 - movq 8(%rcx),%rsi - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rbx - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbp - sbbq %rdx,%rdx - - andq %rdx,%rax - andq %rdx,%rsi - andq %rdx,%rbx - andq %rdx,%rbp - - addq %rax,%r8 - adcq %rsi,%r9 - movq %r8,0(%rdi) - adcq %rbx,%r10 - movq %r9,8(%rdi) - adcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _check_mod_256 -.private_extern _check_mod_256 - -.p2align 5 -_check_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - movq 0(%rdi),%rax - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - - movq %rax,%r8 - orq %r9,%rax - orq %r10,%rax - orq %r11,%rax - - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq %rsi,%rsi - - movq $1,%rdx - cmpq $0,%rax - cmovneq %rdx,%rax - andq %rsi,%rax - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _add_n_check_mod_256 -.private_extern _add_n_check_mod_256 - -.p2align 5 -_add_n_check_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - movq %r8,%rax - adcq 16(%rdx),%r10 - movq %r9,%rsi - adcq 24(%rdx),%r11 - sbbq %rdx,%rdx - - movq %r10,%rbx - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - sbbq 16(%rcx),%r10 - movq %r11,%rbp - sbbq 24(%rcx),%r11 - sbbq $0,%rdx - - cmovcq %rax,%r8 - cmovcq %rsi,%r9 - movq %r8,0(%rdi) - cmovcq %rbx,%r10 - movq %r9,8(%rdi) - cmovcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - orq %r9,%r8 - orq %r11,%r10 - orq %r10,%r8 - movq $1,%rax - cmovzq %r8,%rax - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _sub_n_check_mod_256 -.private_extern _sub_n_check_mod_256 - -.p2align 5 -_sub_n_check_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - subq 0(%rdx),%r8 - movq 0(%rcx),%rax - sbbq 8(%rdx),%r9 - movq 8(%rcx),%rsi - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rbx - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbp - sbbq %rdx,%rdx - - andq %rdx,%rax - andq %rdx,%rsi - andq %rdx,%rbx - andq %rdx,%rbp - - addq %rax,%r8 - adcq %rsi,%r9 - movq %r8,0(%rdi) - adcq %rbx,%r10 - movq %r9,8(%rdi) - adcq %rbp,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - orq %r9,%r8 - orq %r11,%r10 - orq %r10,%r8 - movq $1,%rax - cmovzq %r8,%rax - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/add_mod_384-armv8.S b/crypto/blst_src/build/mach-o/add_mod_384-armv8.S deleted file mode 100644 index a62995f2bed..00000000000 --- a/crypto/blst_src/build/mach-o/add_mod_384-armv8.S +++ /dev/null @@ -1,1000 +0,0 @@ -.text - -.globl _add_mod_384 -.private_extern _add_mod_384 - -.align 5 -_add_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - - -.align 5 -__add_mod_384: - ldp x10,x11,[x1] - ldp x16,x17,[x2] - ldp x12,x13,[x1,#16] - ldp x19,x20,[x2,#16] - ldp x14,x15,[x1,#32] - ldp x21,x22,[x2,#32] - -__add_mod_384_ab_are_loaded: - adds x10,x10,x16 - adcs x11,x11,x17 - adcs x12,x12,x19 - adcs x13,x13,x20 - adcs x14,x14,x21 - adcs x15,x15,x22 - adc x3,xzr,xzr - - subs x16,x10,x4 - sbcs x17,x11,x5 - sbcs x19,x12,x6 - sbcs x20,x13,x7 - sbcs x21,x14,x8 - sbcs x22,x15,x9 - sbcs xzr,x3,xzr - - csel x10,x10,x16,lo - csel x11,x11,x17,lo - csel x12,x12,x19,lo - csel x13,x13,x20,lo - csel x14,x14,x21,lo - csel x15,x15,x22,lo - - ret - - -.globl _add_mod_384x -.private_extern _add_mod_384x - -.align 5 -_add_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __add_mod_384 - - stp x10,x11,[x0] - add x1,x1,#48 - stp x12,x13,[x0,#16] - add x2,x2,#48 - stp x14,x15,[x0,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl _rshift_mod_384 -.private_extern _rshift_mod_384 - -.align 5 -_rshift_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - -Loop_rshift_mod_384: - sub x2,x2,#1 - bl __rshift_mod_384 - cbnz x2,Loop_rshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - - -.align 5 -__rshift_mod_384: - sbfx x22,x10,#0,#1 - and x16,x22,x4 - and x17,x22,x5 - adds x10,x10,x16 - and x19,x22,x6 - adcs x11,x11,x17 - and x20,x22,x7 - adcs x12,x12,x19 - and x21,x22,x8 - adcs x13,x13,x20 - and x22,x22,x9 - adcs x14,x14,x21 - extr x10,x11,x10,#1 // a[0:5] >>= 1 - adcs x15,x15,x22 - extr x11,x12,x11,#1 - adc x22,xzr,xzr - extr x12,x13,x12,#1 - extr x13,x14,x13,#1 - extr x14,x15,x14,#1 - extr x15,x22,x15,#1 - ret - - -.globl _div_by_2_mod_384 -.private_extern _div_by_2_mod_384 - -.align 5 -_div_by_2_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __rshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl _lshift_mod_384 -.private_extern _lshift_mod_384 - -.align 5 -_lshift_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - -Loop_lshift_mod_384: - sub x2,x2,#1 - bl __lshift_mod_384 - cbnz x2,Loop_lshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - - -.align 5 -__lshift_mod_384: - adds x10,x10,x10 - adcs x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x3,xzr,xzr - - subs x16,x10,x4 - sbcs x17,x11,x5 - sbcs x19,x12,x6 - sbcs x20,x13,x7 - sbcs x21,x14,x8 - sbcs x22,x15,x9 - sbcs xzr,x3,xzr - - csel x10,x10,x16,lo - csel x11,x11,x17,lo - csel x12,x12,x19,lo - csel x13,x13,x20,lo - csel x14,x14,x21,lo - csel x15,x15,x22,lo - - ret - - -.globl _mul_by_3_mod_384 -.private_extern _mul_by_3_mod_384 - -.align 5 -_mul_by_3_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl _mul_by_8_mod_384 -.private_extern _mul_by_8_mod_384 - -.align 5 -_mul_by_8_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl _mul_by_3_mod_384x -.private_extern _mul_by_3_mod_384x - -.align 5 -_mul_by_3_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - - bl __add_mod_384_ab_are_loaded - - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __lshift_mod_384 - - ldp x16,x17,[x1,#48] - ldp x19,x20,[x1,#64] - ldp x21,x22,[x1,#80] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl _mul_by_8_mod_384x -.private_extern _mul_by_8_mod_384x - -.align 5 -_mul_by_8_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl _cneg_mod_384 -.private_extern _cneg_mod_384 - -.align 5 -_cneg_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x4,x5,[x3] - ldp x12,x13,[x1,#16] - ldp x6,x7,[x3,#16] - - subs x16,x4,x10 - ldp x14,x15,[x1,#32] - ldp x8,x9,[x3,#32] - orr x3,x10,x11 - sbcs x17,x5,x11 - orr x3,x3,x12 - sbcs x19,x6,x12 - orr x3,x3,x13 - sbcs x20,x7,x13 - orr x3,x3,x14 - sbcs x21,x8,x14 - orr x3,x3,x15 - sbc x22,x9,x15 - - cmp x3,#0 - csetm x3,ne - ands x2,x2,x3 - - csel x10,x10,x16,eq - csel x11,x11,x17,eq - csel x12,x12,x19,eq - csel x13,x13,x20,eq - stp x10,x11,[x0] - csel x14,x14,x21,eq - stp x12,x13,[x0,#16] - csel x15,x15,x22,eq - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl _sub_mod_384 -.private_extern _sub_mod_384 - -.align 5 -_sub_mod_384: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - - -.align 5 -__sub_mod_384: - ldp x10,x11,[x1] - ldp x16,x17,[x2] - ldp x12,x13,[x1,#16] - ldp x19,x20,[x2,#16] - ldp x14,x15,[x1,#32] - ldp x21,x22,[x2,#32] - - subs x10,x10,x16 - sbcs x11,x11,x17 - sbcs x12,x12,x19 - sbcs x13,x13,x20 - sbcs x14,x14,x21 - sbcs x15,x15,x22 - sbc x3,xzr,xzr - - and x16,x4,x3 - and x17,x5,x3 - adds x10,x10,x16 - and x19,x6,x3 - adcs x11,x11,x17 - and x20,x7,x3 - adcs x12,x12,x19 - and x21,x8,x3 - adcs x13,x13,x20 - and x22,x9,x3 - adcs x14,x14,x21 - adc x15,x15,x22 - - ret - - -.globl _sub_mod_384x -.private_extern _sub_mod_384x - -.align 5 -_sub_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __sub_mod_384 - - stp x10,x11,[x0] - add x1,x1,#48 - stp x12,x13,[x0,#16] - add x2,x2,#48 - stp x14,x15,[x0,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl _mul_by_1_plus_i_mod_384x -.private_extern _mul_by_1_plus_i_mod_384x - -.align 5 -_mul_by_1_plus_i_mod_384x: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - add x2,x1,#48 - - bl __sub_mod_384 // a->re - a->im - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __add_mod_384_ab_are_loaded // a->re + a->im - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - - -.globl _sgn0_pty_mod_384 -.private_extern _sgn0_pty_mod_384 - -.align 5 -_sgn0_pty_mod_384: - ldp x10,x11,[x0] - ldp x12,x13,[x0,#16] - ldp x14,x15,[x0,#32] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldp x8,x9,[x1,#32] - - and x0,x10,#1 - adds x10,x10,x10 - adcs x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x3,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x3,x3,xzr - - mvn x3,x3 - and x3,x3,#2 - orr x0,x0,x3 - - ret - - -.globl _sgn0_pty_mod_384x -.private_extern _sgn0_pty_mod_384x - -.align 5 -_sgn0_pty_mod_384x: - ldp x10,x11,[x0] - ldp x12,x13,[x0,#16] - ldp x14,x15,[x0,#32] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldp x8,x9,[x1,#32] - - and x2,x10,#1 - orr x3,x10,x11 - adds x10,x10,x10 - orr x3,x3,x12 - adcs x11,x11,x11 - orr x3,x3,x13 - adcs x12,x12,x12 - orr x3,x3,x14 - adcs x13,x13,x13 - orr x3,x3,x15 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x16,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x16,x16,xzr - - ldp x10,x11,[x0,#48] - ldp x12,x13,[x0,#64] - ldp x14,x15,[x0,#80] - - mvn x16,x16 - and x16,x16,#2 - orr x2,x2,x16 - - and x0,x10,#1 - orr x1,x10,x11 - adds x10,x10,x10 - orr x1,x1,x12 - adcs x11,x11,x11 - orr x1,x1,x13 - adcs x12,x12,x12 - orr x1,x1,x14 - adcs x13,x13,x13 - orr x1,x1,x15 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x16,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x16,x16,xzr - - mvn x16,x16 - and x16,x16,#2 - orr x0,x0,x16 - - cmp x3,#0 - csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) - - cmp x1,#0 - csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) - - and x3,x3,#1 - and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity - - ret - -.globl _vec_select_32 -.private_extern _vec_select_32 - -.align 5 -_vec_select_32: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret - -.globl _vec_select_48 -.private_extern _vec_select_48 - -.align 5 -_vec_select_48: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret - -.globl _vec_select_96 -.private_extern _vec_select_96 - -.align 5 -_vec_select_96: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret - -.globl _vec_select_192 -.private_extern _vec_select_192 - -.align 5 -_vec_select_192: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret - -.globl _vec_select_144 -.private_extern _vec_select_144 - -.align 5 -_vec_select_144: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret - -.globl _vec_select_288 -.private_extern _vec_select_288 - -.align 5 -_vec_select_288: - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret - -.globl _vec_prefetch -.private_extern _vec_prefetch - -.align 5 -_vec_prefetch: - add x1, x1, x0 - sub x1, x1, #1 - mov x2, #64 - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - csel x2, xzr, x2, hi - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - csel x0, x1, x0, hi - prfm pldl1keep, [x0] - ret - -.globl _vec_is_zero_16x -.private_extern _vec_is_zero_16x - -.align 5 -_vec_is_zero_16x: - ld1 {v0.2d}, [x0], #16 - lsr x1, x1, #4 - sub x1, x1, #1 - cbz x1, Loop_is_zero_done - -Loop_is_zero: - ld1 {v1.2d}, [x0], #16 - orr v0.16b, v0.16b, v1.16b - sub x1, x1, #1 - cbnz x1, Loop_is_zero - -Loop_is_zero_done: - dup v1.2d, v0.d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] - mov x0, #1 - cmp x1, #0 - csel x0, x0, xzr, eq - ret - -.globl _vec_is_equal_16x -.private_extern _vec_is_equal_16x - -.align 5 -_vec_is_equal_16x: - ld1 {v0.2d}, [x0], #16 - ld1 {v1.2d}, [x1], #16 - lsr x2, x2, #4 - eor v0.16b, v0.16b, v1.16b - -Loop_is_equal: - sub x2, x2, #1 - cbz x2, Loop_is_equal_done - ld1 {v1.2d}, [x0], #16 - ld1 {v2.2d}, [x1], #16 - eor v1.16b, v1.16b, v2.16b - orr v0.16b, v0.16b, v1.16b - b Loop_is_equal - nop - -Loop_is_equal_done: - dup v1.2d, v0.d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] - mov x0, #1 - cmp x1, #0 - csel x0, x0, xzr, eq - ret - diff --git a/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s deleted file mode 100644 index 974978e3425..00000000000 --- a/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s +++ /dev/null @@ -1,1899 +0,0 @@ -.text - -.globl _add_mod_384 -.private_extern _add_mod_384 - -.p2align 5 -_add_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __add_mod_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__add_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -__add_mod_384_a_is_loaded: - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - movq %r8,%r14 - adcq 24(%rdx),%r11 - movq %r9,%r15 - adcq 32(%rdx),%r12 - movq %r10,%rax - adcq 40(%rdx),%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,0(%rdi) - cmovcq %rbx,%r11 - movq %r9,8(%rdi) - cmovcq %rbp,%r12 - movq %r10,16(%rdi) - cmovcq %rsi,%r13 - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _add_mod_384x -.private_extern _add_mod_384x - -.p2align 5 -_add_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $24,%rsp -.cfi_adjust_cfa_offset 24 - - - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - leaq 48(%rsi),%rsi - leaq 48(%rdx),%rdx - leaq 48(%rdi),%rdi - call __add_mod_384 - - movq 0(%rsp),%rsi - movq 8(%rsp),%rdx - leaq -48(%rdi),%rdi - call __add_mod_384 - - movq 24+0(%rsp),%r15 -.cfi_restore %r15 - movq 24+8(%rsp),%r14 -.cfi_restore %r14 - movq 24+16(%rsp),%r13 -.cfi_restore %r13 - movq 24+24(%rsp),%r12 -.cfi_restore %r12 - movq 24+32(%rsp),%rbx -.cfi_restore %rbx - movq 24+40(%rsp),%rbp -.cfi_restore %rbp - leaq 24+48(%rsp),%rsp -.cfi_adjust_cfa_offset -24-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _rshift_mod_384 -.private_extern _rshift_mod_384 - -.p2align 5 -_rshift_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -L$oop_rshift_mod_384: - call __rshift_mod_384 - decl %edx - jnz L$oop_rshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__rshift_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $1,%rsi - movq 0(%rcx),%r14 - andq %r8,%rsi - movq 8(%rcx),%r15 - negq %rsi - movq 16(%rcx),%rax - andq %rsi,%r14 - movq 24(%rcx),%rbx - andq %rsi,%r15 - movq 32(%rcx),%rbp - andq %rsi,%rax - andq %rsi,%rbx - andq %rsi,%rbp - andq 40(%rcx),%rsi - - addq %r8,%r14 - adcq %r9,%r15 - adcq %r10,%rax - adcq %r11,%rbx - adcq %r12,%rbp - adcq %r13,%rsi - sbbq %r13,%r13 - - shrq $1,%r14 - movq %r15,%r8 - shrq $1,%r15 - movq %rax,%r9 - shrq $1,%rax - movq %rbx,%r10 - shrq $1,%rbx - movq %rbp,%r11 - shrq $1,%rbp - movq %rsi,%r12 - shrq $1,%rsi - shlq $63,%r8 - shlq $63,%r9 - orq %r14,%r8 - shlq $63,%r10 - orq %r15,%r9 - shlq $63,%r11 - orq %rax,%r10 - shlq $63,%r12 - orq %rbx,%r11 - shlq $63,%r13 - orq %rbp,%r12 - orq %rsi,%r13 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _div_by_2_mod_384 -.private_extern _div_by_2_mod_384 - -.p2align 5 -_div_by_2_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq %rdx,%rcx - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - call __rshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _lshift_mod_384 -.private_extern _lshift_mod_384 - -.p2align 5 -_lshift_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -L$oop_lshift_mod_384: - addq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - movq %r8,%r14 - adcq %r11,%r11 - movq %r9,%r15 - adcq %r12,%r12 - movq %r10,%rax - adcq %r13,%r13 - movq %r11,%rbx - sbbq %rdi,%rdi - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdi - - movq (%rsp),%rdi - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - cmovcq %rbx,%r11 - cmovcq %rbp,%r12 - cmovcq %rsi,%r13 - - decl %edx - jnz L$oop_lshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__lshift_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - addq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - movq %r8,%r14 - adcq %r11,%r11 - movq %r9,%r15 - adcq %r12,%r12 - movq %r10,%rax - adcq %r13,%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - cmovcq %rbx,%r11 - cmovcq %rbp,%r12 - cmovcq %rsi,%r13 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _mul_by_3_mod_384 -.private_extern _mul_by_3_mod_384 - -.p2align 5 -_mul_by_3_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - - movq (%rsp),%rdx - call __add_mod_384_a_is_loaded - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _mul_by_8_mod_384 -.private_extern _mul_by_8_mod_384 - -.p2align 5 -_mul_by_8_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _mul_by_3_mod_384x -.private_extern _mul_by_3_mod_384x - -.p2align 5 -_mul_by_3_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - - movq (%rsp),%rdx - call __add_mod_384_a_is_loaded - - movq (%rsp),%rsi - leaq 48(%rdi),%rdi - - movq 48(%rsi),%r8 - movq 56(%rsi),%r9 - movq 64(%rsi),%r10 - movq 72(%rsi),%r11 - movq 80(%rsi),%r12 - movq 88(%rsi),%r13 - - call __lshift_mod_384 - - movq $48,%rdx - addq (%rsp),%rdx - call __add_mod_384_a_is_loaded - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _mul_by_8_mod_384x -.private_extern _mul_by_8_mod_384x - -.p2align 5 -_mul_by_8_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq %rdx,%rcx - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - movq (%rsp),%rsi - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - movq 48+0(%rsi),%r8 - movq 48+8(%rsi),%r9 - movq 48+16(%rsi),%r10 - movq 48+24(%rsi),%r11 - movq 48+32(%rsi),%r12 - movq 48+40(%rsi),%r13 - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - movq %r8,48+0(%rdi) - movq %r9,48+8(%rdi) - movq %r10,48+16(%rdi) - movq %r11,48+24(%rdi) - movq %r12,48+32(%rdi) - movq %r13,48+40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _cneg_mod_384 -.private_extern _cneg_mod_384 - -.p2align 5 -_cneg_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdx -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%rdx - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq %rdx,%r8 - movq 24(%rsi),%r11 - orq %r9,%rdx - movq 32(%rsi),%r12 - orq %r10,%rdx - movq 40(%rsi),%r13 - orq %r11,%rdx - movq $-1,%rsi - orq %r12,%rdx - orq %r13,%rdx - - movq 0(%rcx),%r14 - cmovnzq %rsi,%rdx - movq 8(%rcx),%r15 - movq 16(%rcx),%rax - andq %rdx,%r14 - movq 24(%rcx),%rbx - andq %rdx,%r15 - movq 32(%rcx),%rbp - andq %rdx,%rax - movq 40(%rcx),%rsi - andq %rdx,%rbx - movq 0(%rsp),%rcx - andq %rdx,%rbp - andq %rdx,%rsi - - subq %r8,%r14 - sbbq %r9,%r15 - sbbq %r10,%rax - sbbq %r11,%rbx - sbbq %r12,%rbp - sbbq %r13,%rsi - - orq %rcx,%rcx - - cmovzq %r8,%r14 - cmovzq %r9,%r15 - cmovzq %r10,%rax - movq %r14,0(%rdi) - cmovzq %r11,%rbx - movq %r15,8(%rdi) - cmovzq %r12,%rbp - movq %rax,16(%rdi) - cmovzq %r13,%rsi - movq %rbx,24(%rdi) - movq %rbp,32(%rdi) - movq %rsi,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.globl _sub_mod_384 -.private_extern _sub_mod_384 - -.p2align 5 -_sub_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __sub_mod_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__sub_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - subq 0(%rdx),%r8 - movq 0(%rcx),%r14 - sbbq 8(%rdx),%r9 - movq 8(%rcx),%r15 - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rax - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbx - sbbq 32(%rdx),%r12 - movq 32(%rcx),%rbp - sbbq 40(%rdx),%r13 - movq 40(%rcx),%rsi - sbbq %rdx,%rdx - - andq %rdx,%r14 - andq %rdx,%r15 - andq %rdx,%rax - andq %rdx,%rbx - andq %rdx,%rbp - andq %rdx,%rsi - - addq %r14,%r8 - adcq %r15,%r9 - movq %r8,0(%rdi) - adcq %rax,%r10 - movq %r9,8(%rdi) - adcq %rbx,%r11 - movq %r10,16(%rdi) - adcq %rbp,%r12 - movq %r11,24(%rdi) - adcq %rsi,%r13 - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sub_mod_384x -.private_extern _sub_mod_384x - -.p2align 5 -_sub_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $24,%rsp -.cfi_adjust_cfa_offset 24 - - - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - leaq 48(%rsi),%rsi - leaq 48(%rdx),%rdx - leaq 48(%rdi),%rdi - call __sub_mod_384 - - movq 0(%rsp),%rsi - movq 8(%rsp),%rdx - leaq -48(%rdi),%rdi - call __sub_mod_384 - - movq 24+0(%rsp),%r15 -.cfi_restore %r15 - movq 24+8(%rsp),%r14 -.cfi_restore %r14 - movq 24+16(%rsp),%r13 -.cfi_restore %r13 - movq 24+24(%rsp),%r12 -.cfi_restore %r12 - movq 24+32(%rsp),%rbx -.cfi_restore %rbx - movq 24+40(%rsp),%rbp -.cfi_restore %rbp - leaq 24+48(%rsp),%rsp -.cfi_adjust_cfa_offset -24-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _mul_by_1_plus_i_mod_384x -.private_extern _mul_by_1_plus_i_mod_384x - -.p2align 5 -_mul_by_1_plus_i_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $56,%rsp -.cfi_adjust_cfa_offset 56 - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %r8,%r14 - addq 48(%rsi),%r8 - movq %r9,%r15 - adcq 56(%rsi),%r9 - movq %r10,%rax - adcq 64(%rsi),%r10 - movq %r11,%rbx - adcq 72(%rsi),%r11 - movq %r12,%rcx - adcq 80(%rsi),%r12 - movq %r13,%rbp - adcq 88(%rsi),%r13 - movq %rdi,48(%rsp) - sbbq %rdi,%rdi - - subq 48(%rsi),%r14 - sbbq 56(%rsi),%r15 - sbbq 64(%rsi),%rax - sbbq 72(%rsi),%rbx - sbbq 80(%rsi),%rcx - sbbq 88(%rsi),%rbp - sbbq %rsi,%rsi - - movq %r8,0(%rsp) - movq 0(%rdx),%r8 - movq %r9,8(%rsp) - movq 8(%rdx),%r9 - movq %r10,16(%rsp) - movq 16(%rdx),%r10 - movq %r11,24(%rsp) - movq 24(%rdx),%r11 - movq %r12,32(%rsp) - andq %rsi,%r8 - movq 32(%rdx),%r12 - movq %r13,40(%rsp) - andq %rsi,%r9 - movq 40(%rdx),%r13 - andq %rsi,%r10 - andq %rsi,%r11 - andq %rsi,%r12 - andq %rsi,%r13 - movq 48(%rsp),%rsi - - addq %r8,%r14 - movq 0(%rsp),%r8 - adcq %r9,%r15 - movq 8(%rsp),%r9 - adcq %r10,%rax - movq 16(%rsp),%r10 - adcq %r11,%rbx - movq 24(%rsp),%r11 - adcq %r12,%rcx - movq 32(%rsp),%r12 - adcq %r13,%rbp - movq 40(%rsp),%r13 - - movq %r14,0(%rsi) - movq %r8,%r14 - movq %r15,8(%rsi) - movq %rax,16(%rsi) - movq %r9,%r15 - movq %rbx,24(%rsi) - movq %rcx,32(%rsi) - movq %r10,%rax - movq %rbp,40(%rsi) - - subq 0(%rdx),%r8 - movq %r11,%rbx - sbbq 8(%rdx),%r9 - sbbq 16(%rdx),%r10 - movq %r12,%rcx - sbbq 24(%rdx),%r11 - sbbq 32(%rdx),%r12 - movq %r13,%rbp - sbbq 40(%rdx),%r13 - sbbq $0,%rdi - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,48(%rsi) - cmovcq %rbx,%r11 - movq %r9,56(%rsi) - cmovcq %rcx,%r12 - movq %r10,64(%rsi) - cmovcq %rbp,%r13 - movq %r11,72(%rsi) - movq %r12,80(%rsi) - movq %r13,88(%rsi) - - movq 56+0(%rsp),%r15 -.cfi_restore %r15 - movq 56+8(%rsp),%r14 -.cfi_restore %r14 - movq 56+16(%rsp),%r13 -.cfi_restore %r13 - movq 56+24(%rsp),%r12 -.cfi_restore %r12 - movq 56+32(%rsp),%rbx -.cfi_restore %rbx - movq 56+40(%rsp),%rbp -.cfi_restore %rbp - leaq 56+48(%rsp),%rsp -.cfi_adjust_cfa_offset -56-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sgn0_pty_mod_384 -.private_extern _sgn0_pty_mod_384 - -.p2align 5 -_sgn0_pty_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%rcx - movq 40(%rdi),%rdx - - xorq %rax,%rax - movq %r8,%rdi - addq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %rcx,%rcx - adcq %rdx,%rdx - adcq $0,%rax - - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - sbbq 40(%rsi),%rdx - sbbq $0,%rax - - notq %rax - andq $1,%rdi - andq $2,%rax - orq %rdi,%rax - - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sgn0_pty_mod_384x -.private_extern _sgn0_pty_mod_384x - -.p2align 5 -_sgn0_pty_mod_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq 48(%rdi),%r8 - movq 56(%rdi),%r9 - movq 64(%rdi),%r10 - movq 72(%rdi),%r11 - movq 80(%rdi),%rcx - movq 88(%rdi),%rdx - - movq %r8,%rbx - orq %r9,%r8 - orq %r10,%r8 - orq %r11,%r8 - orq %rcx,%r8 - orq %rdx,%r8 - - leaq 0(%rdi),%rax - xorq %rdi,%rdi - movq %rbx,%rbp - addq %rbx,%rbx - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %rcx,%rcx - adcq %rdx,%rdx - adcq $0,%rdi - - subq 0(%rsi),%rbx - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - sbbq 40(%rsi),%rdx - sbbq $0,%rdi - - movq %r8,0(%rsp) - notq %rdi - andq $1,%rbp - andq $2,%rdi - orq %rbp,%rdi - - movq 0(%rax),%r8 - movq 8(%rax),%r9 - movq 16(%rax),%r10 - movq 24(%rax),%r11 - movq 32(%rax),%rcx - movq 40(%rax),%rdx - - movq %r8,%rbx - orq %r9,%r8 - orq %r10,%r8 - orq %r11,%r8 - orq %rcx,%r8 - orq %rdx,%r8 - - xorq %rax,%rax - movq %rbx,%rbp - addq %rbx,%rbx - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %rcx,%rcx - adcq %rdx,%rdx - adcq $0,%rax - - subq 0(%rsi),%rbx - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - sbbq 24(%rsi),%r11 - sbbq 32(%rsi),%rcx - sbbq 40(%rsi),%rdx - sbbq $0,%rax - - movq 0(%rsp),%rbx - - notq %rax - - testq %r8,%r8 - cmovzq %rdi,%rbp - - testq %rbx,%rbx - cmovnzq %rdi,%rax - - andq $1,%rbp - andq $2,%rax - orq %rbp,%rax - - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _vec_select_32 -.private_extern _vec_select_32 - -.p2align 5 -_vec_select_32: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 16(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 16(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 16(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-16(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-16(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-16(%rdi) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,16-16(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _vec_select_48 -.private_extern _vec_select_48 - -.p2align 5 -_vec_select_48: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 24(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 24(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 24(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-24(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-24(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-24(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-24(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-24(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-24(%rdi) - pand %xmm4,%xmm0 - pand %xmm5,%xmm1 - por %xmm1,%xmm0 - movdqu %xmm0,32-24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _vec_select_96 -.private_extern _vec_select_96 - -.p2align 5 -_vec_select_96: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 48(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 48(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 48(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-48(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-48(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-48(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-48(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-48(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-48(%rdi) - pand %xmm4,%xmm0 - movdqu 32+16-48(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-48(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-48(%rdi) - pand %xmm4,%xmm2 - movdqu 48+16-48(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-48(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-48(%rdi) - pand %xmm4,%xmm0 - movdqu 64+16-48(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-48(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-48(%rdi) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,80-48(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _vec_select_192 -.private_extern _vec_select_192 - -.p2align 5 -_vec_select_192: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 96(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 96(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 96(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-96(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-96(%rdi) - pand %xmm4,%xmm0 - movdqu 32+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-96(%rdi) - pand %xmm4,%xmm2 - movdqu 48+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-96(%rdi) - pand %xmm4,%xmm0 - movdqu 64+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-96(%rdi) - pand %xmm4,%xmm2 - movdqu 80+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 80+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,80-96(%rdi) - pand %xmm4,%xmm0 - movdqu 96+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 96+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,96-96(%rdi) - pand %xmm4,%xmm2 - movdqu 112+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 112+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,112-96(%rdi) - pand %xmm4,%xmm0 - movdqu 128+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 128+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,128-96(%rdi) - pand %xmm4,%xmm2 - movdqu 144+16-96(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 144+16-96(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,144-96(%rdi) - pand %xmm4,%xmm0 - movdqu 160+16-96(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 160+16-96(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,160-96(%rdi) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,176-96(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _vec_select_144 -.private_extern _vec_select_144 - -.p2align 5 -_vec_select_144: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 72(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 72(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 72(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-72(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-72(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-72(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-72(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-72(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-72(%rdi) - pand %xmm4,%xmm0 - movdqu 32+16-72(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-72(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-72(%rdi) - pand %xmm4,%xmm2 - movdqu 48+16-72(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-72(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-72(%rdi) - pand %xmm4,%xmm0 - movdqu 64+16-72(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-72(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-72(%rdi) - pand %xmm4,%xmm2 - movdqu 80+16-72(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 80+16-72(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,80-72(%rdi) - pand %xmm4,%xmm0 - movdqu 96+16-72(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 96+16-72(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,96-72(%rdi) - pand %xmm4,%xmm2 - movdqu 112+16-72(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 112+16-72(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,112-72(%rdi) - pand %xmm4,%xmm0 - pand %xmm5,%xmm1 - por %xmm1,%xmm0 - movdqu %xmm0,128-72(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _vec_select_288 -.private_extern _vec_select_288 - -.p2align 5 -_vec_select_288: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movd %ecx,%xmm5 - pxor %xmm4,%xmm4 - pshufd $0,%xmm5,%xmm5 - movdqu (%rsi),%xmm0 - leaq 144(%rsi),%rsi - pcmpeqd %xmm4,%xmm5 - movdqu (%rdx),%xmm1 - leaq 144(%rdx),%rdx - pcmpeqd %xmm5,%xmm4 - leaq 144(%rdi),%rdi - pand %xmm4,%xmm0 - movdqu 0+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 0+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,0-144(%rdi) - pand %xmm4,%xmm2 - movdqu 16+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 16+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,16-144(%rdi) - pand %xmm4,%xmm0 - movdqu 32+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 32+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,32-144(%rdi) - pand %xmm4,%xmm2 - movdqu 48+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 48+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,48-144(%rdi) - pand %xmm4,%xmm0 - movdqu 64+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 64+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,64-144(%rdi) - pand %xmm4,%xmm2 - movdqu 80+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 80+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,80-144(%rdi) - pand %xmm4,%xmm0 - movdqu 96+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 96+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,96-144(%rdi) - pand %xmm4,%xmm2 - movdqu 112+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 112+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,112-144(%rdi) - pand %xmm4,%xmm0 - movdqu 128+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 128+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,128-144(%rdi) - pand %xmm4,%xmm2 - movdqu 144+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 144+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,144-144(%rdi) - pand %xmm4,%xmm0 - movdqu 160+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 160+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,160-144(%rdi) - pand %xmm4,%xmm2 - movdqu 176+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 176+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,176-144(%rdi) - pand %xmm4,%xmm0 - movdqu 192+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 192+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,192-144(%rdi) - pand %xmm4,%xmm2 - movdqu 208+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 208+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,208-144(%rdi) - pand %xmm4,%xmm0 - movdqu 224+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 224+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,224-144(%rdi) - pand %xmm4,%xmm2 - movdqu 240+16-144(%rsi),%xmm0 - pand %xmm5,%xmm3 - movdqu 240+16-144(%rdx),%xmm1 - por %xmm3,%xmm2 - movdqu %xmm2,240-144(%rdi) - pand %xmm4,%xmm0 - movdqu 256+16-144(%rsi),%xmm2 - pand %xmm5,%xmm1 - movdqu 256+16-144(%rdx),%xmm3 - por %xmm1,%xmm0 - movdqu %xmm0,256-144(%rdi) - pand %xmm4,%xmm2 - pand %xmm5,%xmm3 - por %xmm3,%xmm2 - movdqu %xmm2,272-144(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _vec_prefetch -.private_extern _vec_prefetch - -.p2align 5 -_vec_prefetch: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - leaq -1(%rdi,%rsi,1),%rsi - movq $64,%rax - xorq %r8,%r8 - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - cmovaq %r8,%rax - prefetchnta (%rdi) - leaq (%rdi,%rax,1),%rdi - cmpq %rsi,%rdi - cmovaq %rsi,%rdi - prefetchnta (%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _vec_is_zero_16x -.private_extern _vec_is_zero_16x - -.p2align 5 -_vec_is_zero_16x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - shrl $4,%esi - movdqu (%rdi),%xmm0 - leaq 16(%rdi),%rdi - -L$oop_is_zero: - decl %esi - jz L$oop_is_zero_done - movdqu (%rdi),%xmm1 - leaq 16(%rdi),%rdi - por %xmm1,%xmm0 - jmp L$oop_is_zero - -L$oop_is_zero_done: - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 -.byte 102,72,15,126,192 - incl %esi - testq %rax,%rax - cmovnzl %esi,%eax - xorl $1,%eax - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _vec_is_equal_16x -.private_extern _vec_is_equal_16x - -.p2align 5 -_vec_is_equal_16x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - shrl $4,%edx - movdqu (%rdi),%xmm0 - movdqu (%rsi),%xmm1 - subq %rdi,%rsi - leaq 16(%rdi),%rdi - pxor %xmm1,%xmm0 - -L$oop_is_equal: - decl %edx - jz L$oop_is_equal_done - movdqu (%rdi),%xmm1 - movdqu (%rdi,%rsi,1),%xmm2 - leaq 16(%rdi),%rdi - pxor %xmm2,%xmm1 - por %xmm1,%xmm0 - jmp L$oop_is_equal - -L$oop_is_equal_done: - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 -.byte 102,72,15,126,192 - incl %edx - testq %rax,%rax - cmovnzl %edx,%eax - xorl $1,%eax - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s deleted file mode 100644 index 2dc58f81608..00000000000 --- a/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s +++ /dev/null @@ -1,244 +0,0 @@ -.text - - -.p2align 5 -__add_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - addq 0(%rdx),%r8 - movq 56(%rsi),%r15 - adcq 8(%rdx),%r9 - movq 64(%rsi),%rax - adcq 16(%rdx),%r10 - movq 72(%rsi),%rbx - adcq 24(%rdx),%r11 - movq 80(%rsi),%rbp - adcq 32(%rdx),%r12 - movq 88(%rsi),%rsi - adcq 40(%rdx),%r13 - movq %r8,0(%rdi) - adcq 48(%rdx),%r14 - movq %r9,8(%rdi) - adcq 56(%rdx),%r15 - movq %r10,16(%rdi) - adcq 64(%rdx),%rax - movq %r12,32(%rdi) - movq %r14,%r8 - adcq 72(%rdx),%rbx - movq %r11,24(%rdi) - movq %r15,%r9 - adcq 80(%rdx),%rbp - movq %r13,40(%rdi) - movq %rax,%r10 - adcq 88(%rdx),%rsi - movq %rbx,%r11 - sbbq %rdx,%rdx - - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - movq %rbp,%r12 - sbbq 16(%rcx),%rax - sbbq 24(%rcx),%rbx - sbbq 32(%rcx),%rbp - movq %rsi,%r13 - sbbq 40(%rcx),%rsi - sbbq $0,%rdx - - cmovcq %r8,%r14 - cmovcq %r9,%r15 - cmovcq %r10,%rax - movq %r14,48(%rdi) - cmovcq %r11,%rbx - movq %r15,56(%rdi) - cmovcq %r12,%rbp - movq %rax,64(%rdi) - cmovcq %r13,%rsi - movq %rbx,72(%rdi) - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__sub_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - subq 0(%rdx),%r8 - movq 56(%rsi),%r15 - sbbq 8(%rdx),%r9 - movq 64(%rsi),%rax - sbbq 16(%rdx),%r10 - movq 72(%rsi),%rbx - sbbq 24(%rdx),%r11 - movq 80(%rsi),%rbp - sbbq 32(%rdx),%r12 - movq 88(%rsi),%rsi - sbbq 40(%rdx),%r13 - movq %r8,0(%rdi) - sbbq 48(%rdx),%r14 - movq 0(%rcx),%r8 - movq %r9,8(%rdi) - sbbq 56(%rdx),%r15 - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - sbbq 64(%rdx),%rax - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - sbbq 72(%rdx),%rbx - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - sbbq 80(%rdx),%rbp - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - sbbq 88(%rdx),%rsi - movq 40(%rcx),%r13 - sbbq %rdx,%rdx - - andq %rdx,%r8 - andq %rdx,%r9 - andq %rdx,%r10 - andq %rdx,%r11 - andq %rdx,%r12 - andq %rdx,%r13 - - addq %r8,%r14 - adcq %r9,%r15 - movq %r14,48(%rdi) - adcq %r10,%rax - movq %r15,56(%rdi) - adcq %r11,%rbx - movq %rax,64(%rdi) - adcq %r12,%rbp - movq %rbx,72(%rdi) - adcq %r13,%rsi - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _add_mod_384x384 -.private_extern _add_mod_384x384 - -.p2align 5 -_add_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __add_mod_384x384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sub_mod_384x384 -.private_extern _sub_mod_384x384 - -.p2align 5 -_sub_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __sub_mod_384x384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S deleted file mode 100644 index 2fd4847a496..00000000000 --- a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S +++ /dev/null @@ -1,785 +0,0 @@ -.text - -.globl _ct_inverse_mod_256 -.private_extern _ct_inverse_mod_256 - -.align 5 -_ct_inverse_mod_256: -.long 3573752639 - stp x29, x30, [sp,#-80]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - sub sp, sp, #1040 - - ldp x4, x5, [x1,#8*0] - ldp x6, x7, [x1,#8*2] - - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... - str x0, [sp] - - ldp x8, x9, [x2,#8*0] - ldp x10, x11, [x2,#8*2] - - stp x4, x5, [x1,#8*0] // copy input to |a| - stp x6, x7, [x1,#8*2] - stp x8, x9, [x1,#8*4] // copy modulus to |b| - stp x10, x11, [x1,#8*6] - - ////////////////////////////////////////// first iteration - bl Lab_approximation_31_256_loaded - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - str x12,[x0,#8*8] // initialize |u| with |f0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to dst |b| - bl __smul_256_n_shift_by_31 - str x12, [x0,#8*9] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - ldr x8, [x1,#8*8] // |u| - ldr x9, [x1,#8*13] // |v| - madd x4, x16, x8, xzr // |u|*|f0| - madd x4, x17, x9, x4 // |v|*|g0| - str x4, [x0,#8*4] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*5] - stp x5, x5, [x0,#8*7] - - madd x4, x12, x8, xzr // |u|*|f1| - madd x4, x13, x9, x4 // |v|*|g1| - str x4, [x0,#8*9] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*10] - stp x5, x5, [x0,#8*12] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - ////////////////////////////////////////// two[!] last iterations - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #47 // 31 + 512 % 31 - //bl __ab_approximation_62_256 // |a| and |b| are exact, - ldr x7, [x1,#8*0] // just load - ldr x11, [x1,#8*4] - bl __inner_loop_62_256 - - mov x16, x14 - mov x17, x15 - ldr x0, [sp] // original out_ptr - bl __smul_256x63 - bl __smul_512x63_tail - ldr x30, [x29,#8] - - smulh x20, x7, x17 // figure out top-most limb - ldp x8, x9, [x3,#8*0] - adc x23, x23, x25 - ldp x10, x11, [x3,#8*2] - - add x20, x20, x23 // x20 is 1, 0 or -1 - asr x19, x20, #63 // sign as mask - - and x23, x8, x19 // add mod<<256 conditionally - and x24, x9, x19 - adds x4, x4, x23 - and x25, x10, x19 - adcs x5, x5, x24 - and x26, x11, x19 - adcs x6, x6, x25 - adcs x7, x22, x26 - adc x20, x20, xzr // x20 is 1, 0 or -1 - - neg x19, x20 - orr x20, x20, x19 // excess bit or sign as mask - asr x19, x19, #63 // excess bit as mask - - and x8, x8, x20 // mask |mod| - and x9, x9, x20 - and x10, x10, x20 - and x11, x11, x20 - - eor x8, x8, x19 // conditionally negate |mod| - eor x9, x9, x19 - adds x8, x8, x19, lsr#63 - eor x10, x10, x19 - adcs x9, x9, xzr - eor x11, x11, x19 - adcs x10, x10, xzr - adc x11, x11, xzr - - adds x4, x4, x8 // final adjustment for |mod|<<256 - adcs x5, x5, x9 - adcs x6, x6, x10 - stp x4, x5, [x0,#8*4] - adc x7, x7, x11 - stp x6, x7, [x0,#8*6] - - add sp, sp, #1040 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldr x29, [sp],#80 -.long 3573752767 - ret - - -//////////////////////////////////////////////////////////////////////// - -.align 5 -__smul_256x63: - ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) - asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x6, x7, [x1,#8*2+64] - eor x16, x16, x14 // conditionally negate |f_| (or |g_|) - ldr x22, [x1,#8*4+64] - - eor x4, x4, x14 // conditionally negate |u| (or |v|) - sub x16, x16, x14 - eor x5, x5, x14 - adds x4, x4, x14, lsr#63 - eor x6, x6, x14 - adcs x5, x5, xzr - eor x7, x7, x14 - adcs x6, x6, xzr - eor x22, x22, x14 - umulh x19, x4, x16 - adcs x7, x7, xzr - umulh x20, x5, x16 - adcs x22, x22, xzr - umulh x21, x6, x16 - mul x4, x4, x16 - cmp x16, #0 - mul x5, x5, x16 - csel x22, x22, xzr, ne - mul x6, x6, x16 - adds x5, x5, x19 - mul x24, x7, x16 - adcs x6, x6, x20 - adcs x24, x24, x21 - adc x26, xzr, xzr - ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) - asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x10, x11, [x1,#8*2+104] - eor x17, x17, x14 // conditionally negate |f_| (or |g_|) - ldr x23, [x1,#8*4+104] - - eor x8, x8, x14 // conditionally negate |u| (or |v|) - sub x17, x17, x14 - eor x9, x9, x14 - adds x8, x8, x14, lsr#63 - eor x10, x10, x14 - adcs x9, x9, xzr - eor x11, x11, x14 - adcs x10, x10, xzr - eor x23, x23, x14 - umulh x19, x8, x17 - adcs x11, x11, xzr - umulh x20, x9, x17 - adcs x23, x23, xzr - umulh x21, x10, x17 - adc x15, xzr, xzr // used in __smul_512x63_tail - mul x8, x8, x17 - cmp x17, #0 - mul x9, x9, x17 - csel x23, x23, xzr, ne - mul x10, x10, x17 - adds x9, x9, x19 - mul x25, x11, x17 - adcs x10, x10, x20 - adcs x25, x25, x21 - adc x26, x26, xzr - - adds x4, x4, x8 - adcs x5, x5, x9 - adcs x6, x6, x10 - stp x4, x5, [x0,#8*0] - adcs x24, x24, x25 - stp x6, x24, [x0,#8*2] - - ret - - - -.align 5 -__smul_512x63_tail: - umulh x24, x7, x16 - ldp x5, x6, [x1,#8*18] // load rest of |v| - adc x26, x26, xzr - ldr x7, [x1,#8*20] - and x22, x22, x16 - - umulh x11, x11, x17 // resume |v|*|g1| chain - - sub x24, x24, x22 // tie up |u|*|f1| chain - asr x25, x24, #63 - - eor x5, x5, x14 // conditionally negate rest of |v| - eor x6, x6, x14 - adds x5, x5, x15 - eor x7, x7, x14 - adcs x6, x6, xzr - umulh x19, x23, x17 - adc x7, x7, xzr - umulh x20, x5, x17 - add x11, x11, x26 - umulh x21, x6, x17 - - mul x4, x23, x17 - mul x5, x5, x17 - adds x4, x4, x11 - mul x6, x6, x17 - adcs x5, x5, x19 - mul x22, x7, x17 - adcs x6, x6, x20 - adcs x22, x22, x21 - adc x23, xzr, xzr // used in the final step - - adds x4, x4, x24 - adcs x5, x5, x25 - adcs x6, x6, x25 - stp x4, x5, [x0,#8*4] - adcs x22, x22, x25 // carry is used in the final step - stp x6, x22, [x0,#8*6] - - ret - - - -.align 5 -__smul_256_n_shift_by_31: - ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) - asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x6, x7, [x1,#8*2+0] - eor x25, x12, x24 // conditionally negate |f0| (or |g0|) - - eor x4, x4, x24 // conditionally negate |a| (or |b|) - sub x25, x25, x24 - eor x5, x5, x24 - adds x4, x4, x24, lsr#63 - eor x6, x6, x24 - adcs x5, x5, xzr - eor x7, x7, x24 - umulh x19, x4, x25 - adcs x6, x6, xzr - umulh x20, x5, x25 - adc x7, x7, xzr - umulh x21, x6, x25 - and x24, x24, x25 - umulh x22, x7, x25 - neg x24, x24 - - mul x4, x4, x25 - mul x5, x5, x25 - mul x6, x6, x25 - adds x5, x5, x19 - mul x7, x7, x25 - adcs x6, x6, x20 - adcs x7, x7, x21 - adc x22, x22, x24 - ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) - asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x10, x11, [x1,#8*2+32] - eor x25, x13, x24 // conditionally negate |f0| (or |g0|) - - eor x8, x8, x24 // conditionally negate |a| (or |b|) - sub x25, x25, x24 - eor x9, x9, x24 - adds x8, x8, x24, lsr#63 - eor x10, x10, x24 - adcs x9, x9, xzr - eor x11, x11, x24 - umulh x19, x8, x25 - adcs x10, x10, xzr - umulh x20, x9, x25 - adc x11, x11, xzr - umulh x21, x10, x25 - and x24, x24, x25 - umulh x23, x11, x25 - neg x24, x24 - - mul x8, x8, x25 - mul x9, x9, x25 - mul x10, x10, x25 - adds x9, x9, x19 - mul x11, x11, x25 - adcs x10, x10, x20 - adcs x11, x11, x21 - adc x23, x23, x24 - adds x4, x4, x8 - adcs x5, x5, x9 - adcs x6, x6, x10 - adcs x7, x7, x11 - adc x8, x22, x23 - - extr x4, x5, x4, #31 - extr x5, x6, x5, #31 - extr x6, x7, x6, #31 - asr x23, x8, #63 // result's sign as mask - extr x7, x8, x7, #31 - - eor x4, x4, x23 // ensure the result is positive - eor x5, x5, x23 - adds x4, x4, x23, lsr#63 - eor x6, x6, x23 - adcs x5, x5, xzr - eor x7, x7, x23 - adcs x6, x6, xzr - stp x4, x5, [x0,#8*0] - adc x7, x7, xzr - stp x6, x7, [x0,#8*2] - - eor x12, x12, x23 // adjust |f/g| accordingly - eor x13, x13, x23 - sub x12, x12, x23 - sub x13, x13, x23 - - ret - - -.align 4 -__ab_approximation_31_256: - ldp x6, x7, [x1,#8*2] - ldp x10, x11, [x1,#8*6] - ldp x4, x5, [x1,#8*0] - ldp x8, x9, [x1,#8*4] - -Lab_approximation_31_256_loaded: - orr x19, x7, x11 // check top-most limbs, ... - cmp x19, #0 - csel x7, x7, x6, ne - csel x11, x11, x10, ne - csel x6, x6, x5, ne - orr x19, x7, x11 // and ones before top-most, ... - csel x10, x10, x9, ne - - cmp x19, #0 - csel x7, x7, x6, ne - csel x11, x11, x10, ne - csel x6, x6, x4, ne - orr x19, x7, x11 // and one more, ... - csel x10, x10, x8, ne - - clz x19, x19 - cmp x19, #64 - csel x19, x19, xzr, ne - csel x7, x7, x6, ne - csel x11, x11, x10, ne - neg x20, x19 - - lslv x7, x7, x19 // align high limbs to the left - lslv x11, x11, x19 - lsrv x6, x6, x20 - lsrv x10, x10, x20 - and x6, x6, x20, asr#6 - and x10, x10, x20, asr#6 - orr x7, x7, x6 - orr x11, x11, x10 - - bfxil x7, x4, #0, #31 - bfxil x11, x8, #0, #31 - - b __inner_loop_31_256 - ret - - - -.align 4 -__inner_loop_31_256: - mov x2, #31 - mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov x23,#0x7FFFFFFF7FFFFFFF - -Loop_31_256: - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x19, x15 - csel x11, x11, x7, hs // |b_| = |a_| - csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x15, x15, x13, hs // exchange |fg0| and |fg1| - csel x13, x13, x19, hs - lsr x7, x7, #1 - and x19, x15, x22 - and x20, x23, x22 - sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x15, x15, x15 // |f1|<<=1 - add x13, x13, x20 - sub x15, x15, x23 - cbnz x2, Loop_31_256 - - mov x23, #0x7FFFFFFF - ubfx x12, x13, #0, #32 - ubfx x13, x13, #32, #32 - ubfx x14, x15, #0, #32 - ubfx x15, x15, #32, #32 - sub x12, x12, x23 // remove bias - sub x13, x13, x23 - sub x14, x14, x23 - sub x15, x15, x23 - - ret - - - -.align 4 -__inner_loop_62_256: - mov x12, #1 // |f0|=1 - mov x13, #0 // |g0|=0 - mov x14, #0 // |f1|=0 - mov x15, #1 // |g1|=1 - -Loop_62_256: - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x19, x12 - csel x11, x11, x7, hs // |b_| = |a_| - csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - mov x20, x13 - csel x12, x12, x14, hs // exchange |f0| and |f1| - csel x14, x14, x19, hs - csel x13, x13, x15, hs // exchange |g0| and |g1| - csel x15, x15, x20, hs - lsr x7, x7, #1 - and x19, x14, x22 - and x20, x15, x22 - add x14, x14, x14 // |f1|<<=1 - add x15, x15, x15 // |g1|<<=1 - sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) - cbnz x2, Loop_62_256 - - ret - diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s deleted file mode 100644 index bf0ad8986e7..00000000000 --- a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s +++ /dev/null @@ -1,1178 +0,0 @@ -.text - -.globl _ct_inverse_mod_256 -.private_extern _ct_inverse_mod_256 - -.p2align 5 -_ct_inverse_mod_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $1072,%rsp -.cfi_adjust_cfa_offset 1072 - - - leaq 48+511(%rsp),%rax - andq $-512,%rax - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - - movq 0(%rdx),%r12 - movq 8(%rdx),%r13 - movq 16(%rdx),%r14 - movq 24(%rdx),%r15 - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - - movq %r12,32(%rax) - movq %r13,40(%rax) - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rax,%rsi - - - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - - - movq %rdx,64(%rdi) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - - - movq %rdx,72(%rdi) - - - xorq $256,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - - - - movq 64(%rsi),%r8 - movq 104(%rsi),%r12 - movq %r8,%r9 - imulq 0(%rsp),%r8 - movq %r12,%r13 - imulq 8(%rsp),%r12 - addq %r12,%r8 - movq %r8,32(%rdi) - sarq $63,%r8 - movq %r8,40(%rdi) - movq %r8,48(%rdi) - movq %r8,56(%rdi) - movq %r8,64(%rdi) - leaq 64(%rsi),%rsi - - imulq %rdx,%r9 - imulq %rcx,%r13 - addq %r13,%r9 - movq %r9,72(%rdi) - sarq $63,%r9 - movq %r9,80(%rdi) - movq %r9,88(%rdi) - movq %r9,96(%rdi) - movq %r9,104(%rdi) - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_256x63 - sarq $63,%rbp - movq %rbp,40(%rdi) - movq %rbp,48(%rdi) - movq %rbp,56(%rdi) - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - xorq $256+64,%rsi - movl $31,%edx - call __ab_approximation_31_256 - - - movq %r12,16(%rsp) - movq %r13,24(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,0(%rsp) - movq %rcx,8(%rsp) - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 32(%rdi),%rdi - call __smulq_256_n_shift_by_31 - movq %rdx,16(%rsp) - movq %rcx,24(%rsp) - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq 64(%rsi),%rsi - leaq 32(%rdi),%rdi - call __smulq_256x63 - - movq 16(%rsp),%rdx - movq 24(%rsp),%rcx - leaq 40(%rdi),%rdi - call __smulq_512x63 - - xorq $256+64,%rsi - movl $47,%edx - - movq 0(%rsi),%r8 - - movq 32(%rsi),%r10 - - call __inner_loop_62_256 - - - - - - - - leaq 64(%rsi),%rsi - - - - - - movq %r12,%rdx - movq %r13,%rcx - movq 32(%rsp),%rdi - call __smulq_512x63 - adcq %rbp,%rdx - - movq 40(%rsp),%rsi - movq %rdx,%rax - sarq $63,%rdx - - movq %rdx,%r8 - movq %rdx,%r9 - andq 0(%rsi),%r8 - movq %rdx,%r10 - andq 8(%rsi),%r9 - andq 16(%rsi),%r10 - andq 24(%rsi),%rdx - - addq %r8,%r12 - adcq %r9,%r13 - adcq %r10,%r14 - adcq %rdx,%r15 - adcq $0,%rax - - movq %rax,%rdx - negq %rax - orq %rax,%rdx - sarq $63,%rax - - movq %rdx,%r8 - movq %rdx,%r9 - andq 0(%rsi),%r8 - movq %rdx,%r10 - andq 8(%rsi),%r9 - andq 16(%rsi),%r10 - andq 24(%rsi),%rdx - - xorq %rax,%r8 - xorq %rcx,%rcx - xorq %rax,%r9 - subq %rax,%rcx - xorq %rax,%r10 - xorq %rax,%rdx - addq %rcx,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%rdx - - addq %r8,%r12 - adcq %r9,%r13 - adcq %r10,%r14 - adcq %rdx,%r15 - - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 1072(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -1072-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__smulq_512x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%rbp - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%rbp - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%rbp - - mulq %rbx - movq %rax,0(%rdi) - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %r9,8(%rdi) - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %r10,16(%rdi) - movq %rdx,%r11 - andq %rbx,%rbp - negq %rbp - mulq %rbx - addq %rax,%r11 - adcq %rdx,%rbp - movq %r11,24(%rdi) - - movq 40(%rsi),%r8 - movq 48(%rsi),%r9 - movq 56(%rsi),%r10 - movq 64(%rsi),%r11 - movq 72(%rsi),%r12 - movq 80(%rsi),%r13 - movq 88(%rsi),%r14 - movq 96(%rsi),%r15 - - movq %rcx,%rdx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rcx - addq %rax,%rcx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - xorq %rdx,%r14 - xorq %rdx,%r15 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - - mulq %rcx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rcx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rcx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rcx - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rcx - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - mulq %rcx - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rcx - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - imulq %rcx - addq %rax,%r15 - adcq $0,%rdx - - movq %rbp,%rbx - sarq $63,%rbp - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq %rbx,%r12 - adcq %rbp,%r13 - adcq %rbp,%r14 - adcq %rbp,%r15 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__smulq_256x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - movq 0+32(%rsi),%rbp - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%rbp - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%rbp - - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - andq %rbx,%rbp - negq %rbp - mulq %rbx - addq %rax,%r11 - adcq %rdx,%rbp - movq %rcx,%rdx - movq 40+0(%rsi),%r12 - movq 40+8(%rsi),%r13 - movq 40+16(%rsi),%r14 - movq 40+24(%rsi),%r15 - movq 40+32(%rsi),%rcx - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r12 - xorq %rdx,%r13 - xorq %rdx,%r14 - xorq %rdx,%r15 - xorq %rdx,%rcx - addq %r12,%rax - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rcx - - mulq %rbx - movq %rax,%r12 - movq %r13,%rax - movq %rdx,%r13 - mulq %rbx - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rbx - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - andq %rbx,%rcx - negq %rcx - mulq %rbx - addq %rax,%r15 - adcq %rdx,%rcx - addq %r12,%r8 - adcq %r13,%r9 - adcq %r14,%r10 - adcq %r15,%r11 - adcq %rcx,%rbp - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %rbp,32(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__smulq_256_n_shift_by_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,0(%rdi) - movq %rcx,8(%rdi) - movq %rdx,%rbp - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - - movq %rbp,%rbx - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%rbx - addq %rax,%rbx - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - andq %rbx,%rbp - negq %rbp - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbx - addq %rax,%r11 - adcq %rdx,%rbp - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r14 - movq 32+24(%rsi),%r15 - - movq %rcx,%rbx - sarq $63,%rcx - xorq %rax,%rax - subq %rcx,%rax - - xorq %rcx,%rbx - addq %rax,%rbx - - xorq %rcx,%r12 - xorq %rcx,%r13 - xorq %rcx,%r14 - xorq %rcx,%r15 - addq %r12,%rax - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - - mulq %rbx - movq %rax,%r12 - movq %r13,%rax - andq %rbx,%rcx - negq %rcx - movq %rdx,%r13 - mulq %rbx - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rbx - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - mulq %rbx - addq %rax,%r15 - adcq %rdx,%rcx - addq %r12,%r8 - adcq %r13,%r9 - adcq %r14,%r10 - adcq %r15,%r11 - adcq %rcx,%rbp - - movq 0(%rdi),%rdx - movq 8(%rdi),%rcx - - shrdq $31,%r9,%r8 - shrdq $31,%r10,%r9 - shrdq $31,%r11,%r10 - shrdq $31,%rbp,%r11 - - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - xorq %rbp,%rdx - xorq %rbp,%rcx - addq %rax,%rdx - addq %rax,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__ab_approximation_31_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 24(%rsi),%r9 - movq 56(%rsi),%r11 - movq 16(%rsi),%rbx - movq 48(%rsi),%rbp - movq 8(%rsi),%r8 - movq 40(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 0(%rsi),%r8 - cmovzq %r10,%rbp - movq 32(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - - movq %r9,%rax - orq %r11,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %r8,%r9 - cmovzq %r10,%r11 - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%rbx,%r9 - shldq %cl,%rbp,%r11 - - movl $0x7FFFFFFF,%eax - andq %rax,%r8 - andq %rax,%r10 - notq %rax - andq %rax,%r9 - andq %rax,%r11 - orq %r9,%r8 - orq %r11,%r10 - - jmp __inner_loop_31_256 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__inner_loop_31_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $0x7FFFFFFF80000000,%rcx - movq $0x800000007FFFFFFF,%r13 - movq $0x7FFFFFFF7FFFFFFF,%r15 - -L$oop_31_256: - cmpq %r10,%r8 - movq %r8,%rax - movq %r10,%rbx - movq %rcx,%rbp - movq %r13,%r14 - cmovbq %r10,%r8 - cmovbq %rax,%r10 - cmovbq %r13,%rcx - cmovbq %rbp,%r13 - - subq %r10,%r8 - subq %r13,%rcx - addq %r15,%rcx - - testq $1,%rax - cmovzq %rax,%r8 - cmovzq %rbx,%r10 - cmovzq %rbp,%rcx - cmovzq %r14,%r13 - - shrq $1,%r8 - addq %r13,%r13 - subq %r15,%r13 - subl $1,%edx - jnz L$oop_31_256 - - shrq $32,%r15 - movl %ecx,%edx - movl %r13d,%r12d - shrq $32,%rcx - shrq $32,%r13 - subq %r15,%rdx - subq %r15,%rcx - subq %r15,%r12 - subq %r15,%r13 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__inner_loop_62_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movl %edx,%r15d - movq $1,%rdx - xorq %rcx,%rcx - xorq %r12,%r12 - movq %rdx,%r13 - movq %rdx,%r14 - -L$oop_62_256: - xorq %rax,%rax - testq %r14,%r8 - movq %r10,%rbx - cmovnzq %r10,%rax - subq %r8,%rbx - movq %r8,%rbp - subq %rax,%r8 - cmovcq %rbx,%r8 - cmovcq %rbp,%r10 - movq %rdx,%rax - cmovcq %r12,%rdx - cmovcq %rax,%r12 - movq %rcx,%rbx - cmovcq %r13,%rcx - cmovcq %rbx,%r13 - xorq %rax,%rax - xorq %rbx,%rbx - shrq $1,%r8 - testq %r14,%rbp - cmovnzq %r12,%rax - cmovnzq %r13,%rbx - addq %r12,%r12 - addq %r13,%r13 - subq %rax,%rdx - subq %rbx,%rcx - subl $1,%r15d - jnz L$oop_62_256 - - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S deleted file mode 100644 index b9c3acde200..00000000000 --- a/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S +++ /dev/null @@ -1,718 +0,0 @@ -.text - -.globl _ct_inverse_mod_383 -.private_extern _ct_inverse_mod_383 - -.align 5 -_ct_inverse_mod_383: -.long 3573752639 - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #1040 - - ldp x22, x4, [x1,#8*0] - ldp x5, x6, [x1,#8*2] - ldp x7, x8, [x1,#8*4] - - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... - stp x0, x3, [sp] - - ldp x9, x10, [x2,#8*0] - ldp x11, x12, [x2,#8*2] - ldp x13, x14, [x2,#8*4] - - stp x22, x4, [x1,#8*0] // copy input to |a| - stp x5, x6, [x1,#8*2] - stp x7, x8, [x1,#8*4] - stp x9, x10, [x1,#8*6] // copy modulus to |b| - stp x11, x12, [x1,#8*8] - stp x13, x14, [x1,#8*10] - - ////////////////////////////////////////// first iteration - mov x2, #62 - bl Lab_approximation_62_loaded - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - str x15,[x0,#8*12] // initialize |u| with |f0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to dst |b| - bl __smul_383_n_shift_by_62 - str x15, [x0,#8*12] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - ldr x7, [x1,#8*12] // |u| - ldr x8, [x1,#8*18] // |v| - mul x3, x20, x7 // |u|*|f0| - smulh x4, x20, x7 - mul x5, x21, x8 // |v|*|g0| - smulh x6, x21, x8 - adds x3, x3, x5 - adc x4, x4, x6 - stp x3, x4, [x0,#8*6] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*8] - stp x5, x5, [x0,#8*10] - - mul x3, x15, x7 // |u|*|f1| - smulh x4, x15, x7 - mul x5, x16, x8 // |v|*|g1| - smulh x6, x16, x8 - adds x3, x3, x5 - adc x4, x4, x6 - stp x3, x4, [x0,#8*12] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*14] - stp x5, x5, [x0,#8*16] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - asr x27, x27, #63 // sign extension - stp x27, x27, [x0,#8*6] - stp x27, x27, [x0,#8*8] - stp x27, x27, [x0,#8*10] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - ////////////////////////////////////////// iteration before last - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldp x3, x8, [x1,#8*0] // just load - ldp x9, x14, [x1,#8*6] - bl __inner_loop_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - str x3, [x0,#8*0] - str x9, [x0,#8*6] - - mov x20, x15 // exact |f0| - mov x21, x16 // exact |g0| - mov x15, x17 - mov x16, x19 - add x0, x0, #8*12 // pointer to dst |u| - bl __smul_383x63 - - mov x20, x15 // exact |f1| - mov x21, x16 // exact |g1| - add x0, x0, #8*6 // pointer to dst |v| - bl __smul_383x63 - bl __smul_767x63_tail - - ////////////////////////////////////////// last iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #22 // 766 % 62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldr x3, [x1,#8*0] // just load - eor x8, x8, x8 - ldr x9, [x1,#8*6] - eor x14, x14, x14 - bl __inner_loop_62 - - mov x20, x17 - mov x21, x19 - ldp x0, x15, [sp] // original out_ptr and n_ptr - bl __smul_383x63 - bl __smul_767x63_tail - ldr x30, [x29,#8] - - asr x22, x8, #63 // sign as mask - ldp x9, x10, [x15,#8*0] - ldp x11, x12, [x15,#8*2] - ldp x13, x14, [x15,#8*4] - - and x9, x9, x22 // add mod<<384 conditionally - and x10, x10, x22 - adds x3, x3, x9 - and x11, x11, x22 - adcs x4, x4, x10 - and x12, x12, x22 - adcs x5, x5, x11 - and x13, x13, x22 - adcs x6, x6, x12 - and x14, x14, x22 - stp x3, x4, [x0,#8*6] - adcs x7, x7, x13 - stp x5, x6, [x0,#8*8] - adc x8, x8, x14 - stp x7, x8, [x0,#8*10] - - add sp, sp, #1040 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 -.long 3573752767 - ret - - -//////////////////////////////////////////////////////////////////////// -// see corresponding commentary in ctx_inverse_mod_384-x86_64... - -.align 5 -__smul_383x63: - ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) - asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x5, x6, [x1,#8*2+96] - eor x20, x20, x17 // conditionally negate |f_| (or |g_|) - ldp x7, x8, [x1,#8*4+96] - - eor x3, x3, x17 // conditionally negate |u| (or |v|) - sub x20, x20, x17 - eor x4, x4, x17 - adds x3, x3, x17, lsr#63 - eor x5, x5, x17 - adcs x4, x4, xzr - eor x6, x6, x17 - adcs x5, x5, xzr - eor x7, x7, x17 - adcs x6, x6, xzr - umulh x22, x3, x20 - eor x8, x8, x17 - umulh x23, x4, x20 - adcs x7, x7, xzr - umulh x24, x5, x20 - adcs x8, x8, xzr - umulh x25, x6, x20 - umulh x26, x7, x20 - mul x3, x3, x20 - mul x4, x4, x20 - mul x5, x5, x20 - adds x4, x4, x22 - mul x6, x6, x20 - adcs x5, x5, x23 - mul x7, x7, x20 - adcs x6, x6, x24 - mul x27,x8, x20 - adcs x7, x7, x25 - adcs x27,x27,x26 - adc x2, xzr, xzr - ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) - asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x11, x12, [x1,#8*2+144] - eor x21, x21, x17 // conditionally negate |f_| (or |g_|) - ldp x13, x14, [x1,#8*4+144] - - eor x9, x9, x17 // conditionally negate |u| (or |v|) - sub x21, x21, x17 - eor x10, x10, x17 - adds x9, x9, x17, lsr#63 - eor x11, x11, x17 - adcs x10, x10, xzr - eor x12, x12, x17 - adcs x11, x11, xzr - eor x13, x13, x17 - adcs x12, x12, xzr - umulh x22, x9, x21 - eor x14, x14, x17 - umulh x23, x10, x21 - adcs x13, x13, xzr - umulh x24, x11, x21 - adcs x14, x14, xzr - umulh x25, x12, x21 - adc x19, xzr, xzr // used in __smul_767x63_tail - umulh x26, x13, x21 - mul x9, x9, x21 - mul x10, x10, x21 - mul x11, x11, x21 - adds x10, x10, x22 - mul x12, x12, x21 - adcs x11, x11, x23 - mul x13, x13, x21 - adcs x12, x12, x24 - mul x28,x14, x21 - adcs x13, x13, x25 - adcs x28,x28,x26 - adc x2, x2, xzr - - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - stp x3, x4, [x0,#8*0] - adcs x7, x7, x13 - stp x5, x6, [x0,#8*2] - adcs x27, x27, x28 - stp x7, x27, [x0,#8*4] - adc x28, x2, xzr // used in __smul_767x63_tail - - ret - - - -.align 5 -__smul_767x63_tail: - smulh x27, x8, x20 - ldp x3, x4, [x1,#8*24] // load rest of |v| - umulh x14,x14, x21 - ldp x5, x6, [x1,#8*26] - ldp x7, x8, [x1,#8*28] - - eor x3, x3, x17 // conditionally negate rest of |v| - eor x4, x4, x17 - eor x5, x5, x17 - adds x3, x3, x19 - eor x6, x6, x17 - adcs x4, x4, xzr - eor x7, x7, x17 - adcs x5, x5, xzr - eor x8, x8, x17 - adcs x6, x6, xzr - umulh x22, x3, x21 - adcs x7, x7, xzr - umulh x23, x4, x21 - adc x8, x8, xzr - - umulh x24, x5, x21 - add x14, x14, x28 - umulh x25, x6, x21 - asr x28, x27, #63 - umulh x26, x7, x21 - mul x3, x3, x21 - mul x4, x4, x21 - mul x5, x5, x21 - adds x3, x3, x14 - mul x6, x6, x21 - adcs x4, x4, x22 - mul x7, x7, x21 - adcs x5, x5, x23 - mul x8, x8, x21 - adcs x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, x26 - - adds x3, x3, x27 - adcs x4, x4, x28 - adcs x5, x5, x28 - adcs x6, x6, x28 - stp x3, x4, [x0,#8*6] - adcs x7, x7, x28 - stp x5, x6, [x0,#8*8] - adc x8, x8, x28 - stp x7, x8, [x0,#8*10] - - ret - - - -.align 5 -__smul_383_n_shift_by_62: - ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) - asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x5, x6, [x1,#8*2+0] - eor x2, x15, x28 // conditionally negate |f0| (or |g0|) - ldp x7, x8, [x1,#8*4+0] - - eor x3, x3, x28 // conditionally negate |a| (or |b|) - sub x2, x2, x28 - eor x4, x4, x28 - adds x3, x3, x28, lsr#63 - eor x5, x5, x28 - adcs x4, x4, xzr - eor x6, x6, x28 - adcs x5, x5, xzr - eor x7, x7, x28 - umulh x22, x3, x2 - adcs x6, x6, xzr - umulh x23, x4, x2 - eor x8, x8, x28 - umulh x24, x5, x2 - adcs x7, x7, xzr - umulh x25, x6, x2 - adc x8, x8, xzr - - umulh x26, x7, x2 - smulh x27, x8, x2 - mul x3, x3, x2 - mul x4, x4, x2 - mul x5, x5, x2 - adds x4, x4, x22 - mul x6, x6, x2 - adcs x5, x5, x23 - mul x7, x7, x2 - adcs x6, x6, x24 - mul x8, x8, x2 - adcs x7, x7, x25 - adcs x8, x8 ,x26 - adc x27, x27, xzr - ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) - asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x11, x12, [x1,#8*2+48] - eor x2, x16, x28 // conditionally negate |f0| (or |g0|) - ldp x13, x14, [x1,#8*4+48] - - eor x9, x9, x28 // conditionally negate |a| (or |b|) - sub x2, x2, x28 - eor x10, x10, x28 - adds x9, x9, x28, lsr#63 - eor x11, x11, x28 - adcs x10, x10, xzr - eor x12, x12, x28 - adcs x11, x11, xzr - eor x13, x13, x28 - umulh x22, x9, x2 - adcs x12, x12, xzr - umulh x23, x10, x2 - eor x14, x14, x28 - umulh x24, x11, x2 - adcs x13, x13, xzr - umulh x25, x12, x2 - adc x14, x14, xzr - - umulh x26, x13, x2 - smulh x28, x14, x2 - mul x9, x9, x2 - mul x10, x10, x2 - mul x11, x11, x2 - adds x10, x10, x22 - mul x12, x12, x2 - adcs x11, x11, x23 - mul x13, x13, x2 - adcs x12, x12, x24 - mul x14, x14, x2 - adcs x13, x13, x25 - adcs x14, x14 ,x26 - adc x28, x28, xzr - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - adcs x7, x7, x13 - adcs x8, x8, x14 - adc x9, x27, x28 - - extr x3, x4, x3, #62 - extr x4, x5, x4, #62 - extr x5, x6, x5, #62 - asr x28, x9, #63 - extr x6, x7, x6, #62 - extr x7, x8, x7, #62 - extr x8, x9, x8, #62 - - eor x3, x3, x28 - eor x4, x4, x28 - adds x3, x3, x28, lsr#63 - eor x5, x5, x28 - adcs x4, x4, xzr - eor x6, x6, x28 - adcs x5, x5, xzr - eor x7, x7, x28 - adcs x6, x6, xzr - eor x8, x8, x28 - stp x3, x4, [x0,#8*0] - adcs x7, x7, xzr - stp x5, x6, [x0,#8*2] - adc x8, x8, xzr - stp x7, x8, [x0,#8*4] - - eor x15, x15, x28 - eor x16, x16, x28 - sub x15, x15, x28 - sub x16, x16, x28 - - ret - - -.align 4 -__ab_approximation_62: - ldp x7, x8, [x1,#8*4] - ldp x13, x14, [x1,#8*10] - ldp x5, x6, [x1,#8*2] - ldp x11, x12, [x1,#8*8] - -Lab_approximation_62_loaded: - orr x22, x8, x14 // check top-most limbs, ... - cmp x22, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x6, ne - orr x22, x8, x14 // ... ones before top-most, ... - csel x13, x13, x12, ne - - ldp x3, x4, [x1,#8*0] - ldp x9, x10, [x1,#8*6] - - cmp x22, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x5, ne - orr x22, x8, x14 // ... and ones before that ... - csel x13, x13, x11, ne - - cmp x22, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x4, ne - orr x22, x8, x14 - csel x13, x13, x10, ne - - clz x22, x22 - cmp x22, #64 - csel x22, x22, xzr, ne - csel x8, x8, x7, ne - csel x14, x14, x13, ne - neg x23, x22 - - lslv x8, x8, x22 // align high limbs to the left - lslv x14, x14, x22 - lsrv x7, x7, x23 - lsrv x13, x13, x23 - and x7, x7, x23, asr#6 - and x13, x13, x23, asr#6 - orr x8, x8, x7 - orr x14, x14, x13 - - b __inner_loop_62 - ret - - -.align 4 -__inner_loop_62: - mov x15, #1 // |f0|=1 - mov x16, #0 // |g0|=0 - mov x17, #0 // |f1|=0 - mov x19, #1 // |g1|=1 - -Loop_62: - sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - subs x24, x9, x3 // |b_|-|a_| - and x22, x9, x28 - sbc x25, x14, x8 - and x23, x14, x28 - subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x22, x15 - sbcs x27, x8, x23 - mov x23, x16 - csel x9, x9, x3, hs // |b_| = |a_| - csel x14, x14, x8, hs - csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x8, x27, x25, hs - csel x15, x15, x17, hs // exchange |f0| and |f1| - csel x17, x17, x22, hs - csel x16, x16, x19, hs // exchange |g0| and |g1| - csel x19, x19, x23, hs - extr x3, x8, x3, #1 - lsr x8, x8, #1 - and x22, x17, x28 - and x23, x19, x28 - add x17, x17, x17 // |f1|<<=1 - add x19, x19, x19 // |g1|<<=1 - sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) - cbnz x2, Loop_62 - - ret - diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S deleted file mode 100644 index 9fe0df88b59..00000000000 --- a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S +++ /dev/null @@ -1,325 +0,0 @@ -.text - -.globl _ct_is_square_mod_384 -.private_extern _ct_is_square_mod_384 - -.align 5 -_ct_is_square_mod_384: -.long 3573752639 - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #512 - - ldp x3, x4, [x0,#8*0] // load input - ldp x5, x6, [x0,#8*2] - ldp x7, x8, [x0,#8*4] - - add x0, sp, #255 // find closest 256-byte-aligned spot - and x0, x0, #-256 // in the frame... - - ldp x9, x10, [x1,#8*0] // load modulus - ldp x11, x12, [x1,#8*2] - ldp x13, x14, [x1,#8*4] - - stp x3, x4, [x0,#8*6] // copy input to |a| - stp x5, x6, [x0,#8*8] - stp x7, x8, [x0,#8*10] - stp x9, x10, [x0,#8*0] // copy modulus to |b| - stp x11, x12, [x0,#8*2] - stp x13, x14, [x0,#8*4] - - eor x2, x2, x2 // init the Legendre symbol - mov x15, #24 // 24 is 768/30-1 - b Loop_is_square - -.align 4 -Loop_is_square: - bl __ab_approximation_30 - sub x15, x15, #1 - - eor x1, x0, #128 // pointer to dst |b| - bl __smul_384_n_shift_by_30 - - mov x19, x16 // |f0| - mov x20, x17 // |g0| - add x1, x1, #8*6 // pointer to dst |a| - bl __smul_384_n_shift_by_30 - - ldp x9, x10, [x1,#-8*6] - eor x0, x0, #128 // flip-flop src |a|b| - and x27, x27, x9 // if |a| was negative, - add x2, x2, x27, lsr#1 // adjust |L| - - cbnz x15, Loop_is_square - - ////////////////////////////////////////// last iteration - //bl __ab_approximation_30 // |a| and |b| are exact, - //ldr x8, [x0,#8*6] // and loaded - //ldr x14, [x0,#8*0] - mov x15, #48 // 48 is 768%30 + 30 - bl __inner_loop_48 - ldr x30, [x29,#8] - - and x0, x2, #1 - eor x0, x0, #1 - - add sp, sp, #512 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 -.long 3573752767 - ret - - - -.align 5 -__smul_384_n_shift_by_30: - ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) - asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) - ldp x5, x6, [x0,#8*2+0] - eor x20, x20, x27 // conditionally negate |g1| (or |f1|) - ldp x7, x8, [x0,#8*4+0] - - eor x3, x3, x27 // conditionally negate |b| (or |a|) - sub x20, x20, x27 - eor x4, x4, x27 - adds x3, x3, x27, lsr#63 - eor x5, x5, x27 - adcs x4, x4, xzr - eor x6, x6, x27 - adcs x5, x5, xzr - eor x7, x7, x27 - umulh x21, x3, x20 - adcs x6, x6, xzr - umulh x22, x4, x20 - eor x8, x8, x27 - umulh x23, x5, x20 - adcs x7, x7, xzr - umulh x24, x6, x20 - adc x8, x8, xzr - - umulh x25, x7, x20 - and x28, x20, x27 - umulh x26, x8, x20 - neg x28, x28 - mul x3, x3, x20 - mul x4, x4, x20 - mul x5, x5, x20 - adds x4, x4, x21 - mul x6, x6, x20 - adcs x5, x5, x22 - mul x7, x7, x20 - adcs x6, x6, x23 - mul x8, x8, x20 - adcs x7, x7, x24 - adcs x8, x8 ,x25 - adc x26, x26, x28 - ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) - asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) - ldp x11, x12, [x0,#8*2+48] - eor x19, x19, x27 // conditionally negate |g1| (or |f1|) - ldp x13, x14, [x0,#8*4+48] - - eor x9, x9, x27 // conditionally negate |b| (or |a|) - sub x19, x19, x27 - eor x10, x10, x27 - adds x9, x9, x27, lsr#63 - eor x11, x11, x27 - adcs x10, x10, xzr - eor x12, x12, x27 - adcs x11, x11, xzr - eor x13, x13, x27 - umulh x21, x9, x19 - adcs x12, x12, xzr - umulh x22, x10, x19 - eor x14, x14, x27 - umulh x23, x11, x19 - adcs x13, x13, xzr - umulh x24, x12, x19 - adc x14, x14, xzr - - umulh x25, x13, x19 - and x28, x19, x27 - umulh x27, x14, x19 - neg x28, x28 - mul x9, x9, x19 - mul x10, x10, x19 - mul x11, x11, x19 - adds x10, x10, x21 - mul x12, x12, x19 - adcs x11, x11, x22 - mul x13, x13, x19 - adcs x12, x12, x23 - mul x14, x14, x19 - adcs x13, x13, x24 - adcs x14, x14 ,x25 - adc x27, x27, x28 - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - adcs x7, x7, x13 - adcs x8, x8, x14 - adc x9, x26, x27 - - extr x3, x4, x3, #30 - extr x4, x5, x4, #30 - extr x5, x6, x5, #30 - asr x27, x9, #63 - extr x6, x7, x6, #30 - extr x7, x8, x7, #30 - extr x8, x9, x8, #30 - - eor x3, x3, x27 - eor x4, x4, x27 - adds x3, x3, x27, lsr#63 - eor x5, x5, x27 - adcs x4, x4, xzr - eor x6, x6, x27 - adcs x5, x5, xzr - eor x7, x7, x27 - adcs x6, x6, xzr - eor x8, x8, x27 - stp x3, x4, [x1,#8*0] - adcs x7, x7, xzr - stp x5, x6, [x1,#8*2] - adc x8, x8, xzr - stp x7, x8, [x1,#8*4] - - ret - - -.align 4 -__ab_approximation_30: - ldp x13, x14, [x0,#8*4] // |a| is still in registers - ldp x11, x12, [x0,#8*2] - - orr x21, x8, x14 // check top-most limbs, ... - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x6, ne - orr x21, x8, x14 // ... ones before top-most, ... - csel x13, x13, x12, ne - - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x5, ne - orr x21, x8, x14 // ... and ones before that ... - csel x13, x13, x11, ne - - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x4, ne - orr x21, x8, x14 // and one more, ... - csel x13, x13, x10, ne - - cmp x21, #0 - csel x8, x8, x7, ne - csel x14, x14, x13, ne - csel x7, x7, x3, ne - orr x21, x8, x14 - csel x13, x13, x9, ne - - clz x21, x21 - cmp x21, #64 - csel x21, x21, xzr, ne - csel x8, x8, x7, ne - csel x14, x14, x13, ne - neg x22, x21 - - lslv x8, x8, x21 // align high limbs to the left - lslv x14, x14, x21 - lsrv x7, x7, x22 - lsrv x13, x13, x22 - and x7, x7, x22, asr#6 - and x13, x13, x22, asr#6 - orr x8, x8, x7 - orr x14, x14, x13 - - bfxil x8, x3, #0, #32 - bfxil x14, x9, #0, #32 - - b __inner_loop_30 - ret - - - -.align 4 -__inner_loop_30: - mov x28, #30 - mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov x27,#0x7FFFFFFF7FFFFFFF - -Loop_30: - sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting - and x25, x8, x14 - sub x28, x28, #1 - and x21, x14, x24 - - sub x22, x14, x8 // |b_|-|a_| - subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) - add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 - mov x21, x20 - csel x14, x14, x8, hs // |b_| = |a_| - csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x20, x20, x17, hs // exchange |fg0| and |fg1| - csel x17, x17, x21, hs - csel x2, x2, x25, hs - lsr x8, x8, #1 - and x21, x20, x24 - and x22, x27, x24 - add x23, x14, #2 - sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x20, x20, x20 // |f1|<<=1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - add x17, x17, x22 - sub x20, x20, x27 - - cbnz x28, Loop_30 - - mov x27, #0x7FFFFFFF - ubfx x16, x17, #0, #32 - ubfx x17, x17, #32, #32 - ubfx x19, x20, #0, #32 - ubfx x20, x20, #32, #32 - sub x16, x16, x27 // remove the bias - sub x17, x17, x27 - sub x19, x19, x27 - sub x20, x20, x27 - - ret - - -.align 4 -__inner_loop_48: -Loop_48: - sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting - and x25, x3, x9 - sub x15, x15, #1 - and x21, x9, x24 - sub x22, x9, x3 // |b_|-|a_| - subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) - add x25, x2, x25, lsr#1 - csel x9, x9, x3, hs // |b_| = |a_| - csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| - csel x2, x2, x25, hs - add x23, x9, #2 - lsr x3, x3, #1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - - cbnz x15, Loop_48 - - ret - diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s deleted file mode 100644 index 5faadb8dbff..00000000000 --- a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s +++ /dev/null @@ -1,472 +0,0 @@ -.text - -.globl _ct_is_square_mod_384 -.private_extern _ct_is_square_mod_384 - -.p2align 5 -_ct_is_square_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $536,%rsp -.cfi_adjust_cfa_offset 536 - - - leaq 24+255(%rsp),%rax - andq $-256,%rax - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rbx - movq 24(%rsi),%rcx - movq 32(%rsi),%rdx - movq 40(%rsi),%rdi - movq %rax,%rsi - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - movq %r12,32(%rax) - movq %r13,40(%rax) - - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rbx,64(%rax) - movq %rcx,72(%rax) - movq %rdx,80(%rax) - movq %rdi,88(%rax) - - xorq %rbp,%rbp - movl $24,%ecx - jmp L$oop_is_square - -.p2align 5 -L$oop_is_square: - movl %ecx,16(%rsp) - - call __ab_approximation_30 - movq %rax,0(%rsp) - movq %rbx,8(%rsp) - - movq $128+48,%rdi - xorq %rsi,%rdi - call __smulq_384_n_shift_by_30 - - movq 0(%rsp),%rdx - movq 8(%rsp),%rcx - leaq -48(%rdi),%rdi - call __smulq_384_n_shift_by_30 - - movl 16(%rsp),%ecx - xorq $128,%rsi - - andq 48(%rdi),%r14 - shrq $1,%r14 - addq %r14,%rbp - - subl $1,%ecx - jnz L$oop_is_square - - - - - movq 48(%rsi),%r9 - call __inner_loop_48 - - movq $1,%rax - andq %rbp,%rax - xorq $1,%rax - - leaq 536(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -536-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__smulq_384_n_shift_by_30: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %rdx,%r14 - andq %rbx,%r14 - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbx - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbx - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - negq %r14 - mulq %rbx - addq %rax,%r13 - adcq %rdx,%r14 - leaq 48(%rsi),%rsi - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbx - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbx - addq %rax,%rbx - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %rdx,%r15 - andq %rbx,%r15 - mulq %rbx - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbx - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbx - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbx - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbx - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - negq %r15 - mulq %rbx - addq %rax,%r13 - adcq %rdx,%r15 - leaq -48(%rsi),%rsi - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq %r15,%r14 - - shrdq $30,%r9,%r8 - shrdq $30,%r10,%r9 - shrdq $30,%r11,%r10 - shrdq $30,%r12,%r11 - shrdq $30,%r13,%r12 - shrdq $30,%r14,%r13 - - sarq $63,%r14 - xorq %rbx,%rbx - subq %r14,%rbx - - xorq %r14,%r8 - xorq %r14,%r9 - xorq %r14,%r10 - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%r13 - addq %rbx,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__ab_approximation_30: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 88(%rsi),%rbx - movq 80(%rsi),%r15 - movq 72(%rsi),%r14 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r11,%r12 - movq 64(%rsi),%r11 - cmovzq %r14,%r15 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r10,%r12 - movq 56(%rsi),%r10 - cmovzq %r11,%r15 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r9,%r12 - movq 48(%rsi),%r9 - cmovzq %r10,%r15 - - movq %r13,%rax - orq %rbx,%rax - cmovzq %r12,%r13 - cmovzq %r15,%rbx - cmovzq %r8,%r12 - cmovzq %r9,%r15 - - movq %r13,%rax - orq %rbx,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %r8,%r13 - cmovzq %r9,%rbx - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%r12,%r13 - shldq %cl,%r15,%rbx - - movq $0xFFFFFFFF00000000,%rax - movl %r8d,%r8d - movl %r9d,%r9d - andq %rax,%r13 - andq %rax,%rbx - orq %r13,%r8 - orq %rbx,%r9 - - jmp __inner_loop_30 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__inner_loop_30: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $0x7FFFFFFF80000000,%rbx - movq $0x800000007FFFFFFF,%rcx - leaq -1(%rbx),%r15 - movl $30,%edi - -L$oop_30: - movq %r8,%rax - andq %r9,%rax - shrq $1,%rax - - cmpq %r9,%r8 - movq %r8,%r10 - movq %r9,%r11 - leaq (%rax,%rbp,1),%rax - movq %rbx,%r12 - movq %rcx,%r13 - movq %rbp,%r14 - cmovbq %r9,%r8 - cmovbq %r10,%r9 - cmovbq %rcx,%rbx - cmovbq %r12,%rcx - cmovbq %rax,%rbp - - subq %r9,%r8 - subq %rcx,%rbx - addq %r15,%rbx - - testq $1,%r10 - cmovzq %r10,%r8 - cmovzq %r11,%r9 - cmovzq %r12,%rbx - cmovzq %r13,%rcx - cmovzq %r14,%rbp - - leaq 2(%r9),%rax - shrq $1,%r8 - shrq $2,%rax - addq %rcx,%rcx - leaq (%rax,%rbp,1),%rbp - subq %r15,%rcx - - subl $1,%edi - jnz L$oop_30 - - shrq $32,%r15 - movl %ebx,%eax - shrq $32,%rbx - movl %ecx,%edx - shrq $32,%rcx - subq %r15,%rax - subq %r15,%rbx - subq %r15,%rdx - subq %r15,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__inner_loop_48: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movl $48,%edi - -L$oop_48: - movq %r8,%rax - andq %r9,%rax - shrq $1,%rax - - cmpq %r9,%r8 - movq %r8,%r10 - movq %r9,%r11 - leaq (%rax,%rbp,1),%rax - movq %rbp,%r12 - cmovbq %r9,%r8 - cmovbq %r10,%r9 - cmovbq %rax,%rbp - - subq %r9,%r8 - - testq $1,%r10 - cmovzq %r10,%r8 - cmovzq %r11,%r9 - cmovzq %r12,%rbp - - leaq 2(%r9),%rax - shrq $1,%r8 - shrq $2,%rax - addq %rax,%rbp - - subl $1,%edi - jnz L$oop_48 - - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s deleted file mode 100644 index eebe131d0cb..00000000000 --- a/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s +++ /dev/null @@ -1,1193 +0,0 @@ -.comm ___blst_platform_cap,4 -.text - -.globl _ct_inverse_mod_383 -.private_extern _ct_inverse_mod_383 - -.p2align 5 -_ct_inverse_mod_383: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz ct_inverse_mod_383$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $1112,%rsp -.cfi_adjust_cfa_offset 1112 - - - leaq 88+511(%rsp),%rax - andq $-512,%rax - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq 0(%rdx),%r14 - movq 8(%rdx),%r15 - movq 16(%rdx),%rbx - movq 24(%rdx),%rbp - movq 32(%rdx),%rsi - movq 40(%rdx),%rdi - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - movq %r12,32(%rax) - movq %r13,40(%rax) - - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rbx,64(%rax) - movq %rbp,72(%rax) - movq %rsi,80(%rax) - movq %rax,%rsi - movq %rdi,88(%rax) - - - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - - - movq %rdx,96(%rdi) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - - - movq %rdx,96(%rdi) - - - xorq $256,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - - - - movq 96(%rsi),%rax - movq 144(%rsi),%r11 - movq %rdx,%rbx - movq %rax,%r10 - imulq 56(%rsp) - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq 64(%rsp) - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,48(%rdi) - movq %r9,56(%rdi) - sarq $63,%r9 - movq %r9,64(%rdi) - movq %r9,72(%rdi) - movq %r9,80(%rdi) - movq %r9,88(%rdi) - leaq 96(%rsi),%rsi - - movq %r10,%rax - imulq %rbx - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq %rcx - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,96(%rdi) - movq %r9,104(%rdi) - sarq $63,%r9 - movq %r9,112(%rdi) - movq %r9,120(%rdi) - movq %r9,128(%rdi) - movq %r9,136(%rdi) - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383x63 - sarq $63,%r13 - movq %r13,48(%rdi) - movq %r13,56(%rdi) - movq %r13,64(%rdi) - movq %r13,72(%rdi) - movq %r13,80(%rdi) - movq %r13,88(%rdi) - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - xorq $256+96,%rsi - movl $62,%edi - call __ab_approximation_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_383_n_shift_by_62 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - - xorq $256+96,%rsi - movl $62,%edi - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 48(%rsi),%r10 - movq 56(%rsi),%r11 - call __inner_loop_62 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - movq %r8,0(%rdi) - movq %r10,48(%rdi) - - - - leaq 96(%rsi),%rsi - leaq 96(%rdi),%rdi - call __smulq_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulq_767x63 - - - xorq $256+96,%rsi - movl $22,%edi - - movq 0(%rsi),%r8 - xorq %r9,%r9 - movq 48(%rsi),%r10 - xorq %r11,%r11 - call __inner_loop_62 - - - - - - - - leaq 96(%rsi),%rsi - - - - - - movq %r12,%rdx - movq %r13,%rcx - movq 32(%rsp),%rdi - call __smulq_767x63 - - movq 40(%rsp),%rsi - movq %rax,%rdx - sarq $63,%rax - - movq %rax,%r8 - movq %rax,%r9 - movq %rax,%r10 - andq 0(%rsi),%r8 - andq 8(%rsi),%r9 - movq %rax,%r11 - andq 16(%rsi),%r10 - andq 24(%rsi),%r11 - movq %rax,%r12 - andq 32(%rsi),%r12 - andq 40(%rsi),%rax - - addq %r8,%r14 - adcq %r9,%r15 - adcq %r10,%rbx - adcq %r11,%rbp - adcq %r12,%rcx - adcq %rax,%rdx - - movq %r14,48(%rdi) - movq %r15,56(%rdi) - movq %rbx,64(%rdi) - movq %rbp,72(%rdi) - movq %rcx,80(%rdi) - movq %rdx,88(%rdi) - - leaq 1112(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -1112-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__smulq_767x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - leaq 48(%rsi),%rsi - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,0(%rdi) - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - movq %r9,8(%rdi) - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - movq %r10,16(%rdi) - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - movq %r11,24(%rdi) - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - movq %r12,32(%rdi) - imulq %rbp - addq %rax,%r13 - adcq $0,%rdx - - movq %r13,40(%rdi) - movq %rdx,48(%rdi) - sarq $63,%rdx - movq %rdx,56(%rdi) - movq %rcx,%rdx - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - movq 56(%rsi),%r15 - movq 64(%rsi),%rbx - movq 72(%rsi),%rbp - movq 80(%rsi),%rcx - movq 88(%rsi),%rdi - - movq %rdx,%rsi - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rsi - addq %rax,%rsi - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - xorq %rdx,%r14 - xorq %rdx,%r15 - xorq %rdx,%rbx - xorq %rdx,%rbp - xorq %rdx,%rcx - xorq %rdx,%rdi - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rbx - adcq $0,%rbp - adcq $0,%rcx - adcq $0,%rdi - - mulq %rsi - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rsi - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rsi - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rsi - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rsi - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - mulq %rsi - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - mulq %rsi - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - mulq %rsi - addq %rax,%r15 - movq %rbx,%rax - adcq $0,%rdx - movq %rdx,%rbx - mulq %rsi - addq %rax,%rbx - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rbp - mulq %rsi - addq %rax,%rbp - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rcx - mulq %rsi - addq %rax,%rcx - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%rdi - movq 8(%rsp),%rdx - imulq %rsi,%rax - movq 16(%rsp),%rsi - addq %rdi,%rax - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - adcq 24(%rdx),%r11 - adcq 32(%rdx),%r12 - adcq 40(%rdx),%r13 - adcq 48(%rdx),%r14 - movq 56(%rdx),%rdi - adcq %rdi,%r15 - adcq %rdi,%rbx - adcq %rdi,%rbp - adcq %rdi,%rcx - adcq %rdi,%rax - - movq %rdx,%rdi - - movq %r8,0(%rdx) - movq %r9,8(%rdx) - movq %r10,16(%rdx) - movq %r11,24(%rdx) - movq %r12,32(%rdx) - movq %r13,40(%rdx) - movq %r14,48(%rdx) - movq %r15,56(%rdx) - movq %rbx,64(%rdx) - movq %rbp,72(%rdx) - movq %rcx,80(%rdx) - movq %rax,88(%rdx) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__smulq_383x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp,%rax - addq %rax,%r13 - - leaq 48(%rsi),%rsi - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp,%rax - addq %rax,%r13 - - leaq -48(%rsi),%rsi - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__smulq_383_n_shift_by_62: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,%rbx - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp - addq %rax,%r13 - adcq $0,%rdx - - leaq 48(%rsi),%rsi - movq %rdx,%r14 - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rdx - xorq %rax,%rax - subq %rdx,%rax - - xorq %rdx,%rbp - addq %rax,%rbp - - xorq %rdx,%r8 - xorq %rdx,%r9 - xorq %rdx,%r10 - xorq %rdx,%r11 - xorq %rdx,%r12 - xorq %rdx,%r13 - addq %r8,%rax - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulq %rbp - movq %rax,%r8 - movq %r9,%rax - movq %rdx,%r9 - mulq %rbp - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r10 - mulq %rbp - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r11 - mulq %rbp - addq %rax,%r11 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r12 - mulq %rbp - addq %rax,%r12 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r13 - imulq %rbp - addq %rax,%r13 - adcq $0,%rdx - - leaq -48(%rsi),%rsi - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq %rdx,%r14 - movq %rbx,%rdx - - shrdq $62,%r9,%r8 - shrdq $62,%r10,%r9 - shrdq $62,%r11,%r10 - shrdq $62,%r12,%r11 - shrdq $62,%r13,%r12 - shrdq $62,%r14,%r13 - - sarq $63,%r14 - xorq %rbp,%rbp - subq %r14,%rbp - - xorq %r14,%r8 - xorq %r14,%r9 - xorq %r14,%r10 - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%r13 - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - xorq %r14,%rdx - xorq %r14,%rcx - addq %rbp,%rdx - addq %rbp,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__ab_approximation_62: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 40(%rsi),%r9 - movq 88(%rsi),%r11 - movq 32(%rsi),%rbx - movq 80(%rsi),%rbp - movq 24(%rsi),%r8 - movq 72(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - movq 16(%rsi),%r8 - movq 64(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - movq 8(%rsi),%r8 - movq 56(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - movq 0(%rsi),%r8 - movq 48(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%rbx,%r9 - shldq %cl,%rbp,%r11 - - jmp __inner_loop_62 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 3 -.long 0 -__inner_loop_62: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $1,%rdx - xorq %rcx,%rcx - xorq %r12,%r12 - movq $1,%r13 - movq %rsi,8(%rsp) - -L$oop_62: - xorq %rax,%rax - xorq %rbx,%rbx - testq $1,%r8 - movq %r10,%rbp - movq %r11,%r14 - cmovnzq %r10,%rax - cmovnzq %r11,%rbx - subq %r8,%rbp - sbbq %r9,%r14 - movq %r8,%r15 - movq %r9,%rsi - subq %rax,%r8 - sbbq %rbx,%r9 - cmovcq %rbp,%r8 - cmovcq %r14,%r9 - cmovcq %r15,%r10 - cmovcq %rsi,%r11 - movq %rdx,%rax - cmovcq %r12,%rdx - cmovcq %rax,%r12 - movq %rcx,%rbx - cmovcq %r13,%rcx - cmovcq %rbx,%r13 - xorq %rax,%rax - xorq %rbx,%rbx - shrdq $1,%r9,%r8 - shrq $1,%r9 - testq $1,%r15 - cmovnzq %r12,%rax - cmovnzq %r13,%rbx - addq %r12,%r12 - addq %r13,%r13 - subq %rax,%rdx - subq %rbx,%rcx - subl $1,%edi - jnz L$oop_62 - - movq 8(%rsp),%rsi - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s deleted file mode 100644 index 3f999075813..00000000000 --- a/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s +++ /dev/null @@ -1,1568 +0,0 @@ -.text - -.globl _ctx_inverse_mod_383 -.private_extern _ctx_inverse_mod_383 - -.p2align 5 -_ctx_inverse_mod_383: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -ct_inverse_mod_383$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $1112,%rsp -.cfi_adjust_cfa_offset 1112 - - - leaq 88+511(%rsp),%rax - andq $-512,%rax - movq %rdi,32(%rsp) - movq %rcx,40(%rsp) - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq 0(%rdx),%r14 - movq 8(%rdx),%r15 - movq 16(%rdx),%rbx - movq 24(%rdx),%rbp - movq 32(%rdx),%rsi - movq 40(%rdx),%rdi - - movq %r8,0(%rax) - movq %r9,8(%rax) - movq %r10,16(%rax) - movq %r11,24(%rax) - movq %r12,32(%rax) - movq %r13,40(%rax) - - movq %r14,48(%rax) - movq %r15,56(%rax) - movq %rbx,64(%rax) - movq %rbp,72(%rax) - movq %rsi,80(%rax) - movq %rax,%rsi - movq %rdi,88(%rax) - - - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - - - movq %rdx,96(%rdi) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - - - movq %rdx,96(%rdi) - - - xorq $256,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - - - - movq 96(%rsi),%rax - movq 144(%rsi),%r11 - movq %rdx,%rbx - movq %rax,%r10 - imulq 56(%rsp) - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq 64(%rsp) - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,48(%rdi) - movq %r9,56(%rdi) - sarq $63,%r9 - movq %r9,64(%rdi) - movq %r9,72(%rdi) - movq %r9,80(%rdi) - movq %r9,88(%rdi) - leaq 96(%rsi),%rsi - - movq %r10,%rax - imulq %rbx - movq %rax,%r8 - movq %r11,%rax - movq %rdx,%r9 - imulq %rcx - addq %rax,%r8 - adcq %rdx,%r9 - movq %r8,96(%rdi) - movq %r9,104(%rdi) - sarq $63,%r9 - movq %r9,112(%rdi) - movq %r9,120(%rdi) - movq %r9,128(%rdi) - movq %r9,136(%rdi) - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383x63 - sarq $63,%r13 - movq %r13,48(%rdi) - movq %r13,56(%rdi) - movq %r13,64(%rdi) - movq %r13,72(%rdi) - movq %r13,80(%rdi) - movq %r13,88(%rdi) - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_383_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - xorq $256+96,%rsi - movl $31,%edi - call __ab_approximation_31 - - - movq %r12,72(%rsp) - movq %r13,80(%rsp) - - movq $256,%rdi - xorq %rsi,%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,56(%rsp) - movq %rcx,64(%rsp) - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_191_n_shift_by_31 - movq %rdx,72(%rsp) - movq %rcx,80(%rsp) - - movq 56(%rsp),%rdx - movq 64(%rsp),%rcx - leaq 96(%rsi),%rsi - leaq 48(%rdi),%rdi - call __smulx_383x63 - - movq 72(%rsp),%rdx - movq 80(%rsp),%rcx - leaq 48(%rdi),%rdi - call __smulx_767x63 - - xorq $256+96,%rsi - movl $53,%edi - - movq 0(%rsi),%r8 - - movq 48(%rsi),%r10 - - call __tail_loop_53 - - - - - - - - leaq 96(%rsi),%rsi - - - - - - movq %r12,%rdx - movq %r13,%rcx - movq 32(%rsp),%rdi - call __smulx_767x63 - - movq 40(%rsp),%rsi - movq %rax,%rdx - sarq $63,%rax - - movq %rax,%r8 - movq %rax,%r9 - movq %rax,%r10 - andq 0(%rsi),%r8 - andq 8(%rsi),%r9 - movq %rax,%r11 - andq 16(%rsi),%r10 - andq 24(%rsi),%r11 - movq %rax,%r12 - andq 32(%rsi),%r12 - andq 40(%rsi),%rax - - addq %r8,%r14 - adcq %r9,%r15 - adcq %r10,%rbx - adcq %r11,%rbp - adcq %r12,%rcx - adcq %rax,%rdx - - movq %r14,48(%rdi) - movq %r15,56(%rdi) - movq %rbx,64(%rdi) - movq %rbp,72(%rdi) - movq %rcx,80(%rdi) - movq %rdx,88(%rdi) - - leaq 1112(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -1112-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__smulx_767x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - leaq 48(%rsi),%rsi - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r13 - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %r13,%r10 - mulxq %r11,%r11,%r13 - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %r13,%r12 - adcq $0,%rbp - imulq %rdx - addq %rbp,%rax - adcq $0,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %rax,40(%rdi) - movq %rdx,48(%rdi) - sarq $63,%rdx - movq %rdx,56(%rdi) - movq %rcx,%rdx - movq %rcx,%rax - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - movq 56(%rsi),%r15 - movq 64(%rsi),%rbx - movq 72(%rsi),%rbp - movq 80(%rsi),%rcx - movq 88(%rsi),%rdi - - sarq $63,%rax - xorq %rsi,%rsi - subq %rax,%rsi - - xorq %rax,%rdx - addq %rsi,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %rax,%r13 - xorq %rax,%r14 - xorq %rax,%r15 - xorq %rax,%rbx - xorq %rax,%rbp - xorq %rax,%rcx - xorq %rax,%rdi - addq %rsi,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rbx - adcq $0,%rbp - adcq $0,%rcx - adcq $0,%rdi - - mulxq %r8,%r8,%rax - mulxq %r9,%r9,%rsi - addq %rax,%r9 - mulxq %r10,%r10,%rax - adcq %rsi,%r10 - mulxq %r11,%r11,%rsi - adcq %rax,%r11 - mulxq %r12,%r12,%rax - adcq %rsi,%r12 - mulxq %r13,%r13,%rsi - adcq %rax,%r13 - mulxq %r14,%r14,%rax - adcq %rsi,%r14 - mulxq %r15,%r15,%rsi - adcq %rax,%r15 - mulxq %rbx,%rbx,%rax - adcq %rsi,%rbx - mulxq %rbp,%rbp,%rsi - adcq %rax,%rbp - mulxq %rcx,%rcx,%rax - adcq %rsi,%rcx - mulxq %rdi,%rdi,%rsi - movq 8(%rsp),%rdx - movq 16(%rsp),%rsi - adcq %rdi,%rax - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - adcq 24(%rdx),%r11 - adcq 32(%rdx),%r12 - adcq 40(%rdx),%r13 - adcq 48(%rdx),%r14 - movq 56(%rdx),%rdi - adcq %rdi,%r15 - adcq %rdi,%rbx - adcq %rdi,%rbp - adcq %rdi,%rcx - adcq %rdi,%rax - - movq %rdx,%rdi - - movq %r8,0(%rdx) - movq %r9,8(%rdx) - movq %r10,16(%rdx) - movq %r11,24(%rdx) - movq %r12,32(%rdx) - movq %r13,40(%rdx) - movq %r14,48(%rdx) - movq %r15,56(%rdx) - movq %rbx,64(%rdx) - movq %rbp,72(%rdx) - movq %rcx,80(%rdx) - movq %rax,88(%rdx) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__smulx_383x63: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - movq 0+32(%rsi),%r12 - movq 0+40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%rdx - addq %rax,%rdx - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - xorq %rbp,%r12 - xorq %rbp,%r13 - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%rax - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %rax,%r10 - mulxq %r11,%r11,%rax - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %rax,%r12 - mulxq %r13,%r13,%rax - movq %rcx,%rdx - adcq %rbp,%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq 48+0(%rsi),%r8 - movq 48+8(%rsi),%r9 - movq 48+16(%rsi),%r10 - movq 48+24(%rsi),%r11 - movq 48+32(%rsi),%r12 - movq 48+40(%rsi),%r13 - - movq %rdx,%rbp - sarq $63,%rbp - xorq %rax,%rax - subq %rbp,%rax - - xorq %rbp,%rdx - addq %rax,%rdx - - xorq %rbp,%r8 - xorq %rbp,%r9 - xorq %rbp,%r10 - xorq %rbp,%r11 - xorq %rbp,%r12 - xorq %rbp,%r13 - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%rax - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %rax,%r10 - mulxq %r11,%r11,%rax - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %rax,%r12 - mulxq %r13,%r13,%rax - adcq %rbp,%r13 - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__smulx_383_n_shift_by_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,%rbx - xorq %r14,%r14 - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - movq 0+24(%rsi),%r11 - movq 0+32(%rsi),%r12 - movq 0+40(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r13 - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %r13,%r10 - mulxq %r11,%r11,%r13 - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %r13,%r12 - adcq $0,%rbp - imulq %rdx - addq %rbp,%rax - adcq %rdx,%r14 - - movq %rcx,%rdx - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %rax,40(%rdi) - movq 48+0(%rsi),%r8 - movq 48+8(%rsi),%r9 - movq 48+16(%rsi),%r10 - movq 48+24(%rsi),%r11 - movq 48+32(%rsi),%r12 - movq 48+40(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %rax,%r10 - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r13 - addq %rbp,%r9 - mulxq %r10,%r10,%rbp - adcq %r13,%r10 - mulxq %r11,%r11,%r13 - adcq %rbp,%r11 - mulxq %r12,%r12,%rbp - adcq %r13,%r12 - adcq $0,%rbp - imulq %rdx - addq %rbp,%rax - adcq $0,%rdx - - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%rax - adcq %rdx,%r14 - movq %rbx,%rdx - - shrdq $31,%r9,%r8 - shrdq $31,%r10,%r9 - shrdq $31,%r11,%r10 - shrdq $31,%r12,%r11 - shrdq $31,%rax,%r12 - shrdq $31,%r14,%rax - - sarq $63,%r14 - xorq %rbp,%rbp - subq %r14,%rbp - - xorq %r14,%r8 - xorq %r14,%r9 - xorq %r14,%r10 - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%rax - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %rax,40(%rdi) - - xorq %r14,%rdx - xorq %r14,%rcx - addq %rbp,%rdx - addq %rbp,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__smulx_191_n_shift_by_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rdx,%rbx - movq 0+0(%rsi),%r8 - movq 0+8(%rsi),%r9 - movq 0+16(%rsi),%r10 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r8 - xorq %rax,%r9 - xorq %r10,%rax - addq %rbp,%r8 - adcq $0,%r9 - adcq $0,%rax - - mulxq %r8,%r8,%rbp - mulxq %r9,%r9,%r10 - addq %rbp,%r9 - adcq $0,%r10 - imulq %rdx - addq %rax,%r10 - adcq $0,%rdx - movq %rdx,%r14 - movq %rcx,%rdx - movq 48+0(%rsi),%r11 - movq 48+8(%rsi),%r12 - movq 48+16(%rsi),%r13 - - movq %rdx,%rax - sarq $63,%rax - xorq %rbp,%rbp - subq %rax,%rbp - - xorq %rax,%rdx - addq %rbp,%rdx - - xorq %rax,%r11 - xorq %rax,%r12 - xorq %r13,%rax - addq %rbp,%r11 - adcq $0,%r12 - adcq $0,%rax - - mulxq %r11,%r11,%rbp - mulxq %r12,%r12,%r13 - addq %rbp,%r12 - adcq $0,%r13 - imulq %rdx - addq %rax,%r13 - adcq $0,%rdx - addq %r8,%r11 - adcq %r9,%r12 - adcq %r10,%r13 - adcq %rdx,%r14 - movq %rbx,%rdx - - shrdq $31,%r12,%r11 - shrdq $31,%r13,%r12 - shrdq $31,%r14,%r13 - - sarq $63,%r14 - xorq %rbp,%rbp - subq %r14,%rbp - - xorq %r14,%r11 - xorq %r14,%r12 - xorq %r14,%r13 - addq %rbp,%r11 - adcq $0,%r12 - adcq $0,%r13 - - movq %r11,0(%rdi) - movq %r12,8(%rdi) - movq %r13,16(%rdi) - - xorq %r14,%rdx - xorq %r14,%rcx - addq %rbp,%rdx - addq %rbp,%rcx - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__ab_approximation_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 40(%rsi),%r9 - movq 88(%rsi),%r11 - movq 32(%rsi),%rbx - movq 80(%rsi),%rbp - movq 24(%rsi),%r8 - movq 72(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 16(%rsi),%r8 - cmovzq %r10,%rbp - movq 64(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 8(%rsi),%r8 - cmovzq %r10,%rbp - movq 56(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - movq 0(%rsi),%r8 - cmovzq %r10,%rbp - movq 48(%rsi),%r10 - - movq %r9,%rax - orq %r11,%rax - cmovzq %rbx,%r9 - cmovzq %rbp,%r11 - cmovzq %r8,%rbx - cmovzq %r10,%rbp - - movq %r9,%rax - orq %r11,%rax - bsrq %rax,%rcx - leaq 1(%rcx),%rcx - cmovzq %r8,%r9 - cmovzq %r10,%r11 - cmovzq %rax,%rcx - negq %rcx - - - shldq %cl,%rbx,%r9 - shldq %cl,%rbp,%r11 - - movl $0x7FFFFFFF,%eax - andq %rax,%r8 - andq %rax,%r10 - andnq %r9,%rax,%r9 - andnq %r11,%rax,%r11 - orq %r9,%r8 - orq %r11,%r10 - - jmp __inner_loop_31 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__inner_loop_31: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $0x7FFFFFFF80000000,%rcx - movq $0x800000007FFFFFFF,%r13 - movq $0x7FFFFFFF7FFFFFFF,%r15 - -L$oop_31: - cmpq %r10,%r8 - movq %r8,%rax - movq %r10,%rbx - movq %rcx,%rbp - movq %r13,%r14 - cmovbq %r10,%r8 - cmovbq %rax,%r10 - cmovbq %r13,%rcx - cmovbq %rbp,%r13 - - subq %r10,%r8 - subq %r13,%rcx - addq %r15,%rcx - - testq $1,%rax - cmovzq %rax,%r8 - cmovzq %rbx,%r10 - cmovzq %rbp,%rcx - cmovzq %r14,%r13 - - shrq $1,%r8 - addq %r13,%r13 - subq %r15,%r13 - subl $1,%edi - jnz L$oop_31 - - shrq $32,%r15 - movl %ecx,%edx - movl %r13d,%r12d - shrq $32,%rcx - shrq $32,%r13 - subq %r15,%rdx - subq %r15,%rcx - subq %r15,%r12 - subq %r15,%r13 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__tail_loop_53: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq $1,%rdx - xorq %rcx,%rcx - xorq %r12,%r12 - movq $1,%r13 - -L$oop_53: - xorq %rax,%rax - testq $1,%r8 - movq %r10,%rbx - cmovnzq %r10,%rax - subq %r8,%rbx - movq %r8,%rbp - subq %rax,%r8 - cmovcq %rbx,%r8 - cmovcq %rbp,%r10 - movq %rdx,%rax - cmovcq %r12,%rdx - cmovcq %rax,%r12 - movq %rcx,%rbx - cmovcq %r13,%rcx - cmovcq %rbx,%r13 - xorq %rax,%rax - xorq %rbx,%rbx - shrq $1,%r8 - testq $1,%rbp - cmovnzq %r12,%rax - cmovnzq %r13,%rbx - addq %r12,%r12 - addq %r13,%r13 - subq %rax,%rdx - subq %rbx,%rcx - subl $1,%edi - jnz L$oop_53 - - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/div3w-armv8.S b/crypto/blst_src/build/mach-o/div3w-armv8.S deleted file mode 100644 index 4b130080123..00000000000 --- a/crypto/blst_src/build/mach-o/div3w-armv8.S +++ /dev/null @@ -1,88 +0,0 @@ -.text - -.globl _div_3_limbs - -.align 5 -_div_3_limbs: - ldp x4,x5,[x0] // load R - eor x0,x0,x0 // Q = 0 - mov x3,#64 // loop counter - nop - -Loop: - subs x6,x4,x1 // R - D - add x0,x0,x0 // Q <<= 1 - sbcs x7,x5,x2 - add x0,x0,#1 // Q + speculative bit - csel x4,x4,x6,lo // select between R and R - D - extr x1,x2,x1,#1 // D >>= 1 - csel x5,x5,x7,lo - lsr x2,x2,#1 - sbc x0,x0,xzr // subtract speculative bit - sub x3,x3,#1 - cbnz x3,Loop - - asr x3,x0,#63 // top bit -> mask - add x0,x0,x0 // Q <<= 1 - subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + speculative bit - sbcs x7,x5,x2 - sbc x0,x0,xzr // subtract speculative bit - - orr x0,x0,x3 // all ones if overflow - - ret - -.globl _quot_rem_128 - -.align 5 -_quot_rem_128: - ldp x3,x4,[x1] - - mul x5,x3,x2 // divisor[0:1} * quotient - umulh x6,x3,x2 - mul x11, x4,x2 - umulh x7,x4,x2 - - ldp x8,x9,[x0] // load 3 limbs of the dividend - ldr x10,[x0,#16] - - adds x6,x6,x11 - adc x7,x7,xzr - - subs x8,x8,x5 // dividend - divisor * quotient - sbcs x9,x9,x6 - sbcs x10,x10,x7 - sbc x5,xzr,xzr // borrow -> mask - - add x2,x2,x5 // if borrowed, adjust the quotient ... - and x3,x3,x5 - and x4,x4,x5 - adds x8,x8,x3 // ... and add divisor - adc x9,x9,x4 - - stp x8,x9,[x0] // save 2 limbs of the remainder - str x2,[x0,#16] // and one limb of the quotient - - mov x0,x2 // return adjusted quotient - - ret - - -.globl _quot_rem_64 - -.align 5 -_quot_rem_64: - ldr x3,[x1] - ldr x8,[x0] // load 1 limb of the dividend - - mul x5,x3,x2 // divisor * quotient - - sub x8,x8,x5 // dividend - divisor * quotient - - stp x8,x2,[x0] // save remainder and quotient - - mov x0,x2 // return quotient - - ret - diff --git a/crypto/blst_src/build/mach-o/div3w-x86_64.s b/crypto/blst_src/build/mach-o/div3w-x86_64.s deleted file mode 100644 index 99a94d50a2b..00000000000 --- a/crypto/blst_src/build/mach-o/div3w-x86_64.s +++ /dev/null @@ -1,124 +0,0 @@ -.text - -.globl _div_3_limbs -.private_extern _div_3_limbs - -.p2align 5 -_div_3_limbs: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - - movq (%rdi),%r8 - movq 8(%rdi),%r9 - xorq %rax,%rax - movl $64,%ecx - -L$oop: - movq %r8,%r10 - subq %rsi,%r8 - movq %r9,%r11 - sbbq %rdx,%r9 - leaq 1(%rax,%rax,1),%rax - movq %rdx,%rdi - cmovcq %r10,%r8 - cmovcq %r11,%r9 - sbbq $0,%rax - shlq $63,%rdi - shrq $1,%rsi - shrq $1,%rdx - orq %rdi,%rsi - subl $1,%ecx - jnz L$oop - - leaq 1(%rax,%rax,1),%rcx - sarq $63,%rax - - subq %rsi,%r8 - sbbq %rdx,%r9 - sbbq $0,%rcx - - orq %rcx,%rax - - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _quot_rem_128 -.private_extern _quot_rem_128 - -.p2align 5 -_quot_rem_128: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - - movq %rdx,%rax - movq %rdx,%rcx - - mulq 0(%rsi) - movq %rax,%r8 - movq %rcx,%rax - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r9 - adcq $0,%rdx - - movq 0(%rdi),%r10 - movq 8(%rdi),%r11 - movq 16(%rdi),%rax - - subq %r8,%r10 - sbbq %r9,%r11 - sbbq %rdx,%rax - sbbq %r8,%r8 - - addq %r8,%rcx - movq %r8,%r9 - andq 0(%rsi),%r8 - andq 8(%rsi),%r9 - addq %r8,%r10 - adcq %r9,%r11 - - movq %r10,0(%rdi) - movq %r11,8(%rdi) - movq %rcx,16(%rdi) - - movq %rcx,%rax - - - .byte 0xf3,0xc3 -.cfi_endproc - - - - - - -.globl _quot_rem_64 -.private_extern _quot_rem_64 - -.p2align 5 -_quot_rem_64: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - - movq %rdx,%rax - imulq 0(%rsi),%rdx - - movq 0(%rdi),%r10 - - subq %rdx,%r10 - - movq %r10,0(%rdi) - movq %rax,8(%rdi) - - - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S b/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S deleted file mode 100644 index 4f506b58b0f..00000000000 --- a/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S +++ /dev/null @@ -1,464 +0,0 @@ -.text - -.globl _mul_mont_sparse_256 -.private_extern _mul_mont_sparse_256 - -.align 5 -_mul_mont_sparse_256: - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x10,x11,[x1] - ldr x9, [x2] - ldp x12,x13,[x1,#16] - - mul x19,x10,x9 - ldp x5,x6,[x3] - mul x20,x11,x9 - ldp x7,x8,[x3,#16] - mul x21,x12,x9 - mul x22,x13,x9 - - umulh x14,x10,x9 - umulh x15,x11,x9 - mul x3,x4,x19 - umulh x16,x12,x9 - umulh x17,x13,x9 - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,xzr, x17 - mul x17,x8,x3 - ldr x9,[x2,8*1] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - ldr x9,[x2,8*2] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - ldr x9,[x2,8*3] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - adcs x20,x21,x15 - adcs x21,x22,x16 - adcs x22,x23,x17 - adc x23,xzr,xzr - - subs x14,x19,x5 - sbcs x15,x20,x6 - sbcs x16,x21,x7 - sbcs x17,x22,x8 - sbcs xzr, x23,xzr - - csel x19,x19,x14,lo - csel x20,x20,x15,lo - csel x21,x21,x16,lo - csel x22,x22,x17,lo - - stp x19,x20,[x0] - stp x21,x22,[x0,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - ret - -.globl _sqr_mont_sparse_256 -.private_extern _sqr_mont_sparse_256 - -.align 5 -_sqr_mont_sparse_256: -.long 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - mov x4,x3 - - //////////////////////////////////////////////////////////////// - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul x11,x6,x5 // a[1]*a[0] - umulh x15,x6,x5 - mul x12,x7,x5 // a[2]*a[0] - umulh x16,x7,x5 - mul x13,x8,x5 // a[3]*a[0] - umulh x19,x8,x5 - - adds x12,x12,x15 // accumulate high parts of multiplication - mul x14,x7,x6 // a[2]*a[1] - umulh x15,x7,x6 - adcs x13,x13,x16 - mul x16,x8,x6 // a[3]*a[1] - umulh x17,x8,x6 - adc x19,x19,xzr // can't overflow - - mul x20,x8,x7 // a[3]*a[2] - umulh x21,x8,x7 - - adds x15,x15,x16 // accumulate high parts of multiplication - mul x10,x5,x5 // a[0]*a[0] - adc x16,x17,xzr // can't overflow - - adds x13,x13,x14 // accumulate low parts of multiplication - umulh x5,x5,x5 - adcs x19,x19,x15 - mul x15,x6,x6 // a[1]*a[1] - adcs x20,x20,x16 - umulh x6,x6,x6 - adc x21,x21,xzr // can't overflow - - adds x11,x11,x11 // acc[1-6]*=2 - mul x16,x7,x7 // a[2]*a[2] - adcs x12,x12,x12 - umulh x7,x7,x7 - adcs x13,x13,x13 - mul x17,x8,x8 // a[3]*a[3] - adcs x19,x19,x19 - umulh x8,x8,x8 - adcs x20,x20,x20 - adcs x21,x21,x21 - adc x22,xzr,xzr - - adds x11,x11,x5 // +a[i]*a[i] - adcs x12,x12,x15 - adcs x13,x13,x6 - adcs x19,x19,x16 - adcs x20,x20,x7 - adcs x21,x21,x17 - adc x22,x22,x8 - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - adds x10,x10,x19 // accumulate upper half - adcs x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - adc x19,xzr,xzr - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - sbcs xzr, x19,xzr - - csel x10,x10,x14,lo - csel x11,x11,x15,lo - csel x12,x12,x16,lo - csel x13,x13,x17,lo - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 -.long 3573752767 - ret - -.globl _from_mont_256 -.private_extern _from_mont_256 - -.align 5 -_from_mont_256: -.long 3573752639 - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x4,x3 - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - - csel x10,x10,x14,lo - csel x11,x11,x15,lo - csel x12,x12,x16,lo - csel x13,x13,x17,lo - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldr x29,[sp],#16 -.long 3573752767 - ret - - -.globl _redc_mont_256 -.private_extern _redc_mont_256 - -.align 5 -_redc_mont_256: -.long 3573752639 - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x4,x3 - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - ldp x14,x15,[x1,#32] - ldp x16,x17,[x1,#48] - - adds x10,x10,x14 - adcs x11,x11,x15 - adcs x12,x12,x16 - adcs x13,x13,x17 - adc x9,xzr,xzr - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - sbcs xzr, x9,xzr - - csel x10,x10,x14,lo - csel x11,x11,x15,lo - csel x12,x12,x16,lo - csel x13,x13,x17,lo - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldr x29,[sp],#16 -.long 3573752767 - ret - - - -.align 5 -__mul_by_1_mont_256: - mul x3,x4,x10 - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - adc x13,x9,x17 - - ret - diff --git a/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S b/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S deleted file mode 100644 index 5aa2e9f3ae7..00000000000 --- a/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S +++ /dev/null @@ -1,2372 +0,0 @@ -.text - -.globl _add_mod_384x384 - -.align 5 -_add_mod_384x384: -.long 3573752639 - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __add_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 -.long 3573752767 - ret - - - -.align 5 -__add_mod_384x384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - adds x11,x11,x19 - ldp x21,x22,[x2,#16] - adcs x12,x12,x20 - ldp x15, x16, [x1,#32] - adcs x13,x13,x21 - ldp x23,x24,[x2,#32] - adcs x14,x14,x22 - stp x11, x12, [x0] - adcs x15,x15,x23 - ldp x11, x12, [x1,#48] - adcs x16,x16,x24 - - ldp x19,x20,[x2,#48] - stp x13, x14, [x0,#16] - ldp x13, x14, [x1,#64] - ldp x21,x22,[x2,#64] - - adcs x11,x11,x19 - stp x15, x16, [x0,#32] - adcs x12,x12,x20 - ldp x15, x16, [x1,#80] - adcs x13,x13,x21 - ldp x23,x24,[x2,#80] - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x17,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x17,xzr - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - stp x11,x12,[x0,#48] - csel x15,x15,x23,lo - stp x13,x14,[x0,#64] - csel x16,x16,x24,lo - stp x15,x16,[x0,#80] - - ret - - -.globl _sub_mod_384x384 - -.align 5 -_sub_mod_384x384: -.long 3573752639 - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 -.long 3573752767 - ret - - - -.align 5 -__sub_mod_384x384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - subs x11,x11,x19 - ldp x21,x22,[x2,#16] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#32] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#32] - sbcs x14,x14,x22 - stp x11, x12, [x0] - sbcs x15,x15,x23 - ldp x11, x12, [x1,#48] - sbcs x16,x16,x24 - - ldp x19,x20,[x2,#48] - stp x13, x14, [x0,#16] - ldp x13, x14, [x1,#64] - ldp x21,x22,[x2,#64] - - sbcs x11,x11,x19 - stp x15, x16, [x0,#32] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#80] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#80] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x17,xzr,xzr - - and x19,x5,x17 - and x20,x6,x17 - adds x11,x11,x19 - and x21,x7,x17 - adcs x12,x12,x20 - and x22,x8,x17 - adcs x13,x13,x21 - and x23,x9,x17 - adcs x14,x14,x22 - and x24,x10,x17 - adcs x15,x15,x23 - stp x11,x12,[x0,#48] - adc x16,x16,x24 - stp x13,x14,[x0,#64] - stp x15,x16,[x0,#80] - - ret - - - -.align 5 -__add_mod_384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - adds x11,x11,x19 - ldp x21,x22,[x2,#16] - adcs x12,x12,x20 - ldp x15, x16, [x1,#32] - adcs x13,x13,x21 - ldp x23,x24,[x2,#32] - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x17,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x17,xzr - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - csel x15,x15,x23,lo - stp x11,x12,[x0] - csel x16,x16,x24,lo - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret - - - -.align 5 -__sub_mod_384: - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - subs x11,x11,x19 - ldp x21,x22,[x2,#16] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#32] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#32] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x17,xzr,xzr - - and x19,x5,x17 - and x20,x6,x17 - adds x11,x11,x19 - and x21,x7,x17 - adcs x12,x12,x20 - and x22,x8,x17 - adcs x13,x13,x21 - and x23,x9,x17 - adcs x14,x14,x22 - and x24,x10,x17 - adcs x15,x15,x23 - stp x11,x12,[x0] - adc x16,x16,x24 - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret - - -.globl _mul_mont_384x -.private_extern _mul_mont_384x - -.align 5 -_mul_mont_384x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#288 // space for 3 768-bit vectors - - mov x26,x0 // save r_ptr - mov x27,x1 // save b_ptr - mov x28,x2 // save b_ptr - - sub x0,sp,#0 // mul_384(t0, a->re, b->re) - bl __mul_384 - - add x1,x1,#48 // mul_384(t1, a->im, b->im) - add x2,x2,#48 - add x0,sp,#96 - bl __mul_384 - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - sub x2,x1,#48 - add x0,sp,#240 - bl __add_mod_384 - - add x1,x28,#0 - add x2,x28,#48 - add x0,sp,#192 // t2 - bl __add_mod_384 - - add x1,x0,#0 - add x2,x0,#48 - bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - mov x1,x0 - add x2,sp,#0 - bl __sub_mod_384x384 - - add x2,sp,#96 - bl __sub_mod_384x384 // t2 = t2-t0-t1 - - add x1,sp,#0 - add x2,sp,#96 - add x0,sp,#0 - bl __sub_mod_384x384 // t0 = t0-t1 - - add x1,sp,#0 // ret->re = redc(t0) - add x0,x26,#0 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - - add x1,sp,#192 // ret->im = redc(t2) - add x0,x0,#48 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#288 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl _sqr_mont_384x -.private_extern _sqr_mont_384x - -.align 5 -_sqr_mont_384x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 2 384-bit vectors - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - add x2,x1,#48 - add x0,sp,#0 - bl __add_mod_384 // t0 = a->re + a->im - - add x0,sp,#48 - bl __sub_mod_384 // t1 = a->re - a->im - - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) - - adds x11,x11,x11 // add with itself - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x25,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x25,xzr - - csel x19,x11,x19,lo - csel x20,x12,x20,lo - csel x21,x13,x21,lo - ldp x11,x12,[sp] - csel x22,x14,x22,lo - ldr x17, [sp,#48] - csel x23,x15,x23,lo - ldp x13,x14,[sp,#16] - csel x24,x16,x24,lo - ldp x15,x16,[sp,#32] - - stp x19,x20,[x2,#48] - stp x21,x22,[x2,#64] - stp x23,x24,[x2,#80] - - add x2,sp,#48 - bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl _mul_mont_384 -.private_extern _mul_mont_384 - -.align 5 -_mul_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there - - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - - -.align 5 -__mul_mont_384: - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - mul x4,x4,x19 - - umulh x26,x11,x17 - umulh x27,x12,x17 - umulh x28,x13,x17 - umulh x0,x14,x17 - umulh x1,x15,x17 - umulh x3,x16,x17 - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,xzr, x3 - mul x3,x10,x4 - mov x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*1] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*2] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*3] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*4] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*5] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr - adc x17,x17,xzr - - adds x19,x20,x26 - adcs x20,x21,x27 - adcs x21,x22,x28 - adcs x22,x23,x0 - adcs x23,x24,x1 - adcs x24,x25,x3 - adc x25,x17,xzr - - subs x26,x19,x5 - sbcs x27,x20,x6 - sbcs x28,x21,x7 - sbcs x0,x22,x8 - sbcs x1,x23,x9 - sbcs x3,x24,x10 - sbcs xzr, x25,xzr - - csel x11,x19,x26,lo - csel x12,x20,x27,lo - csel x13,x21,x28,lo - csel x14,x22,x0,lo - csel x15,x23,x1,lo - csel x16,x24,x3,lo - ret - - -.globl _sqr_mont_384 -.private_extern _sqr_mont_384 - -.align 5 -_sqr_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for 768-bit vector - mov x4,x3 // adjust for missing b_ptr - - mov x3,x0 // save r_ptr - mov x0,sp - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __sqr_384 - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - mov x1,sp - mov x0,x3 // restore r_ptr - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl _sqr_n_mul_mont_383 -.private_extern _sqr_n_mul_mont_383 - -.align 5 -_sqr_n_mul_mont_383: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 768-bit vector - mov x17,x5 // save b_ptr - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - mov x0,sp -Loop_sqr_383: - bl __sqr_384 - sub x2,x2,#1 // counter - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - mov x1,sp - bl __mul_by_1_mont_384 - - ldp x19,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x11,x11,x19 // just accumulate upper half - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adc x16,x16,x24 - - cbnz x2,Loop_sqr_383 - - mov x2,x17 - ldr x17,[x17] - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.align 5 -__sqr_384: - mul x19,x12,x11 - mul x20,x13,x11 - mul x21,x14,x11 - mul x22,x15,x11 - mul x23,x16,x11 - - umulh x6,x12,x11 - umulh x7,x13,x11 - umulh x8,x14,x11 - umulh x9,x15,x11 - adds x20,x20,x6 - umulh x10,x16,x11 - adcs x21,x21,x7 - mul x7,x13,x12 - adcs x22,x22,x8 - mul x8,x14,x12 - adcs x23,x23,x9 - mul x9,x15,x12 - adc x24,xzr, x10 - mul x10,x16,x12 - - adds x21,x21,x7 - umulh x7,x13,x12 - adcs x22,x22,x8 - umulh x8,x14,x12 - adcs x23,x23,x9 - umulh x9,x15,x12 - adcs x24,x24,x10 - umulh x10,x16,x12 - adc x25,xzr,xzr - - mul x5,x11,x11 - adds x22,x22,x7 - umulh x11, x11,x11 - adcs x23,x23,x8 - mul x8,x14,x13 - adcs x24,x24,x9 - mul x9,x15,x13 - adc x25,x25,x10 - mul x10,x16,x13 - - adds x23,x23,x8 - umulh x8,x14,x13 - adcs x24,x24,x9 - umulh x9,x15,x13 - adcs x25,x25,x10 - umulh x10,x16,x13 - adc x26,xzr,xzr - - mul x6,x12,x12 - adds x24,x24,x8 - umulh x12, x12,x12 - adcs x25,x25,x9 - mul x9,x15,x14 - adc x26,x26,x10 - mul x10,x16,x14 - - adds x25,x25,x9 - umulh x9,x15,x14 - adcs x26,x26,x10 - umulh x10,x16,x14 - adc x27,xzr,xzr - mul x7,x13,x13 - adds x26,x26,x9 - umulh x13, x13,x13 - adc x27,x27,x10 - mul x8,x14,x14 - - mul x10,x16,x15 - umulh x14, x14,x14 - adds x27,x27,x10 - umulh x10,x16,x15 - mul x9,x15,x15 - adc x28,x10,xzr - - adds x19,x19,x19 - adcs x20,x20,x20 - adcs x21,x21,x21 - adcs x22,x22,x22 - adcs x23,x23,x23 - adcs x24,x24,x24 - adcs x25,x25,x25 - adcs x26,x26,x26 - umulh x15, x15,x15 - adcs x27,x27,x27 - mul x10,x16,x16 - adcs x28,x28,x28 - umulh x16, x16,x16 - adc x1,xzr,xzr - - adds x19,x19,x11 - adcs x20,x20,x6 - adcs x21,x21,x12 - adcs x22,x22,x7 - adcs x23,x23,x13 - adcs x24,x24,x8 - adcs x25,x25,x14 - stp x5,x19,[x0] - adcs x26,x26,x9 - stp x20,x21,[x0,#16] - adcs x27,x27,x15 - stp x22,x23,[x0,#32] - adcs x28,x28,x10 - stp x24,x25,[x0,#48] - adc x16,x16,x1 - stp x26,x27,[x0,#64] - stp x28,x16,[x0,#80] - - ret - -.globl _sqr_384 -.private_extern _sqr_384 - -.align 5 -_sqr_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __sqr_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl _redc_mont_384 -.private_extern _redc_mont_384 - -.align 5 -_redc_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl _from_mont_384 -.private_extern _from_mont_384 - -.align 5 -_from_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - csel x15,x15,x23,lo - csel x16,x16,x24,lo - - stp x11,x12,[x0] - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - - -.align 5 -__mul_by_1_mont_384: - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - mul x26,x4,x11 - ldp x15,x16,[x1,#32] - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - ret - - - -.align 5 -__redc_tail_mont_384: - ldp x19,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x11,x11,x19 // accumulate upper half - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x25,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x25,xzr - - csel x11,x11,x19,lo - csel x12,x12,x20,lo - csel x13,x13,x21,lo - csel x14,x14,x22,lo - csel x15,x15,x23,lo - csel x16,x16,x24,lo - - stp x11,x12,[x0] - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret - - -.globl _mul_384 -.private_extern _mul_384 - -.align 5 -_mul_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - bl __mul_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - - -.align 5 -__mul_384: - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - - umulh x5,x11,x17 - umulh x6,x12,x17 - umulh x7,x13,x17 - umulh x8,x14,x17 - umulh x9,x15,x17 - umulh x10,x16,x17 - ldr x17,[x2,8*1] - - str x19,[x0] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,xzr, x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(1+1)] - adc x25,xzr,xzr - - str x19,[x0,8*1] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(2+1)] - adc x25,xzr,xzr - - str x19,[x0,8*2] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(3+1)] - adc x25,xzr,xzr - - str x19,[x0,8*3] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(4+1)] - adc x25,xzr,xzr - - str x19,[x0,8*4] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - adc x25,xzr,xzr - - str x19,[x0,8*5] - adds x19,x20,x5 - adcs x20,x21,x6 - adcs x21,x22,x7 - adcs x22,x23,x8 - adcs x23,x24,x9 - adc x24,x25,x10 - - stp x19,x20,[x0,#48] - stp x21,x22,[x0,#64] - stp x23,x24,[x0,#80] - - ret - - -.globl _mul_382x -.private_extern _mul_382x - -.align 5 -_mul_382x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for two 384-bit vectors - - ldp x11,x12,[x1] - mov x26,x0 // save r_ptr - ldp x19,x20,[x1,#48] - mov x27,x1 // save a_ptr - ldp x13,x14,[x1,#16] - mov x28,x2 // save b_ptr - ldp x21,x22,[x1,#64] - ldp x15,x16,[x1,#32] - adds x5,x11,x19 // t0 = a->re + a->im - ldp x23,x24,[x1,#80] - adcs x6,x12,x20 - ldp x11,x12,[x2] - adcs x7,x13,x21 - ldp x19,x20,[x2,#48] - adcs x8,x14,x22 - ldp x13,x14,[x2,#16] - adcs x9,x15,x23 - ldp x21,x22,[x2,#64] - adc x10,x16,x24 - ldp x15,x16,[x2,#32] - - stp x5,x6,[sp] - adds x5,x11,x19 // t1 = b->re + b->im - ldp x23,x24,[x2,#80] - adcs x6,x12,x20 - stp x7,x8,[sp,#16] - adcs x7,x13,x21 - adcs x8,x14,x22 - stp x9,x10,[sp,#32] - adcs x9,x15,x23 - stp x5,x6,[sp,#48] - adc x10,x16,x24 - stp x7,x8,[sp,#64] - stp x9,x10,[sp,#80] - - bl __mul_384 // _mul_384(ret->re, a->re, b->re) - - add x1,sp,#0 // _mul_384(ret->im, t0, t1) - add x2,sp,#48 - add x0,x26,#96 - bl __mul_384 - - add x1,x27,#48 // _mul_384(tx, a->im, b->im) - add x2,x28,#48 - add x0,sp,#0 - bl __mul_384 - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - add x1,x26,#96 // ret->im -= tx - add x2,sp,#0 - add x0,x26,#96 - bl __sub_mod_384x384 - - add x2,x26,#0 // ret->im -= ret->re - bl __sub_mod_384x384 - - add x1,x26,#0 // ret->re -= tx - add x2,sp,#0 - add x0,x26,#0 - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl _sqr_382x -.private_extern _sqr_382x - -.align 5 -_sqr_382x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp x11,x12,[x1] - ldp x19,x20,[x1,#48] - ldp x13,x14,[x1,#16] - adds x5,x11,x19 // t0 = a->re + a->im - ldp x21,x22,[x1,#64] - adcs x6,x12,x20 - ldp x15,x16,[x1,#32] - adcs x7,x13,x21 - ldp x23,x24,[x1,#80] - adcs x8,x14,x22 - stp x5,x6,[x0] - adcs x9,x15,x23 - ldp x5,x6,[x2] - adc x10,x16,x24 - stp x7,x8,[x0,#16] - - subs x11,x11,x19 // t1 = a->re - a->im - ldp x7,x8,[x2,#16] - sbcs x12,x12,x20 - stp x9,x10,[x0,#32] - sbcs x13,x13,x21 - ldp x9,x10,[x2,#32] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x25,xzr,xzr - - and x19,x5,x25 - and x20,x6,x25 - adds x11,x11,x19 - and x21,x7,x25 - adcs x12,x12,x20 - and x22,x8,x25 - adcs x13,x13,x21 - and x23,x9,x25 - adcs x14,x14,x22 - and x24,x10,x25 - adcs x15,x15,x23 - stp x11,x12,[x0,#48] - adc x16,x16,x24 - stp x13,x14,[x0,#64] - stp x15,x16,[x0,#80] - - mov x4,x1 // save a_ptr - add x1,x0,#0 // _mul_384(ret->re, t0, t1) - add x2,x0,#48 - bl __mul_384 - - add x1,x4,#0 // _mul_384(ret->im, a->re, a->im) - add x2,x4,#48 - add x0,x0,#96 - bl __mul_384 - ldr x30,[x29,#8] - - ldp x11,x12,[x0] - ldp x13,x14,[x0,#16] - adds x11,x11,x11 // add with itself - ldp x15,x16,[x0,#32] - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adcs x19,x19,x19 - adcs x20,x20,x20 - stp x11,x12,[x0] - adcs x21,x21,x21 - stp x13,x14,[x0,#16] - adcs x22,x22,x22 - stp x15,x16,[x0,#32] - adcs x23,x23,x23 - stp x19,x20,[x0,#48] - adc x24,x24,x24 - stp x21,x22,[x0,#64] - stp x23,x24,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl _sqr_mont_382x -.private_extern _sqr_mont_382x - -.align 5 -_sqr_mont_382x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#112 // space for two 384-bit vectors + word - mov x4,x3 // adjust for missing b_ptr - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - ldp x17,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x5,x11,x17 // t0 = a->re + a->im - adcs x6,x12,x20 - adcs x7,x13,x21 - adcs x8,x14,x22 - adcs x9,x15,x23 - adc x10,x16,x24 - - subs x19,x11,x17 // t1 = a->re - a->im - sbcs x20,x12,x20 - sbcs x21,x13,x21 - sbcs x22,x14,x22 - sbcs x23,x15,x23 - sbcs x24,x16,x24 - sbc x25,xzr,xzr // borrow flag as mask - - stp x5,x6,[sp] - stp x7,x8,[sp,#16] - stp x9,x10,[sp,#32] - stp x19,x20,[sp,#48] - stp x21,x22,[sp,#64] - stp x23,x24,[sp,#80] - str x25,[sp,#96] - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - add x2,x1,#48 - bl __mul_mont_383_nonred // _mul_mont_384(ret->im, a->re, a->im) - - adds x19,x11,x11 // add with itself - adcs x20,x12,x12 - adcs x21,x13,x13 - adcs x22,x14,x14 - adcs x23,x15,x15 - adc x24,x16,x16 - - stp x19,x20,[x2,#48] - stp x21,x22,[x2,#64] - stp x23,x24,[x2,#80] - - ldp x11,x12,[sp] - ldr x17,[sp,#48] - ldp x13,x14,[sp,#16] - ldp x15,x16,[sp,#32] - - add x2,sp,#48 - bl __mul_mont_383_nonred // _mul_mont_384(ret->im, t0, t1) - ldr x30,[x29,#8] - - ldr x25,[sp,#96] // account for sign from a->re - a->im - ldp x19,x20,[sp] - ldp x21,x22,[sp,#16] - ldp x23,x24,[sp,#32] - - and x19,x19,x25 - and x20,x20,x25 - and x21,x21,x25 - and x22,x22,x25 - and x23,x23,x25 - and x24,x24,x25 - - subs x11,x11,x19 - sbcs x12,x12,x20 - sbcs x13,x13,x21 - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x25,xzr,xzr - - and x19,x5,x25 - and x20,x6,x25 - and x21,x7,x25 - and x22,x8,x25 - and x23,x9,x25 - and x24,x10,x25 - - adds x11,x11,x19 - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adc x16,x16,x24 - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#112 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - - -.align 5 -__mul_mont_383_nonred: - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - mul x4,x4,x19 - - umulh x26,x11,x17 - umulh x27,x12,x17 - umulh x28,x13,x17 - umulh x0,x14,x17 - umulh x1,x15,x17 - umulh x3,x16,x17 - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,xzr, x3 - mul x3,x10,x4 - ldr x17,[x2,8*1] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*2] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*3] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*4] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*5] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr - - adds x11,x20,x26 - adcs x12,x21,x27 - adcs x13,x22,x28 - adcs x14,x23,x0 - adcs x15,x24,x1 - adcs x16,x25,x3 - - ret - - -.globl _sgn0_pty_mont_384 -.private_extern _sgn0_pty_mont_384 - -.align 5 -_sgn0_pty_mont_384: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov x4,x2 - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - ldp x9,x10,[x1,#32] - mov x1,x0 - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and x0,x11,#1 - adds x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x0,x0,x17 - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - - -.globl _sgn0_pty_mont_384x -.private_extern _sgn0_pty_mont_384x - -.align 5 -_sgn0_pty_mont_384x: -.long 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov x4,x2 - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - ldp x9,x10,[x1,#32] - mov x1,x0 - - bl __mul_by_1_mont_384 - add x1,x1,#48 - - and x2,x11,#1 - orr x3,x11,x12 - adds x11,x11,x11 - orr x3,x3,x13 - adcs x12,x12,x12 - orr x3,x3,x14 - adcs x13,x13,x13 - orr x3,x3,x15 - adcs x14,x14,x14 - orr x3,x3,x16 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x2,x2,x17 - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and x0,x11,#1 - orr x1,x11,x12 - adds x11,x11,x11 - orr x1,x1,x13 - adcs x12,x12,x12 - orr x1,x1,x14 - adcs x13,x13,x13 - orr x1,x1,x15 - adcs x14,x14,x14 - orr x1,x1,x16 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x0,x0,x17 - - cmp x3,#0 - csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) - - cmp x1,#0 - csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) - - and x3,x3,#1 - and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 -.long 3573752767 - ret - diff --git a/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s deleted file mode 100644 index 842c39225b6..00000000000 --- a/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s +++ /dev/null @@ -1,723 +0,0 @@ -.comm ___blst_platform_cap,4 -.text - -.globl _mul_mont_sparse_256 -.private_extern _mul_mont_sparse_256 - -.p2align 5 -_mul_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz mul_mont_sparse_256$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rdx),%rax - movq 0(%rsi),%r13 - movq 8(%rsi),%r14 - movq 16(%rsi),%r12 - movq 24(%rsi),%rbp - movq %rdx,%rbx - - movq %rax,%r15 - mulq %r13 - movq %rax,%r9 - movq %r15,%rax - movq %rdx,%r10 - call __mulq_mont_sparse_256 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sqr_mont_sparse_256 -.private_extern _sqr_mont_sparse_256 - -.p2align 5 -_sqr_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sqr_mont_sparse_256$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - movq 0(%rsi),%rax - movq %rcx,%r8 - movq 8(%rsi),%r14 - movq %rdx,%rcx - movq 16(%rsi),%r12 - leaq (%rsi),%rbx - movq 24(%rsi),%rbp - - movq %rax,%r15 - mulq %rax - movq %rax,%r9 - movq %r15,%rax - movq %rdx,%r10 - call __mulq_mont_sparse_256 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulq_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - mulq %r14 - addq %rax,%r10 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %r12 - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq %rbp - addq %rax,%r12 - movq 8(%rbx),%rax - adcq $0,%rdx - xorq %r14,%r14 - movq %rdx,%r13 - - movq %r9,%rdi - imulq %r8,%r9 - - - movq %rax,%r15 - mulq 0(%rsi) - addq %rax,%r10 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rax,%r12 - movq %r15,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rsi) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq %rdx,%r14 - xorq %r15,%r15 - - - mulq 0(%rcx) - addq %rax,%rdi - movq %r9,%rax - adcq %rdx,%rdi - - mulq 8(%rcx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %rdi,%r10 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rax,%r12 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - addq %rdx,%r13 - adcq $0,%r14 - adcq $0,%r15 - movq %r10,%rdi - imulq %r8,%r10 - - - movq %rax,%r9 - mulq 0(%rsi) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rsi) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq %rdx,%r15 - xorq %r9,%r9 - - - mulq 0(%rcx) - addq %rax,%rdi - movq %r10,%rax - adcq %rdx,%rdi - - mulq 8(%rcx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %rdi,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rax,%r13 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - addq %rdx,%r14 - adcq $0,%r15 - adcq $0,%r9 - movq %r11,%rdi - imulq %r8,%r11 - - - movq %rax,%r10 - mulq 0(%rsi) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rsi) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq %rdx,%r9 - xorq %r10,%r10 - - - mulq 0(%rcx) - addq %rax,%rdi - movq %r11,%rax - adcq %rdx,%rdi - - mulq 8(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %rdi,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - addq %rdx,%r15 - adcq $0,%r9 - adcq $0,%r10 - imulq %r8,%rax - movq 8(%rsp),%rsi - - - movq %rax,%r11 - mulq 0(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq %rdx,%r12 - - mulq 8(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r12,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - movq %r14,%rbx - addq %rbp,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %rdx,%r9 - adcq $0,%r10 - - - - - movq %r15,%r12 - subq 0(%rcx),%r13 - sbbq 8(%rcx),%r14 - sbbq 16(%rcx),%r15 - movq %r9,%rbp - sbbq 24(%rcx),%r9 - sbbq $0,%r10 - - cmovcq %rax,%r13 - cmovcq %rbx,%r14 - cmovcq %r12,%r15 - movq %r13,0(%rsi) - cmovcq %rbp,%r9 - movq %r14,8(%rsi) - movq %r15,16(%rsi) - movq %r9,24(%rsi) - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _from_mont_256 -.private_extern _from_mont_256 - -.p2align 5 -_from_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz from_mont_256$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulq_by_1_mont_256 - - - - - - movq %r14,%r10 - movq %r15,%r11 - movq %r9,%r12 - - subq 0(%rbx),%r13 - sbbq 8(%rbx),%r14 - sbbq 16(%rbx),%r15 - sbbq 24(%rbx),%r9 - - cmovncq %r13,%rax - cmovncq %r14,%r10 - cmovncq %r15,%r11 - movq %rax,0(%rdi) - cmovncq %r9,%r12 - movq %r10,8(%rdi) - movq %r11,16(%rdi) - movq %r12,24(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _redc_mont_256 -.private_extern _redc_mont_256 - -.p2align 5 -_redc_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz redc_mont_256$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulq_by_1_mont_256 - - addq 32(%rsi),%r13 - adcq 40(%rsi),%r14 - movq %r13,%rax - adcq 48(%rsi),%r15 - movq %r14,%r10 - adcq 56(%rsi),%r9 - sbbq %rsi,%rsi - - - - - movq %r15,%r11 - subq 0(%rbx),%r13 - sbbq 8(%rbx),%r14 - sbbq 16(%rbx),%r15 - movq %r9,%r12 - sbbq 24(%rbx),%r9 - sbbq $0,%rsi - - cmovncq %r13,%rax - cmovncq %r14,%r10 - cmovncq %r15,%r11 - movq %rax,0(%rdi) - cmovncq %r9,%r12 - movq %r10,8(%rdi) - movq %r11,16(%rdi) - movq %r12,24(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulq_by_1_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - - movq %rax,%r13 - imulq %rcx,%rax - movq %rax,%r9 - - mulq 0(%rbx) - addq %rax,%r13 - movq %r9,%rax - adcq %rdx,%r13 - - mulq 8(%rbx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %r13,%r10 - adcq $0,%rdx - movq %rdx,%r13 - - mulq 16(%rbx) - movq %r10,%r14 - imulq %rcx,%r10 - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %r13,%r11 - adcq $0,%rdx - movq %rdx,%r13 - - mulq 24(%rbx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r13,%r12 - adcq $0,%rdx - movq %rdx,%r13 - - mulq 0(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq %rdx,%r14 - - mulq 8(%rbx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r11 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 16(%rbx) - movq %r11,%r15 - imulq %rcx,%r11 - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r12 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 24(%rbx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r14,%r13 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq %rdx,%r15 - - mulq 8(%rbx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rbx) - movq %r12,%r9 - imulq %rcx,%r12 - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r15,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rbx) - addq %rax,%r9 - movq %r12,%rax - adcq %rdx,%r9 - - mulq 8(%rbx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r9,%r15 - adcq $0,%rdx - movq %rdx,%r9 - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s deleted file mode 100644 index 7052343d0ac..00000000000 --- a/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s +++ /dev/null @@ -1,3673 +0,0 @@ -.comm ___blst_platform_cap,4 -.text - - - - - - - - -.p2align 5 -__subq_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - subq 0(%rdx),%r8 - movq 56(%rsi),%r15 - sbbq 8(%rdx),%r9 - movq 64(%rsi),%rax - sbbq 16(%rdx),%r10 - movq 72(%rsi),%rbx - sbbq 24(%rdx),%r11 - movq 80(%rsi),%rbp - sbbq 32(%rdx),%r12 - movq 88(%rsi),%rsi - sbbq 40(%rdx),%r13 - movq %r8,0(%rdi) - sbbq 48(%rdx),%r14 - movq 0(%rcx),%r8 - movq %r9,8(%rdi) - sbbq 56(%rdx),%r15 - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - sbbq 64(%rdx),%rax - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - sbbq 72(%rdx),%rbx - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - sbbq 80(%rdx),%rbp - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - sbbq 88(%rdx),%rsi - movq 40(%rcx),%r13 - sbbq %rdx,%rdx - - andq %rdx,%r8 - andq %rdx,%r9 - andq %rdx,%r10 - andq %rdx,%r11 - andq %rdx,%r12 - andq %rdx,%r13 - - addq %r8,%r14 - adcq %r9,%r15 - movq %r14,48(%rdi) - adcq %r10,%rax - movq %r15,56(%rdi) - adcq %r11,%rbx - movq %rax,64(%rdi) - adcq %r12,%rbp - movq %rbx,72(%rdi) - adcq %r13,%rsi - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__addq_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - movq %r8,%r14 - adcq 24(%rdx),%r11 - movq %r9,%r15 - adcq 32(%rdx),%r12 - movq %r10,%rax - adcq 40(%rdx),%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,0(%rdi) - cmovcq %rbx,%r11 - movq %r9,8(%rdi) - cmovcq %rbp,%r12 - movq %r10,16(%rdi) - cmovcq %rsi,%r13 - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__subq_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -__subq_mod_384_a_is_loaded: - subq 0(%rdx),%r8 - movq 0(%rcx),%r14 - sbbq 8(%rdx),%r9 - movq 8(%rcx),%r15 - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rax - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbx - sbbq 32(%rdx),%r12 - movq 32(%rcx),%rbp - sbbq 40(%rdx),%r13 - movq 40(%rcx),%rsi - sbbq %rdx,%rdx - - andq %rdx,%r14 - andq %rdx,%r15 - andq %rdx,%rax - andq %rdx,%rbx - andq %rdx,%rbp - andq %rdx,%rsi - - addq %r14,%r8 - adcq %r15,%r9 - movq %r8,0(%rdi) - adcq %rax,%r10 - movq %r9,8(%rdi) - adcq %rbx,%r11 - movq %r10,16(%rdi) - adcq %rbp,%r12 - movq %r11,24(%rdi) - adcq %rsi,%r13 - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _mul_mont_384x -.private_extern _mul_mont_384x - -.p2align 5 -_mul_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz mul_mont_384x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $328,%rsp -.cfi_adjust_cfa_offset 328 - - - movq %rdx,%rbx - movq %rdi,32(%rsp) - movq %rsi,24(%rsp) - movq %rdx,16(%rsp) - movq %rcx,8(%rsp) - movq %r8,0(%rsp) - - - - - leaq 40(%rsp),%rdi - call __mulq_384 - - - leaq 48(%rbx),%rbx - leaq 48(%rsi),%rsi - leaq 40+96(%rsp),%rdi - call __mulq_384 - - - movq 8(%rsp),%rcx - leaq -48(%rsi),%rdx - leaq 40+192+48(%rsp),%rdi - call __addq_mod_384 - - movq 16(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq -48(%rdi),%rdi - call __addq_mod_384 - - leaq (%rdi),%rbx - leaq 48(%rdi),%rsi - call __mulq_384 - - - leaq (%rdi),%rsi - leaq 40(%rsp),%rdx - movq 8(%rsp),%rcx - call __subq_mod_384x384 - - leaq (%rdi),%rsi - leaq -96(%rdi),%rdx - call __subq_mod_384x384 - - - leaq 40(%rsp),%rsi - leaq 40+96(%rsp),%rdx - leaq 40(%rsp),%rdi - call __subq_mod_384x384 - - movq %rcx,%rbx - - - leaq 40(%rsp),%rsi - movq 0(%rsp),%rcx - movq 32(%rsp),%rdi - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - - leaq 40+192(%rsp),%rsi - movq 0(%rsp),%rcx - leaq 48(%rdi),%rdi - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - leaq 328(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -328-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqr_mont_384x -.private_extern _sqr_mont_384x - -.p2align 5 -_sqr_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sqr_mont_384x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - movq %rdi,8(%rsp) - movq %rsi,16(%rsp) - - - leaq 48(%rsi),%rdx - leaq 32(%rsp),%rdi - call __addq_mod_384 - - - movq 16(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq 32+48(%rsp),%rdi - call __subq_mod_384 - - - movq 16(%rsp),%rsi - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rax - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - - call __mulq_mont_384 - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - movq %r14,%r12 - adcq %r9,%r9 - movq %r15,%r13 - adcq %r10,%r10 - movq %r8,%rax - adcq %r11,%r11 - movq %r9,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - movq %r10,%rbp - sbbq 16(%rcx),%r8 - sbbq 24(%rcx),%r9 - sbbq 32(%rcx),%r10 - movq %r11,%rsi - sbbq 40(%rcx),%r11 - sbbq $0,%rdx - - cmovcq %r12,%r14 - cmovcq %r13,%r15 - cmovcq %rax,%r8 - movq %r14,48(%rdi) - cmovcq %rbx,%r9 - movq %r15,56(%rdi) - cmovcq %rbp,%r10 - movq %r8,64(%rdi) - cmovcq %rsi,%r11 - movq %r9,72(%rdi) - movq %r10,80(%rdi) - movq %r11,88(%rdi) - - leaq 32(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rax - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%r12 - movq 32+24(%rsp),%r13 - - call __mulq_mont_384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _mul_382x -.private_extern _mul_382x - -.p2align 5 -_mul_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz mul_382x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - leaq 96(%rdi),%rdi - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - movq %rdi,16(%rsp) - movq %rcx,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 48(%rsi),%r8 - adcq 56(%rsi),%r9 - adcq 64(%rsi),%r10 - adcq 72(%rsi),%r11 - adcq 80(%rsi),%r12 - adcq 88(%rsi),%r13 - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - movq 32(%rdx),%r12 - movq 40(%rdx),%r13 - - addq 48(%rdx),%r8 - adcq 56(%rdx),%r9 - adcq 64(%rdx),%r10 - adcq 72(%rdx),%r11 - adcq 80(%rdx),%r12 - adcq 88(%rdx),%r13 - - movq %r8,32+48(%rsp) - movq %r9,32+56(%rsp) - movq %r10,32+64(%rsp) - movq %r11,32+72(%rsp) - movq %r12,32+80(%rsp) - movq %r13,32+88(%rsp) - - - leaq 32+0(%rsp),%rsi - leaq 32+48(%rsp),%rbx - call __mulq_384 - - - movq 0(%rsp),%rsi - movq 8(%rsp),%rbx - leaq -96(%rdi),%rdi - call __mulq_384 - - - leaq 48(%rsi),%rsi - leaq 48(%rbx),%rbx - leaq 32(%rsp),%rdi - call __mulq_384 - - - movq 16(%rsp),%rsi - leaq 32(%rsp),%rdx - movq 24(%rsp),%rcx - movq %rsi,%rdi - call __subq_mod_384x384 - - - leaq 0(%rdi),%rsi - leaq -96(%rdi),%rdx - call __subq_mod_384x384 - - - leaq -96(%rdi),%rsi - leaq 32(%rsp),%rdx - leaq -96(%rdi),%rdi - call __subq_mod_384x384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqr_382x -.private_extern _sqr_382x - -.p2align 5 -_sqr_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sqr_382x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rcx - - - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%rbx - movq 32(%rsi),%rbp - movq 40(%rsi),%rdx - - movq %r14,%r8 - addq 48(%rsi),%r14 - movq %r15,%r9 - adcq 56(%rsi),%r15 - movq %rax,%r10 - adcq 64(%rsi),%rax - movq %rbx,%r11 - adcq 72(%rsi),%rbx - movq %rbp,%r12 - adcq 80(%rsi),%rbp - movq %rdx,%r13 - adcq 88(%rsi),%rdx - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - movq %rbp,32(%rdi) - movq %rdx,40(%rdi) - - - leaq 48(%rsi),%rdx - leaq 48(%rdi),%rdi - call __subq_mod_384_a_is_loaded - - - leaq (%rdi),%rsi - leaq -48(%rdi),%rbx - leaq -48(%rdi),%rdi - call __mulq_384 - - - movq (%rsp),%rsi - leaq 48(%rsi),%rbx - leaq 96(%rdi),%rdi - call __mulq_384 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq 64(%rdi),%rax - movq 72(%rdi),%rbx - movq 80(%rdi),%rbp - addq %r8,%r8 - movq 88(%rdi),%rdx - adcq %r9,%r9 - movq %r8,0(%rdi) - adcq %r10,%r10 - movq %r9,8(%rdi) - adcq %r11,%r11 - movq %r10,16(%rdi) - adcq %r12,%r12 - movq %r11,24(%rdi) - adcq %r13,%r13 - movq %r12,32(%rdi) - adcq %r14,%r14 - movq %r13,40(%rdi) - adcq %r15,%r15 - movq %r14,48(%rdi) - adcq %rax,%rax - movq %r15,56(%rdi) - adcq %rbx,%rbx - movq %rax,64(%rdi) - adcq %rbp,%rbp - movq %rbx,72(%rdi) - adcq %rdx,%rdx - movq %rbp,80(%rdi) - movq %rdx,88(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -8*7 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _mul_384 -.private_extern _mul_384 - -.p2align 5 -_mul_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz mul_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - - - movq %rdx,%rbx - call __mulq_384 - - movq 0(%rsp),%r12 -.cfi_restore %r12 - movq 8(%rsp),%rbx -.cfi_restore %rbx - movq 16(%rsp),%rbp -.cfi_restore %rbp - leaq 24(%rsp),%rsp -.cfi_adjust_cfa_offset -24 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__mulq_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rbx),%rax - - movq %rax,%rbp - mulq 0(%rsi) - movq %rax,0(%rdi) - movq %rbp,%rax - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r11 - movq 8(%rbx),%rax - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,8(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,16(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,24(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 32(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,32(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq 40(%rbx),%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%rcx - movq %rbp,%rax - adcq $0,%rdx - movq %rcx,40(%rdi) - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %rax,%r12 - movq %rax,%rax - adcq $0,%rdx - addq %r12,%r11 - adcq $0,%rdx - movq %rdx,%r12 - movq %rcx,48(%rdi) - movq %r8,56(%rdi) - movq %r9,64(%rdi) - movq %r10,72(%rdi) - movq %r11,80(%rdi) - movq %r12,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqr_384 -.private_extern _sqr_384 - -.p2align 5 -_sqr_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sqr_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - call __sqrq_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__sqrq_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r15 - movq 16(%rsi),%rcx - movq 24(%rsi),%rbx - - - movq %rax,%r14 - mulq %r15 - movq %rax,%r9 - movq %r14,%rax - movq 32(%rsi),%rbp - movq %rdx,%r10 - - mulq %rcx - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - movq 40(%rsi),%rsi - movq %rdx,%r11 - - mulq %rbx - addq %rax,%r11 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq %rbp - addq %rax,%r12 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r13 - - mulq %rsi - addq %rax,%r13 - movq %r14,%rax - adcq $0,%rdx - movq %rdx,%r14 - - mulq %rax - xorq %r8,%r8 - movq %rax,0(%rdi) - movq %r15,%rax - addq %r9,%r9 - adcq $0,%r8 - addq %rdx,%r9 - adcq $0,%r8 - movq %r9,8(%rdi) - - mulq %rcx - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq %rbx - addq %rax,%r12 - movq %r15,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq %rbp - addq %rax,%r13 - movq %r15,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq %rsi - addq %rax,%r14 - movq %r15,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq %rax - xorq %r9,%r9 - addq %rax,%r8 - movq %rcx,%rax - addq %r10,%r10 - adcq %r11,%r11 - adcq $0,%r9 - addq %r8,%r10 - adcq %rdx,%r11 - adcq $0,%r9 - movq %r10,16(%rdi) - - mulq %rbx - addq %rax,%r13 - movq %rcx,%rax - adcq $0,%rdx - movq %r11,24(%rdi) - movq %rdx,%r8 - - mulq %rbp - addq %rax,%r14 - movq %rcx,%rax - adcq $0,%rdx - addq %r8,%r14 - adcq $0,%rdx - movq %rdx,%r8 - - mulq %rsi - addq %rax,%r15 - movq %rcx,%rax - adcq $0,%rdx - addq %r8,%r15 - adcq $0,%rdx - movq %rdx,%rcx - - mulq %rax - xorq %r11,%r11 - addq %rax,%r9 - movq %rbx,%rax - addq %r12,%r12 - adcq %r13,%r13 - adcq $0,%r11 - addq %r9,%r12 - adcq %rdx,%r13 - adcq $0,%r11 - movq %r12,32(%rdi) - - - mulq %rbp - addq %rax,%r15 - movq %rbx,%rax - adcq $0,%rdx - movq %r13,40(%rdi) - movq %rdx,%r8 - - mulq %rsi - addq %rax,%rcx - movq %rbx,%rax - adcq $0,%rdx - addq %r8,%rcx - adcq $0,%rdx - movq %rdx,%rbx - - mulq %rax - xorq %r12,%r12 - addq %rax,%r11 - movq %rbp,%rax - addq %r14,%r14 - adcq %r15,%r15 - adcq $0,%r12 - addq %r11,%r14 - adcq %rdx,%r15 - movq %r14,48(%rdi) - adcq $0,%r12 - movq %r15,56(%rdi) - - - mulq %rsi - addq %rax,%rbx - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq %rax - xorq %r13,%r13 - addq %rax,%r12 - movq %rsi,%rax - addq %rcx,%rcx - adcq %rbx,%rbx - adcq $0,%r13 - addq %r12,%rcx - adcq %rdx,%rbx - movq %rcx,64(%rdi) - adcq $0,%r13 - movq %rbx,72(%rdi) - - - mulq %rax - addq %r13,%rax - addq %rbp,%rbp - adcq $0,%rdx - addq %rbp,%rax - adcq $0,%rdx - movq %rax,80(%rdi) - movq %rdx,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sqr_mont_384 -.private_extern _sqr_mont_384 - -.p2align 5 -_sqr_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sqr_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $120,%rsp -.cfi_adjust_cfa_offset 8*15 - - - movq %rcx,96(%rsp) - movq %rdx,104(%rsp) - movq %rdi,112(%rsp) - - movq %rsp,%rdi - call __sqrq_384 - - leaq 0(%rsp),%rsi - movq 96(%rsp),%rcx - movq 104(%rsp),%rbx - movq 112(%rsp),%rdi - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - leaq 120(%rsp),%r8 - movq 120(%rsp),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -8*21 - - .byte 0xf3,0xc3 -.cfi_endproc - - - - -.globl _redc_mont_384 -.private_extern _redc_mont_384 - -.p2align 5 -_redc_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz redc_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - - - -.globl _from_mont_384 -.private_extern _from_mont_384 - -.p2align 5 -_from_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz from_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulq_by_1_mont_384 - - - - - - movq %r15,%rcx - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulq_by_1_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %rax,%r14 - imulq %rcx,%rax - movq %rax,%r8 - - mulq 0(%rbx) - addq %rax,%r14 - movq %r8,%rax - adcq %rdx,%r14 - - mulq 8(%rbx) - addq %rax,%r9 - movq %r8,%rax - adcq $0,%rdx - addq %r14,%r9 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 16(%rbx) - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - addq %r14,%r10 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 24(%rbx) - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %r9,%r15 - imulq %rcx,%r9 - addq %r14,%r11 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 32(%rbx) - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %r14,%r12 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 40(%rbx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %r14,%r13 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rbx) - addq %rax,%r15 - movq %r9,%rax - adcq %rdx,%r15 - - mulq 8(%rbx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %r15,%r10 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rbx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %r15,%r11 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rbx) - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - movq %r10,%r8 - imulq %rcx,%r10 - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 32(%rbx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 40(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %r15,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rbx) - addq %rax,%r8 - movq %r10,%rax - adcq %rdx,%r8 - - mulq 8(%rbx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rbx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r8,%r12 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 24(%rbx) - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - movq %r11,%r9 - imulq %rcx,%r11 - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %r8,%r14 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %r8,%r15 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 0(%rbx) - addq %rax,%r9 - movq %r11,%rax - adcq %rdx,%r9 - - mulq 8(%rbx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rbx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rbx) - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - movq %r12,%r10 - imulq %rcx,%r12 - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %r9,%r15 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rbx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %r9,%r8 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 0(%rbx) - addq %rax,%r10 - movq %r12,%rax - adcq %rdx,%r10 - - mulq 8(%rbx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 24(%rbx) - addq %rax,%r15 - movq %r12,%rax - adcq $0,%rdx - movq %r13,%r11 - imulq %rcx,%r13 - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rbx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r8 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rbx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %r10,%r9 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 0(%rbx) - addq %rax,%r11 - movq %r13,%rax - adcq %rdx,%r11 - - mulq 8(%rbx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 24(%rbx) - addq %rax,%r8 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rbx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r9 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rbx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__redq_tail_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - addq 48(%rsi),%r14 - movq %r14,%rax - adcq 56(%rsi),%r15 - adcq 64(%rsi),%r8 - adcq 72(%rsi),%r9 - movq %r15,%rcx - adcq 80(%rsi),%r10 - adcq 88(%rsi),%r11 - sbbq %r12,%r12 - - - - - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sgn0_pty_mont_384 -.private_extern _sgn0_pty_mont_384 - -.p2align 5 -_sgn0_pty_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sgn0_pty_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - leaq 0(%rdi),%rsi - movq %rdx,%rcx - call __mulq_by_1_mont_384 - - xorq %rax,%rax - movq %r14,%r13 - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - notq %rax - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sgn0_pty_mont_384x -.private_extern _sgn0_pty_mont_384x - -.p2align 5 -_sgn0_pty_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sgn0_pty_mont_384x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - leaq 48(%rdi),%rsi - movq %rdx,%rcx - call __mulq_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - leaq 0(%rdi),%rsi - xorq %rdi,%rdi - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rdi - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rdi - - movq %r14,0(%rsp) - notq %rdi - andq $1,%r13 - andq $2,%rdi - orq %r13,%rdi - - call __mulq_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - xorq %rax,%rax - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - movq 0(%rsp),%r12 - - notq %rax - - testq %r14,%r14 - cmovzq %rdi,%r13 - - testq %r12,%r12 - cmovnzq %rdi,%rax - - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _mul_mont_384 -.private_extern _mul_mont_384 - -.p2align 5 -_mul_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz mul_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $24,%rsp -.cfi_adjust_cfa_offset 8*3 - - - movq 0(%rdx),%rax - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - movq %rdx,%rbx - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - - call __mulq_mont_384 - - movq 24(%rsp),%r15 -.cfi_restore %r15 - movq 32(%rsp),%r14 -.cfi_restore %r14 - movq 40(%rsp),%r13 -.cfi_restore %r13 - movq 48(%rsp),%r12 -.cfi_restore %r12 - movq 56(%rsp),%rbx -.cfi_restore %rbx - movq 64(%rsp),%rbp -.cfi_restore %rbp - leaq 72(%rsp),%rsp -.cfi_adjust_cfa_offset -72 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulq_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rax,%rdi - mulq %r14 - movq %rax,%r8 - movq %rdi,%rax - movq %rdx,%r9 - - mulq %r15 - addq %rax,%r9 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %r12 - addq %rax,%r10 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r11 - - movq %r8,%rbp - imulq 8(%rsp),%r8 - - mulq %r13 - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r13 - - mulq 40(%rsi) - addq %rax,%r13 - movq %r8,%rax - adcq $0,%rdx - xorq %r15,%r15 - movq %rdx,%r14 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r8,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r9 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r9 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r10 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r13 - movq 8(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq %rdx,%r14 - adcq $0,%r15 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r9 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 8(%rsi) - addq %rax,%r10 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r10 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - movq %r9,%rbp - imulq 8(%rsp),%r9 - - mulq 24(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r12 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rsi) - addq %r8,%r14 - adcq $0,%rdx - xorq %r8,%r8 - addq %rax,%r14 - movq %r9,%rax - adcq %rdx,%r15 - adcq $0,%r8 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r9,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r10 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r14 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq %rdx,%r15 - adcq $0,%r8 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r10 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r11 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - movq %r10,%rbp - imulq 8(%rsp),%r10 - - mulq 24(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rsi) - addq %r9,%r15 - adcq $0,%rdx - xorq %r9,%r9 - addq %rax,%r15 - movq %r10,%rax - adcq %rdx,%r8 - adcq $0,%r9 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r10,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r11 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r15 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq %rdx,%r8 - adcq $0,%r9 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r11 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 8(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r12 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - movq %r11,%rbp - imulq 8(%rsp),%r11 - - mulq 24(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r15 - movq %rdi,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rsi) - addq %r10,%r8 - adcq $0,%rdx - xorq %r10,%r10 - addq %rax,%r8 - movq %r11,%rax - adcq %rdx,%r9 - adcq $0,%r10 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r11,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r12 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r14 - adcq $0,%rdx - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r8 - movq 32(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r8 - adcq %rdx,%r9 - adcq $0,%r10 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r12 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 8(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - movq %r12,%rbp - imulq 8(%rsp),%r12 - - mulq 24(%rsi) - addq %rax,%r15 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rsi) - addq %rax,%r8 - movq %rdi,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %r11,%r9 - adcq $0,%rdx - xorq %r11,%r11 - addq %rax,%r9 - movq %r12,%rax - adcq %rdx,%r10 - adcq $0,%r11 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r12,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r13 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %rbp,%r8 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r9 - movq 40(%rbx),%rax - adcq $0,%rdx - addq %rbp,%r9 - adcq %rdx,%r10 - adcq $0,%r11 - - movq %rax,%rdi - mulq 0(%rsi) - addq %rax,%r13 - movq %rdi,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 8(%rsi) - addq %rax,%r14 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r14 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 16(%rsi) - addq %rax,%r15 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r15 - adcq $0,%rdx - movq %rdx,%r12 - - movq %r13,%rbp - imulq 8(%rsp),%r13 - - mulq 24(%rsi) - addq %rax,%r8 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r8 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rsi) - addq %rax,%r9 - movq %rdi,%rax - adcq $0,%rdx - addq %r12,%r9 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 40(%rsi) - addq %r12,%r10 - adcq $0,%rdx - xorq %r12,%r12 - addq %rax,%r10 - movq %r13,%rax - adcq %rdx,%r11 - adcq $0,%r12 - - mulq 0(%rcx) - addq %rax,%rbp - movq %r13,%rax - adcq %rdx,%rbp - - mulq 8(%rcx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %rbp,%r14 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rcx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %rbp,%r15 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 24(%rcx) - addq %rbp,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 32(%rcx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %rbp,%r9 - adcq $0,%rdx - movq %rdx,%rbp - - mulq 40(%rcx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %rbp,%r10 - adcq %rdx,%r11 - adcq $0,%r12 - - - - - movq 16(%rsp),%rdi - subq 0(%rcx),%r14 - movq %r15,%rdx - sbbq 8(%rcx),%r15 - movq %r8,%rbx - sbbq 16(%rcx),%r8 - movq %r9,%rsi - sbbq 24(%rcx),%r9 - movq %r10,%rbp - sbbq 32(%rcx),%r10 - movq %r11,%r13 - sbbq 40(%rcx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r14 - cmovcq %rdx,%r15 - cmovcq %rbx,%r8 - movq %r14,0(%rdi) - cmovcq %rsi,%r9 - movq %r15,8(%rdi) - cmovcq %rbp,%r10 - movq %r8,16(%rdi) - cmovcq %r13,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqr_n_mul_mont_384 -.private_extern _sqr_n_mul_mont_384 - -.p2align 5 -_sqr_n_mul_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sqr_n_mul_mont_384$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 8*17 - - - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - movq %rcx,16(%rsp) - leaq 32(%rsp),%rdi - movq %r9,24(%rsp) - movq (%r9),%xmm2 - -L$oop_sqr_384: - movd %edx,%xmm1 - - call __sqrq_384 - - leaq 0(%rdi),%rsi - movq 0(%rsp),%rcx - movq 16(%rsp),%rbx - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - movd %xmm1,%edx - leaq 0(%rdi),%rsi - decl %edx - jnz L$oop_sqr_384 - -.byte 102,72,15,126,208 - movq %rbx,%rcx - movq 24(%rsp),%rbx - - - - - - - movq %r8,%r12 - movq %r9,%r13 - - call __mulq_mont_384 - - leaq 136(%rsp),%r8 - movq 136(%rsp),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -8*23 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sqr_n_mul_mont_383 -.private_extern _sqr_n_mul_mont_383 - -.p2align 5 -_sqr_n_mul_mont_383: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sqr_n_mul_mont_383$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 8*17 - - - movq %r8,0(%rsp) - movq %rdi,8(%rsp) - movq %rcx,16(%rsp) - leaq 32(%rsp),%rdi - movq %r9,24(%rsp) - movq (%r9),%xmm2 - -L$oop_sqr_383: - movd %edx,%xmm1 - - call __sqrq_384 - - leaq 0(%rdi),%rsi - movq 0(%rsp),%rcx - movq 16(%rsp),%rbx - call __mulq_by_1_mont_384 - - movd %xmm1,%edx - addq 48(%rsi),%r14 - adcq 56(%rsi),%r15 - adcq 64(%rsi),%r8 - adcq 72(%rsi),%r9 - adcq 80(%rsi),%r10 - adcq 88(%rsi),%r11 - leaq 0(%rdi),%rsi - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - decl %edx - jnz L$oop_sqr_383 - -.byte 102,72,15,126,208 - movq %rbx,%rcx - movq 24(%rsp),%rbx - - - - - - - movq %r8,%r12 - movq %r9,%r13 - - call __mulq_mont_384 - - leaq 136(%rsp),%r8 - movq 136(%rsp),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -8*23 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulq_mont_383_nonred: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq %rax,%rbp - mulq %r14 - movq %rax,%r8 - movq %rbp,%rax - movq %rdx,%r9 - - mulq %r15 - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %r12 - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - movq %r8,%r15 - imulq 8(%rsp),%r8 - - mulq %r13 - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r13 - - mulq 40(%rsi) - addq %rax,%r13 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rcx) - addq %rax,%r15 - movq %r8,%rax - adcq %rdx,%r15 - - mulq 8(%rcx) - addq %rax,%r9 - movq %r8,%rax - adcq $0,%rdx - addq %r15,%r9 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rcx) - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - addq %r15,%r10 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rcx) - addq %r15,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq 32(%rcx) - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 40(%rcx) - addq %rax,%r13 - movq 8(%rbx),%rax - adcq $0,%rdx - addq %r15,%r13 - adcq %rdx,%r14 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq 8(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r10 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r11 - adcq $0,%rdx - movq %rdx,%r15 - - movq %r9,%r8 - imulq 8(%rsp),%r9 - - mulq 24(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 32(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 40(%rsi) - addq %r15,%r14 - adcq $0,%rdx - addq %rax,%r14 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rcx) - addq %rax,%r8 - movq %r9,%rax - adcq %rdx,%r8 - - mulq 8(%rcx) - addq %rax,%r10 - movq %r9,%rax - adcq $0,%rdx - addq %r8,%r10 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rcx) - addq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 24(%rcx) - addq %r8,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %r9,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rcx) - addq %rax,%r13 - movq %r9,%rax - adcq $0,%rdx - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rcx) - addq %rax,%r14 - movq 16(%rbx),%rax - adcq $0,%rdx - addq %r8,%r14 - adcq %rdx,%r15 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 8(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r11 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 16(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r12 - adcq $0,%rdx - movq %rdx,%r8 - - movq %r10,%r9 - imulq 8(%rsp),%r10 - - mulq 24(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r13 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 32(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r8,%r14 - adcq $0,%rdx - movq %rdx,%r8 - - mulq 40(%rsi) - addq %r8,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r8 - - mulq 0(%rcx) - addq %rax,%r9 - movq %r10,%rax - adcq %rdx,%r9 - - mulq 8(%rcx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r9,%r11 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rcx) - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 24(%rcx) - addq %r9,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %r10,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rcx) - addq %rax,%r14 - movq %r10,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rcx) - addq %rax,%r15 - movq 24(%rbx),%rax - adcq $0,%rdx - addq %r9,%r15 - adcq %rdx,%r8 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r12 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 16(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r13 - adcq $0,%rdx - movq %rdx,%r9 - - movq %r11,%r10 - imulq 8(%rsp),%r11 - - mulq 24(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r14 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 32(%rsi) - addq %rax,%r15 - movq %rbp,%rax - adcq $0,%rdx - addq %r9,%r15 - adcq $0,%rdx - movq %rdx,%r9 - - mulq 40(%rsi) - addq %r9,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r9 - - mulq 0(%rcx) - addq %rax,%r10 - movq %r11,%rax - adcq %rdx,%r10 - - mulq 8(%rcx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r10,%r12 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rcx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 24(%rcx) - addq %r10,%r14 - adcq $0,%rdx - addq %rax,%r14 - movq %r11,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rcx) - addq %rax,%r15 - movq %r11,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rcx) - addq %rax,%r8 - movq 32(%rbx),%rax - adcq $0,%rdx - addq %r10,%r8 - adcq %rdx,%r9 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 8(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - movq %r12,%r11 - imulq 8(%rsp),%r12 - - mulq 24(%rsi) - addq %rax,%r15 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 32(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r10,%r8 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 40(%rsi) - addq %r10,%r9 - adcq $0,%rdx - addq %rax,%r9 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 0(%rcx) - addq %rax,%r11 - movq %r12,%rax - adcq %rdx,%r11 - - mulq 8(%rcx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rcx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 24(%rcx) - addq %r11,%r15 - adcq $0,%rdx - addq %rax,%r15 - movq %r12,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rcx) - addq %rax,%r8 - movq %r12,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rcx) - addq %rax,%r9 - movq 40(%rbx),%rax - adcq $0,%rdx - addq %r11,%r9 - adcq %rdx,%r10 - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 8(%rsi) - addq %rax,%r14 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rsi) - addq %rax,%r15 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - movq %r13,%r12 - imulq 8(%rsp),%r13 - - mulq 24(%rsi) - addq %rax,%r8 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r8 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 32(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - addq %r11,%r9 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 40(%rsi) - addq %r11,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq 0(%rcx) - addq %rax,%r12 - movq %r13,%rax - adcq %rdx,%r12 - - mulq 8(%rcx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %r12,%r14 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 16(%rcx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r12,%r15 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 24(%rcx) - addq %r12,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r13,%rax - adcq $0,%rdx - movq %rdx,%r12 - - mulq 32(%rcx) - addq %rax,%r9 - movq %r13,%rax - adcq $0,%rdx - addq %r12,%r9 - adcq $0,%rdx - movq %rdx,%r12 - - mulq 40(%rcx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %r12,%r10 - adcq %rdx,%r11 - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqr_mont_382x -.private_extern _sqr_mont_382x - -.p2align 5 -_sqr_mont_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -#ifdef __BLST_PORTABLE__ - testl $1,___blst_platform_cap(%rip) - jnz sqr_mont_382x$1 -#endif - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - movq %rsi,16(%rsp) - movq %rdi,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %r8,%r14 - addq 48(%rsi),%r8 - movq %r9,%r15 - adcq 56(%rsi),%r9 - movq %r10,%rax - adcq 64(%rsi),%r10 - movq %r11,%rdx - adcq 72(%rsi),%r11 - movq %r12,%rbx - adcq 80(%rsi),%r12 - movq %r13,%rbp - adcq 88(%rsi),%r13 - - subq 48(%rsi),%r14 - sbbq 56(%rsi),%r15 - sbbq 64(%rsi),%rax - sbbq 72(%rsi),%rdx - sbbq 80(%rsi),%rbx - sbbq 88(%rsi),%rbp - sbbq %rdi,%rdi - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - movq %r14,32+48(%rsp) - movq %r15,32+56(%rsp) - movq %rax,32+64(%rsp) - movq %rdx,32+72(%rsp) - movq %rbx,32+80(%rsp) - movq %rbp,32+88(%rsp) - movq %rdi,32+96(%rsp) - - - - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rax - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - - movq 24(%rsp),%rdi - call __mulq_mont_383_nonred - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - - movq %r14,48(%rdi) - movq %r15,56(%rdi) - movq %r8,64(%rdi) - movq %r9,72(%rdi) - movq %r10,80(%rdi) - movq %r11,88(%rdi) - - leaq 32(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rax - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%r12 - movq 32+24(%rsp),%r13 - - call __mulq_mont_383_nonred - movq 32+96(%rsp),%rsi - movq 32+0(%rsp),%r12 - movq 32+8(%rsp),%r13 - andq %rsi,%r12 - movq 32+16(%rsp),%rax - andq %rsi,%r13 - movq 32+24(%rsp),%rbx - andq %rsi,%rax - movq 32+32(%rsp),%rbp - andq %rsi,%rbx - andq %rsi,%rbp - andq 32+40(%rsp),%rsi - - subq %r12,%r14 - movq 0(%rcx),%r12 - sbbq %r13,%r15 - movq 8(%rcx),%r13 - sbbq %rax,%r8 - movq 16(%rcx),%rax - sbbq %rbx,%r9 - movq 24(%rcx),%rbx - sbbq %rbp,%r10 - movq 32(%rcx),%rbp - sbbq %rsi,%r11 - sbbq %rsi,%rsi - - andq %rsi,%r12 - andq %rsi,%r13 - andq %rsi,%rax - andq %rsi,%rbx - andq %rsi,%rbp - andq 40(%rcx),%rsi - - addq %r12,%r14 - adcq %r13,%r15 - adcq %rax,%r8 - adcq %rbx,%r9 - adcq %rbp,%r10 - adcq %rsi,%r11 - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s deleted file mode 100644 index ae9a76b739c..00000000000 --- a/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s +++ /dev/null @@ -1,623 +0,0 @@ -.text - -.globl _mulx_mont_sparse_256 -.private_extern _mulx_mont_sparse_256 - -.p2align 5 -_mulx_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_mont_sparse_256$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rbp - movq 24(%rsi),%r9 - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%rax,%r11 - call __mulx_mont_sparse_256 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sqrx_mont_sparse_256 -.private_extern _sqrx_mont_sparse_256 - -.p2align 5 -_sqrx_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_mont_sparse_256$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - movq %rcx,%r8 - movq %rdx,%rcx - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rbp - movq 24(%rsi),%r9 - leaq -128(%rbx),%rsi - leaq -128(%rcx),%rcx - - mulxq %rdx,%rax,%r11 - call __mulx_mont_sparse_256 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulx_mont_sparse_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - mulxq %r15,%r15,%r12 - mulxq %rbp,%rbp,%r13 - addq %r15,%r11 - mulxq %r9,%r9,%r14 - movq 8(%rbx),%rdx - adcq %rbp,%r12 - adcq %r9,%r13 - adcq $0,%r14 - - movq %rax,%r10 - imulq %r8,%rax - - - xorq %r15,%r15 - mulxq 0+128(%rsi),%rbp,%r9 - adoxq %rbp,%r11 - adcxq %r9,%r12 - - mulxq 8+128(%rsi),%rbp,%r9 - adoxq %rbp,%r12 - adcxq %r9,%r13 - - mulxq 16+128(%rsi),%rbp,%r9 - adoxq %rbp,%r13 - adcxq %r9,%r14 - - mulxq 24+128(%rsi),%rbp,%r9 - movq %rax,%rdx - adoxq %rbp,%r14 - adcxq %r15,%r9 - adoxq %r9,%r15 - - - mulxq 0+128(%rcx),%rbp,%rax - adcxq %rbp,%r10 - adoxq %r11,%rax - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%rax - adoxq %r9,%r12 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r12 - adoxq %r9,%r13 - - mulxq 24+128(%rcx),%rbp,%r9 - movq 16(%rbx),%rdx - adcxq %rbp,%r13 - adoxq %r9,%r14 - adcxq %r10,%r14 - adoxq %r10,%r15 - adcxq %r10,%r15 - adoxq %r10,%r10 - adcq $0,%r10 - movq %rax,%r11 - imulq %r8,%rax - - - xorq %rbp,%rbp - mulxq 0+128(%rsi),%rbp,%r9 - adoxq %rbp,%r12 - adcxq %r9,%r13 - - mulxq 8+128(%rsi),%rbp,%r9 - adoxq %rbp,%r13 - adcxq %r9,%r14 - - mulxq 16+128(%rsi),%rbp,%r9 - adoxq %rbp,%r14 - adcxq %r9,%r15 - - mulxq 24+128(%rsi),%rbp,%r9 - movq %rax,%rdx - adoxq %rbp,%r15 - adcxq %r10,%r9 - adoxq %r9,%r10 - - - mulxq 0+128(%rcx),%rbp,%rax - adcxq %rbp,%r11 - adoxq %r12,%rax - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%rax - adoxq %r9,%r13 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r13 - adoxq %r9,%r14 - - mulxq 24+128(%rcx),%rbp,%r9 - movq 24(%rbx),%rdx - adcxq %rbp,%r14 - adoxq %r9,%r15 - adcxq %r11,%r15 - adoxq %r11,%r10 - adcxq %r11,%r10 - adoxq %r11,%r11 - adcq $0,%r11 - movq %rax,%r12 - imulq %r8,%rax - - - xorq %rbp,%rbp - mulxq 0+128(%rsi),%rbp,%r9 - adoxq %rbp,%r13 - adcxq %r9,%r14 - - mulxq 8+128(%rsi),%rbp,%r9 - adoxq %rbp,%r14 - adcxq %r9,%r15 - - mulxq 16+128(%rsi),%rbp,%r9 - adoxq %rbp,%r15 - adcxq %r9,%r10 - - mulxq 24+128(%rsi),%rbp,%r9 - movq %rax,%rdx - adoxq %rbp,%r10 - adcxq %r11,%r9 - adoxq %r9,%r11 - - - mulxq 0+128(%rcx),%rbp,%rax - adcxq %rbp,%r12 - adoxq %r13,%rax - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%rax - adoxq %r9,%r14 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r14 - adoxq %r9,%r15 - - mulxq 24+128(%rcx),%rbp,%r9 - movq %rax,%rdx - adcxq %rbp,%r15 - adoxq %r9,%r10 - adcxq %r12,%r10 - adoxq %r12,%r11 - adcxq %r12,%r11 - adoxq %r12,%r12 - adcq $0,%r12 - imulq %r8,%rdx - - - xorq %rbp,%rbp - mulxq 0+128(%rcx),%r13,%r9 - adcxq %rax,%r13 - adoxq %r9,%r14 - - mulxq 8+128(%rcx),%rbp,%r9 - adcxq %rbp,%r14 - adoxq %r9,%r15 - - mulxq 16+128(%rcx),%rbp,%r9 - adcxq %rbp,%r15 - adoxq %r9,%r10 - - mulxq 24+128(%rcx),%rbp,%r9 - movq %r14,%rdx - leaq 128(%rcx),%rcx - adcxq %rbp,%r10 - adoxq %r9,%r11 - movq %r15,%rax - adcxq %r13,%r11 - adoxq %r13,%r12 - adcq $0,%r12 - - - - - movq %r10,%rbp - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - sbbq 16(%rcx),%r10 - movq %r11,%r9 - sbbq 24(%rcx),%r11 - sbbq $0,%r12 - - cmovcq %rdx,%r14 - cmovcq %rax,%r15 - cmovcq %rbp,%r10 - movq %r14,0(%rdi) - cmovcq %r9,%r11 - movq %r15,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _fromx_mont_256 -.private_extern _fromx_mont_256 - -.p2align 5 -_fromx_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -from_mont_256$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulx_by_1_mont_256 - - - - - - movq %r15,%rdx - movq %r10,%r12 - movq %r11,%r13 - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r10 - sbbq 24(%rbx),%r11 - - cmovncq %r14,%rax - cmovncq %r15,%rdx - cmovncq %r10,%r12 - movq %rax,0(%rdi) - cmovncq %r11,%r13 - movq %rdx,8(%rdi) - movq %r12,16(%rdi) - movq %r13,24(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _redcx_mont_256 -.private_extern _redcx_mont_256 - -.p2align 5 -_redcx_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -redc_mont_256$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulx_by_1_mont_256 - - addq 32(%rsi),%r14 - adcq 40(%rsi),%r15 - movq %r14,%rax - adcq 48(%rsi),%r10 - movq %r15,%rdx - adcq 56(%rsi),%r11 - sbbq %rsi,%rsi - - - - - movq %r10,%r12 - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r10 - movq %r11,%r13 - sbbq 24(%rbx),%r11 - sbbq $0,%rsi - - cmovncq %r14,%rax - cmovncq %r15,%rdx - cmovncq %r10,%r12 - movq %rax,0(%rdi) - cmovncq %r11,%r13 - movq %rdx,8(%rdi) - movq %r12,16(%rdi) - movq %r13,24(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulx_by_1_mont_256: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rax - movq 8(%rsi),%r11 - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - - movq %rax,%r14 - imulq %rcx,%rax - movq %rax,%r10 - - mulq 0(%rbx) - addq %rax,%r14 - movq %r10,%rax - adcq %rdx,%r14 - - mulq 8(%rbx) - addq %rax,%r11 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r11 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 16(%rbx) - movq %r11,%r15 - imulq %rcx,%r11 - addq %rax,%r12 - movq %r10,%rax - adcq $0,%rdx - addq %r14,%r12 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 24(%rbx) - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r14,%r13 - adcq $0,%rdx - movq %rdx,%r14 - - mulq 0(%rbx) - addq %rax,%r15 - movq %r11,%rax - adcq %rdx,%r15 - - mulq 8(%rbx) - addq %rax,%r12 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r12 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 16(%rbx) - movq %r12,%r10 - imulq %rcx,%r12 - addq %rax,%r13 - movq %r11,%rax - adcq $0,%rdx - addq %r15,%r13 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 24(%rbx) - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r15,%r14 - adcq $0,%rdx - movq %rdx,%r15 - - mulq 0(%rbx) - addq %rax,%r10 - movq %r12,%rax - adcq %rdx,%r10 - - mulq 8(%rbx) - addq %rax,%r13 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rbx) - movq %r13,%r11 - imulq %rcx,%r13 - addq %rax,%r14 - movq %r12,%rax - adcq $0,%rdx - addq %r10,%r14 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 24(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r10,%r15 - adcq $0,%rdx - movq %rdx,%r10 - - mulq 0(%rbx) - addq %rax,%r11 - movq %r13,%rax - adcq %rdx,%r11 - - mulq 8(%rbx) - addq %rax,%r14 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 16(%rbx) - addq %rax,%r15 - movq %r13,%rax - adcq $0,%rdx - addq %r11,%r15 - adcq $0,%rdx - movq %rdx,%r11 - - mulq 24(%rbx) - addq %rax,%r10 - movq %r14,%rax - adcq $0,%rdx - addq %r11,%r10 - adcq $0,%rdx - movq %rdx,%r11 - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s deleted file mode 100644 index c5afeec8a51..00000000000 --- a/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s +++ /dev/null @@ -1,2975 +0,0 @@ -.text - - - - - - - - -.p2align 5 -__subx_mod_384x384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - - subq 0(%rdx),%r8 - movq 56(%rsi),%r15 - sbbq 8(%rdx),%r9 - movq 64(%rsi),%rax - sbbq 16(%rdx),%r10 - movq 72(%rsi),%rbx - sbbq 24(%rdx),%r11 - movq 80(%rsi),%rbp - sbbq 32(%rdx),%r12 - movq 88(%rsi),%rsi - sbbq 40(%rdx),%r13 - movq %r8,0(%rdi) - sbbq 48(%rdx),%r14 - movq 0(%rcx),%r8 - movq %r9,8(%rdi) - sbbq 56(%rdx),%r15 - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - sbbq 64(%rdx),%rax - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - sbbq 72(%rdx),%rbx - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - sbbq 80(%rdx),%rbp - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - sbbq 88(%rdx),%rsi - movq 40(%rcx),%r13 - sbbq %rdx,%rdx - - andq %rdx,%r8 - andq %rdx,%r9 - andq %rdx,%r10 - andq %rdx,%r11 - andq %rdx,%r12 - andq %rdx,%r13 - - addq %r8,%r14 - adcq %r9,%r15 - movq %r14,48(%rdi) - adcq %r10,%rax - movq %r15,56(%rdi) - adcq %r11,%rbx - movq %rax,64(%rdi) - adcq %r12,%rbp - movq %rbx,72(%rdi) - adcq %r13,%rsi - movq %rbp,80(%rdi) - movq %rsi,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__addx_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - movq %r8,%r14 - adcq 24(%rdx),%r11 - movq %r9,%r15 - adcq 32(%rdx),%r12 - movq %r10,%rax - adcq 40(%rdx),%r13 - movq %r11,%rbx - sbbq %rdx,%rdx - - subq 0(%rcx),%r8 - sbbq 8(%rcx),%r9 - movq %r12,%rbp - sbbq 16(%rcx),%r10 - sbbq 24(%rcx),%r11 - sbbq 32(%rcx),%r12 - movq %r13,%rsi - sbbq 40(%rcx),%r13 - sbbq $0,%rdx - - cmovcq %r14,%r8 - cmovcq %r15,%r9 - cmovcq %rax,%r10 - movq %r8,0(%rdi) - cmovcq %rbx,%r11 - movq %r9,8(%rdi) - cmovcq %rbp,%r12 - movq %r10,16(%rdi) - cmovcq %rsi,%r13 - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__subx_mod_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - -__subx_mod_384_a_is_loaded: - subq 0(%rdx),%r8 - movq 0(%rcx),%r14 - sbbq 8(%rdx),%r9 - movq 8(%rcx),%r15 - sbbq 16(%rdx),%r10 - movq 16(%rcx),%rax - sbbq 24(%rdx),%r11 - movq 24(%rcx),%rbx - sbbq 32(%rdx),%r12 - movq 32(%rcx),%rbp - sbbq 40(%rdx),%r13 - movq 40(%rcx),%rsi - sbbq %rdx,%rdx - - andq %rdx,%r14 - andq %rdx,%r15 - andq %rdx,%rax - andq %rdx,%rbx - andq %rdx,%rbp - andq %rdx,%rsi - - addq %r14,%r8 - adcq %r15,%r9 - movq %r8,0(%rdi) - adcq %rax,%r10 - movq %r9,8(%rdi) - adcq %rbx,%r11 - movq %r10,16(%rdi) - adcq %rbp,%r12 - movq %r11,24(%rdi) - adcq %rsi,%r13 - movq %r12,32(%rdi) - movq %r13,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _mulx_mont_384x -.private_extern _mulx_mont_384x - -.p2align 5 -_mulx_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_mont_384x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $328,%rsp -.cfi_adjust_cfa_offset 328 - - - movq %rdx,%rbx - movq %rdi,32(%rsp) - movq %rsi,24(%rsp) - movq %rdx,16(%rsp) - movq %rcx,8(%rsp) - movq %r8,0(%rsp) - - - - - leaq 40(%rsp),%rdi - call __mulx_384 - - - leaq 48(%rbx),%rbx - leaq 128+48(%rsi),%rsi - leaq 96(%rdi),%rdi - call __mulx_384 - - - movq 8(%rsp),%rcx - leaq (%rbx),%rsi - leaq -48(%rbx),%rdx - leaq 40+192+48(%rsp),%rdi - call __addx_mod_384 - - movq 24(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq -48(%rdi),%rdi - call __addx_mod_384 - - leaq (%rdi),%rbx - leaq 48(%rdi),%rsi - call __mulx_384 - - - leaq (%rdi),%rsi - leaq 40(%rsp),%rdx - movq 8(%rsp),%rcx - call __subx_mod_384x384 - - leaq (%rdi),%rsi - leaq -96(%rdi),%rdx - call __subx_mod_384x384 - - - leaq 40(%rsp),%rsi - leaq 40+96(%rsp),%rdx - leaq 40(%rsp),%rdi - call __subx_mod_384x384 - - leaq (%rcx),%rbx - - - leaq 40(%rsp),%rsi - movq 0(%rsp),%rcx - movq 32(%rsp),%rdi - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - - leaq 40+192(%rsp),%rsi - movq 0(%rsp),%rcx - leaq 48(%rdi),%rdi - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - leaq 328(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -328-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqrx_mont_384x -.private_extern _sqrx_mont_384x - -.p2align 5 -_sqrx_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_mont_384x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - - movq %rdi,16(%rsp) - movq %rsi,24(%rsp) - - - leaq 48(%rsi),%rdx - leaq 32(%rsp),%rdi - call __addx_mod_384 - - - movq 24(%rsp),%rsi - leaq 48(%rsi),%rdx - leaq 32+48(%rsp),%rdi - call __subx_mod_384 - - - movq 24(%rsp),%rsi - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - addq %rdx,%rdx - adcq %r15,%r15 - adcq %rax,%rax - movq %rdx,%r8 - adcq %r12,%r12 - movq %r15,%r9 - adcq %rdi,%rdi - movq %rax,%r10 - adcq %rbp,%rbp - movq %r12,%r11 - sbbq %rsi,%rsi - - subq 0(%rcx),%rdx - sbbq 8(%rcx),%r15 - movq %rdi,%r13 - sbbq 16(%rcx),%rax - sbbq 24(%rcx),%r12 - sbbq 32(%rcx),%rdi - movq %rbp,%r14 - sbbq 40(%rcx),%rbp - sbbq $0,%rsi - - cmovcq %r8,%rdx - cmovcq %r9,%r15 - cmovcq %r10,%rax - movq %rdx,48(%rbx) - cmovcq %r11,%r12 - movq %r15,56(%rbx) - cmovcq %r13,%rdi - movq %rax,64(%rbx) - cmovcq %r14,%rbp - movq %r12,72(%rbx) - movq %rdi,80(%rbx) - movq %rbp,88(%rbx) - - leaq 32(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rdx - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%rax - movq 32+24(%rsp),%r12 - movq 32+32(%rsp),%rdi - movq 32+40(%rsp),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _mulx_382x -.private_extern _mulx_382x - -.p2align 5 -_mulx_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_382x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - leaq 96(%rdi),%rdi - movq %rsi,0(%rsp) - movq %rdx,8(%rsp) - movq %rdi,16(%rsp) - movq %rcx,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - addq 48(%rsi),%r8 - adcq 56(%rsi),%r9 - adcq 64(%rsi),%r10 - adcq 72(%rsi),%r11 - adcq 80(%rsi),%r12 - adcq 88(%rsi),%r13 - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - movq 32(%rdx),%r12 - movq 40(%rdx),%r13 - - addq 48(%rdx),%r8 - adcq 56(%rdx),%r9 - adcq 64(%rdx),%r10 - adcq 72(%rdx),%r11 - adcq 80(%rdx),%r12 - adcq 88(%rdx),%r13 - - movq %r8,32+48(%rsp) - movq %r9,32+56(%rsp) - movq %r10,32+64(%rsp) - movq %r11,32+72(%rsp) - movq %r12,32+80(%rsp) - movq %r13,32+88(%rsp) - - - leaq 32+0(%rsp),%rsi - leaq 32+48(%rsp),%rbx - call __mulx_384 - - - movq 0(%rsp),%rsi - movq 8(%rsp),%rbx - leaq -96(%rdi),%rdi - call __mulx_384 - - - leaq 48+128(%rsi),%rsi - leaq 48(%rbx),%rbx - leaq 32(%rsp),%rdi - call __mulx_384 - - - movq 16(%rsp),%rsi - leaq 32(%rsp),%rdx - movq 24(%rsp),%rcx - movq %rsi,%rdi - call __subx_mod_384x384 - - - leaq 0(%rdi),%rsi - leaq -96(%rdi),%rdx - call __subx_mod_384x384 - - - leaq -96(%rdi),%rsi - leaq 32(%rsp),%rdx - leaq -96(%rdi),%rdi - call __subx_mod_384x384 - - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqrx_382x -.private_extern _sqrx_382x - -.p2align 5 -_sqrx_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_382x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rcx - - - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%rbx - movq 32(%rsi),%rbp - movq 40(%rsi),%rdx - - movq %r14,%r8 - addq 48(%rsi),%r14 - movq %r15,%r9 - adcq 56(%rsi),%r15 - movq %rax,%r10 - adcq 64(%rsi),%rax - movq %rbx,%r11 - adcq 72(%rsi),%rbx - movq %rbp,%r12 - adcq 80(%rsi),%rbp - movq %rdx,%r13 - adcq 88(%rsi),%rdx - - movq %r14,0(%rdi) - movq %r15,8(%rdi) - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - movq %rbp,32(%rdi) - movq %rdx,40(%rdi) - - - leaq 48(%rsi),%rdx - leaq 48(%rdi),%rdi - call __subx_mod_384_a_is_loaded - - - leaq (%rdi),%rsi - leaq -48(%rdi),%rbx - leaq -48(%rdi),%rdi - call __mulx_384 - - - movq (%rsp),%rsi - leaq 48(%rsi),%rbx - leaq 96(%rdi),%rdi - call __mulx_384 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq 64(%rdi),%rax - movq 72(%rdi),%rbx - movq 80(%rdi),%rbp - addq %r8,%r8 - movq 88(%rdi),%rdx - adcq %r9,%r9 - movq %r8,0(%rdi) - adcq %r10,%r10 - movq %r9,8(%rdi) - adcq %r11,%r11 - movq %r10,16(%rdi) - adcq %r12,%r12 - movq %r11,24(%rdi) - adcq %r13,%r13 - movq %r12,32(%rdi) - adcq %r14,%r14 - movq %r13,40(%rdi) - adcq %r15,%r15 - movq %r14,48(%rdi) - adcq %rax,%rax - movq %r15,56(%rdi) - adcq %rbx,%rbx - movq %rax,64(%rdi) - adcq %rbp,%rbp - movq %rbx,72(%rdi) - adcq %rdx,%rdx - movq %rbp,80(%rdi) - movq %rdx,88(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -8*7 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _mulx_384 -.private_extern _mulx_384 - -.p2align 5 -_mulx_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - - - movq %rdx,%rbx - call __mulx_384 - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 - - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__mulx_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rbx),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - leaq -128(%rsi),%rsi - - mulxq %r14,%r9,%rcx - xorq %rbp,%rbp - - mulxq %r15,%r8,%rax - adcxq %rcx,%r8 - movq %r9,0(%rdi) - - mulxq %r10,%r9,%rcx - adcxq %rax,%r9 - - mulxq %r11,%r10,%rax - adcxq %rcx,%r10 - - mulxq %r12,%r11,%rcx - adcxq %rax,%r11 - - mulxq %r13,%r12,%r13 - movq 8(%rbx),%rdx - adcxq %rcx,%r12 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,8(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 16(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,16(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 24(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,24(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 32(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,32(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq 40(%rbx),%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - mulxq %r14,%rax,%rcx - adcxq %r8,%rax - adoxq %rcx,%r9 - movq %rax,40(%rdi) - - mulxq %r15,%r8,%rcx - adcxq %r9,%r8 - adoxq %rcx,%r10 - - mulxq 128+16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 - - mulxq 128+24(%rsi),%r10,%rcx - adcxq %r11,%r10 - adoxq %rcx,%r12 - - mulxq 128+32(%rsi),%r11,%rax - adcxq %r12,%r11 - adoxq %r13,%rax - - mulxq 128+40(%rsi),%r12,%r13 - movq %rax,%rdx - adcxq %rax,%r12 - adoxq %rbp,%r13 - adcxq %rbp,%r13 - movq %r8,48(%rdi) - movq %r9,56(%rdi) - movq %r10,64(%rdi) - movq %r11,72(%rdi) - movq %r12,80(%rdi) - movq %r13,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqrx_384 -.private_extern _sqrx_384 - -.p2align 5 -_sqrx_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - pushq %rdi -.cfi_adjust_cfa_offset 8 - - - call __sqrx_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__sqrx_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%rcx - movq 32(%rsi),%rbx - - - mulxq %r14,%r8,%rdi - movq 40(%rsi),%rbp - mulxq %r15,%r9,%rax - addq %rdi,%r9 - mulxq %rcx,%r10,%rdi - adcq %rax,%r10 - mulxq %rbx,%r11,%rax - adcq %rdi,%r11 - mulxq %rbp,%r12,%r13 - movq %r14,%rdx - adcq %rax,%r12 - adcq $0,%r13 - - - xorq %r14,%r14 - mulxq %r15,%rdi,%rax - adcxq %rdi,%r10 - adoxq %rax,%r11 - - mulxq %rcx,%rdi,%rax - adcxq %rdi,%r11 - adoxq %rax,%r12 - - mulxq %rbx,%rdi,%rax - adcxq %rdi,%r12 - adoxq %rax,%r13 - - mulxq %rbp,%rdi,%rax - movq %r15,%rdx - adcxq %rdi,%r13 - adoxq %r14,%rax - adcxq %rax,%r14 - - - xorq %r15,%r15 - mulxq %rcx,%rdi,%rax - adcxq %rdi,%r12 - adoxq %rax,%r13 - - mulxq %rbx,%rdi,%rax - adcxq %rdi,%r13 - adoxq %rax,%r14 - - mulxq %rbp,%rdi,%rax - movq %rcx,%rdx - adcxq %rdi,%r14 - adoxq %r15,%rax - adcxq %rax,%r15 - - - xorq %rcx,%rcx - mulxq %rbx,%rdi,%rax - adcxq %rdi,%r14 - adoxq %rax,%r15 - - mulxq %rbp,%rdi,%rax - movq %rbx,%rdx - adcxq %rdi,%r15 - adoxq %rcx,%rax - adcxq %rax,%rcx - - - mulxq %rbp,%rdi,%rbx - movq 0(%rsi),%rdx - addq %rdi,%rcx - movq 8(%rsp),%rdi - adcq $0,%rbx - - - xorq %rbp,%rbp - adcxq %r8,%r8 - adcxq %r9,%r9 - adcxq %r10,%r10 - adcxq %r11,%r11 - adcxq %r12,%r12 - - - mulxq %rdx,%rdx,%rax - movq %rdx,0(%rdi) - movq 8(%rsi),%rdx - adoxq %rax,%r8 - movq %r8,8(%rdi) - - mulxq %rdx,%r8,%rax - movq 16(%rsi),%rdx - adoxq %r8,%r9 - adoxq %rax,%r10 - movq %r9,16(%rdi) - movq %r10,24(%rdi) - - mulxq %rdx,%r8,%r9 - movq 24(%rsi),%rdx - adoxq %r8,%r11 - adoxq %r9,%r12 - adcxq %r13,%r13 - adcxq %r14,%r14 - movq %r11,32(%rdi) - movq %r12,40(%rdi) - - mulxq %rdx,%r8,%r9 - movq 32(%rsi),%rdx - adoxq %r8,%r13 - adoxq %r9,%r14 - adcxq %r15,%r15 - adcxq %rcx,%rcx - movq %r13,48(%rdi) - movq %r14,56(%rdi) - - mulxq %rdx,%r8,%r9 - movq 40(%rsi),%rdx - adoxq %r8,%r15 - adoxq %r9,%rcx - adcxq %rbx,%rbx - adcxq %rbp,%rbp - movq %r15,64(%rdi) - movq %rcx,72(%rdi) - - mulxq %rdx,%r8,%r9 - adoxq %r8,%rbx - adoxq %r9,%rbp - - movq %rbx,80(%rdi) - movq %rbp,88(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - - - -.globl _redcx_mont_384 -.private_extern _redcx_mont_384 - -.p2align 5 -_redcx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -redc_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - - - - -.globl _fromx_mont_384 -.private_extern _fromx_mont_384 - -.p2align 5 -_fromx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -from_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rdx,%rbx - call __mulx_by_1_mont_384 - - - - - movq %r14,%rax - movq %r15,%rcx - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulx_by_1_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq %rcx,%rdx - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - imulq %r8,%rdx - - - xorq %r14,%r14 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r8 - adoxq %rbp,%r9 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r9 - adoxq %rbp,%r10 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r10 - adoxq %rbp,%r11 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r13 - adoxq %r14,%rbp - adcxq %rbp,%r14 - imulq %r9,%rdx - - - xorq %r15,%r15 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r9 - adoxq %rbp,%r10 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r10 - adoxq %rbp,%r11 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r14 - adoxq %r15,%rbp - adcxq %rbp,%r15 - imulq %r10,%rdx - - - xorq %r8,%r8 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r10 - adoxq %rbp,%r11 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r15 - adoxq %r8,%rbp - adcxq %rbp,%r8 - imulq %r11,%rdx - - - xorq %r9,%r9 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r11 - adoxq %rbp,%r12 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r15 - adoxq %rbp,%r8 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r8 - adoxq %r9,%rbp - adcxq %rbp,%r9 - imulq %r12,%rdx - - - xorq %r10,%r10 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r12 - adoxq %rbp,%r13 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r15 - adoxq %rbp,%r8 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r8 - adoxq %rbp,%r9 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r9 - adoxq %r10,%rbp - adcxq %rbp,%r10 - imulq %r13,%rdx - - - xorq %r11,%r11 - mulxq 0(%rbx),%rax,%rbp - adcxq %rax,%r13 - adoxq %rbp,%r14 - - mulxq 8(%rbx),%rax,%rbp - adcxq %rax,%r14 - adoxq %rbp,%r15 - - mulxq 16(%rbx),%rax,%rbp - adcxq %rax,%r15 - adoxq %rbp,%r8 - - mulxq 24(%rbx),%rax,%rbp - adcxq %rax,%r8 - adoxq %rbp,%r9 - - mulxq 32(%rbx),%rax,%rbp - adcxq %rax,%r9 - adoxq %rbp,%r10 - - mulxq 40(%rbx),%rax,%rbp - movq %rcx,%rdx - adcxq %rax,%r10 - adoxq %r11,%rbp - adcxq %rbp,%r11 - .byte 0xf3,0xc3 -.cfi_endproc - - - -.p2align 5 -__redx_tail_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - addq 48(%rsi),%r14 - movq %r14,%rax - adcq 56(%rsi),%r15 - adcq 64(%rsi),%r8 - adcq 72(%rsi),%r9 - movq %r15,%rcx - adcq 80(%rsi),%r10 - adcq 88(%rsi),%r11 - sbbq %r12,%r12 - - - - - movq %r8,%rdx - movq %r9,%rbp - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - movq %r10,%r13 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - movq %r11,%rsi - sbbq 40(%rbx),%r11 - sbbq $0,%r12 - - cmovcq %rax,%r14 - cmovcq %rcx,%r15 - cmovcq %rdx,%r8 - movq %r14,0(%rdi) - cmovcq %rbp,%r9 - movq %r15,8(%rdi) - cmovcq %r13,%r10 - movq %r8,16(%rdi) - cmovcq %rsi,%r11 - movq %r9,24(%rdi) - movq %r10,32(%rdi) - movq %r11,40(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sgn0x_pty_mont_384 -.private_extern _sgn0x_pty_mont_384 - -.p2align 5 -_sgn0x_pty_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sgn0_pty_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - leaq 0(%rdi),%rsi - movq %rdx,%rcx - call __mulx_by_1_mont_384 - - xorq %rax,%rax - movq %r14,%r13 - addq %r14,%r14 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r14 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - notq %rax - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sgn0x_pty_mont_384x -.private_extern _sgn0x_pty_mont_384x - -.p2align 5 -_sgn0x_pty_mont_384x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sgn0_pty_mont_384x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $8,%rsp -.cfi_adjust_cfa_offset 8 - - - movq %rsi,%rbx - leaq 48(%rdi),%rsi - movq %rdx,%rcx - call __mulx_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - leaq 0(%rdi),%rsi - xorq %rdi,%rdi - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rdi - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rdi - - movq %r14,0(%rsp) - notq %rdi - andq $1,%r13 - andq $2,%rdi - orq %r13,%rdi - - call __mulx_by_1_mont_384 - - movq %r14,%r12 - orq %r15,%r14 - orq %r8,%r14 - orq %r9,%r14 - orq %r10,%r14 - orq %r11,%r14 - - xorq %rax,%rax - movq %r12,%r13 - addq %r12,%r12 - adcq %r15,%r15 - adcq %r8,%r8 - adcq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq $0,%rax - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r15 - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - sbbq 32(%rbx),%r10 - sbbq 40(%rbx),%r11 - sbbq $0,%rax - - movq 0(%rsp),%r12 - - notq %rax - - testq %r14,%r14 - cmovzq %rdi,%r13 - - testq %r12,%r12 - cmovnzq %rdi,%rax - - andq $1,%r13 - andq $2,%rax - orq %r13,%rax - - movq 8(%rsp),%r15 -.cfi_restore %r15 - movq 16(%rsp),%r14 -.cfi_restore %r14 - movq 24(%rsp),%r13 -.cfi_restore %r13 - movq 32(%rsp),%r12 -.cfi_restore %r12 - movq 40(%rsp),%rbx -.cfi_restore %rbx - movq 48(%rsp),%rbp -.cfi_restore %rbp - leaq 56(%rsp),%rsp -.cfi_adjust_cfa_offset -56 - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _mulx_mont_384 -.private_extern _mulx_mont_384 - -.p2align 5 -_mulx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -mul_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -24(%rsp),%rsp -.cfi_adjust_cfa_offset 8*3 - - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - movq %r8,(%rsp) - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - movq 24(%rsp),%r15 -.cfi_restore %r15 - movq 32(%rsp),%r14 -.cfi_restore %r14 - movq 40(%rsp),%r13 -.cfi_restore %r13 - movq 48(%rsp),%r12 -.cfi_restore %r12 - movq 56(%rsp),%rbx -.cfi_restore %rbx - movq 64(%rsp),%rbp -.cfi_restore %rbp - leaq 72(%rsp),%rsp -.cfi_adjust_cfa_offset -8*9 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - mulxq %r15,%r14,%r10 - mulxq %rax,%r15,%r11 - addq %r14,%r9 - mulxq %r12,%rax,%r12 - adcq %r15,%r10 - mulxq %rdi,%rdi,%r13 - adcq %rax,%r11 - mulxq %rbp,%rbp,%r14 - movq 8(%rbx),%rdx - adcq %rdi,%r12 - adcq %rbp,%r13 - adcq $0,%r14 - xorq %r15,%r15 - - movq %r8,16(%rsp) - imulq 8(%rsp),%r8 - - - xorq %rax,%rax - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r9 - adcxq %rbp,%r10 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r8,%rdx - adoxq %rdi,%r14 - adcxq %rbp,%r15 - adoxq %rax,%r15 - adoxq %rax,%rax - - - xorq %r8,%r8 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r9 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 16(%rbx),%rdx - adcxq %rdi,%r13 - adoxq %rbp,%r14 - adcxq %r8,%r14 - adoxq %r8,%r15 - adcxq %r8,%r15 - adoxq %r8,%rax - adcxq %r8,%rax - movq %r9,16(%rsp) - imulq 8(%rsp),%r9 - - - xorq %r8,%r8 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r9,%rdx - adoxq %rdi,%r15 - adcxq %rbp,%rax - adoxq %r8,%rax - adoxq %r8,%r8 - - - xorq %r9,%r9 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r10 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 24(%rbx),%rdx - adcxq %rdi,%r14 - adoxq %rbp,%r15 - adcxq %r9,%r15 - adoxq %r9,%rax - adcxq %r9,%rax - adoxq %r9,%r8 - adcxq %r9,%r8 - movq %r10,16(%rsp) - imulq 8(%rsp),%r10 - - - xorq %r9,%r9 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r10,%rdx - adoxq %rdi,%rax - adcxq %rbp,%r8 - adoxq %r9,%r8 - adoxq %r9,%r9 - - - xorq %r10,%r10 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r11 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 32(%rbx),%rdx - adcxq %rdi,%r15 - adoxq %rbp,%rax - adcxq %r10,%rax - adoxq %r10,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcxq %r10,%r9 - movq %r11,16(%rsp) - imulq 8(%rsp),%r11 - - - xorq %r10,%r10 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r11,%rdx - adoxq %rdi,%r8 - adcxq %rbp,%r9 - adoxq %r10,%r9 - adoxq %r10,%r10 - - - xorq %r11,%r11 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r12 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 40+128(%rcx),%rdi,%rbp - movq 40(%rbx),%rdx - adcxq %rdi,%rax - adoxq %rbp,%r8 - adcxq %r11,%r8 - adoxq %r11,%r9 - adcxq %r11,%r9 - adoxq %r11,%r10 - adcxq %r11,%r10 - movq %r12,16(%rsp) - imulq 8(%rsp),%r12 - - - xorq %r11,%r11 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r8 - adcxq %rbp,%r9 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r12,%rdx - adoxq %rdi,%r9 - adcxq %rbp,%r10 - adoxq %r11,%r10 - adoxq %r11,%r11 - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq 16(%rsp),%rdi - adoxq %rbp,%r13 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - - mulxq 40+128(%rcx),%rdi,%rbp - movq %r13,%rdx - adcxq %rdi,%r8 - adoxq %rbp,%r9 - adcxq %r12,%r9 - adoxq %r12,%r10 - adcxq %r12,%r10 - adoxq %r12,%r11 - adcxq %r12,%r11 - imulq 8(%rsp),%rdx - movq 24(%rsp),%rbx - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - movq %r15,%r13 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r8 - adoxq %rbp,%r9 - movq %rax,%rsi - - mulxq 40+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r10 - movq %r14,%rdx - adcxq %r12,%r10 - adoxq %r12,%r11 - leaq 128(%rcx),%rcx - movq %r8,%r12 - adcq $0,%r11 - - - - - subq 0(%rcx),%r14 - sbbq 8(%rcx),%r15 - movq %r9,%rdi - sbbq 16(%rcx),%rax - sbbq 24(%rcx),%r8 - sbbq 32(%rcx),%r9 - movq %r10,%rbp - sbbq 40(%rcx),%r10 - sbbq $0,%r11 - - cmovncq %r14,%rdx - cmovcq %r13,%r15 - cmovcq %rsi,%rax - cmovncq %r8,%r12 - movq %rdx,0(%rbx) - cmovncq %r9,%rdi - movq %r15,8(%rbx) - cmovncq %r10,%rbp - movq %rax,16(%rbx) - movq %r12,24(%rbx) - movq %rdi,32(%rbx) - movq %rbp,40(%rbx) - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqrx_mont_384 -.private_extern _sqrx_mont_384 - -.p2align 5 -_sqrx_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -24(%rsp),%rsp -.cfi_adjust_cfa_offset 8*3 - - - movq %rcx,%r8 - leaq -128(%rdx),%rcx - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - - leaq (%rsi),%rbx - movq %r8,(%rsp) - leaq -128(%rsi),%rsi - - mulxq %rdx,%r8,%r9 - call __mulx_mont_384 - - movq 24(%rsp),%r15 -.cfi_restore %r15 - movq 32(%rsp),%r14 -.cfi_restore %r14 - movq 40(%rsp),%r13 -.cfi_restore %r13 - movq 48(%rsp),%r12 -.cfi_restore %r12 - movq 56(%rsp),%rbx -.cfi_restore %rbx - movq 64(%rsp),%rbp -.cfi_restore %rbp - leaq 72(%rsp),%rsp -.cfi_adjust_cfa_offset -8*9 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sqrx_n_mul_mont_384 -.private_extern _sqrx_n_mul_mont_384 - -.p2align 5 -_sqrx_n_mul_mont_384: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_n_mul_mont_384$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -40(%rsp),%rsp -.cfi_adjust_cfa_offset 8*5 - - - movq %rdx,%r10 - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq %rsi,%rbx - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - - movq %r8,(%rsp) - movq %r9,24(%rsp) - movq 0(%r9),%xmm2 - -L$oop_sqrx_384: - movd %r10d,%xmm1 - leaq -128(%rbx),%rsi - leaq -128(%rcx),%rcx - - mulxq %rdx,%r8,%r9 - call __mulx_mont_384 - - movd %xmm1,%r10d - decl %r10d - jnz L$oop_sqrx_384 - - movq %rdx,%r14 -.byte 102,72,15,126,210 - leaq -128(%rbx),%rsi - movq 24(%rsp),%rbx - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - movq 40(%rsp),%r15 -.cfi_restore %r15 - movq 48(%rsp),%r14 -.cfi_restore %r14 - movq 56(%rsp),%r13 -.cfi_restore %r13 - movq 64(%rsp),%r12 -.cfi_restore %r12 - movq 72(%rsp),%rbx -.cfi_restore %rbx - movq 80(%rsp),%rbp -.cfi_restore %rbp - leaq 88(%rsp),%rsp -.cfi_adjust_cfa_offset -8*11 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _sqrx_n_mul_mont_383 -.private_extern _sqrx_n_mul_mont_383 - -.p2align 5 -_sqrx_n_mul_mont_383: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_n_mul_mont_383$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - leaq -40(%rsp),%rsp -.cfi_adjust_cfa_offset 8*5 - - - movq %rdx,%r10 - movq 0(%rsi),%rdx - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq %rsi,%rbx - movq 24(%rsi),%r12 - movq %rdi,16(%rsp) - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - - movq %r8,(%rsp) - movq %r9,24(%rsp) - movq 0(%r9),%xmm2 - leaq -128(%rcx),%rcx - -L$oop_sqrx_383: - movd %r10d,%xmm1 - leaq -128(%rbx),%rsi - - mulxq %rdx,%r8,%r9 - call __mulx_mont_383_nonred - - movd %xmm1,%r10d - decl %r10d - jnz L$oop_sqrx_383 - - movq %rdx,%r14 -.byte 102,72,15,126,210 - leaq -128(%rbx),%rsi - movq 24(%rsp),%rbx - - mulxq %r14,%r8,%r9 - call __mulx_mont_384 - - movq 40(%rsp),%r15 -.cfi_restore %r15 - movq 48(%rsp),%r14 -.cfi_restore %r14 - movq 56(%rsp),%r13 -.cfi_restore %r13 - movq 64(%rsp),%r12 -.cfi_restore %r12 - movq 72(%rsp),%rbx -.cfi_restore %rbx - movq 80(%rsp),%rbp -.cfi_restore %rbp - leaq 88(%rsp),%rsp -.cfi_adjust_cfa_offset -8*11 - - .byte 0xf3,0xc3 -.cfi_endproc - - -.p2align 5 -__mulx_mont_383_nonred: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - mulxq %r15,%r14,%r10 - mulxq %rax,%r15,%r11 - addq %r14,%r9 - mulxq %r12,%rax,%r12 - adcq %r15,%r10 - mulxq %rdi,%rdi,%r13 - adcq %rax,%r11 - mulxq %rbp,%rbp,%r14 - movq 8(%rbx),%rdx - adcq %rdi,%r12 - adcq %rbp,%r13 - adcq $0,%r14 - movq %r8,%rax - imulq 8(%rsp),%r8 - - - xorq %r15,%r15 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r9 - adcxq %rbp,%r10 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r8,%rdx - adoxq %rdi,%r14 - adcxq %r15,%rbp - adoxq %rbp,%r15 - - - xorq %r8,%r8 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r9 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 16(%rbx),%rdx - adcxq %rdi,%r13 - adoxq %rbp,%r14 - adcxq %rax,%r14 - adoxq %rax,%r15 - adcxq %rax,%r15 - movq %r9,%r8 - imulq 8(%rsp),%r9 - - - xorq %rax,%rax - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r10 - adcxq %rbp,%r11 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r9,%rdx - adoxq %rdi,%r15 - adcxq %rax,%rbp - adoxq %rbp,%rax - - - xorq %r9,%r9 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r8 - adoxq %rbp,%r10 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 24(%rbx),%rdx - adcxq %rdi,%r14 - adoxq %rbp,%r15 - adcxq %r8,%r15 - adoxq %r8,%rax - adcxq %r8,%rax - movq %r10,%r9 - imulq 8(%rsp),%r10 - - - xorq %r8,%r8 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r11 - adcxq %rbp,%r12 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r10,%rdx - adoxq %rdi,%rax - adcxq %r8,%rbp - adoxq %rbp,%r8 - - - xorq %r10,%r10 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r9 - adoxq %rbp,%r11 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 40+128(%rcx),%rdi,%rbp - movq 32(%rbx),%rdx - adcxq %rdi,%r15 - adoxq %rbp,%rax - adcxq %r9,%rax - adoxq %r9,%r8 - adcxq %r9,%r8 - movq %r11,%r10 - imulq 8(%rsp),%r11 - - - xorq %r9,%r9 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r12 - adcxq %rbp,%r13 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r11,%rdx - adoxq %rdi,%r8 - adcxq %r9,%rbp - adoxq %rbp,%r9 - - - xorq %r11,%r11 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r10 - adoxq %rbp,%r12 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 40+128(%rcx),%rdi,%rbp - movq 40(%rbx),%rdx - adcxq %rdi,%rax - adoxq %rbp,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcxq %r10,%r9 - movq %r12,%r11 - imulq 8(%rsp),%r12 - - - xorq %r10,%r10 - mulxq 0+128(%rsi),%rdi,%rbp - adoxq %rdi,%r13 - adcxq %rbp,%r14 - - mulxq 8+128(%rsi),%rdi,%rbp - adoxq %rdi,%r14 - adcxq %rbp,%r15 - - mulxq 16+128(%rsi),%rdi,%rbp - adoxq %rdi,%r15 - adcxq %rbp,%rax - - mulxq 24+128(%rsi),%rdi,%rbp - adoxq %rdi,%rax - adcxq %rbp,%r8 - - mulxq 32+128(%rsi),%rdi,%rbp - adoxq %rdi,%r8 - adcxq %rbp,%r9 - - mulxq 40+128(%rsi),%rdi,%rbp - movq %r12,%rdx - adoxq %rdi,%r9 - adcxq %r10,%rbp - adoxq %rbp,%r10 - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r11 - adoxq %rbp,%r13 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - - mulxq 40+128(%rcx),%rdi,%rbp - movq %r13,%rdx - adcxq %rdi,%r8 - adoxq %rbp,%r9 - adcxq %r11,%r9 - adoxq %r11,%r10 - adcxq %r11,%r10 - imulq 8(%rsp),%rdx - movq 24(%rsp),%rbx - - - xorq %r12,%r12 - mulxq 0+128(%rcx),%rdi,%rbp - adcxq %rdi,%r13 - adoxq %rbp,%r14 - - mulxq 8+128(%rcx),%rdi,%rbp - adcxq %rdi,%r14 - adoxq %rbp,%r15 - - mulxq 16+128(%rcx),%rdi,%rbp - adcxq %rdi,%r15 - adoxq %rbp,%rax - - mulxq 24+128(%rcx),%rdi,%rbp - adcxq %rdi,%rax - adoxq %rbp,%r8 - - mulxq 32+128(%rcx),%rdi,%rbp - adcxq %rdi,%r8 - adoxq %rbp,%r9 - - mulxq 40+128(%rcx),%rdi,%rbp - movq %r14,%rdx - adcxq %rdi,%r9 - adoxq %rbp,%r10 - adcq $0,%r10 - movq %r8,%r12 - - movq %r14,0(%rbx) - movq %r15,8(%rbx) - movq %rax,16(%rbx) - movq %r9,%rdi - movq %r8,24(%rbx) - movq %r9,32(%rbx) - movq %r10,40(%rbx) - movq %r10,%rbp - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _sqrx_mont_382x -.private_extern _sqrx_mont_382x - -.p2align 5 -_sqrx_mont_382x: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - -sqr_mont_382x$1: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $136,%rsp -.cfi_adjust_cfa_offset 136 - - - movq %rcx,0(%rsp) - movq %rdx,%rcx - movq %rdi,16(%rsp) - movq %rsi,24(%rsp) - - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - - movq %r8,%r14 - addq 48(%rsi),%r8 - movq %r9,%r15 - adcq 56(%rsi),%r9 - movq %r10,%rax - adcq 64(%rsi),%r10 - movq %r11,%rdx - adcq 72(%rsi),%r11 - movq %r12,%rbx - adcq 80(%rsi),%r12 - movq %r13,%rbp - adcq 88(%rsi),%r13 - - subq 48(%rsi),%r14 - sbbq 56(%rsi),%r15 - sbbq 64(%rsi),%rax - sbbq 72(%rsi),%rdx - sbbq 80(%rsi),%rbx - sbbq 88(%rsi),%rbp - sbbq %rdi,%rdi - - movq %r8,32+0(%rsp) - movq %r9,32+8(%rsp) - movq %r10,32+16(%rsp) - movq %r11,32+24(%rsp) - movq %r12,32+32(%rsp) - movq %r13,32+40(%rsp) - - movq %r14,32+48(%rsp) - movq %r15,32+56(%rsp) - movq %rax,32+64(%rsp) - movq %rdx,32+72(%rsp) - movq %rbx,32+80(%rsp) - movq %rbp,32+88(%rsp) - movq %rdi,32+96(%rsp) - - - - leaq 48(%rsi),%rbx - - movq 48(%rsi),%rdx - movq 0(%rsi),%r14 - movq 8(%rsi),%r15 - movq 16(%rsi),%rax - movq 24(%rsi),%r12 - movq 32(%rsi),%rdi - movq 40(%rsi),%rbp - leaq -128(%rsi),%rsi - leaq -128(%rcx),%rcx - - mulxq %r14,%r8,%r9 - call __mulx_mont_383_nonred - addq %rdx,%rdx - adcq %r15,%r15 - adcq %rax,%rax - adcq %r12,%r12 - adcq %rdi,%rdi - adcq %rbp,%rbp - - movq %rdx,48(%rbx) - movq %r15,56(%rbx) - movq %rax,64(%rbx) - movq %r12,72(%rbx) - movq %rdi,80(%rbx) - movq %rbp,88(%rbx) - - leaq 32-128(%rsp),%rsi - leaq 32+48(%rsp),%rbx - - movq 32+48(%rsp),%rdx - movq 32+0(%rsp),%r14 - movq 32+8(%rsp),%r15 - movq 32+16(%rsp),%rax - movq 32+24(%rsp),%r12 - movq 32+32(%rsp),%rdi - movq 32+40(%rsp),%rbp - - - - mulxq %r14,%r8,%r9 - call __mulx_mont_383_nonred - movq 32+96(%rsp),%r14 - leaq 128(%rcx),%rcx - movq 32+0(%rsp),%r8 - andq %r14,%r8 - movq 32+8(%rsp),%r9 - andq %r14,%r9 - movq 32+16(%rsp),%r10 - andq %r14,%r10 - movq 32+24(%rsp),%r11 - andq %r14,%r11 - movq 32+32(%rsp),%r13 - andq %r14,%r13 - andq 32+40(%rsp),%r14 - - subq %r8,%rdx - movq 0(%rcx),%r8 - sbbq %r9,%r15 - movq 8(%rcx),%r9 - sbbq %r10,%rax - movq 16(%rcx),%r10 - sbbq %r11,%r12 - movq 24(%rcx),%r11 - sbbq %r13,%rdi - movq 32(%rcx),%r13 - sbbq %r14,%rbp - sbbq %r14,%r14 - - andq %r14,%r8 - andq %r14,%r9 - andq %r14,%r10 - andq %r14,%r11 - andq %r14,%r13 - andq 40(%rcx),%r14 - - addq %r8,%rdx - adcq %r9,%r15 - adcq %r10,%rax - adcq %r11,%r12 - adcq %r13,%rdi - adcq %r14,%rbp - - movq %rdx,0(%rbx) - movq %r15,8(%rbx) - movq %rax,16(%rbx) - movq %r12,24(%rbx) - movq %rdi,32(%rbx) - movq %rbp,40(%rbx) - leaq 136(%rsp),%r8 - movq 0(%r8),%r15 -.cfi_restore %r15 - movq 8(%r8),%r14 -.cfi_restore %r14 - movq 16(%r8),%r13 -.cfi_restore %r13 - movq 24(%r8),%r12 -.cfi_restore %r12 - movq 32(%r8),%rbx -.cfi_restore %rbx - movq 40(%r8),%rbp -.cfi_restore %rbp - leaq 48(%r8),%rsp -.cfi_adjust_cfa_offset -136-8*6 - - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/mach-o/sha256-armv8.S b/crypto/blst_src/build/mach-o/sha256-armv8.S deleted file mode 100644 index 3f3c1266dcd..00000000000 --- a/crypto/blst_src/build/mach-o/sha256-armv8.S +++ /dev/null @@ -1,1083 +0,0 @@ -// -// Copyright Supranational LLC -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// ==================================================================== -// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -// project. -// ==================================================================== -// -// sha256_block procedure for ARMv8. -// -// This module is stripped of scalar code paths, with rationale that all -// known processors are NEON-capable. -// -// See original module at CRYPTOGAMS for further details. - -.comm ___blst_platform_cap,4 -.text - -.align 6 - -LK256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0 //terminator - -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 -.align 2 -.align 2 -.globl _blst_sha256_block_armv8 - -.align 6 -_blst_sha256_block_armv8: -Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v0.4s,v1.4s},[x0] - adr x3,LK256 - -Loop_hw: - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - sub x2,x2,#1 - ld1 {v16.4s},[x3],#16 - rev32 v4.16b,v4.16b - rev32 v5.16b,v5.16b - rev32 v6.16b,v6.16b - rev32 v7.16b,v7.16b - orr v18.16b,v0.16b,v0.16b // offload - orr v19.16b,v1.16b,v1.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.long 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.long 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.long 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - ld1 {v17.4s},[x3] - add v16.4s,v16.4s,v6.4s - sub x3,x3,#64*4-16 // rewind - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - add v17.4s,v17.4s,v7.4s - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - add v0.4s,v0.4s,v18.4s - add v1.4s,v1.4s,v19.4s - - cbnz x2,Loop_hw - - st1 {v0.4s,v1.4s},[x0] - - ldr x29,[sp],#16 - ret - -.globl _blst_sha256_block_data_order - -.align 4 -_blst_sha256_block_data_order: - adrp x16,___blst_platform_cap@PAGE - ldr w16,[x16,___blst_platform_cap@PAGEOFF] - tst w16,#1 - b.ne Lv8_entry - - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 - - adr x16,LK256 - add x2,x1,x2,lsl#6 // len to point at the end of inp - - ld1 {v0.16b},[x1], #16 - ld1 {v1.16b},[x1], #16 - ld1 {v2.16b},[x1], #16 - ld1 {v3.16b},[x1], #16 - ld1 {v4.4s},[x16], #16 - ld1 {v5.4s},[x16], #16 - ld1 {v6.4s},[x16], #16 - ld1 {v7.4s},[x16], #16 - rev32 v0.16b,v0.16b // yes, even on - rev32 v1.16b,v1.16b // big-endian - rev32 v2.16b,v2.16b - rev32 v3.16b,v3.16b - mov x17,sp - add v4.4s,v4.4s,v0.4s - add v5.4s,v5.4s,v1.4s - add v6.4s,v6.4s,v2.4s - st1 {v4.4s,v5.4s},[x17], #32 - add v7.4s,v7.4s,v3.4s - st1 {v6.4s,v7.4s},[x17] - sub x17,x17,#32 - - ldp w3,w4,[x0] - ldp w5,w6,[x0,#8] - ldp w7,w8,[x0,#16] - ldp w9,w10,[x0,#24] - ldr w12,[sp,#0] - mov w13,wzr - eor w14,w4,w5 - mov w15,wzr - b L_00_48 - -.align 4 -L_00_48: - ext v4.16b,v0.16b,v1.16b,#4 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - bic w15,w9,w7 - ext v7.16b,v2.16b,v3.16b,#4 - eor w11,w7,w7,ror#5 - add w3,w3,w13 - mov d19,v3.d[1] - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w3,w3,ror#11 - ushr v5.4s,v4.4s,#3 - add w10,w10,w12 - add v0.4s,v0.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - ushr v7.4s,v4.4s,#18 - add w10,w10,w11 - ldr w12,[sp,#4] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w6,w6,w10 - sli v7.4s,v4.4s,#14 - eor w14,w14,w4 - ushr v16.4s,v19.4s,#17 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - eor v5.16b,v5.16b,v7.16b - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - sli v16.4s,v19.4s,#15 - add w10,w10,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - ushr v7.4s,v19.4s,#19 - add w9,w9,w12 - ror w11,w11,#6 - add v0.4s,v0.4s,v5.4s - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - sli v7.4s,v19.4s,#13 - add w9,w9,w11 - ldr w12,[sp,#8] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - eor v17.16b,v17.16b,v7.16b - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - add v0.4s,v0.4s,v17.4s - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - ushr v18.4s,v0.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v0.4s,#10 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - sli v18.4s,v0.4s,#15 - add w8,w8,w12 - ushr v17.4s,v0.4s,#19 - ror w11,w11,#6 - eor w13,w9,w10 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w9,ror#20 - add w8,w8,w11 - sli v17.4s,v0.4s,#13 - ldr w12,[sp,#12] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w4,w4,w8 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w10 - eor v17.16b,v17.16b,v17.16b - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - mov v17.d[1],v19.d[0] - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - add v0.4s,v0.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add v4.4s,v4.4s,v0.4s - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#16] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - ext v4.16b,v1.16b,v2.16b,#4 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - bic w15,w5,w3 - ext v7.16b,v3.16b,v0.16b,#4 - eor w11,w3,w3,ror#5 - add w7,w7,w13 - mov d19,v0.d[1] - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w7,w7,ror#11 - ushr v5.4s,v4.4s,#3 - add w6,w6,w12 - add v1.4s,v1.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - ushr v7.4s,v4.4s,#18 - add w6,w6,w11 - ldr w12,[sp,#20] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w10,w10,w6 - sli v7.4s,v4.4s,#14 - eor w14,w14,w8 - ushr v16.4s,v19.4s,#17 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - eor v5.16b,v5.16b,v7.16b - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - sli v16.4s,v19.4s,#15 - add w6,w6,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - ushr v7.4s,v19.4s,#19 - add w5,w5,w12 - ror w11,w11,#6 - add v1.4s,v1.4s,v5.4s - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - sli v7.4s,v19.4s,#13 - add w5,w5,w11 - ldr w12,[sp,#24] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - eor v17.16b,v17.16b,v7.16b - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - add v1.4s,v1.4s,v17.4s - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - ushr v18.4s,v1.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v1.4s,#10 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - sli v18.4s,v1.4s,#15 - add w4,w4,w12 - ushr v17.4s,v1.4s,#19 - ror w11,w11,#6 - eor w13,w5,w6 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w5,ror#20 - add w4,w4,w11 - sli v17.4s,v1.4s,#13 - ldr w12,[sp,#28] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w8,w8,w4 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w6 - eor v17.16b,v17.16b,v17.16b - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - mov v17.d[1],v19.d[0] - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - add v1.4s,v1.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add v4.4s,v4.4s,v1.4s - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[sp,#32] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - ext v4.16b,v2.16b,v3.16b,#4 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - bic w15,w9,w7 - ext v7.16b,v0.16b,v1.16b,#4 - eor w11,w7,w7,ror#5 - add w3,w3,w13 - mov d19,v1.d[1] - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w3,w3,ror#11 - ushr v5.4s,v4.4s,#3 - add w10,w10,w12 - add v2.4s,v2.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - ushr v7.4s,v4.4s,#18 - add w10,w10,w11 - ldr w12,[sp,#36] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w6,w6,w10 - sli v7.4s,v4.4s,#14 - eor w14,w14,w4 - ushr v16.4s,v19.4s,#17 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - eor v5.16b,v5.16b,v7.16b - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - sli v16.4s,v19.4s,#15 - add w10,w10,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - ushr v7.4s,v19.4s,#19 - add w9,w9,w12 - ror w11,w11,#6 - add v2.4s,v2.4s,v5.4s - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - sli v7.4s,v19.4s,#13 - add w9,w9,w11 - ldr w12,[sp,#40] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - eor v17.16b,v17.16b,v7.16b - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - add v2.4s,v2.4s,v17.4s - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - ushr v18.4s,v2.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v2.4s,#10 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - sli v18.4s,v2.4s,#15 - add w8,w8,w12 - ushr v17.4s,v2.4s,#19 - ror w11,w11,#6 - eor w13,w9,w10 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w9,ror#20 - add w8,w8,w11 - sli v17.4s,v2.4s,#13 - ldr w12,[sp,#44] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w4,w4,w8 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w10 - eor v17.16b,v17.16b,v17.16b - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - mov v17.d[1],v19.d[0] - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - add v2.4s,v2.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add v4.4s,v4.4s,v2.4s - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#48] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - ext v4.16b,v3.16b,v0.16b,#4 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - bic w15,w5,w3 - ext v7.16b,v1.16b,v2.16b,#4 - eor w11,w3,w3,ror#5 - add w7,w7,w13 - mov d19,v2.d[1] - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w7,w7,ror#11 - ushr v5.4s,v4.4s,#3 - add w6,w6,w12 - add v3.4s,v3.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - ushr v7.4s,v4.4s,#18 - add w6,w6,w11 - ldr w12,[sp,#52] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w10,w10,w6 - sli v7.4s,v4.4s,#14 - eor w14,w14,w8 - ushr v16.4s,v19.4s,#17 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - eor v5.16b,v5.16b,v7.16b - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - sli v16.4s,v19.4s,#15 - add w6,w6,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - ushr v7.4s,v19.4s,#19 - add w5,w5,w12 - ror w11,w11,#6 - add v3.4s,v3.4s,v5.4s - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - sli v7.4s,v19.4s,#13 - add w5,w5,w11 - ldr w12,[sp,#56] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - eor v17.16b,v17.16b,v7.16b - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - add v3.4s,v3.4s,v17.4s - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - ushr v18.4s,v3.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v3.4s,#10 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - sli v18.4s,v3.4s,#15 - add w4,w4,w12 - ushr v17.4s,v3.4s,#19 - ror w11,w11,#6 - eor w13,w5,w6 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w5,ror#20 - add w4,w4,w11 - sli v17.4s,v3.4s,#13 - ldr w12,[sp,#60] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w8,w8,w4 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w6 - eor v17.16b,v17.16b,v17.16b - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - mov v17.d[1],v19.d[0] - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - add v3.4s,v3.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add v4.4s,v4.4s,v3.4s - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[x16] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - cmp w12,#0 // check for K256 terminator - ldr w12,[sp,#0] - sub x17,x17,#64 - bne L_00_48 - - sub x16,x16,#256 // rewind x16 - cmp x1,x2 - mov x17, #64 - csel x17, x17, xzr, eq - sub x1,x1,x17 // avoid SEGV - mov x17,sp - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - ld1 {v0.16b},[x1],#16 - bic w15,w9,w7 - eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 - add w3,w3,w13 - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - eor w15,w3,w3,ror#11 - rev32 v0.16b,v0.16b - add w10,w10,w12 - ror w11,w11,#6 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - add v4.4s,v4.4s,v0.4s - add w10,w10,w11 - ldr w12,[sp,#4] - and w14,w14,w13 - ror w15,w15,#2 - add w6,w6,w10 - eor w14,w14,w4 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - add w10,w10,w14 - orr w12,w12,w15 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - add w9,w9,w12 - ror w11,w11,#6 - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - add w9,w9,w11 - ldr w12,[sp,#8] - and w13,w13,w14 - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - orr w12,w12,w15 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - add w8,w8,w12 - ror w11,w11,#6 - eor w13,w9,w10 - eor w15,w15,w9,ror#20 - add w8,w8,w11 - ldr w12,[sp,#12] - and w14,w14,w13 - ror w15,w15,#2 - add w4,w4,w8 - eor w14,w14,w10 - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#16] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - ld1 {v1.16b},[x1],#16 - bic w15,w5,w3 - eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 - add w7,w7,w13 - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - eor w15,w7,w7,ror#11 - rev32 v1.16b,v1.16b - add w6,w6,w12 - ror w11,w11,#6 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - add v4.4s,v4.4s,v1.4s - add w6,w6,w11 - ldr w12,[sp,#20] - and w14,w14,w13 - ror w15,w15,#2 - add w10,w10,w6 - eor w14,w14,w8 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - add w6,w6,w14 - orr w12,w12,w15 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - add w5,w5,w12 - ror w11,w11,#6 - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - add w5,w5,w11 - ldr w12,[sp,#24] - and w13,w13,w14 - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - orr w12,w12,w15 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - add w4,w4,w12 - ror w11,w11,#6 - eor w13,w5,w6 - eor w15,w15,w5,ror#20 - add w4,w4,w11 - ldr w12,[sp,#28] - and w14,w14,w13 - ror w15,w15,#2 - add w8,w8,w4 - eor w14,w14,w6 - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[sp,#32] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - ld1 {v2.16b},[x1],#16 - bic w15,w9,w7 - eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 - add w3,w3,w13 - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - eor w15,w3,w3,ror#11 - rev32 v2.16b,v2.16b - add w10,w10,w12 - ror w11,w11,#6 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - add v4.4s,v4.4s,v2.4s - add w10,w10,w11 - ldr w12,[sp,#36] - and w14,w14,w13 - ror w15,w15,#2 - add w6,w6,w10 - eor w14,w14,w4 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - add w10,w10,w14 - orr w12,w12,w15 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - add w9,w9,w12 - ror w11,w11,#6 - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - add w9,w9,w11 - ldr w12,[sp,#40] - and w13,w13,w14 - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - orr w12,w12,w15 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - add w8,w8,w12 - ror w11,w11,#6 - eor w13,w9,w10 - eor w15,w15,w9,ror#20 - add w8,w8,w11 - ldr w12,[sp,#44] - and w14,w14,w13 - ror w15,w15,#2 - add w4,w4,w8 - eor w14,w14,w10 - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#48] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - ld1 {v3.16b},[x1],#16 - bic w15,w5,w3 - eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 - add w7,w7,w13 - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - eor w15,w7,w7,ror#11 - rev32 v3.16b,v3.16b - add w6,w6,w12 - ror w11,w11,#6 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - add v4.4s,v4.4s,v3.4s - add w6,w6,w11 - ldr w12,[sp,#52] - and w14,w14,w13 - ror w15,w15,#2 - add w10,w10,w6 - eor w14,w14,w8 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - add w6,w6,w14 - orr w12,w12,w15 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - add w5,w5,w12 - ror w11,w11,#6 - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - add w5,w5,w11 - ldr w12,[sp,#56] - and w13,w13,w14 - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - orr w12,w12,w15 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - add w4,w4,w12 - ror w11,w11,#6 - eor w13,w5,w6 - eor w15,w15,w5,ror#20 - add w4,w4,w11 - ldr w12,[sp,#60] - and w14,w14,w13 - ror w15,w15,#2 - add w8,w8,w4 - eor w14,w14,w6 - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - add w3,w3,w15 // h+=Sigma0(a) from the past - ldp w11,w12,[x0,#0] - add w3,w3,w13 // h+=Maj(a,b,c) from the past - ldp w13,w14,[x0,#8] - add w3,w3,w11 // accumulate - add w4,w4,w12 - ldp w11,w12,[x0,#16] - add w5,w5,w13 - add w6,w6,w14 - ldp w13,w14,[x0,#24] - add w7,w7,w11 - add w8,w8,w12 - ldr w12,[sp,#0] - stp w3,w4,[x0,#0] - add w9,w9,w13 - mov w13,wzr - stp w5,w6,[x0,#8] - add w10,w10,w14 - stp w7,w8,[x0,#16] - eor w14,w4,w5 - stp w9,w10,[x0,#24] - mov w15,wzr - mov x17,sp - b.ne L_00_48 - - ldr x29,[x29] - add sp,sp,#16*4+16 - ret - -.globl _blst_sha256_emit -.private_extern _blst_sha256_emit - -.align 4 -_blst_sha256_emit: - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] -#ifndef __AARCH64EB__ - rev x4,x4 - rev x5,x5 - rev x6,x6 - rev x7,x7 -#endif - str w4,[x0,#4] - lsr x4,x4,#32 - str w5,[x0,#12] - lsr x5,x5,#32 - str w6,[x0,#20] - lsr x6,x6,#32 - str w7,[x0,#28] - lsr x7,x7,#32 - str w4,[x0,#0] - str w5,[x0,#8] - str w6,[x0,#16] - str w7,[x0,#24] - ret - - -.globl _blst_sha256_bcopy -.private_extern _blst_sha256_bcopy - -.align 4 -_blst_sha256_bcopy: -Loop_bcopy: - ldrb w3,[x1],#1 - sub x2,x2,#1 - strb w3,[x0],#1 - cbnz x2,Loop_bcopy - ret - - -.globl _blst_sha256_hcopy -.private_extern _blst_sha256_hcopy - -.align 4 -_blst_sha256_hcopy: - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - stp x4,x5,[x0] - stp x6,x7,[x0,#16] - ret - diff --git a/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s deleted file mode 100644 index 9f0a4f84ff0..00000000000 --- a/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s +++ /dev/null @@ -1,1750 +0,0 @@ -.comm ___blst_platform_cap,4 -.text - -.globl _blst_sha256_block_data_order - -.p2align 4 -_blst_sha256_block_data_order: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - movq %rsp,%rbp -.cfi_def_cfa_register %rbp -#ifdef __BLST_PORTABLE__ - testl $2,___blst_platform_cap(%rip) - jnz L$blst_sha256_block_data_order$2 -#endif - pushq %rbx -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $64+24,%rsp - -.cfi_def_cfa %rsp,144 - - leaq (%rsi,%rdx,4),%rdx - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - jmp L$loop - -.p2align 4 -L$loop: - movl %ebx,%edi - leaq K256(%rip),%rbp - xorl %ecx,%edi - movl 0(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 0(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - addl %r14d,%r11d - movl 4(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 4(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - addl %r14d,%r10d - movl 8(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 8(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - addl %r14d,%r9d - movl 12(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 12(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - addl %r14d,%r8d - movl 16(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 16(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - addl %r14d,%edx - movl 20(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 20(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - addl %r14d,%ecx - movl 24(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 24(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - addl %r14d,%ebx - movl 28(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 28(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - addl %r14d,%eax - movl 32(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 32(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - addl %r14d,%r11d - movl 36(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 36(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - addl %r14d,%r10d - movl 40(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 40(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - addl %r14d,%r9d - movl 44(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 44(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - addl %r14d,%r8d - movl 48(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 48(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - addl %r14d,%edx - movl 52(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 52(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - addl %r14d,%ecx - movl 56(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 56(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - addl %r14d,%ebx - movl 60(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 60(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - jmp L$rounds_16_xx -.p2align 4 -L$rounds_16_xx: - movl 4(%rsp),%r13d - movl 56(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 36(%rsp),%r12d - - addl 0(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 64(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 8(%rsp),%r13d - movl 60(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 40(%rsp),%r12d - - addl 4(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 68(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 12(%rsp),%r13d - movl 0(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 44(%rsp),%r12d - - addl 8(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 72(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 16(%rsp),%r13d - movl 4(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 48(%rsp),%r12d - - addl 12(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 76(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 20(%rsp),%r13d - movl 8(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 52(%rsp),%r12d - - addl 16(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 80(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 24(%rsp),%r13d - movl 12(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 56(%rsp),%r12d - - addl 20(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 84(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 28(%rsp),%r13d - movl 16(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 60(%rsp),%r12d - - addl 24(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 88(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 32(%rsp),%r13d - movl 20(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 0(%rsp),%r12d - - addl 28(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 92(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - movl 36(%rsp),%r13d - movl 24(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 4(%rsp),%r12d - - addl 32(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl 96(%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - movl 40(%rsp),%r13d - movl 28(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 8(%rsp),%r12d - - addl 36(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl 100(%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - movl 44(%rsp),%r13d - movl 32(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 12(%rsp),%r12d - - addl 40(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl 104(%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - movl 48(%rsp),%r13d - movl 36(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 16(%rsp),%r12d - - addl 44(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl 108(%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - movl 52(%rsp),%r13d - movl 40(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 20(%rsp),%r12d - - addl 48(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl 112(%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - movl 56(%rsp),%r13d - movl 44(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 24(%rsp),%r12d - - addl 52(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl 116(%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - movl 60(%rsp),%r13d - movl 48(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 28(%rsp),%r12d - - addl 56(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl 120(%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - movl 0(%rsp),%r13d - movl 52(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 32(%rsp),%r12d - - addl 60(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl 124(%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - leaq 64(%rbp),%rbp - cmpb $0x19,3(%rbp) - jnz L$rounds_16_xx - - movq 64+0(%rsp),%rdi - addl %r14d,%eax - leaq 64(%rsi),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb L$loop - - leaq 64+24+48(%rsp),%r11 -.cfi_def_cfa %r11,8 - movq 64+24(%rsp),%r15 - movq -40(%r11),%r14 - movq -32(%r11),%r13 - movq -24(%r11),%r12 - movq -16(%r11),%rbx - movq -8(%r11),%rbp -.cfi_restore %r12 -.cfi_restore %r13 -.cfi_restore %r14 -.cfi_restore %r15 -.cfi_restore %rbp -.cfi_restore %rbx - leaq (%r11),%rsp - .byte 0xf3,0xc3 -.cfi_endproc - - -#ifndef __BLST_PORTABLE__ -.p2align 6 - -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 -.globl _blst_sha256_emit -.private_extern _blst_sha256_emit - -.p2align 4 -_blst_sha256_emit: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - bswapq %r8 - movq 24(%rsi),%r11 - bswapq %r9 - movl %r8d,4(%rdi) - bswapq %r10 - movl %r9d,12(%rdi) - bswapq %r11 - movl %r10d,20(%rdi) - shrq $32,%r8 - movl %r11d,28(%rdi) - shrq $32,%r9 - movl %r8d,0(%rdi) - shrq $32,%r10 - movl %r9d,8(%rdi) - shrq $32,%r11 - movl %r10d,16(%rdi) - movl %r11d,24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _blst_sha256_bcopy -.private_extern _blst_sha256_bcopy - -.p2align 4 -_blst_sha256_bcopy: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - subq %rsi,%rdi -L$oop_bcopy: - movzbl (%rsi),%eax - leaq 1(%rsi),%rsi - movb %al,-1(%rdi,%rsi,1) - decq %rdx - jnz L$oop_bcopy - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _blst_sha256_hcopy -.private_extern _blst_sha256_hcopy - -.p2align 4 -_blst_sha256_hcopy: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - -#endif diff --git a/crypto/blst_src/build/mach-o/sha256-x86_64.s b/crypto/blst_src/build/mach-o/sha256-x86_64.s deleted file mode 100644 index cff024eed4f..00000000000 --- a/crypto/blst_src/build/mach-o/sha256-x86_64.s +++ /dev/null @@ -1,1447 +0,0 @@ -.comm ___blst_platform_cap,4 -.text - -.p2align 6 - -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 -.globl _blst_sha256_block_data_order_shaext -.private_extern _blst_sha256_block_data_order_shaext - -.p2align 6 -_blst_sha256_block_data_order_shaext: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - movq %rsp,%rbp -.cfi_def_cfa_register %rbp -L$blst_sha256_block_data_order$2: - - leaq K256+128(%rip),%rcx - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa 256-128(%rcx),%xmm7 - - pshufd $0x1b,%xmm1,%xmm0 - pshufd $0xb1,%xmm1,%xmm1 - pshufd $0x1b,%xmm2,%xmm2 - movdqa %xmm7,%xmm8 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp L$oop_shaext - -.p2align 4 -L$oop_shaext: - movdqu (%rsi),%xmm3 - movdqu 16(%rsi),%xmm4 - movdqu 32(%rsi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%rsi),%xmm6 - - movdqa 0-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 - movdqa %xmm2,%xmm10 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - nop - movdqa %xmm1,%xmm9 -.byte 15,56,203,202 - - movdqa 16-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - leaq 64(%rsi),%rsi -.byte 15,56,204,220 -.byte 15,56,203,202 - - movdqa 32-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - - movdqa 48-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 64-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 80-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 96-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 112-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 128-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 144-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 160-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 176-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 192-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 208-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - - movdqa 224-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa %xmm8,%xmm7 -.byte 15,56,203,202 - - movdqa 240-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - decq %rdx - nop -.byte 15,56,203,202 - - paddd %xmm10,%xmm2 - paddd %xmm9,%xmm1 - jnz L$oop_shaext - - pshufd $0xb1,%xmm2,%xmm2 - pshufd $0x1b,%xmm1,%xmm7 - pshufd $0xb1,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - - movdqu %xmm1,(%rdi) - movdqu %xmm2,16(%rdi) -.cfi_def_cfa_register %rsp - popq %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbp - - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _blst_sha256_block_data_order -.private_extern _blst_sha256_block_data_order - -.p2align 6 -_blst_sha256_block_data_order: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - movq %rsp,%rbp -.cfi_def_cfa_register %rbp - testl $2,___blst_platform_cap(%rip) - jnz L$blst_sha256_block_data_order$2 - pushq %rbx -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $24,%rsp - - leaq (%rsi,%rdx,4),%rdx - movq %rdi,-64(%rbp) - - movq %rdx,-48(%rbp) - - - leaq -64(%rsp),%rsp - movl 0(%rdi),%eax - andq $-64,%rsp - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - - - jmp L$loop_ssse3 -.p2align 4 -L$loop_ssse3: - movdqa K256+256(%rip),%xmm7 - movq %rsi,-56(%rbp) - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 -.byte 102,15,56,0,199 - movdqu 48(%rsi),%xmm3 - leaq K256(%rip),%rsi -.byte 102,15,56,0,207 - movdqa 0(%rsi),%xmm4 - movdqa 16(%rsi),%xmm5 -.byte 102,15,56,0,215 - paddd %xmm0,%xmm4 - movdqa 32(%rsi),%xmm6 -.byte 102,15,56,0,223 - movdqa 48(%rsi),%xmm7 - paddd %xmm1,%xmm5 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - movdqa %xmm4,0(%rsp) - movl %eax,%r14d - movdqa %xmm5,16(%rsp) - movl %ebx,%edi - movdqa %xmm6,32(%rsp) - xorl %ecx,%edi - movdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp L$ssse3_00_47 - -.p2align 4 -L$ssse3_00_47: - subq $-64,%rsi - rorl $14,%r13d - movdqa %xmm1,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm3,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,224,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,250,4 - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm3,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 4(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm0 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm0 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm0,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 0(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm0,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,0(%rsp) - rorl $14,%r13d - movdqa %xmm2,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm0,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,225,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,251,4 - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm0,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 20(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm1 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm1 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm1,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 16(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm1,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,16(%rsp) - rorl $14,%r13d - movdqa %xmm3,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm1,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,226,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,248,4 - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm1,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 36(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm2 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm2 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm2,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 32(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm2,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,32(%rsp) - rorl $14,%r13d - movdqa %xmm0,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm2,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,227,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,249,4 - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm2,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 52(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm3 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm3 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm3,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 48(%rsi),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm3,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,48(%rsp) - cmpb $0,67(%rsi) - jne L$ssse3_00_47 - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq -64(%rbp),%rdi - movl %r14d,%eax - movq -56(%rbp),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - leaq 64(%rsi),%rsi - cmpq -48(%rbp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb L$loop_ssse3 - - xorps %xmm0,%xmm0 - movaps %xmm0,0(%rsp) - movaps %xmm0,16(%rsp) - movaps %xmm0,32(%rsp) - movaps %xmm0,48(%rsp) - movq -40(%rbp),%r15 - movq -32(%rbp),%r14 - movq -24(%rbp),%r13 - movq -16(%rbp),%r12 - movq -8(%rbp),%rbx - movq %rbp,%rsp -.cfi_def_cfa_register %rsp - popq %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbp -.cfi_restore %r12 -.cfi_restore %r13 -.cfi_restore %r14 -.cfi_restore %r15 -.cfi_restore %rbx - .byte 0xf3,0xc3 -.cfi_endproc - -.globl _blst_sha256_emit -.private_extern _blst_sha256_emit - -.p2align 4 -_blst_sha256_emit: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - bswapq %r8 - movq 24(%rsi),%r11 - bswapq %r9 - movl %r8d,4(%rdi) - bswapq %r10 - movl %r9d,12(%rdi) - bswapq %r11 - movl %r10d,20(%rdi) - shrq $32,%r8 - movl %r11d,28(%rdi) - shrq $32,%r9 - movl %r8d,0(%rdi) - shrq $32,%r10 - movl %r9d,8(%rdi) - shrq $32,%r11 - movl %r10d,16(%rdi) - movl %r11d,24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _blst_sha256_bcopy -.private_extern _blst_sha256_bcopy - -.p2align 4 -_blst_sha256_bcopy: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - subq %rsi,%rdi -L$oop_bcopy: - movzbl (%rsi),%eax - leaq 1(%rsi),%rsi - movb %al,-1(%rdi,%rsi,1) - decq %rdx - jnz L$oop_bcopy - .byte 0xf3,0xc3 -.cfi_endproc - - -.globl _blst_sha256_hcopy -.private_extern _blst_sha256_hcopy - -.p2align 4 -_blst_sha256_hcopy: -.cfi_startproc - .byte 0xf3,0x0f,0x1e,0xfa - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc - diff --git a/crypto/blst_src/build/refresh.sh b/crypto/blst_src/build/refresh.sh deleted file mode 100755 index 56b0b279c69..00000000000 --- a/crypto/blst_src/build/refresh.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/sh - -HERE=`dirname $0` -cd "${HERE}" - -PERL=${PERL:-perl} - -for pl in ../src/asm/*-x86_64.pl; do - s=`basename $pl .pl`.asm - expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s) - s=`basename $pl .pl`.s - (set -x; ${PERL} $pl elf > elf/$s) - (set -x; ${PERL} $pl mingw64 > coff/$s) - (set -x; ${PERL} $pl macosx > mach-o/$s) -done - -for pl in ../src/asm/*-armv8.pl; do - s=`basename $pl .pl`.asm - (set -x; ${PERL} $pl win64 > win64/$s) - s=`basename $pl .pl`.S - (set -x; ${PERL} $pl linux64 > elf/$s) - (set -x; ${PERL} $pl coff64 > coff/$s) - (set -x; ${PERL} $pl ios64 > mach-o/$s) -done - -( cd ../bindings; - echo "LIBRARY blst" - echo - echo "EXPORTS" - cc -E blst.h | \ - ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) && print "\t$1\n" }' - echo -) > win64/blst.def - -if which bindgen > /dev/null 2>&1; then - ( cd ../bindings; set -x; - bindgen --opaque-type blst_pairing \ - --opaque-type blst_uniq \ - --with-derive-default \ - --with-derive-eq \ - --rustified-enum BLST.\* \ - blst.h -- -D__BLST_RUST_BINDGEN__ \ - | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs - ) -else - echo "Install Rust bindgen with 'cargo install bindgen-cli'" 1>&2 - exit 1 -fi diff --git a/crypto/blst_src/build/win64/add_mod_256-armv8.asm b/crypto/blst_src/build/win64/add_mod_256-armv8.asm deleted file mode 100644 index 8d6975185a6..00000000000 --- a/crypto/blst_src/build/win64/add_mod_256-armv8.asm +++ /dev/null @@ -1,380 +0,0 @@ - AREA |.text|,CODE,ALIGN=8,ARM64 - - - - EXPORT |add_mod_256|[FUNC] - ALIGN 32 -|add_mod_256| PROC - ldp x8,x9,[x1] - ldp x12,x13,[x2] - - ldp x10,x11,[x1,#16] - adds x8,x8,x12 - ldp x14,x15,[x2,#16] - adcs x9,x9,x13 - ldp x4,x5,[x3] - adcs x10,x10,x14 - ldp x6,x7,[x3,#16] - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csello x8,x8,x16 - csello x9,x9,x17 - csello x10,x10,x1 - stp x8,x9,[x0] - csello x11,x11,x2 - stp x10,x11,[x0,#16] - - ret - ENDP - - - - EXPORT |mul_by_3_mod_256|[FUNC] - ALIGN 32 -|mul_by_3_mod_256| PROC - ldp x12,x13,[x1] - ldp x14,x15,[x1,#16] - - adds x8,x12,x12 - ldp x4,x5,[x2] - adcs x9,x13,x13 - ldp x6,x7,[x2,#16] - adcs x10,x14,x14 - adcs x11,x15,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csello x8,x8,x16 - csello x9,x9,x17 - csello x10,x10,x1 - csello x11,x11,x2 - - adds x8,x8,x12 - adcs x9,x9,x13 - adcs x10,x10,x14 - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csello x8,x8,x16 - csello x9,x9,x17 - csello x10,x10,x1 - stp x8,x9,[x0] - csello x11,x11,x2 - stp x10,x11,[x0,#16] - - ret - ENDP - - - - EXPORT |lshift_mod_256|[FUNC] - ALIGN 32 -|lshift_mod_256| PROC - ldp x8,x9,[x1] - ldp x10,x11,[x1,#16] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - -|$Loop_lshift_mod_256| - adds x8,x8,x8 - sub x2,x2,#1 - adcs x9,x9,x9 - adcs x10,x10,x10 - adcs x11,x11,x11 - adc x3,xzr,xzr - - subs x12,x8,x4 - sbcs x13,x9,x5 - sbcs x14,x10,x6 - sbcs x15,x11,x7 - sbcs xzr,x3,xzr - - csello x8,x8,x12 - csello x9,x9,x13 - csello x10,x10,x14 - csello x11,x11,x15 - - cbnz x2,|$Loop_lshift_mod_256| - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - ret - ENDP - - - - EXPORT |rshift_mod_256|[FUNC] - ALIGN 32 -|rshift_mod_256| PROC - ldp x8,x9,[x1] - ldp x10,x11,[x1,#16] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - -|$Loop_rshift| - adds x12,x8,x4 - sub x2,x2,#1 - adcs x13,x9,x5 - adcs x14,x10,x6 - adcs x15,x11,x7 - adc x3,xzr,xzr - tst x8,#1 - - cselne x12,x12,x8 - cselne x13,x13,x9 - cselne x14,x14,x10 - cselne x15,x15,x11 - cselne x3,x3,xzr - - extr x8,x13,x12,#1 - extr x9,x14,x13,#1 - extr x10,x15,x14,#1 - extr x11,x3,x15,#1 - - cbnz x2,|$Loop_rshift| - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - ret - ENDP - - - - EXPORT |cneg_mod_256|[FUNC] - ALIGN 32 -|cneg_mod_256| PROC - ldp x8,x9,[x1] - ldp x4,x5,[x3] - - ldp x10,x11,[x1,#16] - subs x12,x4,x8 - ldp x6,x7,[x3,#16] - orr x4,x8,x9 - sbcs x13,x5,x9 - orr x5,x10,x11 - sbcs x14,x6,x10 - orr x3,x4,x5 - sbc x15,x7,x11 - - cmp x3,#0 - csetmne x3 - ands x2,x2,x3 - - cseleq x8,x8,x12 - cseleq x9,x9,x13 - cseleq x10,x10,x14 - stp x8,x9,[x0] - cseleq x11,x11,x15 - stp x10,x11,[x0,#16] - - ret - ENDP - - - - EXPORT |sub_mod_256|[FUNC] - ALIGN 32 -|sub_mod_256| PROC - ldp x8,x9,[x1] - ldp x12,x13,[x2] - - ldp x10,x11,[x1,#16] - subs x8,x8,x12 - ldp x14,x15,[x2,#16] - sbcs x9,x9,x13 - ldp x4,x5,[x3] - sbcs x10,x10,x14 - ldp x6,x7,[x3,#16] - sbcs x11,x11,x15 - sbc x3,xzr,xzr - - and x4,x4,x3 - and x5,x5,x3 - adds x8,x8,x4 - and x6,x6,x3 - adcs x9,x9,x5 - and x7,x7,x3 - adcs x10,x10,x6 - stp x8,x9,[x0] - adc x11,x11,x7 - stp x10,x11,[x0,#16] - - ret - ENDP - - - - EXPORT |check_mod_256|[FUNC] - ALIGN 32 -|check_mod_256| PROC - ldp x8,x9,[x0] - ldp x10,x11,[x0,#16] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - subs xzr,x8,x4 - sbcs xzr,x9,x5 - orr x8,x8,x9 - sbcs xzr,x10,x6 - orr x8,x8,x10 - sbcs xzr,x11,x7 - orr x8,x8,x11 - sbc x1,xzr,xzr - - cmp x8,#0 - mov x0,#1 - cselne x0,x0,xzr - and x0,x0,x1 - - ret - ENDP - - - - EXPORT |add_n_check_mod_256|[FUNC] - ALIGN 32 -|add_n_check_mod_256| PROC - ldp x8,x9,[x1] - ldp x12,x13,[x2] - ldp x10,x11,[x1,#16] - ldp x14,x15,[x2,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 - rev x10,x10 - rev x14,x14 - rev x11,x11 - rev x15,x15 -#endif - - adds x8,x8,x12 - ldp x4,x5,[x3] - adcs x9,x9,x13 - ldp x6,x7,[x3,#16] - adcs x10,x10,x14 - adcs x11,x11,x15 - adc x3,xzr,xzr - - subs x16,x8,x4 - sbcs x17,x9,x5 - sbcs x1,x10,x6 - sbcs x2,x11,x7 - sbcs xzr,x3,xzr - - csello x8,x8,x16 - csello x9,x9,x17 - csello x10,x10,x1 - csello x11,x11,x2 - - orr x16, x8, x9 - orr x17, x10, x11 - orr x16, x16, x17 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - mov x17, #1 - cmp x16, #0 - cselne x0,x17,xzr - - ret - ENDP - - - - EXPORT |sub_n_check_mod_256|[FUNC] - ALIGN 32 -|sub_n_check_mod_256| PROC - ldp x8,x9,[x1] - ldp x12,x13,[x2] - ldp x10,x11,[x1,#16] - ldp x14,x15,[x2,#16] - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x12,x12 - rev x9,x9 - rev x13,x13 - rev x10,x10 - rev x14,x14 - rev x11,x11 - rev x15,x15 -#endif - - subs x8,x8,x12 - sbcs x9,x9,x13 - ldp x4,x5,[x3] - sbcs x10,x10,x14 - ldp x6,x7,[x3,#16] - sbcs x11,x11,x15 - sbc x3,xzr,xzr - - and x4,x4,x3 - and x5,x5,x3 - adds x8,x8,x4 - and x6,x6,x3 - adcs x9,x9,x5 - and x7,x7,x3 - adcs x10,x10,x6 - adc x11,x11,x7 - - orr x16, x8, x9 - orr x17, x10, x11 - orr x16, x16, x17 - -#ifdef __AARCH64EB__ - rev x8,x8 - rev x9,x9 - rev x10,x10 - rev x11,x11 -#endif - - stp x8,x9,[x0] - stp x10,x11,[x0,#16] - - mov x17, #1 - cmp x16, #0 - cselne x0,x17,xzr - - ret - ENDP - END diff --git a/crypto/blst_src/build/win64/add_mod_256-x86_64.asm b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm deleted file mode 100644 index d5308b8f809..00000000000 --- a/crypto/blst_src/build/win64/add_mod_256-x86_64.asm +++ /dev/null @@ -1,939 +0,0 @@ -OPTION DOTNAME -.text$ SEGMENT ALIGN(256) 'CODE' - -PUBLIC add_mod_256 - - -ALIGN 32 -add_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_add_mod_256:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - sub rsp,8 - -$L$SEH_body_add_mod_256:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - -$L$oaded_a_add_mod_256:: - add r8,QWORD PTR[rdx] - adc r9,QWORD PTR[8+rdx] - mov rax,r8 - adc r10,QWORD PTR[16+rdx] - mov rsi,r9 - adc r11,QWORD PTR[24+rdx] - sbb rdx,rdx - - mov rbx,r10 - sub r8,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rcx] - sbb r10,QWORD PTR[16+rcx] - mov rbp,r11 - sbb r11,QWORD PTR[24+rcx] - sbb rdx,0 - - cmovc r8,rax - cmovc r9,rsi - mov QWORD PTR[rdi],r8 - cmovc r10,rbx - mov QWORD PTR[8+rdi],r9 - cmovc r11,rbp - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_add_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_add_mod_256:: -add_mod_256 ENDP - - -PUBLIC mul_by_3_mod_256 - - -ALIGN 32 -mul_by_3_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_by_3_mod_256:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbx - - push r12 - -$L$SEH_body_mul_by_3_mod_256:: - - - mov rcx,rdx - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov rdx,rsi - mov r11,QWORD PTR[24+rsi] - - call __lshift_mod_256 - mov r12,QWORD PTR[rsp] - - jmp $L$oaded_a_add_mod_256 - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_mul_by_3_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_by_3_mod_256:: -mul_by_3_mod_256 ENDP - - -ALIGN 32 -__lshift_mod_256 PROC PRIVATE - DB 243,15,30,250 - - add r8,r8 - adc r9,r9 - mov rax,r8 - adc r10,r10 - mov rsi,r9 - adc r11,r11 - sbb r12,r12 - - mov rbx,r10 - sub r8,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rcx] - sbb r10,QWORD PTR[16+rcx] - mov rbp,r11 - sbb r11,QWORD PTR[24+rcx] - sbb r12,0 - - cmovc r8,rax - cmovc r9,rsi - cmovc r10,rbx - cmovc r11,rbp - - DB 0F3h,0C3h ;repret -__lshift_mod_256 ENDP - - -PUBLIC lshift_mod_256 - - -ALIGN 32 -lshift_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_lshift_mod_256:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - -$L$SEH_body_lshift_mod_256:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - -$L$oop_lshift_mod_256:: - call __lshift_mod_256 - dec edx - jnz $L$oop_lshift_mod_256 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - - mov r12,QWORD PTR[rsp] - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_lshift_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_lshift_mod_256:: -lshift_mod_256 ENDP - - -PUBLIC rshift_mod_256 - - -ALIGN 32 -rshift_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_rshift_mod_256:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - sub rsp,8 - -$L$SEH_body_rshift_mod_256:: - - - mov rbp,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - -$L$oop_rshift_mod_256:: - mov r8,rbp - and rbp,1 - mov rax,QWORD PTR[rcx] - neg rbp - mov rsi,QWORD PTR[8+rcx] - mov rbx,QWORD PTR[16+rcx] - - and rax,rbp - and rsi,rbp - and rbx,rbp - and rbp,QWORD PTR[24+rcx] - - add r8,rax - adc r9,rsi - adc r10,rbx - adc r11,rbp - sbb rax,rax - - shr r8,1 - mov rbp,r9 - shr r9,1 - mov rbx,r10 - shr r10,1 - mov rsi,r11 - shr r11,1 - - shl rbp,63 - shl rbx,63 - or rbp,r8 - shl rsi,63 - or r9,rbx - shl rax,63 - or r10,rsi - or r11,rax - - dec edx - jnz $L$oop_rshift_mod_256 - - mov QWORD PTR[rdi],rbp - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_rshift_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_rshift_mod_256:: -rshift_mod_256 ENDP - - -PUBLIC cneg_mod_256 - - -ALIGN 32 -cneg_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_cneg_mod_256:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - -$L$SEH_body_cneg_mod_256:: - - - mov r12,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r8,r12 - mov r11,QWORD PTR[24+rsi] - or r12,r9 - or r12,r10 - or r12,r11 - mov rbp,-1 - - mov rax,QWORD PTR[rcx] - cmovnz r12,rbp - mov rsi,QWORD PTR[8+rcx] - mov rbx,QWORD PTR[16+rcx] - and rax,r12 - mov rbp,QWORD PTR[24+rcx] - and rsi,r12 - and rbx,r12 - and rbp,r12 - - sub rax,r8 - sbb rsi,r9 - sbb rbx,r10 - sbb rbp,r11 - - or rdx,rdx - - cmovz rax,r8 - cmovz rsi,r9 - mov QWORD PTR[rdi],rax - cmovz rbx,r10 - mov QWORD PTR[8+rdi],rsi - cmovz rbp,r11 - mov QWORD PTR[16+rdi],rbx - mov QWORD PTR[24+rdi],rbp - - mov r12,QWORD PTR[rsp] - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_cneg_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_cneg_mod_256:: -cneg_mod_256 ENDP - - -PUBLIC sub_mod_256 - - -ALIGN 32 -sub_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sub_mod_256:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - sub rsp,8 - -$L$SEH_body_sub_mod_256:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - - sub r8,QWORD PTR[rdx] - mov rax,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rdx] - mov rsi,QWORD PTR[8+rcx] - sbb r10,QWORD PTR[16+rdx] - mov rbx,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rdx] - mov rbp,QWORD PTR[24+rcx] - sbb rdx,rdx - - and rax,rdx - and rsi,rdx - and rbx,rdx - and rbp,rdx - - add r8,rax - adc r9,rsi - mov QWORD PTR[rdi],r8 - adc r10,rbx - mov QWORD PTR[8+rdi],r9 - adc r11,rbp - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_sub_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sub_mod_256:: -sub_mod_256 ENDP - - -PUBLIC check_mod_256 - - -ALIGN 32 -check_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_check_mod_256:: - - - mov rdi,rcx - mov rsi,rdx - mov rax,QWORD PTR[rdi] - mov r9,QWORD PTR[8+rdi] - mov r10,QWORD PTR[16+rdi] - mov r11,QWORD PTR[24+rdi] - - mov r8,rax - or rax,r9 - or rax,r10 - or rax,r11 - - sub r8,QWORD PTR[rsi] - sbb r9,QWORD PTR[8+rsi] - sbb r10,QWORD PTR[16+rsi] - sbb r11,QWORD PTR[24+rsi] - sbb rsi,rsi - - mov rdx,1 - cmp rax,0 - cmovne rax,rdx - and rax,rsi -$L$SEH_epilogue_check_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_check_mod_256:: -check_mod_256 ENDP - - -PUBLIC add_n_check_mod_256 - - -ALIGN 32 -add_n_check_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_add_n_check_mod_256:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - sub rsp,8 - -$L$SEH_body_add_n_check_mod_256:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - - add r8,QWORD PTR[rdx] - adc r9,QWORD PTR[8+rdx] - mov rax,r8 - adc r10,QWORD PTR[16+rdx] - mov rsi,r9 - adc r11,QWORD PTR[24+rdx] - sbb rdx,rdx - - mov rbx,r10 - sub r8,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rcx] - sbb r10,QWORD PTR[16+rcx] - mov rbp,r11 - sbb r11,QWORD PTR[24+rcx] - sbb rdx,0 - - cmovc r8,rax - cmovc r9,rsi - mov QWORD PTR[rdi],r8 - cmovc r10,rbx - mov QWORD PTR[8+rdi],r9 - cmovc r11,rbp - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - - or r8,r9 - or r10,r11 - or r8,r10 - mov rax,1 - cmovz rax,r8 - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_add_n_check_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_add_n_check_mod_256:: -add_n_check_mod_256 ENDP - - -PUBLIC sub_n_check_mod_256 - - -ALIGN 32 -sub_n_check_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sub_n_check_mod_256:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - sub rsp,8 - -$L$SEH_body_sub_n_check_mod_256:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - - sub r8,QWORD PTR[rdx] - mov rax,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rdx] - mov rsi,QWORD PTR[8+rcx] - sbb r10,QWORD PTR[16+rdx] - mov rbx,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rdx] - mov rbp,QWORD PTR[24+rcx] - sbb rdx,rdx - - and rax,rdx - and rsi,rdx - and rbx,rdx - and rbp,rdx - - add r8,rax - adc r9,rsi - mov QWORD PTR[rdi],r8 - adc r10,rbx - mov QWORD PTR[8+rdi],r9 - adc r11,rbp - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - - or r8,r9 - or r10,r11 - or r8,r10 - mov rax,1 - cmovz rax,r8 - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_sub_n_check_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sub_n_check_mod_256:: -sub_n_check_mod_256 ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_add_mod_256 - DD imagerel $L$SEH_body_add_mod_256 - DD imagerel $L$SEH_info_add_mod_256_prologue - - DD imagerel $L$SEH_body_add_mod_256 - DD imagerel $L$SEH_epilogue_add_mod_256 - DD imagerel $L$SEH_info_add_mod_256_body - - DD imagerel $L$SEH_epilogue_add_mod_256 - DD imagerel $L$SEH_end_add_mod_256 - DD imagerel $L$SEH_info_add_mod_256_epilogue - - DD imagerel $L$SEH_begin_mul_by_3_mod_256 - DD imagerel $L$SEH_body_mul_by_3_mod_256 - DD imagerel $L$SEH_info_mul_by_3_mod_256_prologue - - DD imagerel $L$SEH_body_mul_by_3_mod_256 - DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 - DD imagerel $L$SEH_info_mul_by_3_mod_256_body - - DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 - DD imagerel $L$SEH_end_mul_by_3_mod_256 - DD imagerel $L$SEH_info_mul_by_3_mod_256_epilogue - - DD imagerel $L$SEH_begin_lshift_mod_256 - DD imagerel $L$SEH_body_lshift_mod_256 - DD imagerel $L$SEH_info_lshift_mod_256_prologue - - DD imagerel $L$SEH_body_lshift_mod_256 - DD imagerel $L$SEH_epilogue_lshift_mod_256 - DD imagerel $L$SEH_info_lshift_mod_256_body - - DD imagerel $L$SEH_epilogue_lshift_mod_256 - DD imagerel $L$SEH_end_lshift_mod_256 - DD imagerel $L$SEH_info_lshift_mod_256_epilogue - - DD imagerel $L$SEH_begin_rshift_mod_256 - DD imagerel $L$SEH_body_rshift_mod_256 - DD imagerel $L$SEH_info_rshift_mod_256_prologue - - DD imagerel $L$SEH_body_rshift_mod_256 - DD imagerel $L$SEH_epilogue_rshift_mod_256 - DD imagerel $L$SEH_info_rshift_mod_256_body - - DD imagerel $L$SEH_epilogue_rshift_mod_256 - DD imagerel $L$SEH_end_rshift_mod_256 - DD imagerel $L$SEH_info_rshift_mod_256_epilogue - - DD imagerel $L$SEH_begin_cneg_mod_256 - DD imagerel $L$SEH_body_cneg_mod_256 - DD imagerel $L$SEH_info_cneg_mod_256_prologue - - DD imagerel $L$SEH_body_cneg_mod_256 - DD imagerel $L$SEH_epilogue_cneg_mod_256 - DD imagerel $L$SEH_info_cneg_mod_256_body - - DD imagerel $L$SEH_epilogue_cneg_mod_256 - DD imagerel $L$SEH_end_cneg_mod_256 - DD imagerel $L$SEH_info_cneg_mod_256_epilogue - - DD imagerel $L$SEH_begin_sub_mod_256 - DD imagerel $L$SEH_body_sub_mod_256 - DD imagerel $L$SEH_info_sub_mod_256_prologue - - DD imagerel $L$SEH_body_sub_mod_256 - DD imagerel $L$SEH_epilogue_sub_mod_256 - DD imagerel $L$SEH_info_sub_mod_256_body - - DD imagerel $L$SEH_epilogue_sub_mod_256 - DD imagerel $L$SEH_end_sub_mod_256 - DD imagerel $L$SEH_info_sub_mod_256_epilogue - - DD imagerel $L$SEH_epilogue_check_mod_256 - DD imagerel $L$SEH_end_check_mod_256 - DD imagerel $L$SEH_info_check_mod_256_epilogue - - DD imagerel $L$SEH_begin_add_n_check_mod_256 - DD imagerel $L$SEH_body_add_n_check_mod_256 - DD imagerel $L$SEH_info_add_n_check_mod_256_prologue - - DD imagerel $L$SEH_body_add_n_check_mod_256 - DD imagerel $L$SEH_epilogue_add_n_check_mod_256 - DD imagerel $L$SEH_info_add_n_check_mod_256_body - - DD imagerel $L$SEH_epilogue_add_n_check_mod_256 - DD imagerel $L$SEH_end_add_n_check_mod_256 - DD imagerel $L$SEH_info_add_n_check_mod_256_epilogue - - DD imagerel $L$SEH_begin_sub_n_check_mod_256 - DD imagerel $L$SEH_body_sub_n_check_mod_256 - DD imagerel $L$SEH_info_sub_n_check_mod_256_prologue - - DD imagerel $L$SEH_body_sub_n_check_mod_256 - DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 - DD imagerel $L$SEH_info_sub_n_check_mod_256_body - - DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 - DD imagerel $L$SEH_end_sub_n_check_mod_256 - DD imagerel $L$SEH_info_sub_n_check_mod_256_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_add_mod_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_add_mod_256_body:: -DB 1,0,9,0 -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_add_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mul_by_3_mod_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_by_3_mod_256_body:: -DB 1,0,11,0 -DB 000h,0c4h,000h,000h -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -$L$SEH_info_mul_by_3_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_lshift_mod_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_lshift_mod_256_body:: -DB 1,0,11,0 -DB 000h,0c4h,000h,000h -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -$L$SEH_info_lshift_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_rshift_mod_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_rshift_mod_256_body:: -DB 1,0,9,0 -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_rshift_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_cneg_mod_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_cneg_mod_256_body:: -DB 1,0,11,0 -DB 000h,0c4h,000h,000h -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -$L$SEH_info_cneg_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sub_mod_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sub_mod_256_body:: -DB 1,0,9,0 -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sub_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_check_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_add_n_check_mod_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_add_n_check_mod_256_body:: -DB 1,0,9,0 -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_add_n_check_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sub_n_check_mod_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sub_n_check_mod_256_body:: -DB 1,0,9,0 -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sub_n_check_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/add_mod_384-armv8.asm b/crypto/blst_src/build/win64/add_mod_384-armv8.asm deleted file mode 100644 index 4bf703a6da0..00000000000 --- a/crypto/blst_src/build/win64/add_mod_384-armv8.asm +++ /dev/null @@ -1,1001 +0,0 @@ - AREA |.text|,CODE,ALIGN=8,ARM64 - - - - EXPORT |add_mod_384|[FUNC] - ALIGN 32 -|add_mod_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__add_mod_384| PROC - ldp x10,x11,[x1] - ldp x16,x17,[x2] - ldp x12,x13,[x1,#16] - ldp x19,x20,[x2,#16] - ldp x14,x15,[x1,#32] - ldp x21,x22,[x2,#32] - -|__add_mod_384_ab_are_loaded| - adds x10,x10,x16 - adcs x11,x11,x17 - adcs x12,x12,x19 - adcs x13,x13,x20 - adcs x14,x14,x21 - adcs x15,x15,x22 - adc x3,xzr,xzr - - subs x16,x10,x4 - sbcs x17,x11,x5 - sbcs x19,x12,x6 - sbcs x20,x13,x7 - sbcs x21,x14,x8 - sbcs x22,x15,x9 - sbcs xzr,x3,xzr - - csello x10,x10,x16 - csello x11,x11,x17 - csello x12,x12,x19 - csello x13,x13,x20 - csello x14,x14,x21 - csello x15,x15,x22 - - ret - ENDP - - - - EXPORT |add_mod_384x|[FUNC] - ALIGN 32 -|add_mod_384x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __add_mod_384 - - stp x10,x11,[x0] - add x1,x1,#48 - stp x12,x13,[x0,#16] - add x2,x2,#48 - stp x14,x15,[x0,#32] - - bl __add_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |rshift_mod_384|[FUNC] - ALIGN 32 -|rshift_mod_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - -|$Loop_rshift_mod_384| - sub x2,x2,#1 - bl __rshift_mod_384 - cbnz x2,|$Loop_rshift_mod_384| - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__rshift_mod_384| PROC - sbfx x22,x10,#0,#1 - and x16,x22,x4 - and x17,x22,x5 - adds x10,x10,x16 - and x19,x22,x6 - adcs x11,x11,x17 - and x20,x22,x7 - adcs x12,x12,x19 - and x21,x22,x8 - adcs x13,x13,x20 - and x22,x22,x9 - adcs x14,x14,x21 - extr x10,x11,x10,#1 // a[0:5] >>= 1 - adcs x15,x15,x22 - extr x11,x12,x11,#1 - adc x22,xzr,xzr - extr x12,x13,x12,#1 - extr x13,x14,x13,#1 - extr x14,x15,x14,#1 - extr x15,x22,x15,#1 - ret - ENDP - - - - EXPORT |div_by_2_mod_384|[FUNC] - ALIGN 32 -|div_by_2_mod_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __rshift_mod_384 - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |lshift_mod_384|[FUNC] - ALIGN 32 -|lshift_mod_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - -|$Loop_lshift_mod_384| - sub x2,x2,#1 - bl __lshift_mod_384 - cbnz x2,|$Loop_lshift_mod_384| - - ldr x30,[sp,#8] - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__lshift_mod_384| PROC - adds x10,x10,x10 - adcs x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x3,xzr,xzr - - subs x16,x10,x4 - sbcs x17,x11,x5 - sbcs x19,x12,x6 - sbcs x20,x13,x7 - sbcs x21,x14,x8 - sbcs x22,x15,x9 - sbcs xzr,x3,xzr - - csello x10,x10,x16 - csello x11,x11,x17 - csello x12,x12,x19 - csello x13,x13,x20 - csello x14,x14,x21 - csello x15,x15,x22 - - ret - ENDP - - - - EXPORT |mul_by_3_mod_384|[FUNC] - ALIGN 32 -|mul_by_3_mod_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |mul_by_8_mod_384|[FUNC] - ALIGN 32 -|mul_by_8_mod_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |mul_by_3_mod_384x|[FUNC] - ALIGN 32 -|mul_by_3_mod_384x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - - bl __add_mod_384_ab_are_loaded - - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __lshift_mod_384 - - ldp x16,x17,[x1,#48] - ldp x19,x20,[x1,#64] - ldp x21,x22,[x1,#80] - - bl __add_mod_384_ab_are_loaded - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |mul_by_8_mod_384x|[FUNC] - ALIGN 32 -|mul_by_8_mod_384x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - ldp x14,x15,[x1,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __lshift_mod_384 - bl __lshift_mod_384 - bl __lshift_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |cneg_mod_384|[FUNC] - ALIGN 32 -|cneg_mod_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x10,x11,[x1] - ldp x4,x5,[x3] - ldp x12,x13,[x1,#16] - ldp x6,x7,[x3,#16] - - subs x16,x4,x10 - ldp x14,x15,[x1,#32] - ldp x8,x9,[x3,#32] - orr x3,x10,x11 - sbcs x17,x5,x11 - orr x3,x3,x12 - sbcs x19,x6,x12 - orr x3,x3,x13 - sbcs x20,x7,x13 - orr x3,x3,x14 - sbcs x21,x8,x14 - orr x3,x3,x15 - sbc x22,x9,x15 - - cmp x3,#0 - csetmne x3 - ands x2,x2,x3 - - cseleq x10,x10,x16 - cseleq x11,x11,x17 - cseleq x12,x12,x19 - cseleq x13,x13,x20 - stp x10,x11,[x0] - cseleq x14,x14,x21 - stp x12,x13,[x0,#16] - cseleq x15,x15,x22 - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |sub_mod_384|[FUNC] - ALIGN 32 -|sub_mod_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - stp x14,x15,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__sub_mod_384| PROC - ldp x10,x11,[x1] - ldp x16,x17,[x2] - ldp x12,x13,[x1,#16] - ldp x19,x20,[x2,#16] - ldp x14,x15,[x1,#32] - ldp x21,x22,[x2,#32] - - subs x10,x10,x16 - sbcs x11,x11,x17 - sbcs x12,x12,x19 - sbcs x13,x13,x20 - sbcs x14,x14,x21 - sbcs x15,x15,x22 - sbc x3,xzr,xzr - - and x16,x4,x3 - and x17,x5,x3 - adds x10,x10,x16 - and x19,x6,x3 - adcs x11,x11,x17 - and x20,x7,x3 - adcs x12,x12,x19 - and x21,x8,x3 - adcs x13,x13,x20 - and x22,x9,x3 - adcs x14,x14,x21 - adc x15,x15,x22 - - ret - ENDP - - - - EXPORT |sub_mod_384x|[FUNC] - ALIGN 32 -|sub_mod_384x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x3] - ldp x6,x7,[x3,#16] - ldp x8,x9,[x3,#32] - - bl __sub_mod_384 - - stp x10,x11,[x0] - add x1,x1,#48 - stp x12,x13,[x0,#16] - add x2,x2,#48 - stp x14,x15,[x0,#32] - - bl __sub_mod_384 - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |mul_by_1_plus_i_mod_384x|[FUNC] - ALIGN 32 -|mul_by_1_plus_i_mod_384x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x4,x5,[x2] - ldp x6,x7,[x2,#16] - ldp x8,x9,[x2,#32] - add x2,x1,#48 - - bl __sub_mod_384 // a->re - a->im - - ldp x16,x17,[x1] - ldp x19,x20,[x1,#16] - ldp x21,x22,[x1,#32] - stp x10,x11,[x0] - ldp x10,x11,[x1,#48] - stp x12,x13,[x0,#16] - ldp x12,x13,[x1,#64] - stp x14,x15,[x0,#32] - ldp x14,x15,[x1,#80] - - bl __add_mod_384_ab_are_loaded // a->re + a->im - ldr x30,[sp,#8] - - stp x10,x11,[x0,#48] - stp x12,x13,[x0,#64] - stp x14,x15,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |sgn0_pty_mod_384|[FUNC] - ALIGN 32 -|sgn0_pty_mod_384| PROC - ldp x10,x11,[x0] - ldp x12,x13,[x0,#16] - ldp x14,x15,[x0,#32] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldp x8,x9,[x1,#32] - - and x0,x10,#1 - adds x10,x10,x10 - adcs x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x3,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x3,x3,xzr - - mvn x3,x3 - and x3,x3,#2 - orr x0,x0,x3 - - ret - ENDP - - - - EXPORT |sgn0_pty_mod_384x|[FUNC] - ALIGN 32 -|sgn0_pty_mod_384x| PROC - ldp x10,x11,[x0] - ldp x12,x13,[x0,#16] - ldp x14,x15,[x0,#32] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldp x8,x9,[x1,#32] - - and x2,x10,#1 - orr x3,x10,x11 - adds x10,x10,x10 - orr x3,x3,x12 - adcs x11,x11,x11 - orr x3,x3,x13 - adcs x12,x12,x12 - orr x3,x3,x14 - adcs x13,x13,x13 - orr x3,x3,x15 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x16,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x16,x16,xzr - - ldp x10,x11,[x0,#48] - ldp x12,x13,[x0,#64] - ldp x14,x15,[x0,#80] - - mvn x16,x16 - and x16,x16,#2 - orr x2,x2,x16 - - and x0,x10,#1 - orr x1,x10,x11 - adds x10,x10,x10 - orr x1,x1,x12 - adcs x11,x11,x11 - orr x1,x1,x13 - adcs x12,x12,x12 - orr x1,x1,x14 - adcs x13,x13,x13 - orr x1,x1,x15 - adcs x14,x14,x14 - adcs x15,x15,x15 - adc x16,xzr,xzr - - subs x10,x10,x4 - sbcs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbc x16,x16,xzr - - mvn x16,x16 - and x16,x16,#2 - orr x0,x0,x16 - - cmp x3,#0 - cseleq x3,x0,x2 - - cmp x1,#0 - cselne x1,x0,x2 - - and x3,x3,#1 - and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity - - ret - ENDP - - - EXPORT |vec_select_32|[FUNC] - ALIGN 32 -|vec_select_32| PROC - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret - ENDP - - - EXPORT |vec_select_48|[FUNC] - ALIGN 32 -|vec_select_48| PROC - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret - ENDP - - - EXPORT |vec_select_96|[FUNC] - ALIGN 32 -|vec_select_96| PROC - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret - ENDP - - - EXPORT |vec_select_192|[FUNC] - ALIGN 32 -|vec_select_192| PROC - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret - ENDP - - - EXPORT |vec_select_144|[FUNC] - ALIGN 32 -|vec_select_144| PROC - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - bit v1.16b, v4.16b, v6.16b - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0] - ret - ENDP - - - EXPORT |vec_select_288|[FUNC] - ALIGN 32 -|vec_select_288| PROC - dup v6.2d, x3 - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - cmeq v6.2d, v6.2d, #0 - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 - bit v17.16b, v20.16b, v6.16b - ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 - bit v0.16b, v3.16b, v6.16b - ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 - bit v1.16b, v4.16b, v6.16b - ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 - bit v2.16b, v5.16b, v6.16b - st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 - bit v16.16b, v19.16b, v6.16b - bit v17.16b, v20.16b, v6.16b - bit v18.16b, v21.16b, v6.16b - st1 {v16.2d, v17.2d, v18.2d}, [x0] - ret - ENDP - - - EXPORT |vec_prefetch|[FUNC] - ALIGN 32 -|vec_prefetch| PROC - add x1, x1, x0 - sub x1, x1, #1 - mov x2, #64 - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - cselhi x0,x1,x0 - cselhi x2,xzr,x2 - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - cselhi x0,x1,x0 - cselhi x2,xzr,x2 - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - cselhi x0,x1,x0 - cselhi x2,xzr,x2 - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - cselhi x0,x1,x0 - cselhi x2,xzr,x2 - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - cselhi x0,x1,x0 - cselhi x2,xzr,x2 - prfm pldl1keep, [x0] - add x0, x0, x2 - cmp x0, x1 - cselhi x0,x1,x0 - prfm pldl1keep, [x0] - ret - ENDP - - - EXPORT |vec_is_zero_16x|[FUNC] - ALIGN 32 -|vec_is_zero_16x| PROC - ld1 {v0.2d}, [x0], #16 - lsr x1, x1, #4 - sub x1, x1, #1 - cbz x1, |$Loop_is_zero_done| - -|$Loop_is_zero| - ld1 {v1.2d}, [x0], #16 - orr v0.16b, v0.16b, v1.16b - sub x1, x1, #1 - cbnz x1, |$Loop_is_zero| - -|$Loop_is_zero_done| - dup v1.2d, v0.d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] - mov x0, #1 - cmp x1, #0 - cseleq x0,x0,xzr - ret - ENDP - - - EXPORT |vec_is_equal_16x|[FUNC] - ALIGN 32 -|vec_is_equal_16x| PROC - ld1 {v0.2d}, [x0], #16 - ld1 {v1.2d}, [x1], #16 - lsr x2, x2, #4 - eor v0.16b, v0.16b, v1.16b - -|$Loop_is_equal| - sub x2, x2, #1 - cbz x2, |$Loop_is_equal_done| - ld1 {v1.2d}, [x0], #16 - ld1 {v2.2d}, [x1], #16 - eor v1.16b, v1.16b, v2.16b - orr v0.16b, v0.16b, v1.16b - b |$Loop_is_equal| - nop - -|$Loop_is_equal_done| - dup v1.2d, v0.d[1] - orr v0.16b, v0.16b, v1.16b - mov x1, v0.d[0] - mov x0, #1 - cmp x1, #0 - cseleq x0,x0,xzr - ret - ENDP - END diff --git a/crypto/blst_src/build/win64/add_mod_384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm deleted file mode 100644 index 560e02ee105..00000000000 --- a/crypto/blst_src/build/win64/add_mod_384-x86_64.asm +++ /dev/null @@ -1,2531 +0,0 @@ -OPTION DOTNAME -.text$ SEGMENT ALIGN(256) 'CODE' - -PUBLIC add_mod_384 - - -ALIGN 32 -add_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_add_mod_384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_add_mod_384:: - - - call __add_mod_384 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_add_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_add_mod_384:: -add_mod_384 ENDP - - -ALIGN 32 -__add_mod_384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - -__add_mod_384_a_is_loaded:: - add r8,QWORD PTR[rdx] - adc r9,QWORD PTR[8+rdx] - adc r10,QWORD PTR[16+rdx] - mov r14,r8 - adc r11,QWORD PTR[24+rdx] - mov r15,r9 - adc r12,QWORD PTR[32+rdx] - mov rax,r10 - adc r13,QWORD PTR[40+rdx] - mov rbx,r11 - sbb rdx,rdx - - sub r8,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rcx] - mov rbp,r12 - sbb r10,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rcx] - sbb r12,QWORD PTR[32+rcx] - mov rsi,r13 - sbb r13,QWORD PTR[40+rcx] - sbb rdx,0 - - cmovc r8,r14 - cmovc r9,r15 - cmovc r10,rax - mov QWORD PTR[rdi],r8 - cmovc r11,rbx - mov QWORD PTR[8+rdi],r9 - cmovc r12,rbp - mov QWORD PTR[16+rdi],r10 - cmovc r13,rsi - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - DB 0F3h,0C3h ;repret -__add_mod_384 ENDP - -PUBLIC add_mod_384x - - -ALIGN 32 -add_mod_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_add_mod_384x:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,24 - -$L$SEH_body_add_mod_384x:: - - - mov QWORD PTR[rsp],rsi - mov QWORD PTR[8+rsp],rdx - lea rsi,QWORD PTR[48+rsi] - lea rdx,QWORD PTR[48+rdx] - lea rdi,QWORD PTR[48+rdi] - call __add_mod_384 - - mov rsi,QWORD PTR[rsp] - mov rdx,QWORD PTR[8+rsp] - lea rdi,QWORD PTR[((-48))+rdi] - call __add_mod_384 - - mov r15,QWORD PTR[((24+0))+rsp] - - mov r14,QWORD PTR[((24+8))+rsp] - - mov r13,QWORD PTR[((24+16))+rsp] - - mov r12,QWORD PTR[((24+24))+rsp] - - mov rbx,QWORD PTR[((24+32))+rsp] - - mov rbp,QWORD PTR[((24+40))+rsp] - - lea rsp,QWORD PTR[((24+48))+rsp] - -$L$SEH_epilogue_add_mod_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_add_mod_384x:: -add_mod_384x ENDP - - -PUBLIC rshift_mod_384 - - -ALIGN 32 -rshift_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_rshift_mod_384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rdi - -$L$SEH_body_rshift_mod_384:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - -$L$oop_rshift_mod_384:: - call __rshift_mod_384 - dec edx - jnz $L$oop_rshift_mod_384 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_rshift_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_rshift_mod_384:: -rshift_mod_384 ENDP - - -ALIGN 32 -__rshift_mod_384 PROC PRIVATE - DB 243,15,30,250 - - mov rsi,1 - mov r14,QWORD PTR[rcx] - and rsi,r8 - mov r15,QWORD PTR[8+rcx] - neg rsi - mov rax,QWORD PTR[16+rcx] - and r14,rsi - mov rbx,QWORD PTR[24+rcx] - and r15,rsi - mov rbp,QWORD PTR[32+rcx] - and rax,rsi - and rbx,rsi - and rbp,rsi - and rsi,QWORD PTR[40+rcx] - - add r14,r8 - adc r15,r9 - adc rax,r10 - adc rbx,r11 - adc rbp,r12 - adc rsi,r13 - sbb r13,r13 - - shr r14,1 - mov r8,r15 - shr r15,1 - mov r9,rax - shr rax,1 - mov r10,rbx - shr rbx,1 - mov r11,rbp - shr rbp,1 - mov r12,rsi - shr rsi,1 - shl r8,63 - shl r9,63 - or r8,r14 - shl r10,63 - or r9,r15 - shl r11,63 - or r10,rax - shl r12,63 - or r11,rbx - shl r13,63 - or r12,rbp - or r13,rsi - - DB 0F3h,0C3h ;repret -__rshift_mod_384 ENDP - -PUBLIC div_by_2_mod_384 - - -ALIGN 32 -div_by_2_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_div_by_2_mod_384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rdi - -$L$SEH_body_div_by_2_mod_384:: - - - mov r8,QWORD PTR[rsi] - mov rcx,rdx - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - call __rshift_mod_384 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_div_by_2_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_div_by_2_mod_384:: -div_by_2_mod_384 ENDP - - -PUBLIC lshift_mod_384 - - -ALIGN 32 -lshift_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_lshift_mod_384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rdi - -$L$SEH_body_lshift_mod_384:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - -$L$oop_lshift_mod_384:: - add r8,r8 - adc r9,r9 - adc r10,r10 - mov r14,r8 - adc r11,r11 - mov r15,r9 - adc r12,r12 - mov rax,r10 - adc r13,r13 - mov rbx,r11 - sbb rdi,rdi - - sub r8,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rcx] - mov rbp,r12 - sbb r10,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rcx] - sbb r12,QWORD PTR[32+rcx] - mov rsi,r13 - sbb r13,QWORD PTR[40+rcx] - sbb rdi,0 - - mov rdi,QWORD PTR[rsp] - cmovc r8,r14 - cmovc r9,r15 - cmovc r10,rax - cmovc r11,rbx - cmovc r12,rbp - cmovc r13,rsi - - dec edx - jnz $L$oop_lshift_mod_384 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_lshift_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_lshift_mod_384:: -lshift_mod_384 ENDP - - -ALIGN 32 -__lshift_mod_384 PROC PRIVATE - DB 243,15,30,250 - - add r8,r8 - adc r9,r9 - adc r10,r10 - mov r14,r8 - adc r11,r11 - mov r15,r9 - adc r12,r12 - mov rax,r10 - adc r13,r13 - mov rbx,r11 - sbb rdx,rdx - - sub r8,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rcx] - mov rbp,r12 - sbb r10,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rcx] - sbb r12,QWORD PTR[32+rcx] - mov rsi,r13 - sbb r13,QWORD PTR[40+rcx] - sbb rdx,0 - - cmovc r8,r14 - cmovc r9,r15 - cmovc r10,rax - cmovc r11,rbx - cmovc r12,rbp - cmovc r13,rsi - - DB 0F3h,0C3h ;repret -__lshift_mod_384 ENDP - - -PUBLIC mul_by_3_mod_384 - - -ALIGN 32 -mul_by_3_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_by_3_mod_384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rsi - -$L$SEH_body_mul_by_3_mod_384:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov rcx,rdx - - call __lshift_mod_384 - - mov rdx,QWORD PTR[rsp] - call __add_mod_384_a_is_loaded - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_mul_by_3_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_by_3_mod_384:: -mul_by_3_mod_384 ENDP - -PUBLIC mul_by_8_mod_384 - - -ALIGN 32 -mul_by_8_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_by_8_mod_384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_mul_by_8_mod_384:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov rcx,rdx - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_mul_by_8_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_by_8_mod_384:: -mul_by_8_mod_384 ENDP - - -PUBLIC mul_by_3_mod_384x - - -ALIGN 32 -mul_by_3_mod_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_by_3_mod_384x:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rsi - -$L$SEH_body_mul_by_3_mod_384x:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov rcx,rdx - - call __lshift_mod_384 - - mov rdx,QWORD PTR[rsp] - call __add_mod_384_a_is_loaded - - mov rsi,QWORD PTR[rsp] - lea rdi,QWORD PTR[48+rdi] - - mov r8,QWORD PTR[48+rsi] - mov r9,QWORD PTR[56+rsi] - mov r10,QWORD PTR[64+rsi] - mov r11,QWORD PTR[72+rsi] - mov r12,QWORD PTR[80+rsi] - mov r13,QWORD PTR[88+rsi] - - call __lshift_mod_384 - - mov rdx,8*6 - add rdx,QWORD PTR[rsp] - call __add_mod_384_a_is_loaded - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_mul_by_3_mod_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_by_3_mod_384x:: -mul_by_3_mod_384x ENDP - -PUBLIC mul_by_8_mod_384x - - -ALIGN 32 -mul_by_8_mod_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_by_8_mod_384x:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rsi - -$L$SEH_body_mul_by_8_mod_384x:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov rcx,rdx - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - mov rsi,QWORD PTR[rsp] - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - mov r8,QWORD PTR[((48+0))+rsi] - mov r9,QWORD PTR[((48+8))+rsi] - mov r10,QWORD PTR[((48+16))+rsi] - mov r11,QWORD PTR[((48+24))+rsi] - mov r12,QWORD PTR[((48+32))+rsi] - mov r13,QWORD PTR[((48+40))+rsi] - - call __lshift_mod_384 - call __lshift_mod_384 - call __lshift_mod_384 - - mov QWORD PTR[((48+0))+rdi],r8 - mov QWORD PTR[((48+8))+rdi],r9 - mov QWORD PTR[((48+16))+rdi],r10 - mov QWORD PTR[((48+24))+rdi],r11 - mov QWORD PTR[((48+32))+rdi],r12 - mov QWORD PTR[((48+40))+rdi],r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_mul_by_8_mod_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_by_8_mod_384x:: -mul_by_8_mod_384x ENDP - - -PUBLIC cneg_mod_384 - - -ALIGN 32 -cneg_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_cneg_mod_384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rdx - -$L$SEH_body_cneg_mod_384:: - - - mov rdx,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r8,rdx - mov r11,QWORD PTR[24+rsi] - or rdx,r9 - mov r12,QWORD PTR[32+rsi] - or rdx,r10 - mov r13,QWORD PTR[40+rsi] - or rdx,r11 - mov rsi,-1 - or rdx,r12 - or rdx,r13 - - mov r14,QWORD PTR[rcx] - cmovnz rdx,rsi - mov r15,QWORD PTR[8+rcx] - mov rax,QWORD PTR[16+rcx] - and r14,rdx - mov rbx,QWORD PTR[24+rcx] - and r15,rdx - mov rbp,QWORD PTR[32+rcx] - and rax,rdx - mov rsi,QWORD PTR[40+rcx] - and rbx,rdx - mov rcx,QWORD PTR[rsp] - and rbp,rdx - and rsi,rdx - - sub r14,r8 - sbb r15,r9 - sbb rax,r10 - sbb rbx,r11 - sbb rbp,r12 - sbb rsi,r13 - - or rcx,rcx - - cmovz r14,r8 - cmovz r15,r9 - cmovz rax,r10 - mov QWORD PTR[rdi],r14 - cmovz rbx,r11 - mov QWORD PTR[8+rdi],r15 - cmovz rbp,r12 - mov QWORD PTR[16+rdi],rax - cmovz rsi,r13 - mov QWORD PTR[24+rdi],rbx - mov QWORD PTR[32+rdi],rbp - mov QWORD PTR[40+rdi],rsi - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_cneg_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_cneg_mod_384:: -cneg_mod_384 ENDP - - -PUBLIC sub_mod_384 - - -ALIGN 32 -sub_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sub_mod_384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_sub_mod_384:: - - - call __sub_mod_384 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sub_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sub_mod_384:: -sub_mod_384 ENDP - - -ALIGN 32 -__sub_mod_384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - sub r8,QWORD PTR[rdx] - mov r14,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rdx] - mov r15,QWORD PTR[8+rcx] - sbb r10,QWORD PTR[16+rdx] - mov rax,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rdx] - mov rbx,QWORD PTR[24+rcx] - sbb r12,QWORD PTR[32+rdx] - mov rbp,QWORD PTR[32+rcx] - sbb r13,QWORD PTR[40+rdx] - mov rsi,QWORD PTR[40+rcx] - sbb rdx,rdx - - and r14,rdx - and r15,rdx - and rax,rdx - and rbx,rdx - and rbp,rdx - and rsi,rdx - - add r8,r14 - adc r9,r15 - mov QWORD PTR[rdi],r8 - adc r10,rax - mov QWORD PTR[8+rdi],r9 - adc r11,rbx - mov QWORD PTR[16+rdi],r10 - adc r12,rbp - mov QWORD PTR[24+rdi],r11 - adc r13,rsi - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - DB 0F3h,0C3h ;repret -__sub_mod_384 ENDP - -PUBLIC sub_mod_384x - - -ALIGN 32 -sub_mod_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sub_mod_384x:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,24 - -$L$SEH_body_sub_mod_384x:: - - - mov QWORD PTR[rsp],rsi - mov QWORD PTR[8+rsp],rdx - lea rsi,QWORD PTR[48+rsi] - lea rdx,QWORD PTR[48+rdx] - lea rdi,QWORD PTR[48+rdi] - call __sub_mod_384 - - mov rsi,QWORD PTR[rsp] - mov rdx,QWORD PTR[8+rsp] - lea rdi,QWORD PTR[((-48))+rdi] - call __sub_mod_384 - - mov r15,QWORD PTR[((24+0))+rsp] - - mov r14,QWORD PTR[((24+8))+rsp] - - mov r13,QWORD PTR[((24+16))+rsp] - - mov r12,QWORD PTR[((24+24))+rsp] - - mov rbx,QWORD PTR[((24+32))+rsp] - - mov rbp,QWORD PTR[((24+40))+rsp] - - lea rsp,QWORD PTR[((24+48))+rsp] - -$L$SEH_epilogue_sub_mod_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sub_mod_384x:: -sub_mod_384x ENDP -PUBLIC mul_by_1_plus_i_mod_384x - - -ALIGN 32 -mul_by_1_plus_i_mod_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_by_1_plus_i_mod_384x:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,56 - -$L$SEH_body_mul_by_1_plus_i_mod_384x:: - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov r14,r8 - add r8,QWORD PTR[48+rsi] - mov r15,r9 - adc r9,QWORD PTR[56+rsi] - mov rax,r10 - adc r10,QWORD PTR[64+rsi] - mov rbx,r11 - adc r11,QWORD PTR[72+rsi] - mov rcx,r12 - adc r12,QWORD PTR[80+rsi] - mov rbp,r13 - adc r13,QWORD PTR[88+rsi] - mov QWORD PTR[48+rsp],rdi - sbb rdi,rdi - - sub r14,QWORD PTR[48+rsi] - sbb r15,QWORD PTR[56+rsi] - sbb rax,QWORD PTR[64+rsi] - sbb rbx,QWORD PTR[72+rsi] - sbb rcx,QWORD PTR[80+rsi] - sbb rbp,QWORD PTR[88+rsi] - sbb rsi,rsi - - mov QWORD PTR[rsp],r8 - mov r8,QWORD PTR[rdx] - mov QWORD PTR[8+rsp],r9 - mov r9,QWORD PTR[8+rdx] - mov QWORD PTR[16+rsp],r10 - mov r10,QWORD PTR[16+rdx] - mov QWORD PTR[24+rsp],r11 - mov r11,QWORD PTR[24+rdx] - mov QWORD PTR[32+rsp],r12 - and r8,rsi - mov r12,QWORD PTR[32+rdx] - mov QWORD PTR[40+rsp],r13 - and r9,rsi - mov r13,QWORD PTR[40+rdx] - and r10,rsi - and r11,rsi - and r12,rsi - and r13,rsi - mov rsi,QWORD PTR[48+rsp] - - add r14,r8 - mov r8,QWORD PTR[rsp] - adc r15,r9 - mov r9,QWORD PTR[8+rsp] - adc rax,r10 - mov r10,QWORD PTR[16+rsp] - adc rbx,r11 - mov r11,QWORD PTR[24+rsp] - adc rcx,r12 - mov r12,QWORD PTR[32+rsp] - adc rbp,r13 - mov r13,QWORD PTR[40+rsp] - - mov QWORD PTR[rsi],r14 - mov r14,r8 - mov QWORD PTR[8+rsi],r15 - mov QWORD PTR[16+rsi],rax - mov r15,r9 - mov QWORD PTR[24+rsi],rbx - mov QWORD PTR[32+rsi],rcx - mov rax,r10 - mov QWORD PTR[40+rsi],rbp - - sub r8,QWORD PTR[rdx] - mov rbx,r11 - sbb r9,QWORD PTR[8+rdx] - sbb r10,QWORD PTR[16+rdx] - mov rcx,r12 - sbb r11,QWORD PTR[24+rdx] - sbb r12,QWORD PTR[32+rdx] - mov rbp,r13 - sbb r13,QWORD PTR[40+rdx] - sbb rdi,0 - - cmovc r8,r14 - cmovc r9,r15 - cmovc r10,rax - mov QWORD PTR[48+rsi],r8 - cmovc r11,rbx - mov QWORD PTR[56+rsi],r9 - cmovc r12,rcx - mov QWORD PTR[64+rsi],r10 - cmovc r13,rbp - mov QWORD PTR[72+rsi],r11 - mov QWORD PTR[80+rsi],r12 - mov QWORD PTR[88+rsi],r13 - - mov r15,QWORD PTR[((56+0))+rsp] - - mov r14,QWORD PTR[((56+8))+rsp] - - mov r13,QWORD PTR[((56+16))+rsp] - - mov r12,QWORD PTR[((56+24))+rsp] - - mov rbx,QWORD PTR[((56+32))+rsp] - - mov rbp,QWORD PTR[((56+40))+rsp] - - lea rsp,QWORD PTR[((56+48))+rsp] - -$L$SEH_epilogue_mul_by_1_plus_i_mod_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_by_1_plus_i_mod_384x:: -mul_by_1_plus_i_mod_384x ENDP -PUBLIC sgn0_pty_mod_384 - - -ALIGN 32 -sgn0_pty_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sgn0_pty_mod_384:: - - - mov rdi,rcx - mov rsi,rdx -$L$SEH_body_sgn0_pty_mod_384:: - - mov r8,QWORD PTR[rdi] - mov r9,QWORD PTR[8+rdi] - mov r10,QWORD PTR[16+rdi] - mov r11,QWORD PTR[24+rdi] - mov rcx,QWORD PTR[32+rdi] - mov rdx,QWORD PTR[40+rdi] - - xor rax,rax - mov rdi,r8 - add r8,r8 - adc r9,r9 - adc r10,r10 - adc r11,r11 - adc rcx,rcx - adc rdx,rdx - adc rax,0 - - sub r8,QWORD PTR[rsi] - sbb r9,QWORD PTR[8+rsi] - sbb r10,QWORD PTR[16+rsi] - sbb r11,QWORD PTR[24+rsi] - sbb rcx,QWORD PTR[32+rsi] - sbb rdx,QWORD PTR[40+rsi] - sbb rax,0 - - not rax - and rdi,1 - and rax,2 - or rax,rdi - -$L$SEH_epilogue_sgn0_pty_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sgn0_pty_mod_384:: -sgn0_pty_mod_384 ENDP - -PUBLIC sgn0_pty_mod_384x - - -ALIGN 32 -sgn0_pty_mod_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sgn0_pty_mod_384x:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - push rbx - - sub rsp,8 - -$L$SEH_body_sgn0_pty_mod_384x:: - - - mov r8,QWORD PTR[48+rdi] - mov r9,QWORD PTR[56+rdi] - mov r10,QWORD PTR[64+rdi] - mov r11,QWORD PTR[72+rdi] - mov rcx,QWORD PTR[80+rdi] - mov rdx,QWORD PTR[88+rdi] - - mov rbx,r8 - or r8,r9 - or r8,r10 - or r8,r11 - or r8,rcx - or r8,rdx - - lea rax,QWORD PTR[rdi] - xor rdi,rdi - mov rbp,rbx - add rbx,rbx - adc r9,r9 - adc r10,r10 - adc r11,r11 - adc rcx,rcx - adc rdx,rdx - adc rdi,0 - - sub rbx,QWORD PTR[rsi] - sbb r9,QWORD PTR[8+rsi] - sbb r10,QWORD PTR[16+rsi] - sbb r11,QWORD PTR[24+rsi] - sbb rcx,QWORD PTR[32+rsi] - sbb rdx,QWORD PTR[40+rsi] - sbb rdi,0 - - mov QWORD PTR[rsp],r8 - not rdi - and rbp,1 - and rdi,2 - or rdi,rbp - - mov r8,QWORD PTR[rax] - mov r9,QWORD PTR[8+rax] - mov r10,QWORD PTR[16+rax] - mov r11,QWORD PTR[24+rax] - mov rcx,QWORD PTR[32+rax] - mov rdx,QWORD PTR[40+rax] - - mov rbx,r8 - or r8,r9 - or r8,r10 - or r8,r11 - or r8,rcx - or r8,rdx - - xor rax,rax - mov rbp,rbx - add rbx,rbx - adc r9,r9 - adc r10,r10 - adc r11,r11 - adc rcx,rcx - adc rdx,rdx - adc rax,0 - - sub rbx,QWORD PTR[rsi] - sbb r9,QWORD PTR[8+rsi] - sbb r10,QWORD PTR[16+rsi] - sbb r11,QWORD PTR[24+rsi] - sbb rcx,QWORD PTR[32+rsi] - sbb rdx,QWORD PTR[40+rsi] - sbb rax,0 - - mov rbx,QWORD PTR[rsp] - - not rax - - test r8,r8 - cmovz rbp,rdi - - test rbx,rbx - cmovnz rax,rdi - - and rbp,1 - and rax,2 - or rax,rbp - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_sgn0_pty_mod_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sgn0_pty_mod_384x:: -sgn0_pty_mod_384x ENDP -PUBLIC vec_select_32 - - -ALIGN 32 -vec_select_32 PROC PUBLIC - DB 243,15,30,250 - - movd xmm5,r9d - pxor xmm4,xmm4 - pshufd xmm5,xmm5,0 - movdqu xmm0,XMMWORD PTR[rdx] - lea rdx,QWORD PTR[16+rdx] - pcmpeqd xmm5,xmm4 - movdqu xmm1,XMMWORD PTR[r8] - lea r8,QWORD PTR[16+r8] - pcmpeqd xmm4,xmm5 - lea rcx,QWORD PTR[16+rcx] - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((0+16-16))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((0+16-16))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(0-16)+rcx],xmm0 - pand xmm2,xmm4 - pand xmm3,xmm5 - por xmm2,xmm3 - movdqu XMMWORD PTR[(16-16)+rcx],xmm2 - DB 0F3h,0C3h ;repret -vec_select_32 ENDP -PUBLIC vec_select_48 - - -ALIGN 32 -vec_select_48 PROC PUBLIC - DB 243,15,30,250 - - movd xmm5,r9d - pxor xmm4,xmm4 - pshufd xmm5,xmm5,0 - movdqu xmm0,XMMWORD PTR[rdx] - lea rdx,QWORD PTR[24+rdx] - pcmpeqd xmm5,xmm4 - movdqu xmm1,XMMWORD PTR[r8] - lea r8,QWORD PTR[24+r8] - pcmpeqd xmm4,xmm5 - lea rcx,QWORD PTR[24+rcx] - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((0+16-24))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((0+16-24))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(0-24)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((16+16-24))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((16+16-24))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(16-24)+rcx],xmm2 - pand xmm0,xmm4 - pand xmm1,xmm5 - por xmm0,xmm1 - movdqu XMMWORD PTR[(32-24)+rcx],xmm0 - DB 0F3h,0C3h ;repret -vec_select_48 ENDP -PUBLIC vec_select_96 - - -ALIGN 32 -vec_select_96 PROC PUBLIC - DB 243,15,30,250 - - movd xmm5,r9d - pxor xmm4,xmm4 - pshufd xmm5,xmm5,0 - movdqu xmm0,XMMWORD PTR[rdx] - lea rdx,QWORD PTR[48+rdx] - pcmpeqd xmm5,xmm4 - movdqu xmm1,XMMWORD PTR[r8] - lea r8,QWORD PTR[48+r8] - pcmpeqd xmm4,xmm5 - lea rcx,QWORD PTR[48+rcx] - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((0+16-48))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((0+16-48))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(0-48)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((16+16-48))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((16+16-48))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(16-48)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((32+16-48))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((32+16-48))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(32-48)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((48+16-48))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((48+16-48))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(48-48)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((64+16-48))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((64+16-48))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(64-48)+rcx],xmm0 - pand xmm2,xmm4 - pand xmm3,xmm5 - por xmm2,xmm3 - movdqu XMMWORD PTR[(80-48)+rcx],xmm2 - DB 0F3h,0C3h ;repret -vec_select_96 ENDP -PUBLIC vec_select_192 - - -ALIGN 32 -vec_select_192 PROC PUBLIC - DB 243,15,30,250 - - movd xmm5,r9d - pxor xmm4,xmm4 - pshufd xmm5,xmm5,0 - movdqu xmm0,XMMWORD PTR[rdx] - lea rdx,QWORD PTR[96+rdx] - pcmpeqd xmm5,xmm4 - movdqu xmm1,XMMWORD PTR[r8] - lea r8,QWORD PTR[96+r8] - pcmpeqd xmm4,xmm5 - lea rcx,QWORD PTR[96+rcx] - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((0+16-96))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((0+16-96))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(0-96)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((16+16-96))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((16+16-96))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(16-96)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((32+16-96))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((32+16-96))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(32-96)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((48+16-96))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((48+16-96))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(48-96)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((64+16-96))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((64+16-96))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(64-96)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((80+16-96))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((80+16-96))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(80-96)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((96+16-96))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((96+16-96))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(96-96)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((112+16-96))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((112+16-96))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(112-96)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((128+16-96))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((128+16-96))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(128-96)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((144+16-96))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((144+16-96))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(144-96)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((160+16-96))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((160+16-96))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(160-96)+rcx],xmm0 - pand xmm2,xmm4 - pand xmm3,xmm5 - por xmm2,xmm3 - movdqu XMMWORD PTR[(176-96)+rcx],xmm2 - DB 0F3h,0C3h ;repret -vec_select_192 ENDP -PUBLIC vec_select_144 - - -ALIGN 32 -vec_select_144 PROC PUBLIC - DB 243,15,30,250 - - movd xmm5,r9d - pxor xmm4,xmm4 - pshufd xmm5,xmm5,0 - movdqu xmm0,XMMWORD PTR[rdx] - lea rdx,QWORD PTR[72+rdx] - pcmpeqd xmm5,xmm4 - movdqu xmm1,XMMWORD PTR[r8] - lea r8,QWORD PTR[72+r8] - pcmpeqd xmm4,xmm5 - lea rcx,QWORD PTR[72+rcx] - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((0+16-72))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((0+16-72))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(0-72)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((16+16-72))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((16+16-72))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(16-72)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((32+16-72))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((32+16-72))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(32-72)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((48+16-72))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((48+16-72))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(48-72)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((64+16-72))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((64+16-72))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(64-72)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((80+16-72))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((80+16-72))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(80-72)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((96+16-72))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((96+16-72))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(96-72)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((112+16-72))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((112+16-72))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(112-72)+rcx],xmm2 - pand xmm0,xmm4 - pand xmm1,xmm5 - por xmm0,xmm1 - movdqu XMMWORD PTR[(128-72)+rcx],xmm0 - DB 0F3h,0C3h ;repret -vec_select_144 ENDP -PUBLIC vec_select_288 - - -ALIGN 32 -vec_select_288 PROC PUBLIC - DB 243,15,30,250 - - movd xmm5,r9d - pxor xmm4,xmm4 - pshufd xmm5,xmm5,0 - movdqu xmm0,XMMWORD PTR[rdx] - lea rdx,QWORD PTR[144+rdx] - pcmpeqd xmm5,xmm4 - movdqu xmm1,XMMWORD PTR[r8] - lea r8,QWORD PTR[144+r8] - pcmpeqd xmm4,xmm5 - lea rcx,QWORD PTR[144+rcx] - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((0+16-144))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((0+16-144))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(0-144)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((16+16-144))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((16+16-144))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(16-144)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((32+16-144))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((32+16-144))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(32-144)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((48+16-144))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((48+16-144))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(48-144)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((64+16-144))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((64+16-144))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(64-144)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((80+16-144))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((80+16-144))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(80-144)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((96+16-144))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((96+16-144))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(96-144)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((112+16-144))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((112+16-144))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(112-144)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((128+16-144))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((128+16-144))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(128-144)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((144+16-144))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((144+16-144))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(144-144)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((160+16-144))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((160+16-144))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(160-144)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((176+16-144))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((176+16-144))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(176-144)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((192+16-144))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((192+16-144))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(192-144)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((208+16-144))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((208+16-144))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(208-144)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((224+16-144))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((224+16-144))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(224-144)+rcx],xmm0 - pand xmm2,xmm4 - movdqu xmm0,XMMWORD PTR[((240+16-144))+rdx] - pand xmm3,xmm5 - movdqu xmm1,XMMWORD PTR[((240+16-144))+r8] - por xmm2,xmm3 - movdqu XMMWORD PTR[(240-144)+rcx],xmm2 - pand xmm0,xmm4 - movdqu xmm2,XMMWORD PTR[((256+16-144))+rdx] - pand xmm1,xmm5 - movdqu xmm3,XMMWORD PTR[((256+16-144))+r8] - por xmm0,xmm1 - movdqu XMMWORD PTR[(256-144)+rcx],xmm0 - pand xmm2,xmm4 - pand xmm3,xmm5 - por xmm2,xmm3 - movdqu XMMWORD PTR[(272-144)+rcx],xmm2 - DB 0F3h,0C3h ;repret -vec_select_288 ENDP -PUBLIC vec_prefetch - - -ALIGN 32 -vec_prefetch PROC PUBLIC - DB 243,15,30,250 - - lea rdx,QWORD PTR[((-1))+rdx*1+rcx] - mov rax,64 - xor r8,r8 - prefetchnta [rcx] - lea rcx,QWORD PTR[rax*1+rcx] - cmp rcx,rdx - cmova rcx,rdx - cmova rax,r8 - prefetchnta [rcx] - lea rcx,QWORD PTR[rax*1+rcx] - cmp rcx,rdx - cmova rcx,rdx - cmova rax,r8 - prefetchnta [rcx] - lea rcx,QWORD PTR[rax*1+rcx] - cmp rcx,rdx - cmova rcx,rdx - cmova rax,r8 - prefetchnta [rcx] - lea rcx,QWORD PTR[rax*1+rcx] - cmp rcx,rdx - cmova rcx,rdx - cmova rax,r8 - prefetchnta [rcx] - lea rcx,QWORD PTR[rax*1+rcx] - cmp rcx,rdx - cmova rcx,rdx - cmova rax,r8 - prefetchnta [rcx] - lea rcx,QWORD PTR[rax*1+rcx] - cmp rcx,rdx - cmova rcx,rdx - prefetchnta [rcx] - DB 0F3h,0C3h ;repret -vec_prefetch ENDP -PUBLIC vec_is_zero_16x - - -ALIGN 32 -vec_is_zero_16x PROC PUBLIC - DB 243,15,30,250 - - shr edx,4 - movdqu xmm0,XMMWORD PTR[rcx] - lea rcx,QWORD PTR[16+rcx] - -$L$oop_is_zero:: - dec edx - jz $L$oop_is_zero_done - movdqu xmm1,XMMWORD PTR[rcx] - lea rcx,QWORD PTR[16+rcx] - por xmm0,xmm1 - jmp $L$oop_is_zero - -$L$oop_is_zero_done:: - pshufd xmm1,xmm0,04eh - por xmm0,xmm1 -DB 102,72,15,126,192 - inc edx - test rax,rax - cmovnz eax,edx - xor eax,1 - DB 0F3h,0C3h ;repret -vec_is_zero_16x ENDP -PUBLIC vec_is_equal_16x - - -ALIGN 32 -vec_is_equal_16x PROC PUBLIC - DB 243,15,30,250 - - shr r8d,4 - movdqu xmm0,XMMWORD PTR[rcx] - movdqu xmm1,XMMWORD PTR[rdx] - sub rdx,rcx - lea rcx,QWORD PTR[16+rcx] - pxor xmm0,xmm1 - -$L$oop_is_equal:: - dec r8d - jz $L$oop_is_equal_done - movdqu xmm1,XMMWORD PTR[rcx] - movdqu xmm2,XMMWORD PTR[rdx*1+rcx] - lea rcx,QWORD PTR[16+rcx] - pxor xmm1,xmm2 - por xmm0,xmm1 - jmp $L$oop_is_equal - -$L$oop_is_equal_done:: - pshufd xmm1,xmm0,04eh - por xmm0,xmm1 -DB 102,72,15,126,192 - inc r8d - test rax,rax - cmovnz eax,r8d - xor eax,1 - DB 0F3h,0C3h ;repret -vec_is_equal_16x ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_add_mod_384 - DD imagerel $L$SEH_body_add_mod_384 - DD imagerel $L$SEH_info_add_mod_384_prologue - - DD imagerel $L$SEH_body_add_mod_384 - DD imagerel $L$SEH_epilogue_add_mod_384 - DD imagerel $L$SEH_info_add_mod_384_body - - DD imagerel $L$SEH_epilogue_add_mod_384 - DD imagerel $L$SEH_end_add_mod_384 - DD imagerel $L$SEH_info_add_mod_384_epilogue - - DD imagerel $L$SEH_begin_add_mod_384x - DD imagerel $L$SEH_body_add_mod_384x - DD imagerel $L$SEH_info_add_mod_384x_prologue - - DD imagerel $L$SEH_body_add_mod_384x - DD imagerel $L$SEH_epilogue_add_mod_384x - DD imagerel $L$SEH_info_add_mod_384x_body - - DD imagerel $L$SEH_epilogue_add_mod_384x - DD imagerel $L$SEH_end_add_mod_384x - DD imagerel $L$SEH_info_add_mod_384x_epilogue - - DD imagerel $L$SEH_begin_rshift_mod_384 - DD imagerel $L$SEH_body_rshift_mod_384 - DD imagerel $L$SEH_info_rshift_mod_384_prologue - - DD imagerel $L$SEH_body_rshift_mod_384 - DD imagerel $L$SEH_epilogue_rshift_mod_384 - DD imagerel $L$SEH_info_rshift_mod_384_body - - DD imagerel $L$SEH_epilogue_rshift_mod_384 - DD imagerel $L$SEH_end_rshift_mod_384 - DD imagerel $L$SEH_info_rshift_mod_384_epilogue - - DD imagerel $L$SEH_begin_div_by_2_mod_384 - DD imagerel $L$SEH_body_div_by_2_mod_384 - DD imagerel $L$SEH_info_div_by_2_mod_384_prologue - - DD imagerel $L$SEH_body_div_by_2_mod_384 - DD imagerel $L$SEH_epilogue_div_by_2_mod_384 - DD imagerel $L$SEH_info_div_by_2_mod_384_body - - DD imagerel $L$SEH_epilogue_div_by_2_mod_384 - DD imagerel $L$SEH_end_div_by_2_mod_384 - DD imagerel $L$SEH_info_div_by_2_mod_384_epilogue - - DD imagerel $L$SEH_begin_lshift_mod_384 - DD imagerel $L$SEH_body_lshift_mod_384 - DD imagerel $L$SEH_info_lshift_mod_384_prologue - - DD imagerel $L$SEH_body_lshift_mod_384 - DD imagerel $L$SEH_epilogue_lshift_mod_384 - DD imagerel $L$SEH_info_lshift_mod_384_body - - DD imagerel $L$SEH_epilogue_lshift_mod_384 - DD imagerel $L$SEH_end_lshift_mod_384 - DD imagerel $L$SEH_info_lshift_mod_384_epilogue - - DD imagerel $L$SEH_begin_mul_by_3_mod_384 - DD imagerel $L$SEH_body_mul_by_3_mod_384 - DD imagerel $L$SEH_info_mul_by_3_mod_384_prologue - - DD imagerel $L$SEH_body_mul_by_3_mod_384 - DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 - DD imagerel $L$SEH_info_mul_by_3_mod_384_body - - DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 - DD imagerel $L$SEH_end_mul_by_3_mod_384 - DD imagerel $L$SEH_info_mul_by_3_mod_384_epilogue - - DD imagerel $L$SEH_begin_mul_by_8_mod_384 - DD imagerel $L$SEH_body_mul_by_8_mod_384 - DD imagerel $L$SEH_info_mul_by_8_mod_384_prologue - - DD imagerel $L$SEH_body_mul_by_8_mod_384 - DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 - DD imagerel $L$SEH_info_mul_by_8_mod_384_body - - DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 - DD imagerel $L$SEH_end_mul_by_8_mod_384 - DD imagerel $L$SEH_info_mul_by_8_mod_384_epilogue - - DD imagerel $L$SEH_begin_mul_by_3_mod_384x - DD imagerel $L$SEH_body_mul_by_3_mod_384x - DD imagerel $L$SEH_info_mul_by_3_mod_384x_prologue - - DD imagerel $L$SEH_body_mul_by_3_mod_384x - DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x - DD imagerel $L$SEH_info_mul_by_3_mod_384x_body - - DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x - DD imagerel $L$SEH_end_mul_by_3_mod_384x - DD imagerel $L$SEH_info_mul_by_3_mod_384x_epilogue - - DD imagerel $L$SEH_begin_mul_by_8_mod_384x - DD imagerel $L$SEH_body_mul_by_8_mod_384x - DD imagerel $L$SEH_info_mul_by_8_mod_384x_prologue - - DD imagerel $L$SEH_body_mul_by_8_mod_384x - DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x - DD imagerel $L$SEH_info_mul_by_8_mod_384x_body - - DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x - DD imagerel $L$SEH_end_mul_by_8_mod_384x - DD imagerel $L$SEH_info_mul_by_8_mod_384x_epilogue - - DD imagerel $L$SEH_begin_cneg_mod_384 - DD imagerel $L$SEH_body_cneg_mod_384 - DD imagerel $L$SEH_info_cneg_mod_384_prologue - - DD imagerel $L$SEH_body_cneg_mod_384 - DD imagerel $L$SEH_epilogue_cneg_mod_384 - DD imagerel $L$SEH_info_cneg_mod_384_body - - DD imagerel $L$SEH_epilogue_cneg_mod_384 - DD imagerel $L$SEH_end_cneg_mod_384 - DD imagerel $L$SEH_info_cneg_mod_384_epilogue - - DD imagerel $L$SEH_begin_sub_mod_384 - DD imagerel $L$SEH_body_sub_mod_384 - DD imagerel $L$SEH_info_sub_mod_384_prologue - - DD imagerel $L$SEH_body_sub_mod_384 - DD imagerel $L$SEH_epilogue_sub_mod_384 - DD imagerel $L$SEH_info_sub_mod_384_body - - DD imagerel $L$SEH_epilogue_sub_mod_384 - DD imagerel $L$SEH_end_sub_mod_384 - DD imagerel $L$SEH_info_sub_mod_384_epilogue - - DD imagerel $L$SEH_begin_sub_mod_384x - DD imagerel $L$SEH_body_sub_mod_384x - DD imagerel $L$SEH_info_sub_mod_384x_prologue - - DD imagerel $L$SEH_body_sub_mod_384x - DD imagerel $L$SEH_epilogue_sub_mod_384x - DD imagerel $L$SEH_info_sub_mod_384x_body - - DD imagerel $L$SEH_epilogue_sub_mod_384x - DD imagerel $L$SEH_end_sub_mod_384x - DD imagerel $L$SEH_info_sub_mod_384x_epilogue - - DD imagerel $L$SEH_begin_mul_by_1_plus_i_mod_384x - DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x - DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue - - DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x - DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x - DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_body - - DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x - DD imagerel $L$SEH_end_mul_by_1_plus_i_mod_384x - DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue - - DD imagerel $L$SEH_begin_sgn0_pty_mod_384 - DD imagerel $L$SEH_body_sgn0_pty_mod_384 - DD imagerel $L$SEH_info_sgn0_pty_mod_384_prologue - - DD imagerel $L$SEH_body_sgn0_pty_mod_384 - DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 - DD imagerel $L$SEH_info_sgn0_pty_mod_384_body - - DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 - DD imagerel $L$SEH_end_sgn0_pty_mod_384 - DD imagerel $L$SEH_info_sgn0_pty_mod_384_epilogue - - DD imagerel $L$SEH_begin_sgn0_pty_mod_384x - DD imagerel $L$SEH_body_sgn0_pty_mod_384x - DD imagerel $L$SEH_info_sgn0_pty_mod_384x_prologue - - DD imagerel $L$SEH_body_sgn0_pty_mod_384x - DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x - DD imagerel $L$SEH_info_sgn0_pty_mod_384x_body - - DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x - DD imagerel $L$SEH_end_sgn0_pty_mod_384x - DD imagerel $L$SEH_info_sgn0_pty_mod_384x_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_add_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_add_mod_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_add_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_add_mod_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_add_mod_384x_body:: -DB 1,0,17,0 -DB 000h,0f4h,003h,000h -DB 000h,0e4h,004h,000h -DB 000h,0d4h,005h,000h -DB 000h,0c4h,006h,000h -DB 000h,034h,007h,000h -DB 000h,054h,008h,000h -DB 000h,074h,00ah,000h -DB 000h,064h,00bh,000h -DB 000h,082h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_add_mod_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_rshift_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_rshift_mod_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_rshift_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_div_by_2_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_div_by_2_mod_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_div_by_2_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_lshift_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_lshift_mod_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_lshift_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mul_by_3_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_by_3_mod_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mul_by_3_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mul_by_8_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_by_8_mod_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mul_by_8_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mul_by_3_mod_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_by_3_mod_384x_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mul_by_3_mod_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mul_by_8_mod_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_by_8_mod_384x_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mul_by_8_mod_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_cneg_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_cneg_mod_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_cneg_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sub_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sub_mod_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sub_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sub_mod_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sub_mod_384x_body:: -DB 1,0,17,0 -DB 000h,0f4h,003h,000h -DB 000h,0e4h,004h,000h -DB 000h,0d4h,005h,000h -DB 000h,0c4h,006h,000h -DB 000h,034h,007h,000h -DB 000h,054h,008h,000h -DB 000h,074h,00ah,000h -DB 000h,064h,00bh,000h -DB 000h,082h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sub_mod_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mul_by_1_plus_i_mod_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_by_1_plus_i_mod_384x_body:: -DB 1,0,17,0 -DB 000h,0f4h,007h,000h -DB 000h,0e4h,008h,000h -DB 000h,0d4h,009h,000h -DB 000h,0c4h,00ah,000h -DB 000h,034h,00bh,000h -DB 000h,054h,00ch,000h -DB 000h,074h,00eh,000h -DB 000h,064h,00fh,000h -DB 000h,0c2h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sgn0_pty_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sgn0_pty_mod_384_body:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sgn0_pty_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sgn0_pty_mod_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sgn0_pty_mod_384x_body:: -DB 1,0,9,0 -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sgn0_pty_mod_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm deleted file mode 100644 index 59b51a910ce..00000000000 --- a/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm +++ /dev/null @@ -1,338 +0,0 @@ -OPTION DOTNAME -.text$ SEGMENT ALIGN(256) 'CODE' - - -ALIGN 32 -__add_mod_384x384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov r14,QWORD PTR[48+rsi] - - add r8,QWORD PTR[rdx] - mov r15,QWORD PTR[56+rsi] - adc r9,QWORD PTR[8+rdx] - mov rax,QWORD PTR[64+rsi] - adc r10,QWORD PTR[16+rdx] - mov rbx,QWORD PTR[72+rsi] - adc r11,QWORD PTR[24+rdx] - mov rbp,QWORD PTR[80+rsi] - adc r12,QWORD PTR[32+rdx] - mov rsi,QWORD PTR[88+rsi] - adc r13,QWORD PTR[40+rdx] - mov QWORD PTR[rdi],r8 - adc r14,QWORD PTR[48+rdx] - mov QWORD PTR[8+rdi],r9 - adc r15,QWORD PTR[56+rdx] - mov QWORD PTR[16+rdi],r10 - adc rax,QWORD PTR[64+rdx] - mov QWORD PTR[32+rdi],r12 - mov r8,r14 - adc rbx,QWORD PTR[72+rdx] - mov QWORD PTR[24+rdi],r11 - mov r9,r15 - adc rbp,QWORD PTR[80+rdx] - mov QWORD PTR[40+rdi],r13 - mov r10,rax - adc rsi,QWORD PTR[88+rdx] - mov r11,rbx - sbb rdx,rdx - - sub r14,QWORD PTR[rcx] - sbb r15,QWORD PTR[8+rcx] - mov r12,rbp - sbb rax,QWORD PTR[16+rcx] - sbb rbx,QWORD PTR[24+rcx] - sbb rbp,QWORD PTR[32+rcx] - mov r13,rsi - sbb rsi,QWORD PTR[40+rcx] - sbb rdx,0 - - cmovc r14,r8 - cmovc r15,r9 - cmovc rax,r10 - mov QWORD PTR[48+rdi],r14 - cmovc rbx,r11 - mov QWORD PTR[56+rdi],r15 - cmovc rbp,r12 - mov QWORD PTR[64+rdi],rax - cmovc rsi,r13 - mov QWORD PTR[72+rdi],rbx - mov QWORD PTR[80+rdi],rbp - mov QWORD PTR[88+rdi],rsi - - DB 0F3h,0C3h ;repret -__add_mod_384x384 ENDP - - -ALIGN 32 -__sub_mod_384x384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov r14,QWORD PTR[48+rsi] - - sub r8,QWORD PTR[rdx] - mov r15,QWORD PTR[56+rsi] - sbb r9,QWORD PTR[8+rdx] - mov rax,QWORD PTR[64+rsi] - sbb r10,QWORD PTR[16+rdx] - mov rbx,QWORD PTR[72+rsi] - sbb r11,QWORD PTR[24+rdx] - mov rbp,QWORD PTR[80+rsi] - sbb r12,QWORD PTR[32+rdx] - mov rsi,QWORD PTR[88+rsi] - sbb r13,QWORD PTR[40+rdx] - mov QWORD PTR[rdi],r8 - sbb r14,QWORD PTR[48+rdx] - mov r8,QWORD PTR[rcx] - mov QWORD PTR[8+rdi],r9 - sbb r15,QWORD PTR[56+rdx] - mov r9,QWORD PTR[8+rcx] - mov QWORD PTR[16+rdi],r10 - sbb rax,QWORD PTR[64+rdx] - mov r10,QWORD PTR[16+rcx] - mov QWORD PTR[24+rdi],r11 - sbb rbx,QWORD PTR[72+rdx] - mov r11,QWORD PTR[24+rcx] - mov QWORD PTR[32+rdi],r12 - sbb rbp,QWORD PTR[80+rdx] - mov r12,QWORD PTR[32+rcx] - mov QWORD PTR[40+rdi],r13 - sbb rsi,QWORD PTR[88+rdx] - mov r13,QWORD PTR[40+rcx] - sbb rdx,rdx - - and r8,rdx - and r9,rdx - and r10,rdx - and r11,rdx - and r12,rdx - and r13,rdx - - add r14,r8 - adc r15,r9 - mov QWORD PTR[48+rdi],r14 - adc rax,r10 - mov QWORD PTR[56+rdi],r15 - adc rbx,r11 - mov QWORD PTR[64+rdi],rax - adc rbp,r12 - mov QWORD PTR[72+rdi],rbx - adc rsi,r13 - mov QWORD PTR[80+rdi],rbp - mov QWORD PTR[88+rdi],rsi - - DB 0F3h,0C3h ;repret -__sub_mod_384x384 ENDP - -PUBLIC add_mod_384x384 - - -ALIGN 32 -add_mod_384x384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_add_mod_384x384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_add_mod_384x384:: - - - call __add_mod_384x384 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_add_mod_384x384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_add_mod_384x384:: -add_mod_384x384 ENDP - -PUBLIC sub_mod_384x384 - - -ALIGN 32 -sub_mod_384x384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sub_mod_384x384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_sub_mod_384x384:: - - - call __sub_mod_384x384 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sub_mod_384x384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sub_mod_384x384:: -sub_mod_384x384 ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_add_mod_384x384 - DD imagerel $L$SEH_body_add_mod_384x384 - DD imagerel $L$SEH_info_add_mod_384x384_prologue - - DD imagerel $L$SEH_body_add_mod_384x384 - DD imagerel $L$SEH_epilogue_add_mod_384x384 - DD imagerel $L$SEH_info_add_mod_384x384_body - - DD imagerel $L$SEH_epilogue_add_mod_384x384 - DD imagerel $L$SEH_end_add_mod_384x384 - DD imagerel $L$SEH_info_add_mod_384x384_epilogue - - DD imagerel $L$SEH_begin_sub_mod_384x384 - DD imagerel $L$SEH_body_sub_mod_384x384 - DD imagerel $L$SEH_info_sub_mod_384x384_prologue - - DD imagerel $L$SEH_body_sub_mod_384x384 - DD imagerel $L$SEH_epilogue_sub_mod_384x384 - DD imagerel $L$SEH_info_sub_mod_384x384_body - - DD imagerel $L$SEH_epilogue_sub_mod_384x384 - DD imagerel $L$SEH_end_sub_mod_384x384 - DD imagerel $L$SEH_info_sub_mod_384x384_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_add_mod_384x384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_add_mod_384x384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_add_mod_384x384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sub_mod_384x384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sub_mod_384x384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sub_mod_384x384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/blst.def b/crypto/blst_src/build/win64/blst.def deleted file mode 100644 index dda95336a93..00000000000 --- a/crypto/blst_src/build/win64/blst.def +++ /dev/null @@ -1,221 +0,0 @@ -LIBRARY blst - -EXPORTS - blst_scalar_from_uint32 - blst_uint32_from_scalar - blst_scalar_from_uint64 - blst_uint64_from_scalar - blst_scalar_from_bendian - blst_bendian_from_scalar - blst_scalar_from_lendian - blst_lendian_from_scalar - blst_scalar_fr_check - blst_sk_check - blst_sk_add_n_check - blst_sk_sub_n_check - blst_sk_mul_n_check - blst_sk_inverse - blst_scalar_from_le_bytes - blst_scalar_from_be_bytes - blst_fr_add - blst_fr_sub - blst_fr_mul_by_3 - blst_fr_lshift - blst_fr_rshift - blst_fr_mul - blst_fr_sqr - blst_fr_cneg - blst_fr_eucl_inverse - blst_fr_inverse - blst_fr_from_uint64 - blst_uint64_from_fr - blst_fr_from_scalar - blst_scalar_from_fr - blst_fp_add - blst_fp_sub - blst_fp_mul_by_3 - blst_fp_mul_by_8 - blst_fp_lshift - blst_fp_mul - blst_fp_sqr - blst_fp_cneg - blst_fp_eucl_inverse - blst_fp_inverse - blst_fp_sqrt - blst_fp_from_uint32 - blst_uint32_from_fp - blst_fp_from_uint64 - blst_uint64_from_fp - blst_fp_from_bendian - blst_bendian_from_fp - blst_fp_from_lendian - blst_lendian_from_fp - blst_fp2_add - blst_fp2_sub - blst_fp2_mul_by_3 - blst_fp2_mul_by_8 - blst_fp2_lshift - blst_fp2_mul - blst_fp2_sqr - blst_fp2_cneg - blst_fp2_eucl_inverse - blst_fp2_inverse - blst_fp2_sqrt - blst_fp12_sqr - blst_fp12_cyclotomic_sqr - blst_fp12_mul - blst_fp12_mul_by_xy00z0 - blst_fp12_conjugate - blst_fp12_inverse - blst_fp12_frobenius_map - blst_fp12_is_equal - blst_fp12_is_one - blst_fp12_in_group - blst_fp12_one - blst_p1_add - blst_p1_add_or_double - blst_p1_add_affine - blst_p1_add_or_double_affine - blst_p1_double - blst_p1_mult - blst_p1_cneg - blst_p1_to_affine - blst_p1_from_affine - blst_p1_on_curve - blst_p1_in_g1 - blst_p1_is_equal - blst_p1_is_inf - blst_p1_generator - blst_p1_affine_on_curve - blst_p1_affine_in_g1 - blst_p1_affine_is_equal - blst_p1_affine_is_inf - blst_p1_affine_generator - blst_p2_add - blst_p2_add_or_double - blst_p2_add_affine - blst_p2_add_or_double_affine - blst_p2_double - blst_p2_mult - blst_p2_cneg - blst_p2_to_affine - blst_p2_from_affine - blst_p2_on_curve - blst_p2_in_g2 - blst_p2_is_equal - blst_p2_is_inf - blst_p2_generator - blst_p2_affine_on_curve - blst_p2_affine_in_g2 - blst_p2_affine_is_equal - blst_p2_affine_is_inf - blst_p2_affine_generator - blst_p1s_to_affine - blst_p1s_add - blst_p1s_mult_wbits_precompute_sizeof - blst_p1s_mult_wbits_precompute - blst_p1s_mult_wbits_scratch_sizeof - blst_p1s_mult_wbits - blst_p1s_mult_pippenger_scratch_sizeof - blst_p1s_mult_pippenger - blst_p1s_tile_pippenger - blst_p2s_to_affine - blst_p2s_add - blst_p2s_mult_wbits_precompute_sizeof - blst_p2s_mult_wbits_precompute - blst_p2s_mult_wbits_scratch_sizeof - blst_p2s_mult_wbits - blst_p2s_mult_pippenger_scratch_sizeof - blst_p2s_mult_pippenger - blst_p2s_tile_pippenger - blst_map_to_g1 - blst_map_to_g2 - blst_encode_to_g1 - blst_hash_to_g1 - blst_encode_to_g2 - blst_hash_to_g2 - blst_p1_serialize - blst_p1_compress - blst_p1_affine_serialize - blst_p1_affine_compress - blst_p1_uncompress - blst_p1_deserialize - blst_p2_serialize - blst_p2_compress - blst_p2_affine_serialize - blst_p2_affine_compress - blst_p2_uncompress - blst_p2_deserialize - blst_keygen - blst_sk_to_pk_in_g1 - blst_sign_pk_in_g1 - blst_sk_to_pk_in_g2 - blst_sign_pk_in_g2 - blst_miller_loop - blst_miller_loop_n - blst_final_exp - blst_precompute_lines - blst_miller_loop_lines - blst_fp12_finalverify - blst_pairing_sizeof - blst_pairing_init - blst_pairing_get_dst - blst_pairing_commit - blst_pairing_aggregate_pk_in_g2 - blst_pairing_chk_n_aggr_pk_in_g2 - blst_pairing_mul_n_aggregate_pk_in_g2 - blst_pairing_chk_n_mul_n_aggr_pk_in_g2 - blst_pairing_aggregate_pk_in_g1 - blst_pairing_chk_n_aggr_pk_in_g1 - blst_pairing_mul_n_aggregate_pk_in_g1 - blst_pairing_chk_n_mul_n_aggr_pk_in_g1 - blst_pairing_merge - blst_pairing_finalverify - blst_aggregate_in_g1 - blst_aggregate_in_g2 - blst_aggregated_in_g1 - blst_aggregated_in_g2 - blst_core_verify_pk_in_g1 - blst_core_verify_pk_in_g2 - BLS12_381_G1 - BLS12_381_NEG_G1 - BLS12_381_G2 - BLS12_381_NEG_G2 - blst_fr_ct_bfly - blst_fr_gs_bfly - blst_fr_to - blst_fr_from - blst_fp_to - blst_fp_from - blst_fp_is_square - blst_fp2_is_square - blst_p1_from_jacobian - blst_p2_from_jacobian - blst_sk_to_pk2_in_g1 - blst_sign_pk2_in_g1 - blst_sk_to_pk2_in_g2 - blst_sign_pk2_in_g2 - blst_uniq_sizeof - blst_uniq_init - blst_uniq_test - blst_expand_message_xmd - blst_p1_unchecked_mult - blst_p2_unchecked_mult - blst_pairing_raw_aggregate - blst_pairing_as_fp12 - blst_bendian_from_fp12 - blst_keygen_v3 - blst_keygen_v4_5 - blst_keygen_v5 - blst_derive_master_eip2333 - blst_derive_child_eip2333 - blst_scalar_from_hexascii - blst_fr_from_hexascii - blst_fp_from_hexascii - blst_p1_sizeof - blst_p1_affine_sizeof - blst_p2_sizeof - blst_p2_affine_sizeof - blst_fp12_sizeof - blst_sha256 - diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm deleted file mode 100644 index a4467904612..00000000000 --- a/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm +++ /dev/null @@ -1,786 +0,0 @@ - AREA |.text|,CODE,ALIGN=8,ARM64 - - - - EXPORT |ct_inverse_mod_256|[FUNC] - ALIGN 32 -|ct_inverse_mod_256| PROC - DCDU 3573752639 - stp x29, x30, [sp,#-80]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - sub sp, sp, #1040 - - ldp x4, x5, [x1,#8*0] - ldp x6, x7, [x1,#8*2] - - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... - str x0, [sp] - - ldp x8, x9, [x2,#8*0] - ldp x10, x11, [x2,#8*2] - - stp x4, x5, [x1,#8*0] // copy input to |a| - stp x6, x7, [x1,#8*2] - stp x8, x9, [x1,#8*4] // copy modulus to |b| - stp x10, x11, [x1,#8*6] - - ////////////////////////////////////////// first iteration - bl |$Lab_approximation_31_256_loaded| - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - str x12,[x0,#8*8] // initialize |u| with |f0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to dst |b| - bl __smul_256_n_shift_by_31 - str x12, [x0,#8*9] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - ldr x8, [x1,#8*8] // |u| - ldr x9, [x1,#8*13] // |v| - madd x4, x16, x8, xzr // |u|*|f0| - madd x4, x17, x9, x4 // |v|*|g0| - str x4, [x0,#8*4] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*5] - stp x5, x5, [x0,#8*7] - - madd x4, x12, x8, xzr // |u|*|f1| - madd x4, x13, x9, x4 // |v|*|g1| - str x4, [x0,#8*9] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*10] - stp x5, x5, [x0,#8*12] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - adc x22, x22, x23 - stp x22, x22, [x0,#8*4] - stp x22, x22, [x0,#8*6] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - bl __ab_approximation_31_256 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_256_n_shift_by_31 - mov x16, x12 // corrected |f0| - mov x17, x13 // corrected |g0| - - mov x12, x14 // |f1| - mov x13, x15 // |g1| - add x0, x0, #8*4 // pointer to destination |b| - bl __smul_256_n_shift_by_31 - - add x0, x0, #8*4 // pointer to destination |u| - bl __smul_256x63 - adc x22, x22, x23 - str x22, [x0,#8*4] - - mov x16, x12 // corrected |f1| - mov x17, x13 // corrected |g1| - add x0, x0, #8*5 // pointer to destination |v| - bl __smul_256x63 - bl __smul_512x63_tail - ////////////////////////////////////////// two[!] last iterations - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #47 // 31 + 512 % 31 - //bl __ab_approximation_62_256 // |a| and |b| are exact, - ldr x7, [x1,#8*0] // just load - ldr x11, [x1,#8*4] - bl __inner_loop_62_256 - - mov x16, x14 - mov x17, x15 - ldr x0, [sp] // original out_ptr - bl __smul_256x63 - bl __smul_512x63_tail - ldr x30, [x29,#8] - - smulh x20, x7, x17 // figure out top-most limb - ldp x8, x9, [x3,#8*0] - adc x23, x23, x25 - ldp x10, x11, [x3,#8*2] - - add x20, x20, x23 // x20 is 1, 0 or -1 - asr x19, x20, #63 // sign as mask - - and x23, x8, x19 // add mod<<256 conditionally - and x24, x9, x19 - adds x4, x4, x23 - and x25, x10, x19 - adcs x5, x5, x24 - and x26, x11, x19 - adcs x6, x6, x25 - adcs x7, x22, x26 - adc x20, x20, xzr // x20 is 1, 0 or -1 - - neg x19, x20 - orr x20, x20, x19 // excess bit or sign as mask - asr x19, x19, #63 // excess bit as mask - - and x8, x8, x20 // mask |mod| - and x9, x9, x20 - and x10, x10, x20 - and x11, x11, x20 - - eor x8, x8, x19 // conditionally negate |mod| - eor x9, x9, x19 - adds x8, x8, x19, lsr#63 - eor x10, x10, x19 - adcs x9, x9, xzr - eor x11, x11, x19 - adcs x10, x10, xzr - adc x11, x11, xzr - - adds x4, x4, x8 // final adjustment for |mod|<<256 - adcs x5, x5, x9 - adcs x6, x6, x10 - stp x4, x5, [x0,#8*4] - adc x7, x7, x11 - stp x6, x7, [x0,#8*6] - - add sp, sp, #1040 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldr x29, [sp],#80 - DCDU 3573752767 - ret - ENDP - -//////////////////////////////////////////////////////////////////////// - - ALIGN 32 -|__smul_256x63| PROC - ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) - asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x6, x7, [x1,#8*2+64] - eor x16, x16, x14 // conditionally negate |f_| (or |g_|) - ldr x22, [x1,#8*4+64] - - eor x4, x4, x14 // conditionally negate |u| (or |v|) - sub x16, x16, x14 - eor x5, x5, x14 - adds x4, x4, x14, lsr#63 - eor x6, x6, x14 - adcs x5, x5, xzr - eor x7, x7, x14 - adcs x6, x6, xzr - eor x22, x22, x14 - umulh x19, x4, x16 - adcs x7, x7, xzr - umulh x20, x5, x16 - adcs x22, x22, xzr - umulh x21, x6, x16 - mul x4, x4, x16 - cmp x16, #0 - mul x5, x5, x16 - cselne x22,x22,xzr - mul x6, x6, x16 - adds x5, x5, x19 - mul x24, x7, x16 - adcs x6, x6, x20 - adcs x24, x24, x21 - adc x26, xzr, xzr - ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) - asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x10, x11, [x1,#8*2+104] - eor x17, x17, x14 // conditionally negate |f_| (or |g_|) - ldr x23, [x1,#8*4+104] - - eor x8, x8, x14 // conditionally negate |u| (or |v|) - sub x17, x17, x14 - eor x9, x9, x14 - adds x8, x8, x14, lsr#63 - eor x10, x10, x14 - adcs x9, x9, xzr - eor x11, x11, x14 - adcs x10, x10, xzr - eor x23, x23, x14 - umulh x19, x8, x17 - adcs x11, x11, xzr - umulh x20, x9, x17 - adcs x23, x23, xzr - umulh x21, x10, x17 - adc x15, xzr, xzr // used in __smul_512x63_tail - mul x8, x8, x17 - cmp x17, #0 - mul x9, x9, x17 - cselne x23,x23,xzr - mul x10, x10, x17 - adds x9, x9, x19 - mul x25, x11, x17 - adcs x10, x10, x20 - adcs x25, x25, x21 - adc x26, x26, xzr - - adds x4, x4, x8 - adcs x5, x5, x9 - adcs x6, x6, x10 - stp x4, x5, [x0,#8*0] - adcs x24, x24, x25 - stp x6, x24, [x0,#8*2] - - ret - ENDP - - - ALIGN 32 -|__smul_512x63_tail| PROC - umulh x24, x7, x16 - ldp x5, x6, [x1,#8*18] // load rest of |v| - adc x26, x26, xzr - ldr x7, [x1,#8*20] - and x22, x22, x16 - - umulh x11, x11, x17 // resume |v|*|g1| chain - - sub x24, x24, x22 // tie up |u|*|f1| chain - asr x25, x24, #63 - - eor x5, x5, x14 // conditionally negate rest of |v| - eor x6, x6, x14 - adds x5, x5, x15 - eor x7, x7, x14 - adcs x6, x6, xzr - umulh x19, x23, x17 - adc x7, x7, xzr - umulh x20, x5, x17 - add x11, x11, x26 - umulh x21, x6, x17 - - mul x4, x23, x17 - mul x5, x5, x17 - adds x4, x4, x11 - mul x6, x6, x17 - adcs x5, x5, x19 - mul x22, x7, x17 - adcs x6, x6, x20 - adcs x22, x22, x21 - adc x23, xzr, xzr // used in the final step - - adds x4, x4, x24 - adcs x5, x5, x25 - adcs x6, x6, x25 - stp x4, x5, [x0,#8*4] - adcs x22, x22, x25 // carry is used in the final step - stp x6, x22, [x0,#8*6] - - ret - ENDP - - - ALIGN 32 -|__smul_256_n_shift_by_31| PROC - ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) - asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x6, x7, [x1,#8*2+0] - eor x25, x12, x24 // conditionally negate |f0| (or |g0|) - - eor x4, x4, x24 // conditionally negate |a| (or |b|) - sub x25, x25, x24 - eor x5, x5, x24 - adds x4, x4, x24, lsr#63 - eor x6, x6, x24 - adcs x5, x5, xzr - eor x7, x7, x24 - umulh x19, x4, x25 - adcs x6, x6, xzr - umulh x20, x5, x25 - adc x7, x7, xzr - umulh x21, x6, x25 - and x24, x24, x25 - umulh x22, x7, x25 - neg x24, x24 - - mul x4, x4, x25 - mul x5, x5, x25 - mul x6, x6, x25 - adds x5, x5, x19 - mul x7, x7, x25 - adcs x6, x6, x20 - adcs x7, x7, x21 - adc x22, x22, x24 - ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) - asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x10, x11, [x1,#8*2+32] - eor x25, x13, x24 // conditionally negate |f0| (or |g0|) - - eor x8, x8, x24 // conditionally negate |a| (or |b|) - sub x25, x25, x24 - eor x9, x9, x24 - adds x8, x8, x24, lsr#63 - eor x10, x10, x24 - adcs x9, x9, xzr - eor x11, x11, x24 - umulh x19, x8, x25 - adcs x10, x10, xzr - umulh x20, x9, x25 - adc x11, x11, xzr - umulh x21, x10, x25 - and x24, x24, x25 - umulh x23, x11, x25 - neg x24, x24 - - mul x8, x8, x25 - mul x9, x9, x25 - mul x10, x10, x25 - adds x9, x9, x19 - mul x11, x11, x25 - adcs x10, x10, x20 - adcs x11, x11, x21 - adc x23, x23, x24 - adds x4, x4, x8 - adcs x5, x5, x9 - adcs x6, x6, x10 - adcs x7, x7, x11 - adc x8, x22, x23 - - extr x4, x5, x4, #31 - extr x5, x6, x5, #31 - extr x6, x7, x6, #31 - asr x23, x8, #63 // result's sign as mask - extr x7, x8, x7, #31 - - eor x4, x4, x23 // ensure the result is positive - eor x5, x5, x23 - adds x4, x4, x23, lsr#63 - eor x6, x6, x23 - adcs x5, x5, xzr - eor x7, x7, x23 - adcs x6, x6, xzr - stp x4, x5, [x0,#8*0] - adc x7, x7, xzr - stp x6, x7, [x0,#8*2] - - eor x12, x12, x23 // adjust |f/g| accordingly - eor x13, x13, x23 - sub x12, x12, x23 - sub x13, x13, x23 - - ret - ENDP - - ALIGN 16 -|__ab_approximation_31_256| PROC - ldp x6, x7, [x1,#8*2] - ldp x10, x11, [x1,#8*6] - ldp x4, x5, [x1,#8*0] - ldp x8, x9, [x1,#8*4] - -|$Lab_approximation_31_256_loaded| - orr x19, x7, x11 // check top-most limbs, ... - cmp x19, #0 - cselne x7,x7,x6 - cselne x11,x11,x10 - cselne x6,x6,x5 - orr x19, x7, x11 // and ones before top-most, ... - cselne x10,x10,x9 - - cmp x19, #0 - cselne x7,x7,x6 - cselne x11,x11,x10 - cselne x6,x6,x4 - orr x19, x7, x11 // and one more, ... - cselne x10,x10,x8 - - clz x19, x19 - cmp x19, #64 - cselne x19,x19,xzr - cselne x7,x7,x6 - cselne x11,x11,x10 - neg x20, x19 - - lslv x7, x7, x19 // align high limbs to the left - lslv x11, x11, x19 - lsrv x6, x6, x20 - lsrv x10, x10, x20 - and x6, x6, x20, asr#6 - and x10, x10, x20, asr#6 - orr x7, x7, x6 - orr x11, x11, x10 - - bfxil x7, x4, #0, #31 - bfxil x11, x8, #0, #31 - - b __inner_loop_31_256 - ret - ENDP - - - ALIGN 16 -|__inner_loop_31_256| PROC - mov x2, #31 - mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov x23,#0x7FFFFFFF7FFFFFFF - -|$Loop_31_256| - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x19, x15 - cselhs x11,x11,x7 - cselhs x7,x21,x20 - cselhs x15,x15,x13 - cselhs x13,x13,x19 - lsr x7, x7, #1 - and x19, x15, x22 - and x20, x23, x22 - sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x15, x15, x15 // |f1|<<=1 - add x13, x13, x20 - sub x15, x15, x23 - cbnz x2, |$Loop_31_256| - - mov x23, #0x7FFFFFFF - ubfx x12, x13, #0, #32 - ubfx x13, x13, #32, #32 - ubfx x14, x15, #0, #32 - ubfx x15, x15, #32, #32 - sub x12, x12, x23 // remove bias - sub x13, x13, x23 - sub x14, x14, x23 - sub x15, x15, x23 - - ret - ENDP - - - ALIGN 16 -|__inner_loop_62_256| PROC - mov x12, #1 // |f0|=1 - mov x13, #0 // |g0|=0 - mov x14, #0 // |f1|=0 - mov x15, #1 // |g1|=1 - -|$Loop_62_256| - sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - and x19, x11, x22 - sub x20, x11, x7 // |b_|-|a_| - subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x19, x12 - cselhs x11,x11,x7 - cselhs x7,x21,x20 - mov x20, x13 - cselhs x12,x12,x14 - cselhs x14,x14,x19 - cselhs x13,x13,x15 - cselhs x15,x15,x20 - lsr x7, x7, #1 - and x19, x14, x22 - and x20, x15, x22 - add x14, x14, x14 // |f1|<<=1 - add x15, x15, x15 // |g1|<<=1 - sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) - cbnz x2, |$Loop_62_256| - - ret - ENDP - END diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm deleted file mode 100644 index 5cd09a1d8f2..00000000000 --- a/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm +++ /dev/null @@ -1,1220 +0,0 @@ -OPTION DOTNAME -.text$ SEGMENT ALIGN(256) 'CODE' - -PUBLIC ct_inverse_mod_256 - - -ALIGN 32 -ct_inverse_mod_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_ct_inverse_mod_256:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,1072 - -$L$SEH_body_ct_inverse_mod_256:: - - - lea rax,QWORD PTR[((48+511))+rsp] - and rax,-512 - mov QWORD PTR[32+rsp],rdi - mov QWORD PTR[40+rsp],rcx - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - - mov r12,QWORD PTR[rdx] - mov r13,QWORD PTR[8+rdx] - mov r14,QWORD PTR[16+rdx] - mov r15,QWORD PTR[24+rdx] - - mov QWORD PTR[rax],r8 - mov QWORD PTR[8+rax],r9 - mov QWORD PTR[16+rax],r10 - mov QWORD PTR[24+rax],r11 - - mov QWORD PTR[32+rax],r12 - mov QWORD PTR[40+rax],r13 - mov QWORD PTR[48+rax],r14 - mov QWORD PTR[56+rax],r15 - mov rsi,rax - - - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - - - mov QWORD PTR[64+rdi],rdx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - - - mov QWORD PTR[72+rdi],rdx - - - xor rsi,256 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - - - - mov r8,QWORD PTR[64+rsi] - mov r12,QWORD PTR[104+rsi] - mov r9,r8 - imul r8,QWORD PTR[rsp] - mov r13,r12 - imul r12,QWORD PTR[8+rsp] - add r8,r12 - mov QWORD PTR[32+rdi],r8 - sar r8,63 - mov QWORD PTR[40+rdi],r8 - mov QWORD PTR[48+rdi],r8 - mov QWORD PTR[56+rdi],r8 - mov QWORD PTR[64+rdi],r8 - lea rsi,QWORD PTR[64+rsi] - - imul r9,rdx - imul r13,rcx - add r9,r13 - mov QWORD PTR[72+rdi],r9 - sar r9,63 - mov QWORD PTR[80+rdi],r9 - mov QWORD PTR[88+rdi],r9 - mov QWORD PTR[96+rdi],r9 - mov QWORD PTR[104+rdi],r9 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_256x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_256x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_256x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_256x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_256x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_256x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_256x63 - sar rbp,63 - mov QWORD PTR[40+rdi],rbp - mov QWORD PTR[48+rdi],rbp - mov QWORD PTR[56+rdi],rbp - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_512x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_512x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_512x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_512x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_512x63 - xor rsi,256+8*8 - mov edx,31 - call __ab_approximation_31_256 - - - mov QWORD PTR[16+rsp],r12 - mov QWORD PTR[24+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_256_n_shift_by_31 - mov QWORD PTR[rsp],rdx - mov QWORD PTR[8+rsp],rcx - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256_n_shift_by_31 - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[24+rsp],rcx - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[64+rsi] - lea rdi,QWORD PTR[32+rdi] - call __smulq_256x63 - - mov rdx,QWORD PTR[16+rsp] - mov rcx,QWORD PTR[24+rsp] - lea rdi,QWORD PTR[40+rdi] - call __smulq_512x63 - - xor rsi,256+8*8 - mov edx,47 - - mov r8,QWORD PTR[rsi] - - mov r10,QWORD PTR[32+rsi] - - call __inner_loop_62_256 - - - - - - - - lea rsi,QWORD PTR[64+rsi] - - - - - - mov rdx,r12 - mov rcx,r13 - mov rdi,QWORD PTR[32+rsp] - call __smulq_512x63 - adc rdx,rbp - - mov rsi,QWORD PTR[40+rsp] - mov rax,rdx - sar rdx,63 - - mov r8,rdx - mov r9,rdx - and r8,QWORD PTR[rsi] - mov r10,rdx - and r9,QWORD PTR[8+rsi] - and r10,QWORD PTR[16+rsi] - and rdx,QWORD PTR[24+rsi] - - add r12,r8 - adc r13,r9 - adc r14,r10 - adc r15,rdx - adc rax,0 - - mov rdx,rax - neg rax - or rdx,rax - sar rax,63 - - mov r8,rdx - mov r9,rdx - and r8,QWORD PTR[rsi] - mov r10,rdx - and r9,QWORD PTR[8+rsi] - and r10,QWORD PTR[16+rsi] - and rdx,QWORD PTR[24+rsi] - - xor r8,rax - xor rcx,rcx - xor r9,rax - sub rcx,rax - xor r10,rax - xor rdx,rax - add r8,rcx - adc r9,0 - adc r10,0 - adc rdx,0 - - add r12,r8 - adc r13,r9 - adc r14,r10 - adc r15,rdx - - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - mov QWORD PTR[48+rdi],r14 - mov QWORD PTR[56+rdi],r15 - - lea r8,QWORD PTR[1072+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_ct_inverse_mod_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_ct_inverse_mod_256:: -ct_inverse_mod_256 ENDP - -ALIGN 32 -__smulq_512x63 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov rbp,QWORD PTR[32+rsi] - - mov rbx,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rbx,rdx - add rbx,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor rbp,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc rbp,0 - - mul rbx - mov QWORD PTR[rdi],rax - mov rax,r9 - mov r9,rdx - mul rbx - add r9,rax - mov rax,r10 - adc rdx,0 - mov QWORD PTR[8+rdi],r9 - mov r10,rdx - mul rbx - add r10,rax - mov rax,r11 - adc rdx,0 - mov QWORD PTR[16+rdi],r10 - mov r11,rdx - and rbp,rbx - neg rbp - mul rbx - add r11,rax - adc rbp,rdx - mov QWORD PTR[24+rdi],r11 - - mov r8,QWORD PTR[40+rsi] - mov r9,QWORD PTR[48+rsi] - mov r10,QWORD PTR[56+rsi] - mov r11,QWORD PTR[64+rsi] - mov r12,QWORD PTR[72+rsi] - mov r13,QWORD PTR[80+rsi] - mov r14,QWORD PTR[88+rsi] - mov r15,QWORD PTR[96+rsi] - - mov rdx,rcx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rcx,rdx - add rcx,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor r12,rdx - xor r13,rdx - xor r14,rdx - xor r15,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - adc r14,0 - adc r15,0 - - mul rcx - mov r8,rax - mov rax,r9 - mov r9,rdx - mul rcx - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rcx - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mul rcx - add r11,rax - mov rax,r12 - adc rdx,0 - mov r12,rdx - mul rcx - add r12,rax - mov rax,r13 - adc rdx,0 - mov r13,rdx - mul rcx - add r13,rax - mov rax,r14 - adc rdx,0 - mov r14,rdx - mul rcx - add r14,rax - mov rax,r15 - adc rdx,0 - mov r15,rdx - imul rcx - add r15,rax - adc rdx,0 - - mov rbx,rbp - sar rbp,63 - - add r8,QWORD PTR[rdi] - adc r9,QWORD PTR[8+rdi] - adc r10,QWORD PTR[16+rdi] - adc r11,QWORD PTR[24+rdi] - adc r12,rbx - adc r13,rbp - adc r14,rbp - adc r15,rbp - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - mov QWORD PTR[48+rdi],r14 - mov QWORD PTR[56+rdi],r15 - - DB 0F3h,0C3h ;repret -__smulq_512x63 ENDP - - -ALIGN 32 -__smulq_256x63 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[((0+0))+rsi] - mov r9,QWORD PTR[((0+8))+rsi] - mov r10,QWORD PTR[((0+16))+rsi] - mov r11,QWORD PTR[((0+24))+rsi] - mov rbp,QWORD PTR[((0+32))+rsi] - - mov rbx,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rbx,rdx - add rbx,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor rbp,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc rbp,0 - - mul rbx - mov r8,rax - mov rax,r9 - mov r9,rdx - mul rbx - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rbx - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - and rbp,rbx - neg rbp - mul rbx - add r11,rax - adc rbp,rdx - mov rdx,rcx - mov r12,QWORD PTR[((40+0))+rsi] - mov r13,QWORD PTR[((40+8))+rsi] - mov r14,QWORD PTR[((40+16))+rsi] - mov r15,QWORD PTR[((40+24))+rsi] - mov rcx,QWORD PTR[((40+32))+rsi] - - mov rbx,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rbx,rdx - add rbx,rax - - xor r12,rdx - xor r13,rdx - xor r14,rdx - xor r15,rdx - xor rcx,rdx - add rax,r12 - adc r13,0 - adc r14,0 - adc r15,0 - adc rcx,0 - - mul rbx - mov r12,rax - mov rax,r13 - mov r13,rdx - mul rbx - add r13,rax - mov rax,r14 - adc rdx,0 - mov r14,rdx - mul rbx - add r14,rax - mov rax,r15 - adc rdx,0 - mov r15,rdx - and rcx,rbx - neg rcx - mul rbx - add r15,rax - adc rcx,rdx - add r8,r12 - adc r9,r13 - adc r10,r14 - adc r11,r15 - adc rbp,rcx - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],rbp - - DB 0F3h,0C3h ;repret -__smulq_256x63 ENDP - -ALIGN 32 -__smulq_256_n_shift_by_31 PROC PRIVATE - DB 243,15,30,250 - - mov QWORD PTR[rdi],rdx - mov QWORD PTR[8+rdi],rcx - mov rbp,rdx - mov r8,QWORD PTR[((0+0))+rsi] - mov r9,QWORD PTR[((0+8))+rsi] - mov r10,QWORD PTR[((0+16))+rsi] - mov r11,QWORD PTR[((0+24))+rsi] - - mov rbx,rbp - sar rbp,63 - xor rax,rax - sub rax,rbp - - xor rbx,rbp - add rbx,rax - - xor r8,rbp - xor r9,rbp - xor r10,rbp - xor r11,rbp - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - - mul rbx - mov r8,rax - mov rax,r9 - and rbp,rbx - neg rbp - mov r9,rdx - mul rbx - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rbx - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mul rbx - add r11,rax - adc rbp,rdx - mov r12,QWORD PTR[((32+0))+rsi] - mov r13,QWORD PTR[((32+8))+rsi] - mov r14,QWORD PTR[((32+16))+rsi] - mov r15,QWORD PTR[((32+24))+rsi] - - mov rbx,rcx - sar rcx,63 - xor rax,rax - sub rax,rcx - - xor rbx,rcx - add rbx,rax - - xor r12,rcx - xor r13,rcx - xor r14,rcx - xor r15,rcx - add rax,r12 - adc r13,0 - adc r14,0 - adc r15,0 - - mul rbx - mov r12,rax - mov rax,r13 - and rcx,rbx - neg rcx - mov r13,rdx - mul rbx - add r13,rax - mov rax,r14 - adc rdx,0 - mov r14,rdx - mul rbx - add r14,rax - mov rax,r15 - adc rdx,0 - mov r15,rdx - mul rbx - add r15,rax - adc rcx,rdx - add r8,r12 - adc r9,r13 - adc r10,r14 - adc r11,r15 - adc rbp,rcx - - mov rdx,QWORD PTR[rdi] - mov rcx,QWORD PTR[8+rdi] - - shrd r8,r9,31 - shrd r9,r10,31 - shrd r10,r11,31 - shrd r11,rbp,31 - - sar rbp,63 - xor rax,rax - sub rax,rbp - - xor r8,rbp - xor r9,rbp - xor r10,rbp - xor r11,rbp - add r8,rax - adc r9,0 - adc r10,0 - adc r11,0 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - - xor rdx,rbp - xor rcx,rbp - add rdx,rax - add rcx,rax - - DB 0F3h,0C3h ;repret -__smulq_256_n_shift_by_31 ENDP - -ALIGN 32 -__ab_approximation_31_256 PROC PRIVATE - DB 243,15,30,250 - - mov r9,QWORD PTR[24+rsi] - mov r11,QWORD PTR[56+rsi] - mov rbx,QWORD PTR[16+rsi] - mov rbp,QWORD PTR[48+rsi] - mov r8,QWORD PTR[8+rsi] - mov r10,QWORD PTR[40+rsi] - - mov rax,r9 - or rax,r11 - cmovz r9,rbx - cmovz r11,rbp - cmovz rbx,r8 - mov r8,QWORD PTR[rsi] - cmovz rbp,r10 - mov r10,QWORD PTR[32+rsi] - - mov rax,r9 - or rax,r11 - cmovz r9,rbx - cmovz r11,rbp - cmovz rbx,r8 - cmovz rbp,r10 - - mov rax,r9 - or rax,r11 - bsr rcx,rax - lea rcx,QWORD PTR[1+rcx] - cmovz r9,r8 - cmovz r11,r10 - cmovz rcx,rax - neg rcx - - - shld r9,rbx,cl - shld r11,rbp,cl - - mov eax,07FFFFFFFh - and r8,rax - and r10,rax - not rax - and r9,rax - and r11,rax - or r8,r9 - or r10,r11 - - jmp __inner_loop_31_256 - - DB 0F3h,0C3h ;repret -__ab_approximation_31_256 ENDP - -ALIGN 32 -__inner_loop_31_256 PROC PRIVATE - DB 243,15,30,250 - - mov rcx,07FFFFFFF80000000h - mov r13,0800000007FFFFFFFh - mov r15,07FFFFFFF7FFFFFFFh - -$L$oop_31_256:: - cmp r8,r10 - mov rax,r8 - mov rbx,r10 - mov rbp,rcx - mov r14,r13 - cmovb r8,r10 - cmovb r10,rax - cmovb rcx,r13 - cmovb r13,rbp - - sub r8,r10 - sub rcx,r13 - add rcx,r15 - - test rax,1 - cmovz r8,rax - cmovz r10,rbx - cmovz rcx,rbp - cmovz r13,r14 - - shr r8,1 - add r13,r13 - sub r13,r15 - sub edx,1 - jnz $L$oop_31_256 - - shr r15,32 - mov edx,ecx - mov r12d,r13d - shr rcx,32 - shr r13,32 - sub rdx,r15 - sub rcx,r15 - sub r12,r15 - sub r13,r15 - - DB 0F3h,0C3h ;repret -__inner_loop_31_256 ENDP - - -ALIGN 32 -__inner_loop_62_256 PROC PRIVATE - DB 243,15,30,250 - - mov r15d,edx - mov rdx,1 - xor rcx,rcx - xor r12,r12 - mov r13,rdx - mov r14,rdx - -$L$oop_62_256:: - xor rax,rax - test r8,r14 - mov rbx,r10 - cmovnz rax,r10 - sub rbx,r8 - mov rbp,r8 - sub r8,rax - cmovc r8,rbx - cmovc r10,rbp - mov rax,rdx - cmovc rdx,r12 - cmovc r12,rax - mov rbx,rcx - cmovc rcx,r13 - cmovc r13,rbx - xor rax,rax - xor rbx,rbx - shr r8,1 - test rbp,r14 - cmovnz rax,r12 - cmovnz rbx,r13 - add r12,r12 - add r13,r13 - sub rdx,rax - sub rcx,rbx - sub r15d,1 - jnz $L$oop_62_256 - - DB 0F3h,0C3h ;repret -__inner_loop_62_256 ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_ct_inverse_mod_256 - DD imagerel $L$SEH_body_ct_inverse_mod_256 - DD imagerel $L$SEH_info_ct_inverse_mod_256_prologue - - DD imagerel $L$SEH_body_ct_inverse_mod_256 - DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 - DD imagerel $L$SEH_info_ct_inverse_mod_256_body - - DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 - DD imagerel $L$SEH_end_ct_inverse_mod_256 - DD imagerel $L$SEH_info_ct_inverse_mod_256_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_ct_inverse_mod_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_ct_inverse_mod_256_body:: -DB 1,0,18,0 -DB 000h,0f4h,086h,000h -DB 000h,0e4h,087h,000h -DB 000h,0d4h,088h,000h -DB 000h,0c4h,089h,000h -DB 000h,034h,08ah,000h -DB 000h,054h,08bh,000h -DB 000h,074h,08dh,000h -DB 000h,064h,08eh,000h -DB 000h,001h,08ch,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_ct_inverse_mod_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm deleted file mode 100644 index 311ce7638ce..00000000000 --- a/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm +++ /dev/null @@ -1,719 +0,0 @@ - AREA |.text|,CODE,ALIGN=8,ARM64 - - - - EXPORT |ct_inverse_mod_383|[FUNC] - ALIGN 32 -|ct_inverse_mod_383| PROC - DCDU 3573752639 - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #1040 - - ldp x22, x4, [x1,#8*0] - ldp x5, x6, [x1,#8*2] - ldp x7, x8, [x1,#8*4] - - add x1, sp, #16+511 // find closest 512-byte-aligned spot - and x1, x1, #-512 // in the frame... - stp x0, x3, [sp] - - ldp x9, x10, [x2,#8*0] - ldp x11, x12, [x2,#8*2] - ldp x13, x14, [x2,#8*4] - - stp x22, x4, [x1,#8*0] // copy input to |a| - stp x5, x6, [x1,#8*2] - stp x7, x8, [x1,#8*4] - stp x9, x10, [x1,#8*6] // copy modulus to |b| - stp x11, x12, [x1,#8*8] - stp x13, x14, [x1,#8*10] - - ////////////////////////////////////////// first iteration - mov x2, #62 - bl |$Lab_approximation_62_loaded| - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - str x15,[x0,#8*12] // initialize |u| with |f0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to dst |b| - bl __smul_383_n_shift_by_62 - str x15, [x0,#8*12] // initialize |v| with |f1| - - ////////////////////////////////////////// second iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - ldr x7, [x1,#8*12] // |u| - ldr x8, [x1,#8*18] // |v| - mul x3, x20, x7 // |u|*|f0| - smulh x4, x20, x7 - mul x5, x21, x8 // |v|*|g0| - smulh x6, x21, x8 - adds x3, x3, x5 - adc x4, x4, x6 - stp x3, x4, [x0,#8*6] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*8] - stp x5, x5, [x0,#8*10] - - mul x3, x15, x7 // |u|*|f1| - smulh x4, x15, x7 - mul x5, x16, x8 // |v|*|g1| - smulh x6, x16, x8 - adds x3, x3, x5 - adc x4, x4, x6 - stp x3, x4, [x0,#8*12] - asr x5, x4, #63 // sign extension - stp x5, x5, [x0,#8*14] - stp x5, x5, [x0,#8*16] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - asr x27, x27, #63 // sign extension - stp x27, x27, [x0,#8*6] - stp x27, x27, [x0,#8*8] - stp x27, x27, [x0,#8*10] - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - bl __ab_approximation_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - bl __smul_383_n_shift_by_62 - mov x20, x15 // corrected |f0| - mov x21, x16 // corrected |g0| - - mov x15, x17 // |f1| - mov x16, x19 // |g1| - add x0, x0, #8*6 // pointer to destination |b| - bl __smul_383_n_shift_by_62 - - add x0, x0, #8*6 // pointer to destination |u| - bl __smul_383x63 - - mov x20, x15 // corrected |f1| - mov x21, x16 // corrected |g1| - add x0, x0, #8*6 // pointer to destination |v| - bl __smul_383x63 - bl __smul_767x63_tail - ////////////////////////////////////////// iteration before last - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldp x3, x8, [x1,#8*0] // just load - ldp x9, x14, [x1,#8*6] - bl __inner_loop_62 - - eor x0, x1, #256 // pointer to dst |a|b|u|v| - str x3, [x0,#8*0] - str x9, [x0,#8*6] - - mov x20, x15 // exact |f0| - mov x21, x16 // exact |g0| - mov x15, x17 - mov x16, x19 - add x0, x0, #8*12 // pointer to dst |u| - bl __smul_383x63 - - mov x20, x15 // exact |f1| - mov x21, x16 // exact |g1| - add x0, x0, #8*6 // pointer to dst |v| - bl __smul_383x63 - bl __smul_767x63_tail - - ////////////////////////////////////////// last iteration - eor x1, x1, #256 // flip-flop src |a|b|u|v| - mov x2, #22 // 766 % 62 - //bl __ab_approximation_62 // |a| and |b| are exact, - ldr x3, [x1,#8*0] // just load - eor x8, x8, x8 - ldr x9, [x1,#8*6] - eor x14, x14, x14 - bl __inner_loop_62 - - mov x20, x17 - mov x21, x19 - ldp x0, x15, [sp] // original out_ptr and n_ptr - bl __smul_383x63 - bl __smul_767x63_tail - ldr x30, [x29,#8] - - asr x22, x8, #63 // sign as mask - ldp x9, x10, [x15,#8*0] - ldp x11, x12, [x15,#8*2] - ldp x13, x14, [x15,#8*4] - - and x9, x9, x22 // add mod<<384 conditionally - and x10, x10, x22 - adds x3, x3, x9 - and x11, x11, x22 - adcs x4, x4, x10 - and x12, x12, x22 - adcs x5, x5, x11 - and x13, x13, x22 - adcs x6, x6, x12 - and x14, x14, x22 - stp x3, x4, [x0,#8*6] - adcs x7, x7, x13 - stp x5, x6, [x0,#8*8] - adc x8, x8, x14 - stp x7, x8, [x0,#8*10] - - add sp, sp, #1040 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 - DCDU 3573752767 - ret - ENDP - -//////////////////////////////////////////////////////////////////////// -// see corresponding commentary in ctx_inverse_mod_384-x86_64... - - ALIGN 32 -|__smul_383x63| PROC - ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) - asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x5, x6, [x1,#8*2+96] - eor x20, x20, x17 // conditionally negate |f_| (or |g_|) - ldp x7, x8, [x1,#8*4+96] - - eor x3, x3, x17 // conditionally negate |u| (or |v|) - sub x20, x20, x17 - eor x4, x4, x17 - adds x3, x3, x17, lsr#63 - eor x5, x5, x17 - adcs x4, x4, xzr - eor x6, x6, x17 - adcs x5, x5, xzr - eor x7, x7, x17 - adcs x6, x6, xzr - umulh x22, x3, x20 - eor x8, x8, x17 - umulh x23, x4, x20 - adcs x7, x7, xzr - umulh x24, x5, x20 - adcs x8, x8, xzr - umulh x25, x6, x20 - umulh x26, x7, x20 - mul x3, x3, x20 - mul x4, x4, x20 - mul x5, x5, x20 - adds x4, x4, x22 - mul x6, x6, x20 - adcs x5, x5, x23 - mul x7, x7, x20 - adcs x6, x6, x24 - mul x27,x8, x20 - adcs x7, x7, x25 - adcs x27,x27,x26 - adc x2, xzr, xzr - ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) - asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) - ldp x11, x12, [x1,#8*2+144] - eor x21, x21, x17 // conditionally negate |f_| (or |g_|) - ldp x13, x14, [x1,#8*4+144] - - eor x9, x9, x17 // conditionally negate |u| (or |v|) - sub x21, x21, x17 - eor x10, x10, x17 - adds x9, x9, x17, lsr#63 - eor x11, x11, x17 - adcs x10, x10, xzr - eor x12, x12, x17 - adcs x11, x11, xzr - eor x13, x13, x17 - adcs x12, x12, xzr - umulh x22, x9, x21 - eor x14, x14, x17 - umulh x23, x10, x21 - adcs x13, x13, xzr - umulh x24, x11, x21 - adcs x14, x14, xzr - umulh x25, x12, x21 - adc x19, xzr, xzr // used in __smul_767x63_tail - umulh x26, x13, x21 - mul x9, x9, x21 - mul x10, x10, x21 - mul x11, x11, x21 - adds x10, x10, x22 - mul x12, x12, x21 - adcs x11, x11, x23 - mul x13, x13, x21 - adcs x12, x12, x24 - mul x28,x14, x21 - adcs x13, x13, x25 - adcs x28,x28,x26 - adc x2, x2, xzr - - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - stp x3, x4, [x0,#8*0] - adcs x7, x7, x13 - stp x5, x6, [x0,#8*2] - adcs x27, x27, x28 - stp x7, x27, [x0,#8*4] - adc x28, x2, xzr // used in __smul_767x63_tail - - ret - ENDP - - - ALIGN 32 -|__smul_767x63_tail| PROC - smulh x27, x8, x20 - ldp x3, x4, [x1,#8*24] // load rest of |v| - umulh x14,x14, x21 - ldp x5, x6, [x1,#8*26] - ldp x7, x8, [x1,#8*28] - - eor x3, x3, x17 // conditionally negate rest of |v| - eor x4, x4, x17 - eor x5, x5, x17 - adds x3, x3, x19 - eor x6, x6, x17 - adcs x4, x4, xzr - eor x7, x7, x17 - adcs x5, x5, xzr - eor x8, x8, x17 - adcs x6, x6, xzr - umulh x22, x3, x21 - adcs x7, x7, xzr - umulh x23, x4, x21 - adc x8, x8, xzr - - umulh x24, x5, x21 - add x14, x14, x28 - umulh x25, x6, x21 - asr x28, x27, #63 - umulh x26, x7, x21 - mul x3, x3, x21 - mul x4, x4, x21 - mul x5, x5, x21 - adds x3, x3, x14 - mul x6, x6, x21 - adcs x4, x4, x22 - mul x7, x7, x21 - adcs x5, x5, x23 - mul x8, x8, x21 - adcs x6, x6, x24 - adcs x7, x7, x25 - adc x8, x8, x26 - - adds x3, x3, x27 - adcs x4, x4, x28 - adcs x5, x5, x28 - adcs x6, x6, x28 - stp x3, x4, [x0,#8*6] - adcs x7, x7, x28 - stp x5, x6, [x0,#8*8] - adc x8, x8, x28 - stp x7, x8, [x0,#8*10] - - ret - ENDP - - - ALIGN 32 -|__smul_383_n_shift_by_62| PROC - ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) - asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x5, x6, [x1,#8*2+0] - eor x2, x15, x28 // conditionally negate |f0| (or |g0|) - ldp x7, x8, [x1,#8*4+0] - - eor x3, x3, x28 // conditionally negate |a| (or |b|) - sub x2, x2, x28 - eor x4, x4, x28 - adds x3, x3, x28, lsr#63 - eor x5, x5, x28 - adcs x4, x4, xzr - eor x6, x6, x28 - adcs x5, x5, xzr - eor x7, x7, x28 - umulh x22, x3, x2 - adcs x6, x6, xzr - umulh x23, x4, x2 - eor x8, x8, x28 - umulh x24, x5, x2 - adcs x7, x7, xzr - umulh x25, x6, x2 - adc x8, x8, xzr - - umulh x26, x7, x2 - smulh x27, x8, x2 - mul x3, x3, x2 - mul x4, x4, x2 - mul x5, x5, x2 - adds x4, x4, x22 - mul x6, x6, x2 - adcs x5, x5, x23 - mul x7, x7, x2 - adcs x6, x6, x24 - mul x8, x8, x2 - adcs x7, x7, x25 - adcs x8, x8 ,x26 - adc x27, x27, xzr - ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) - asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) - ldp x11, x12, [x1,#8*2+48] - eor x2, x16, x28 // conditionally negate |f0| (or |g0|) - ldp x13, x14, [x1,#8*4+48] - - eor x9, x9, x28 // conditionally negate |a| (or |b|) - sub x2, x2, x28 - eor x10, x10, x28 - adds x9, x9, x28, lsr#63 - eor x11, x11, x28 - adcs x10, x10, xzr - eor x12, x12, x28 - adcs x11, x11, xzr - eor x13, x13, x28 - umulh x22, x9, x2 - adcs x12, x12, xzr - umulh x23, x10, x2 - eor x14, x14, x28 - umulh x24, x11, x2 - adcs x13, x13, xzr - umulh x25, x12, x2 - adc x14, x14, xzr - - umulh x26, x13, x2 - smulh x28, x14, x2 - mul x9, x9, x2 - mul x10, x10, x2 - mul x11, x11, x2 - adds x10, x10, x22 - mul x12, x12, x2 - adcs x11, x11, x23 - mul x13, x13, x2 - adcs x12, x12, x24 - mul x14, x14, x2 - adcs x13, x13, x25 - adcs x14, x14 ,x26 - adc x28, x28, xzr - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - adcs x7, x7, x13 - adcs x8, x8, x14 - adc x9, x27, x28 - - extr x3, x4, x3, #62 - extr x4, x5, x4, #62 - extr x5, x6, x5, #62 - asr x28, x9, #63 - extr x6, x7, x6, #62 - extr x7, x8, x7, #62 - extr x8, x9, x8, #62 - - eor x3, x3, x28 - eor x4, x4, x28 - adds x3, x3, x28, lsr#63 - eor x5, x5, x28 - adcs x4, x4, xzr - eor x6, x6, x28 - adcs x5, x5, xzr - eor x7, x7, x28 - adcs x6, x6, xzr - eor x8, x8, x28 - stp x3, x4, [x0,#8*0] - adcs x7, x7, xzr - stp x5, x6, [x0,#8*2] - adc x8, x8, xzr - stp x7, x8, [x0,#8*4] - - eor x15, x15, x28 - eor x16, x16, x28 - sub x15, x15, x28 - sub x16, x16, x28 - - ret - ENDP - - ALIGN 16 -|__ab_approximation_62| PROC - ldp x7, x8, [x1,#8*4] - ldp x13, x14, [x1,#8*10] - ldp x5, x6, [x1,#8*2] - ldp x11, x12, [x1,#8*8] - -|$Lab_approximation_62_loaded| - orr x22, x8, x14 // check top-most limbs, ... - cmp x22, #0 - cselne x8,x8,x7 - cselne x14,x14,x13 - cselne x7,x7,x6 - orr x22, x8, x14 // ... ones before top-most, ... - cselne x13,x13,x12 - - ldp x3, x4, [x1,#8*0] - ldp x9, x10, [x1,#8*6] - - cmp x22, #0 - cselne x8,x8,x7 - cselne x14,x14,x13 - cselne x7,x7,x5 - orr x22, x8, x14 // ... and ones before that ... - cselne x13,x13,x11 - - cmp x22, #0 - cselne x8,x8,x7 - cselne x14,x14,x13 - cselne x7,x7,x4 - orr x22, x8, x14 - cselne x13,x13,x10 - - clz x22, x22 - cmp x22, #64 - cselne x22,x22,xzr - cselne x8,x8,x7 - cselne x14,x14,x13 - neg x23, x22 - - lslv x8, x8, x22 // align high limbs to the left - lslv x14, x14, x22 - lsrv x7, x7, x23 - lsrv x13, x13, x23 - and x7, x7, x23, asr#6 - and x13, x13, x23, asr#6 - orr x8, x8, x7 - orr x14, x14, x13 - - b __inner_loop_62 - ret - ENDP - - ALIGN 16 -|__inner_loop_62| PROC - mov x15, #1 // |f0|=1 - mov x16, #0 // |g0|=0 - mov x17, #0 // |f1|=0 - mov x19, #1 // |g1|=1 - -|$Loop_62| - sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting - sub x2, x2, #1 - subs x24, x9, x3 // |b_|-|a_| - and x22, x9, x28 - sbc x25, x14, x8 - and x23, x14, x28 - subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) - mov x22, x15 - sbcs x27, x8, x23 - mov x23, x16 - cselhs x9,x9,x3 - cselhs x14,x14,x8 - cselhs x3,x26,x24 - cselhs x8,x27,x25 - cselhs x15,x15,x17 - cselhs x17,x17,x22 - cselhs x16,x16,x19 - cselhs x19,x19,x23 - extr x3, x8, x3, #1 - lsr x8, x8, #1 - and x22, x17, x28 - and x23, x19, x28 - add x17, x17, x17 // |f1|<<=1 - add x19, x19, x19 // |g1|<<=1 - sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) - cbnz x2, |$Loop_62| - - ret - ENDP - END diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm deleted file mode 100644 index e2454897b33..00000000000 --- a/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm +++ /dev/null @@ -1,326 +0,0 @@ - AREA |.text|,CODE,ALIGN=8,ARM64 - - - - EXPORT |ct_is_square_mod_384|[FUNC] - ALIGN 32 -|ct_is_square_mod_384| PROC - DCDU 3573752639 - stp x29, x30, [sp,#-128]! - add x29, sp, #0 - stp x19, x20, [sp,#16] - stp x21, x22, [sp,#32] - stp x23, x24, [sp,#48] - stp x25, x26, [sp,#64] - stp x27, x28, [sp,#80] - sub sp, sp, #512 - - ldp x3, x4, [x0,#8*0] // load input - ldp x5, x6, [x0,#8*2] - ldp x7, x8, [x0,#8*4] - - add x0, sp, #255 // find closest 256-byte-aligned spot - and x0, x0, #-256 // in the frame... - - ldp x9, x10, [x1,#8*0] // load modulus - ldp x11, x12, [x1,#8*2] - ldp x13, x14, [x1,#8*4] - - stp x3, x4, [x0,#8*6] // copy input to |a| - stp x5, x6, [x0,#8*8] - stp x7, x8, [x0,#8*10] - stp x9, x10, [x0,#8*0] // copy modulus to |b| - stp x11, x12, [x0,#8*2] - stp x13, x14, [x0,#8*4] - - eor x2, x2, x2 // init the |$Legendre| symbol - mov x15, #24 // 24 is 768/30-1 - b |$Loop_is_square| - - ALIGN 16 -|$Loop_is_square| - bl __ab_approximation_30 - sub x15, x15, #1 - - eor x1, x0, #128 // pointer to dst |b| - bl __smul_384_n_shift_by_30 - - mov x19, x16 // |f0| - mov x20, x17 // |g0| - add x1, x1, #8*6 // pointer to dst |a| - bl __smul_384_n_shift_by_30 - - ldp x9, x10, [x1,#-8*6] - eor x0, x0, #128 // flip-flop src |a|b| - and x27, x27, x9 // if |a| was negative, - add x2, x2, x27, lsr#1 // adjust |L| - - cbnz x15, |$Loop_is_square| - - ////////////////////////////////////////// last iteration - //bl __ab_approximation_30 // |a| and |b| are exact, - //ldr x8, [x0,#8*6] // and loaded - //ldr x14, [x0,#8*0] - mov x15, #48 // 48 is 768%30 + 30 - bl __inner_loop_48 - ldr x30, [x29,#8] - - and x0, x2, #1 - eor x0, x0, #1 - - add sp, sp, #512 - ldp x19, x20, [x29,#16] - ldp x21, x22, [x29,#32] - ldp x23, x24, [x29,#48] - ldp x25, x26, [x29,#64] - ldp x27, x28, [x29,#80] - ldr x29, [sp],#128 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__smul_384_n_shift_by_30| PROC - ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) - asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) - ldp x5, x6, [x0,#8*2+0] - eor x20, x20, x27 // conditionally negate |g1| (or |f1|) - ldp x7, x8, [x0,#8*4+0] - - eor x3, x3, x27 // conditionally negate |b| (or |a|) - sub x20, x20, x27 - eor x4, x4, x27 - adds x3, x3, x27, lsr#63 - eor x5, x5, x27 - adcs x4, x4, xzr - eor x6, x6, x27 - adcs x5, x5, xzr - eor x7, x7, x27 - umulh x21, x3, x20 - adcs x6, x6, xzr - umulh x22, x4, x20 - eor x8, x8, x27 - umulh x23, x5, x20 - adcs x7, x7, xzr - umulh x24, x6, x20 - adc x8, x8, xzr - - umulh x25, x7, x20 - and x28, x20, x27 - umulh x26, x8, x20 - neg x28, x28 - mul x3, x3, x20 - mul x4, x4, x20 - mul x5, x5, x20 - adds x4, x4, x21 - mul x6, x6, x20 - adcs x5, x5, x22 - mul x7, x7, x20 - adcs x6, x6, x23 - mul x8, x8, x20 - adcs x7, x7, x24 - adcs x8, x8 ,x25 - adc x26, x26, x28 - ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) - asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) - ldp x11, x12, [x0,#8*2+48] - eor x19, x19, x27 // conditionally negate |g1| (or |f1|) - ldp x13, x14, [x0,#8*4+48] - - eor x9, x9, x27 // conditionally negate |b| (or |a|) - sub x19, x19, x27 - eor x10, x10, x27 - adds x9, x9, x27, lsr#63 - eor x11, x11, x27 - adcs x10, x10, xzr - eor x12, x12, x27 - adcs x11, x11, xzr - eor x13, x13, x27 - umulh x21, x9, x19 - adcs x12, x12, xzr - umulh x22, x10, x19 - eor x14, x14, x27 - umulh x23, x11, x19 - adcs x13, x13, xzr - umulh x24, x12, x19 - adc x14, x14, xzr - - umulh x25, x13, x19 - and x28, x19, x27 - umulh x27, x14, x19 - neg x28, x28 - mul x9, x9, x19 - mul x10, x10, x19 - mul x11, x11, x19 - adds x10, x10, x21 - mul x12, x12, x19 - adcs x11, x11, x22 - mul x13, x13, x19 - adcs x12, x12, x23 - mul x14, x14, x19 - adcs x13, x13, x24 - adcs x14, x14 ,x25 - adc x27, x27, x28 - adds x3, x3, x9 - adcs x4, x4, x10 - adcs x5, x5, x11 - adcs x6, x6, x12 - adcs x7, x7, x13 - adcs x8, x8, x14 - adc x9, x26, x27 - - extr x3, x4, x3, #30 - extr x4, x5, x4, #30 - extr x5, x6, x5, #30 - asr x27, x9, #63 - extr x6, x7, x6, #30 - extr x7, x8, x7, #30 - extr x8, x9, x8, #30 - - eor x3, x3, x27 - eor x4, x4, x27 - adds x3, x3, x27, lsr#63 - eor x5, x5, x27 - adcs x4, x4, xzr - eor x6, x6, x27 - adcs x5, x5, xzr - eor x7, x7, x27 - adcs x6, x6, xzr - eor x8, x8, x27 - stp x3, x4, [x1,#8*0] - adcs x7, x7, xzr - stp x5, x6, [x1,#8*2] - adc x8, x8, xzr - stp x7, x8, [x1,#8*4] - - ret - ENDP - - ALIGN 16 -|__ab_approximation_30| PROC - ldp x13, x14, [x0,#8*4] // |a| is still in registers - ldp x11, x12, [x0,#8*2] - - orr x21, x8, x14 // check top-most limbs, ... - cmp x21, #0 - cselne x8,x8,x7 - cselne x14,x14,x13 - cselne x7,x7,x6 - orr x21, x8, x14 // ... ones before top-most, ... - cselne x13,x13,x12 - - cmp x21, #0 - cselne x8,x8,x7 - cselne x14,x14,x13 - cselne x7,x7,x5 - orr x21, x8, x14 // ... and ones before that ... - cselne x13,x13,x11 - - cmp x21, #0 - cselne x8,x8,x7 - cselne x14,x14,x13 - cselne x7,x7,x4 - orr x21, x8, x14 // and one more, ... - cselne x13,x13,x10 - - cmp x21, #0 - cselne x8,x8,x7 - cselne x14,x14,x13 - cselne x7,x7,x3 - orr x21, x8, x14 - cselne x13,x13,x9 - - clz x21, x21 - cmp x21, #64 - cselne x21,x21,xzr - cselne x8,x8,x7 - cselne x14,x14,x13 - neg x22, x21 - - lslv x8, x8, x21 // align high limbs to the left - lslv x14, x14, x21 - lsrv x7, x7, x22 - lsrv x13, x13, x22 - and x7, x7, x22, asr#6 - and x13, x13, x22, asr#6 - orr x8, x8, x7 - orr x14, x14, x13 - - bfxil x8, x3, #0, #32 - bfxil x14, x9, #0, #32 - - b __inner_loop_30 - ret - ENDP - - - ALIGN 16 -|__inner_loop_30| PROC - mov x28, #30 - mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 - mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 - mov x27,#0x7FFFFFFF7FFFFFFF - -|$Loop_30| - sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting - and x25, x8, x14 - sub x28, x28, #1 - and x21, x14, x24 - - sub x22, x14, x8 // |b_|-|a_| - subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) - add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 - mov x21, x20 - cselhs x14,x14,x8 - cselhs x8,x23,x22 - cselhs x20,x20,x17 - cselhs x17,x17,x21 - cselhs x2,x2,x25 - lsr x8, x8, #1 - and x21, x20, x24 - and x22, x27, x24 - add x23, x14, #2 - sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) - add x20, x20, x20 // |f1|<<=1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - add x17, x17, x22 - sub x20, x20, x27 - - cbnz x28, |$Loop_30| - - mov x27, #0x7FFFFFFF - ubfx x16, x17, #0, #32 - ubfx x17, x17, #32, #32 - ubfx x19, x20, #0, #32 - ubfx x20, x20, #32, #32 - sub x16, x16, x27 // remove the bias - sub x17, x17, x27 - sub x19, x19, x27 - sub x20, x20, x27 - - ret - ENDP - - ALIGN 16 -|__inner_loop_48| PROC -|$Loop_48| - sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting - and x25, x3, x9 - sub x15, x15, #1 - and x21, x9, x24 - sub x22, x9, x3 // |b_|-|a_| - subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) - add x25, x2, x25, lsr#1 - cselhs x9,x9,x3 - cselhs x3,x23,x22 - cselhs x2,x2,x25 - add x23, x9, #2 - lsr x3, x3, #1 - add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 - - cbnz x15, |$Loop_48| - - ret - ENDP - END diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm deleted file mode 100644 index be00f479efb..00000000000 --- a/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm +++ /dev/null @@ -1,516 +0,0 @@ -OPTION DOTNAME -.text$ SEGMENT ALIGN(256) 'CODE' - -PUBLIC ct_is_square_mod_384 - - -ALIGN 32 -ct_is_square_mod_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_ct_is_square_mod_384:: - - - push rbp - - mov rdi,rcx - mov rsi,rdx - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,536 - -$L$SEH_body_ct_is_square_mod_384:: - - - lea rax,QWORD PTR[((24+255))+rsp] - and rax,-256 - - mov r8,QWORD PTR[rdi] - mov r9,QWORD PTR[8+rdi] - mov r10,QWORD PTR[16+rdi] - mov r11,QWORD PTR[24+rdi] - mov r12,QWORD PTR[32+rdi] - mov r13,QWORD PTR[40+rdi] - - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rbx,QWORD PTR[16+rsi] - mov rcx,QWORD PTR[24+rsi] - mov rdx,QWORD PTR[32+rsi] - mov rdi,QWORD PTR[40+rsi] - mov rsi,rax - - mov QWORD PTR[rax],r8 - mov QWORD PTR[8+rax],r9 - mov QWORD PTR[16+rax],r10 - mov QWORD PTR[24+rax],r11 - mov QWORD PTR[32+rax],r12 - mov QWORD PTR[40+rax],r13 - - mov QWORD PTR[48+rax],r14 - mov QWORD PTR[56+rax],r15 - mov QWORD PTR[64+rax],rbx - mov QWORD PTR[72+rax],rcx - mov QWORD PTR[80+rax],rdx - mov QWORD PTR[88+rax],rdi - - xor rbp,rbp - mov ecx,24 - jmp $L$oop_is_square - -ALIGN 32 -$L$oop_is_square:: - mov DWORD PTR[16+rsp],ecx - - call __ab_approximation_30 - mov QWORD PTR[rsp],rax - mov QWORD PTR[8+rsp],rbx - - mov rdi,128+8*6 - xor rdi,rsi - call __smulq_384_n_shift_by_30 - - mov rdx,QWORD PTR[rsp] - mov rcx,QWORD PTR[8+rsp] - lea rdi,QWORD PTR[((-48))+rdi] - call __smulq_384_n_shift_by_30 - - mov ecx,DWORD PTR[16+rsp] - xor rsi,128 - - and r14,QWORD PTR[48+rdi] - shr r14,1 - add rbp,r14 - - sub ecx,1 - jnz $L$oop_is_square - - - - - mov r9,QWORD PTR[48+rsi] - call __inner_loop_48 - - mov rax,1 - and rax,rbp - xor rax,1 - - lea r8,QWORD PTR[536+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_ct_is_square_mod_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_ct_is_square_mod_384:: -ct_is_square_mod_384 ENDP - - -ALIGN 32 -__smulq_384_n_shift_by_30 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov rbx,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rbx,rdx - add rbx,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor r12,rdx - xor r13,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mov r14,rdx - and r14,rbx - mul rbx - mov r8,rax - mov rax,r9 - mov r9,rdx - mul rbx - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rbx - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mul rbx - add r11,rax - mov rax,r12 - adc rdx,0 - mov r12,rdx - mul rbx - add r12,rax - mov rax,r13 - adc rdx,0 - mov r13,rdx - neg r14 - mul rbx - add r13,rax - adc r14,rdx - lea rsi,QWORD PTR[48+rsi] - mov rdx,rcx - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov rbx,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rbx,rdx - add rbx,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor r12,rdx - xor r13,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mov r15,rdx - and r15,rbx - mul rbx - mov r8,rax - mov rax,r9 - mov r9,rdx - mul rbx - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rbx - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mul rbx - add r11,rax - mov rax,r12 - adc rdx,0 - mov r12,rdx - mul rbx - add r12,rax - mov rax,r13 - adc rdx,0 - mov r13,rdx - neg r15 - mul rbx - add r13,rax - adc r15,rdx - lea rsi,QWORD PTR[((-48))+rsi] - - add r8,QWORD PTR[rdi] - adc r9,QWORD PTR[8+rdi] - adc r10,QWORD PTR[16+rdi] - adc r11,QWORD PTR[24+rdi] - adc r12,QWORD PTR[32+rdi] - adc r13,QWORD PTR[40+rdi] - adc r14,r15 - - shrd r8,r9,30 - shrd r9,r10,30 - shrd r10,r11,30 - shrd r11,r12,30 - shrd r12,r13,30 - shrd r13,r14,30 - - sar r14,63 - xor rbx,rbx - sub rbx,r14 - - xor r8,r14 - xor r9,r14 - xor r10,r14 - xor r11,r14 - xor r12,r14 - xor r13,r14 - add r8,rbx - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - DB 0F3h,0C3h ;repret -__smulq_384_n_shift_by_30 ENDP - -ALIGN 32 -__ab_approximation_30 PROC PRIVATE - DB 243,15,30,250 - - mov rbx,QWORD PTR[88+rsi] - mov r15,QWORD PTR[80+rsi] - mov r14,QWORD PTR[72+rsi] - - mov rax,r13 - or rax,rbx - cmovz r13,r12 - cmovz rbx,r15 - cmovz r12,r11 - mov r11,QWORD PTR[64+rsi] - cmovz r15,r14 - - mov rax,r13 - or rax,rbx - cmovz r13,r12 - cmovz rbx,r15 - cmovz r12,r10 - mov r10,QWORD PTR[56+rsi] - cmovz r15,r11 - - mov rax,r13 - or rax,rbx - cmovz r13,r12 - cmovz rbx,r15 - cmovz r12,r9 - mov r9,QWORD PTR[48+rsi] - cmovz r15,r10 - - mov rax,r13 - or rax,rbx - cmovz r13,r12 - cmovz rbx,r15 - cmovz r12,r8 - cmovz r15,r9 - - mov rax,r13 - or rax,rbx - bsr rcx,rax - lea rcx,QWORD PTR[1+rcx] - cmovz r13,r8 - cmovz rbx,r9 - cmovz rcx,rax - neg rcx - - - shld r13,r12,cl - shld rbx,r15,cl - - mov rax,0FFFFFFFF00000000h - mov r8d,r8d - mov r9d,r9d - and r13,rax - and rbx,rax - or r8,r13 - or r9,rbx - - jmp __inner_loop_30 - - DB 0F3h,0C3h ;repret -__ab_approximation_30 ENDP - -ALIGN 32 -__inner_loop_30 PROC PRIVATE - DB 243,15,30,250 - - mov rbx,07FFFFFFF80000000h - mov rcx,0800000007FFFFFFFh - lea r15,QWORD PTR[((-1))+rbx] - mov edi,30 - -$L$oop_30:: - mov rax,r8 - and rax,r9 - shr rax,1 - - cmp r8,r9 - mov r10,r8 - mov r11,r9 - lea rax,QWORD PTR[rbp*1+rax] - mov r12,rbx - mov r13,rcx - mov r14,rbp - cmovb r8,r9 - cmovb r9,r10 - cmovb rbx,rcx - cmovb rcx,r12 - cmovb rbp,rax - - sub r8,r9 - sub rbx,rcx - add rbx,r15 - - test r10,1 - cmovz r8,r10 - cmovz r9,r11 - cmovz rbx,r12 - cmovz rcx,r13 - cmovz rbp,r14 - - lea rax,QWORD PTR[2+r9] - shr r8,1 - shr rax,2 - add rcx,rcx - lea rbp,QWORD PTR[rbp*1+rax] - sub rcx,r15 - - sub edi,1 - jnz $L$oop_30 - - shr r15,32 - mov eax,ebx - shr rbx,32 - mov edx,ecx - shr rcx,32 - sub rax,r15 - sub rbx,r15 - sub rdx,r15 - sub rcx,r15 - - DB 0F3h,0C3h ;repret -__inner_loop_30 ENDP - - -ALIGN 32 -__inner_loop_48 PROC PRIVATE - DB 243,15,30,250 - - mov edi,48 - -$L$oop_48:: - mov rax,r8 - and rax,r9 - shr rax,1 - - cmp r8,r9 - mov r10,r8 - mov r11,r9 - lea rax,QWORD PTR[rbp*1+rax] - mov r12,rbp - cmovb r8,r9 - cmovb r9,r10 - cmovb rbp,rax - - sub r8,r9 - - test r10,1 - cmovz r8,r10 - cmovz r9,r11 - cmovz rbp,r12 - - lea rax,QWORD PTR[2+r9] - shr r8,1 - shr rax,2 - add rbp,rax - - sub edi,1 - jnz $L$oop_48 - - DB 0F3h,0C3h ;repret -__inner_loop_48 ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_ct_is_square_mod_384 - DD imagerel $L$SEH_body_ct_is_square_mod_384 - DD imagerel $L$SEH_info_ct_is_square_mod_384_prologue - - DD imagerel $L$SEH_body_ct_is_square_mod_384 - DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 - DD imagerel $L$SEH_info_ct_is_square_mod_384_body - - DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 - DD imagerel $L$SEH_end_ct_is_square_mod_384 - DD imagerel $L$SEH_info_ct_is_square_mod_384_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_ct_is_square_mod_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_ct_is_square_mod_384_body:: -DB 1,0,18,0 -DB 000h,0f4h,043h,000h -DB 000h,0e4h,044h,000h -DB 000h,0d4h,045h,000h -DB 000h,0c4h,046h,000h -DB 000h,034h,047h,000h -DB 000h,054h,048h,000h -DB 000h,074h,04ah,000h -DB 000h,064h,04bh,000h -DB 000h,001h,049h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_ct_is_square_mod_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm deleted file mode 100644 index 89fbe5d0666..00000000000 --- a/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm +++ /dev/null @@ -1,1240 +0,0 @@ -OPTION DOTNAME -EXTERN ct_inverse_mod_383$1:NEAR -_DATA SEGMENT -COMM __blst_platform_cap:DWORD:1 -_DATA ENDS -.text$ SEGMENT ALIGN(256) 'CODE' - -PUBLIC ct_inverse_mod_383 - - -ALIGN 32 -ct_inverse_mod_383 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_ct_inverse_mod_383:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz ct_inverse_mod_383$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,1112 - -$L$SEH_body_ct_inverse_mod_383:: - - - lea rax,QWORD PTR[((88+511))+rsp] - and rax,-512 - mov QWORD PTR[32+rsp],rdi - mov QWORD PTR[40+rsp],rcx - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov r14,QWORD PTR[rdx] - mov r15,QWORD PTR[8+rdx] - mov rbx,QWORD PTR[16+rdx] - mov rbp,QWORD PTR[24+rdx] - mov rsi,QWORD PTR[32+rdx] - mov rdi,QWORD PTR[40+rdx] - - mov QWORD PTR[rax],r8 - mov QWORD PTR[8+rax],r9 - mov QWORD PTR[16+rax],r10 - mov QWORD PTR[24+rax],r11 - mov QWORD PTR[32+rax],r12 - mov QWORD PTR[40+rax],r13 - - mov QWORD PTR[48+rax],r14 - mov QWORD PTR[56+rax],r15 - mov QWORD PTR[64+rax],rbx - mov QWORD PTR[72+rax],rbp - mov QWORD PTR[80+rax],rsi - mov rsi,rax - mov QWORD PTR[88+rax],rdi - - - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - - - mov QWORD PTR[96+rdi],rdx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - - - mov QWORD PTR[96+rdi],rdx - - - xor rsi,256 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - - - - mov rax,QWORD PTR[96+rsi] - mov r11,QWORD PTR[144+rsi] - mov rbx,rdx - mov r10,rax - imul QWORD PTR[56+rsp] - mov r8,rax - mov rax,r11 - mov r9,rdx - imul QWORD PTR[64+rsp] - add r8,rax - adc r9,rdx - mov QWORD PTR[48+rdi],r8 - mov QWORD PTR[56+rdi],r9 - sar r9,63 - mov QWORD PTR[64+rdi],r9 - mov QWORD PTR[72+rdi],r9 - mov QWORD PTR[80+rdi],r9 - mov QWORD PTR[88+rdi],r9 - lea rsi,QWORD PTR[96+rsi] - - mov rax,r10 - imul rbx - mov r8,rax - mov rax,r11 - mov r9,rdx - imul rcx - add r8,rax - adc r9,rdx - mov QWORD PTR[96+rdi],r8 - mov QWORD PTR[104+rdi],r9 - sar r9,63 - mov QWORD PTR[112+rdi],r9 - mov QWORD PTR[120+rdi],r9 - mov QWORD PTR[128+rdi],r9 - mov QWORD PTR[136+rdi],r9 - xor rsi,256+8*12 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - xor rsi,256+8*12 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - xor rsi,256+8*12 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - xor rsi,256+8*12 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - sar r13,63 - mov QWORD PTR[48+rdi],r13 - mov QWORD PTR[56+rdi],r13 - mov QWORD PTR[64+rdi],r13 - mov QWORD PTR[72+rdi],r13 - mov QWORD PTR[80+rdi],r13 - mov QWORD PTR[88+rdi],r13 - xor rsi,256+8*12 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_767x63 - xor rsi,256+8*12 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_767x63 - xor rsi,256+8*12 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_767x63 - xor rsi,256+8*12 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_767x63 - xor rsi,256+8*12 - mov edi,62 - call __ab_approximation_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulq_383_n_shift_by_62 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383_n_shift_by_62 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_767x63 - - xor rsi,256+8*12 - mov edi,62 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[48+rsi] - mov r11,QWORD PTR[56+rsi] - call __inner_loop_62 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - mov QWORD PTR[rdi],r8 - mov QWORD PTR[48+rdi],r10 - - - - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[96+rdi] - call __smulq_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulq_767x63 - - - xor rsi,256+8*12 - mov edi,22 - - mov r8,QWORD PTR[rsi] - xor r9,r9 - mov r10,QWORD PTR[48+rsi] - xor r11,r11 - call __inner_loop_62 - - - - - - - - lea rsi,QWORD PTR[96+rsi] - - - - - - mov rdx,r12 - mov rcx,r13 - mov rdi,QWORD PTR[32+rsp] - call __smulq_767x63 - - mov rsi,QWORD PTR[40+rsp] - mov rdx,rax - sar rax,63 - - mov r8,rax - mov r9,rax - mov r10,rax - and r8,QWORD PTR[rsi] - and r9,QWORD PTR[8+rsi] - mov r11,rax - and r10,QWORD PTR[16+rsi] - and r11,QWORD PTR[24+rsi] - mov r12,rax - and r12,QWORD PTR[32+rsi] - and rax,QWORD PTR[40+rsi] - - add r14,r8 - adc r15,r9 - adc rbx,r10 - adc rbp,r11 - adc rcx,r12 - adc rdx,rax - - mov QWORD PTR[48+rdi],r14 - mov QWORD PTR[56+rdi],r15 - mov QWORD PTR[64+rdi],rbx - mov QWORD PTR[72+rdi],rbp - mov QWORD PTR[80+rdi],rcx - mov QWORD PTR[88+rdi],rdx - - lea r8,QWORD PTR[1112+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_ct_inverse_mod_383:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_ct_inverse_mod_383:: -ct_inverse_mod_383 ENDP - -ALIGN 32 -__smulq_767x63 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov rbp,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - mov QWORD PTR[8+rsp],rdi - mov QWORD PTR[16+rsp],rsi - lea rsi,QWORD PTR[48+rsi] - - xor rbp,rdx - add rbp,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor r12,rdx - xor r13,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mul rbp - mov QWORD PTR[rdi],rax - mov rax,r9 - mov r9,rdx - mul rbp - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mov QWORD PTR[8+rdi],r9 - mul rbp - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mov QWORD PTR[16+rdi],r10 - mul rbp - add r11,rax - mov rax,r12 - adc rdx,0 - mov r12,rdx - mov QWORD PTR[24+rdi],r11 - mul rbp - add r12,rax - mov rax,r13 - adc rdx,0 - mov r13,rdx - mov QWORD PTR[32+rdi],r12 - imul rbp - add r13,rax - adc rdx,0 - - mov QWORD PTR[40+rdi],r13 - mov QWORD PTR[48+rdi],rdx - sar rdx,63 - mov QWORD PTR[56+rdi],rdx - mov rdx,rcx - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov r14,QWORD PTR[48+rsi] - mov r15,QWORD PTR[56+rsi] - mov rbx,QWORD PTR[64+rsi] - mov rbp,QWORD PTR[72+rsi] - mov rcx,QWORD PTR[80+rsi] - mov rdi,QWORD PTR[88+rsi] - - mov rsi,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rsi,rdx - add rsi,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor r12,rdx - xor r13,rdx - xor r14,rdx - xor r15,rdx - xor rbx,rdx - xor rbp,rdx - xor rcx,rdx - xor rdi,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - adc r14,0 - adc r15,0 - adc rbx,0 - adc rbp,0 - adc rcx,0 - adc rdi,0 - - mul rsi - mov r8,rax - mov rax,r9 - mov r9,rdx - mul rsi - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rsi - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mul rsi - add r11,rax - mov rax,r12 - adc rdx,0 - mov r12,rdx - mul rsi - add r12,rax - mov rax,r13 - adc rdx,0 - mov r13,rdx - mul rsi - add r13,rax - mov rax,r14 - adc rdx,0 - mov r14,rdx - mul rsi - add r14,rax - mov rax,r15 - adc rdx,0 - mov r15,rdx - mul rsi - add r15,rax - mov rax,rbx - adc rdx,0 - mov rbx,rdx - mul rsi - add rbx,rax - mov rax,rbp - adc rdx,0 - mov rbp,rdx - mul rsi - add rbp,rax - mov rax,rcx - adc rdx,0 - mov rcx,rdx - mul rsi - add rcx,rax - mov rax,rdi - adc rdx,0 - mov rdi,rdx - mov rdx,QWORD PTR[8+rsp] - imul rax,rsi - mov rsi,QWORD PTR[16+rsp] - add rax,rdi - - add r8,QWORD PTR[rdx] - adc r9,QWORD PTR[8+rdx] - adc r10,QWORD PTR[16+rdx] - adc r11,QWORD PTR[24+rdx] - adc r12,QWORD PTR[32+rdx] - adc r13,QWORD PTR[40+rdx] - adc r14,QWORD PTR[48+rdx] - mov rdi,QWORD PTR[56+rdx] - adc r15,rdi - adc rbx,rdi - adc rbp,rdi - adc rcx,rdi - adc rax,rdi - - mov rdi,rdx - - mov QWORD PTR[rdx],r8 - mov QWORD PTR[8+rdx],r9 - mov QWORD PTR[16+rdx],r10 - mov QWORD PTR[24+rdx],r11 - mov QWORD PTR[32+rdx],r12 - mov QWORD PTR[40+rdx],r13 - mov QWORD PTR[48+rdx],r14 - mov QWORD PTR[56+rdx],r15 - mov QWORD PTR[64+rdx],rbx - mov QWORD PTR[72+rdx],rbp - mov QWORD PTR[80+rdx],rcx - mov QWORD PTR[88+rdx],rax - - DB 0F3h,0C3h ;repret -__smulq_767x63 ENDP - -ALIGN 32 -__smulq_383x63 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov rbp,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rbp,rdx - add rbp,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor r12,rdx - xor r13,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mul rbp - mov r8,rax - mov rax,r9 - mov r9,rdx - mul rbp - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rbp - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mul rbp - add r11,rax - mov rax,r12 - adc rdx,0 - mov r12,rdx - mul rbp - add r12,rax - mov rax,r13 - adc rdx,0 - mov r13,rdx - imul rax,rbp - add r13,rax - - lea rsi,QWORD PTR[48+rsi] - mov rdx,rcx - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov rbp,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rbp,rdx - add rbp,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor r12,rdx - xor r13,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mul rbp - mov r8,rax - mov rax,r9 - mov r9,rdx - mul rbp - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rbp - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mul rbp - add r11,rax - mov rax,r12 - adc rdx,0 - mov r12,rdx - mul rbp - add r12,rax - mov rax,r13 - adc rdx,0 - mov r13,rdx - imul rax,rbp - add r13,rax - - lea rsi,QWORD PTR[((-48))+rsi] - - add r8,QWORD PTR[rdi] - adc r9,QWORD PTR[8+rdi] - adc r10,QWORD PTR[16+rdi] - adc r11,QWORD PTR[24+rdi] - adc r12,QWORD PTR[32+rdi] - adc r13,QWORD PTR[40+rdi] - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - DB 0F3h,0C3h ;repret -__smulq_383x63 ENDP - -ALIGN 32 -__smulq_383_n_shift_by_62 PROC PRIVATE - DB 243,15,30,250 - - mov rbx,rdx - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov rbp,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rbp,rdx - add rbp,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor r12,rdx - xor r13,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mul rbp - mov r8,rax - mov rax,r9 - mov r9,rdx - mul rbp - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rbp - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mul rbp - add r11,rax - mov rax,r12 - adc rdx,0 - mov r12,rdx - mul rbp - add r12,rax - mov rax,r13 - adc rdx,0 - mov r13,rdx - imul rbp - add r13,rax - adc rdx,0 - - lea rsi,QWORD PTR[48+rsi] - mov r14,rdx - mov rdx,rcx - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov rbp,rdx - sar rdx,63 - xor rax,rax - sub rax,rdx - - xor rbp,rdx - add rbp,rax - - xor r8,rdx - xor r9,rdx - xor r10,rdx - xor r11,rdx - xor r12,rdx - xor r13,rdx - add rax,r8 - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mul rbp - mov r8,rax - mov rax,r9 - mov r9,rdx - mul rbp - add r9,rax - mov rax,r10 - adc rdx,0 - mov r10,rdx - mul rbp - add r10,rax - mov rax,r11 - adc rdx,0 - mov r11,rdx - mul rbp - add r11,rax - mov rax,r12 - adc rdx,0 - mov r12,rdx - mul rbp - add r12,rax - mov rax,r13 - adc rdx,0 - mov r13,rdx - imul rbp - add r13,rax - adc rdx,0 - - lea rsi,QWORD PTR[((-48))+rsi] - - add r8,QWORD PTR[rdi] - adc r9,QWORD PTR[8+rdi] - adc r10,QWORD PTR[16+rdi] - adc r11,QWORD PTR[24+rdi] - adc r12,QWORD PTR[32+rdi] - adc r13,QWORD PTR[40+rdi] - adc r14,rdx - mov rdx,rbx - - shrd r8,r9,62 - shrd r9,r10,62 - shrd r10,r11,62 - shrd r11,r12,62 - shrd r12,r13,62 - shrd r13,r14,62 - - sar r14,63 - xor rbp,rbp - sub rbp,r14 - - xor r8,r14 - xor r9,r14 - xor r10,r14 - xor r11,r14 - xor r12,r14 - xor r13,r14 - add r8,rbp - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - xor rdx,r14 - xor rcx,r14 - add rdx,rbp - add rcx,rbp - - DB 0F3h,0C3h ;repret -__smulq_383_n_shift_by_62 ENDP - -ALIGN 32 -__ab_approximation_62 PROC PRIVATE - DB 243,15,30,250 - - mov r9,QWORD PTR[40+rsi] - mov r11,QWORD PTR[88+rsi] - mov rbx,QWORD PTR[32+rsi] - mov rbp,QWORD PTR[80+rsi] - mov r8,QWORD PTR[24+rsi] - mov r10,QWORD PTR[72+rsi] - - mov rax,r9 - or rax,r11 - cmovz r9,rbx - cmovz r11,rbp - cmovz rbx,r8 - cmovz rbp,r10 - mov r8,QWORD PTR[16+rsi] - mov r10,QWORD PTR[64+rsi] - - mov rax,r9 - or rax,r11 - cmovz r9,rbx - cmovz r11,rbp - cmovz rbx,r8 - cmovz rbp,r10 - mov r8,QWORD PTR[8+rsi] - mov r10,QWORD PTR[56+rsi] - - mov rax,r9 - or rax,r11 - cmovz r9,rbx - cmovz r11,rbp - cmovz rbx,r8 - cmovz rbp,r10 - mov r8,QWORD PTR[rsi] - mov r10,QWORD PTR[48+rsi] - - mov rax,r9 - or rax,r11 - bsr rcx,rax - lea rcx,QWORD PTR[1+rcx] - cmovz r9,rbx - cmovz r11,rbp - cmovz rcx,rax - neg rcx - - - shld r9,rbx,cl - shld r11,rbp,cl - - jmp __inner_loop_62 - - DB 0F3h,0C3h ;repret -__ab_approximation_62 ENDP - -ALIGN 8 - DD 0 -__inner_loop_62 PROC PRIVATE - DB 243,15,30,250 - - mov rdx,1 - xor rcx,rcx - xor r12,r12 - mov r13,1 - mov QWORD PTR[8+rsp],rsi - -$L$oop_62:: - xor rax,rax - xor rbx,rbx - test r8,1 - mov rbp,r10 - mov r14,r11 - cmovnz rax,r10 - cmovnz rbx,r11 - sub rbp,r8 - sbb r14,r9 - mov r15,r8 - mov rsi,r9 - sub r8,rax - sbb r9,rbx - cmovc r8,rbp - cmovc r9,r14 - cmovc r10,r15 - cmovc r11,rsi - mov rax,rdx - cmovc rdx,r12 - cmovc r12,rax - mov rbx,rcx - cmovc rcx,r13 - cmovc r13,rbx - xor rax,rax - xor rbx,rbx - shrd r8,r9,1 - shr r9,1 - test r15,1 - cmovnz rax,r12 - cmovnz rbx,r13 - add r12,r12 - add r13,r13 - sub rdx,rax - sub rcx,rbx - sub edi,1 - jnz $L$oop_62 - - mov rsi,QWORD PTR[8+rsp] - DB 0F3h,0C3h ;repret -__inner_loop_62 ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_ct_inverse_mod_383 - DD imagerel $L$SEH_body_ct_inverse_mod_383 - DD imagerel $L$SEH_info_ct_inverse_mod_383_prologue - - DD imagerel $L$SEH_body_ct_inverse_mod_383 - DD imagerel $L$SEH_epilogue_ct_inverse_mod_383 - DD imagerel $L$SEH_info_ct_inverse_mod_383_body - - DD imagerel $L$SEH_epilogue_ct_inverse_mod_383 - DD imagerel $L$SEH_end_ct_inverse_mod_383 - DD imagerel $L$SEH_info_ct_inverse_mod_383_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_ct_inverse_mod_383_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_ct_inverse_mod_383_body:: -DB 1,0,18,0 -DB 000h,0f4h,08bh,000h -DB 000h,0e4h,08ch,000h -DB 000h,0d4h,08dh,000h -DB 000h,0c4h,08eh,000h -DB 000h,034h,08fh,000h -DB 000h,054h,090h,000h -DB 000h,074h,092h,000h -DB 000h,064h,093h,000h -DB 000h,001h,091h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_ct_inverse_mod_383_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm deleted file mode 100644 index 024da69a645..00000000000 --- a/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm +++ /dev/null @@ -1,1609 +0,0 @@ -OPTION DOTNAME -PUBLIC ct_inverse_mod_383$1 -.text$ SEGMENT ALIGN(256) 'CODE' - -PUBLIC ctx_inverse_mod_383 - - -ALIGN 32 -ctx_inverse_mod_383 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_ctx_inverse_mod_383:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ct_inverse_mod_383$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,1112 - -$L$SEH_body_ctx_inverse_mod_383:: - - - lea rax,QWORD PTR[((88+511))+rsp] - and rax,-512 - mov QWORD PTR[32+rsp],rdi - mov QWORD PTR[40+rsp],rcx - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov r14,QWORD PTR[rdx] - mov r15,QWORD PTR[8+rdx] - mov rbx,QWORD PTR[16+rdx] - mov rbp,QWORD PTR[24+rdx] - mov rsi,QWORD PTR[32+rdx] - mov rdi,QWORD PTR[40+rdx] - - mov QWORD PTR[rax],r8 - mov QWORD PTR[8+rax],r9 - mov QWORD PTR[16+rax],r10 - mov QWORD PTR[24+rax],r11 - mov QWORD PTR[32+rax],r12 - mov QWORD PTR[40+rax],r13 - - mov QWORD PTR[48+rax],r14 - mov QWORD PTR[56+rax],r15 - mov QWORD PTR[64+rax],rbx - mov QWORD PTR[72+rax],rbp - mov QWORD PTR[80+rax],rsi - mov rsi,rax - mov QWORD PTR[88+rax],rdi - - - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - - - mov QWORD PTR[96+rdi],rdx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - - - mov QWORD PTR[96+rdi],rdx - - - xor rsi,256 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - - - - mov rax,QWORD PTR[96+rsi] - mov r11,QWORD PTR[144+rsi] - mov rbx,rdx - mov r10,rax - imul QWORD PTR[56+rsp] - mov r8,rax - mov rax,r11 - mov r9,rdx - imul QWORD PTR[64+rsp] - add r8,rax - adc r9,rdx - mov QWORD PTR[48+rdi],r8 - mov QWORD PTR[56+rdi],r9 - sar r9,63 - mov QWORD PTR[64+rdi],r9 - mov QWORD PTR[72+rdi],r9 - mov QWORD PTR[80+rdi],r9 - mov QWORD PTR[88+rdi],r9 - lea rsi,QWORD PTR[96+rsi] - - mov rax,r10 - imul rbx - mov r8,rax - mov rax,r11 - mov r9,rdx - imul rcx - add r8,rax - adc r9,rdx - mov QWORD PTR[96+rdi],r8 - mov QWORD PTR[104+rdi],r9 - sar r9,63 - mov QWORD PTR[112+rdi],r9 - mov QWORD PTR[120+rdi],r9 - mov QWORD PTR[128+rdi],r9 - mov QWORD PTR[136+rdi],r9 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - sar r13,63 - mov QWORD PTR[48+rdi],r13 - mov QWORD PTR[56+rdi],r13 - mov QWORD PTR[64+rdi],r13 - mov QWORD PTR[72+rdi],r13 - mov QWORD PTR[80+rdi],r13 - mov QWORD PTR[88+rdi],r13 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_383_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_191_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_191_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_191_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_191_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_191_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_191_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - xor rsi,256+8*12 - mov edi,31 - call __ab_approximation_31 - - - mov QWORD PTR[72+rsp],r12 - mov QWORD PTR[80+rsp],r13 - - mov rdi,256 - xor rdi,rsi - call __smulx_191_n_shift_by_31 - mov QWORD PTR[56+rsp],rdx - mov QWORD PTR[64+rsp],rcx - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_191_n_shift_by_31 - mov QWORD PTR[72+rsp],rdx - mov QWORD PTR[80+rsp],rcx - - mov rdx,QWORD PTR[56+rsp] - mov rcx,QWORD PTR[64+rsp] - lea rsi,QWORD PTR[96+rsi] - lea rdi,QWORD PTR[48+rdi] - call __smulx_383x63 - - mov rdx,QWORD PTR[72+rsp] - mov rcx,QWORD PTR[80+rsp] - lea rdi,QWORD PTR[48+rdi] - call __smulx_767x63 - - xor rsi,256+8*12 - mov edi,53 - - mov r8,QWORD PTR[rsi] - - mov r10,QWORD PTR[48+rsi] - - call __tail_loop_53 - - - - - - - - lea rsi,QWORD PTR[96+rsi] - - - - - - mov rdx,r12 - mov rcx,r13 - mov rdi,QWORD PTR[32+rsp] - call __smulx_767x63 - - mov rsi,QWORD PTR[40+rsp] - mov rdx,rax - sar rax,63 - - mov r8,rax - mov r9,rax - mov r10,rax - and r8,QWORD PTR[rsi] - and r9,QWORD PTR[8+rsi] - mov r11,rax - and r10,QWORD PTR[16+rsi] - and r11,QWORD PTR[24+rsi] - mov r12,rax - and r12,QWORD PTR[32+rsi] - and rax,QWORD PTR[40+rsi] - - add r14,r8 - adc r15,r9 - adc rbx,r10 - adc rbp,r11 - adc rcx,r12 - adc rdx,rax - - mov QWORD PTR[48+rdi],r14 - mov QWORD PTR[56+rdi],r15 - mov QWORD PTR[64+rdi],rbx - mov QWORD PTR[72+rdi],rbp - mov QWORD PTR[80+rdi],rcx - mov QWORD PTR[88+rdi],rdx - - lea r8,QWORD PTR[1112+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_ctx_inverse_mod_383:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_ctx_inverse_mod_383:: -ctx_inverse_mod_383 ENDP - -ALIGN 32 -__smulx_767x63 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov rax,rdx - sar rax,63 - xor rbp,rbp - sub rbp,rax - - mov QWORD PTR[8+rsp],rdi - mov QWORD PTR[16+rsp],rsi - lea rsi,QWORD PTR[48+rsi] - - xor rdx,rax - add rdx,rbp - - xor r8,rax - xor r9,rax - xor r10,rax - xor r11,rax - xor r12,rax - xor rax,r13 - add r8,rbp - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc rax,0 - - mulx rbp,r8,r8 - mulx r13,r9,r9 - add r9,rbp - mulx rbp,r10,r10 - adc r10,r13 - mulx r13,r11,r11 - adc r11,rbp - mulx rbp,r12,r12 - adc r12,r13 - adc rbp,0 - imul rdx - add rax,rbp - adc rdx,0 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],rax - mov QWORD PTR[48+rdi],rdx - sar rdx,63 - mov QWORD PTR[56+rdi],rdx - mov rdx,rcx - mov rax,rcx - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov r14,QWORD PTR[48+rsi] - mov r15,QWORD PTR[56+rsi] - mov rbx,QWORD PTR[64+rsi] - mov rbp,QWORD PTR[72+rsi] - mov rcx,QWORD PTR[80+rsi] - mov rdi,QWORD PTR[88+rsi] - - sar rax,63 - xor rsi,rsi - sub rsi,rax - - xor rdx,rax - add rdx,rsi - - xor r8,rax - xor r9,rax - xor r10,rax - xor r11,rax - xor r12,rax - xor r13,rax - xor r14,rax - xor r15,rax - xor rbx,rax - xor rbp,rax - xor rcx,rax - xor rdi,rax - add r8,rsi - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - adc r14,0 - adc r15,0 - adc rbx,0 - adc rbp,0 - adc rcx,0 - adc rdi,0 - - mulx rax,r8,r8 - mulx rsi,r9,r9 - add r9,rax - mulx rax,r10,r10 - adc r10,rsi - mulx rsi,r11,r11 - adc r11,rax - mulx rax,r12,r12 - adc r12,rsi - mulx rsi,r13,r13 - adc r13,rax - mulx rax,r14,r14 - adc r14,rsi - mulx rsi,r15,r15 - adc r15,rax - mulx rax,rbx,rbx - adc rbx,rsi - mulx rsi,rbp,rbp - adc rbp,rax - mulx rax,rcx,rcx - adc rcx,rsi - mulx rsi,rdi,rdi - mov rdx,QWORD PTR[8+rsp] - mov rsi,QWORD PTR[16+rsp] - adc rax,rdi - - add r8,QWORD PTR[rdx] - adc r9,QWORD PTR[8+rdx] - adc r10,QWORD PTR[16+rdx] - adc r11,QWORD PTR[24+rdx] - adc r12,QWORD PTR[32+rdx] - adc r13,QWORD PTR[40+rdx] - adc r14,QWORD PTR[48+rdx] - mov rdi,QWORD PTR[56+rdx] - adc r15,rdi - adc rbx,rdi - adc rbp,rdi - adc rcx,rdi - adc rax,rdi - - mov rdi,rdx - - mov QWORD PTR[rdx],r8 - mov QWORD PTR[8+rdx],r9 - mov QWORD PTR[16+rdx],r10 - mov QWORD PTR[24+rdx],r11 - mov QWORD PTR[32+rdx],r12 - mov QWORD PTR[40+rdx],r13 - mov QWORD PTR[48+rdx],r14 - mov QWORD PTR[56+rdx],r15 - mov QWORD PTR[64+rdx],rbx - mov QWORD PTR[72+rdx],rbp - mov QWORD PTR[80+rdx],rcx - mov QWORD PTR[88+rdx],rax - - DB 0F3h,0C3h ;repret -__smulx_767x63 ENDP - -ALIGN 32 -__smulx_383x63 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[((0+0))+rsi] - mov r9,QWORD PTR[((0+8))+rsi] - mov r10,QWORD PTR[((0+16))+rsi] - mov r11,QWORD PTR[((0+24))+rsi] - mov r12,QWORD PTR[((0+32))+rsi] - mov r13,QWORD PTR[((0+40))+rsi] - - mov rbp,rdx - sar rbp,63 - xor rax,rax - sub rax,rbp - - xor rdx,rbp - add rdx,rax - - xor r8,rbp - xor r9,rbp - xor r10,rbp - xor r11,rbp - xor r12,rbp - xor r13,rbp - add r8,rax - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mulx rbp,r8,r8 - mulx rax,r9,r9 - add r9,rbp - mulx rbp,r10,r10 - adc r10,rax - mulx rax,r11,r11 - adc r11,rbp - mulx rbp,r12,r12 - adc r12,rax - mulx rax,r13,r13 - mov rdx,rcx - adc r13,rbp - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - mov r8,QWORD PTR[((48+0))+rsi] - mov r9,QWORD PTR[((48+8))+rsi] - mov r10,QWORD PTR[((48+16))+rsi] - mov r11,QWORD PTR[((48+24))+rsi] - mov r12,QWORD PTR[((48+32))+rsi] - mov r13,QWORD PTR[((48+40))+rsi] - - mov rbp,rdx - sar rbp,63 - xor rax,rax - sub rax,rbp - - xor rdx,rbp - add rdx,rax - - xor r8,rbp - xor r9,rbp - xor r10,rbp - xor r11,rbp - xor r12,rbp - xor r13,rbp - add r8,rax - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc r13,0 - - mulx rbp,r8,r8 - mulx rax,r9,r9 - add r9,rbp - mulx rbp,r10,r10 - adc r10,rax - mulx rax,r11,r11 - adc r11,rbp - mulx rbp,r12,r12 - adc r12,rax - mulx rax,r13,r13 - adc r13,rbp - - add r8,QWORD PTR[rdi] - adc r9,QWORD PTR[8+rdi] - adc r10,QWORD PTR[16+rdi] - adc r11,QWORD PTR[24+rdi] - adc r12,QWORD PTR[32+rdi] - adc r13,QWORD PTR[40+rdi] - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - DB 0F3h,0C3h ;repret -__smulx_383x63 ENDP - -ALIGN 32 -__smulx_383_n_shift_by_31 PROC PRIVATE - DB 243,15,30,250 - - mov rbx,rdx - xor r14,r14 - mov r8,QWORD PTR[((0+0))+rsi] - mov r9,QWORD PTR[((0+8))+rsi] - mov r10,QWORD PTR[((0+16))+rsi] - mov r11,QWORD PTR[((0+24))+rsi] - mov r12,QWORD PTR[((0+32))+rsi] - mov r13,QWORD PTR[((0+40))+rsi] - - mov rax,rdx - sar rax,63 - xor rbp,rbp - sub rbp,rax - - xor rdx,rax - add rdx,rbp - - xor r8,rax - xor r9,rax - xor r10,rax - xor r11,rax - xor r12,rax - xor rax,r13 - add r8,rbp - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc rax,0 - - mulx rbp,r8,r8 - mulx r13,r9,r9 - add r9,rbp - mulx rbp,r10,r10 - adc r10,r13 - mulx r13,r11,r11 - adc r11,rbp - mulx rbp,r12,r12 - adc r12,r13 - adc rbp,0 - imul rdx - add rax,rbp - adc r14,rdx - - mov rdx,rcx - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],rax - mov r8,QWORD PTR[((48+0))+rsi] - mov r9,QWORD PTR[((48+8))+rsi] - mov r10,QWORD PTR[((48+16))+rsi] - mov r11,QWORD PTR[((48+24))+rsi] - mov r12,QWORD PTR[((48+32))+rsi] - mov r13,QWORD PTR[((48+40))+rsi] - - mov rax,rdx - sar rax,63 - xor rbp,rbp - sub rbp,rax - - xor rdx,rax - add rdx,rbp - - xor r8,rax - xor r9,rax - xor r10,rax - xor r11,rax - xor r12,rax - xor rax,r13 - add r8,rbp - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc rax,0 - - mulx rbp,r8,r8 - mulx r13,r9,r9 - add r9,rbp - mulx rbp,r10,r10 - adc r10,r13 - mulx r13,r11,r11 - adc r11,rbp - mulx rbp,r12,r12 - adc r12,r13 - adc rbp,0 - imul rdx - add rax,rbp - adc rdx,0 - - add r8,QWORD PTR[rdi] - adc r9,QWORD PTR[8+rdi] - adc r10,QWORD PTR[16+rdi] - adc r11,QWORD PTR[24+rdi] - adc r12,QWORD PTR[32+rdi] - adc rax,QWORD PTR[40+rdi] - adc r14,rdx - mov rdx,rbx - - shrd r8,r9,31 - shrd r9,r10,31 - shrd r10,r11,31 - shrd r11,r12,31 - shrd r12,rax,31 - shrd rax,r14,31 - - sar r14,63 - xor rbp,rbp - sub rbp,r14 - - xor r8,r14 - xor r9,r14 - xor r10,r14 - xor r11,r14 - xor r12,r14 - xor rax,r14 - add r8,rbp - adc r9,0 - adc r10,0 - adc r11,0 - adc r12,0 - adc rax,0 - - mov QWORD PTR[rdi],r8 - mov QWORD PTR[8+rdi],r9 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],rax - - xor rdx,r14 - xor rcx,r14 - add rdx,rbp - add rcx,rbp - - DB 0F3h,0C3h ;repret -__smulx_383_n_shift_by_31 ENDP - -ALIGN 32 -__smulx_191_n_shift_by_31 PROC PRIVATE - DB 243,15,30,250 - - mov rbx,rdx - mov r8,QWORD PTR[((0+0))+rsi] - mov r9,QWORD PTR[((0+8))+rsi] - mov r10,QWORD PTR[((0+16))+rsi] - - mov rax,rdx - sar rax,63 - xor rbp,rbp - sub rbp,rax - - xor rdx,rax - add rdx,rbp - - xor r8,rax - xor r9,rax - xor rax,r10 - add r8,rbp - adc r9,0 - adc rax,0 - - mulx rbp,r8,r8 - mulx r10,r9,r9 - add r9,rbp - adc r10,0 - imul rdx - add r10,rax - adc rdx,0 - mov r14,rdx - mov rdx,rcx - mov r11,QWORD PTR[((48+0))+rsi] - mov r12,QWORD PTR[((48+8))+rsi] - mov r13,QWORD PTR[((48+16))+rsi] - - mov rax,rdx - sar rax,63 - xor rbp,rbp - sub rbp,rax - - xor rdx,rax - add rdx,rbp - - xor r11,rax - xor r12,rax - xor rax,r13 - add r11,rbp - adc r12,0 - adc rax,0 - - mulx rbp,r11,r11 - mulx r13,r12,r12 - add r12,rbp - adc r13,0 - imul rdx - add r13,rax - adc rdx,0 - add r11,r8 - adc r12,r9 - adc r13,r10 - adc r14,rdx - mov rdx,rbx - - shrd r11,r12,31 - shrd r12,r13,31 - shrd r13,r14,31 - - sar r14,63 - xor rbp,rbp - sub rbp,r14 - - xor r11,r14 - xor r12,r14 - xor r13,r14 - add r11,rbp - adc r12,0 - adc r13,0 - - mov QWORD PTR[rdi],r11 - mov QWORD PTR[8+rdi],r12 - mov QWORD PTR[16+rdi],r13 - - xor rdx,r14 - xor rcx,r14 - add rdx,rbp - add rcx,rbp - - DB 0F3h,0C3h ;repret -__smulx_191_n_shift_by_31 ENDP - -ALIGN 32 -__ab_approximation_31 PROC PRIVATE - DB 243,15,30,250 - - mov r9,QWORD PTR[40+rsi] - mov r11,QWORD PTR[88+rsi] - mov rbx,QWORD PTR[32+rsi] - mov rbp,QWORD PTR[80+rsi] - mov r8,QWORD PTR[24+rsi] - mov r10,QWORD PTR[72+rsi] - - mov rax,r9 - or rax,r11 - cmovz r9,rbx - cmovz r11,rbp - cmovz rbx,r8 - mov r8,QWORD PTR[16+rsi] - cmovz rbp,r10 - mov r10,QWORD PTR[64+rsi] - - mov rax,r9 - or rax,r11 - cmovz r9,rbx - cmovz r11,rbp - cmovz rbx,r8 - mov r8,QWORD PTR[8+rsi] - cmovz rbp,r10 - mov r10,QWORD PTR[56+rsi] - - mov rax,r9 - or rax,r11 - cmovz r9,rbx - cmovz r11,rbp - cmovz rbx,r8 - mov r8,QWORD PTR[rsi] - cmovz rbp,r10 - mov r10,QWORD PTR[48+rsi] - - mov rax,r9 - or rax,r11 - cmovz r9,rbx - cmovz r11,rbp - cmovz rbx,r8 - cmovz rbp,r10 - - mov rax,r9 - or rax,r11 - bsr rcx,rax - lea rcx,QWORD PTR[1+rcx] - cmovz r9,r8 - cmovz r11,r10 - cmovz rcx,rax - neg rcx - - - shld r9,rbx,cl - shld r11,rbp,cl - - mov eax,07FFFFFFFh - and r8,rax - and r10,rax - andn r9,rax,r9 - andn r11,rax,r11 - or r8,r9 - or r10,r11 - - jmp __inner_loop_31 - - DB 0F3h,0C3h ;repret -__ab_approximation_31 ENDP - -ALIGN 32 -__inner_loop_31 PROC PRIVATE - DB 243,15,30,250 - - mov rcx,07FFFFFFF80000000h - mov r13,0800000007FFFFFFFh - mov r15,07FFFFFFF7FFFFFFFh - -$L$oop_31:: - cmp r8,r10 - mov rax,r8 - mov rbx,r10 - mov rbp,rcx - mov r14,r13 - cmovb r8,r10 - cmovb r10,rax - cmovb rcx,r13 - cmovb r13,rbp - - sub r8,r10 - sub rcx,r13 - add rcx,r15 - - test rax,1 - cmovz r8,rax - cmovz r10,rbx - cmovz rcx,rbp - cmovz r13,r14 - - shr r8,1 - add r13,r13 - sub r13,r15 - sub edi,1 - jnz $L$oop_31 - - shr r15,32 - mov edx,ecx - mov r12d,r13d - shr rcx,32 - shr r13,32 - sub rdx,r15 - sub rcx,r15 - sub r12,r15 - sub r13,r15 - - DB 0F3h,0C3h ;repret -__inner_loop_31 ENDP - - -ALIGN 32 -__tail_loop_53 PROC PRIVATE - DB 243,15,30,250 - - mov rdx,1 - xor rcx,rcx - xor r12,r12 - mov r13,1 - -$L$oop_53:: - xor rax,rax - test r8,1 - mov rbx,r10 - cmovnz rax,r10 - sub rbx,r8 - mov rbp,r8 - sub r8,rax - cmovc r8,rbx - cmovc r10,rbp - mov rax,rdx - cmovc rdx,r12 - cmovc r12,rax - mov rbx,rcx - cmovc rcx,r13 - cmovc r13,rbx - xor rax,rax - xor rbx,rbx - shr r8,1 - test rbp,1 - cmovnz rax,r12 - cmovnz rbx,r13 - add r12,r12 - add r13,r13 - sub rdx,rax - sub rcx,rbx - sub edi,1 - jnz $L$oop_53 - - DB 0F3h,0C3h ;repret -__tail_loop_53 ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_ctx_inverse_mod_383 - DD imagerel $L$SEH_body_ctx_inverse_mod_383 - DD imagerel $L$SEH_info_ctx_inverse_mod_383_prologue - - DD imagerel $L$SEH_body_ctx_inverse_mod_383 - DD imagerel $L$SEH_epilogue_ctx_inverse_mod_383 - DD imagerel $L$SEH_info_ctx_inverse_mod_383_body - - DD imagerel $L$SEH_epilogue_ctx_inverse_mod_383 - DD imagerel $L$SEH_end_ctx_inverse_mod_383 - DD imagerel $L$SEH_info_ctx_inverse_mod_383_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_ctx_inverse_mod_383_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_ctx_inverse_mod_383_body:: -DB 1,0,18,0 -DB 000h,0f4h,08bh,000h -DB 000h,0e4h,08ch,000h -DB 000h,0d4h,08dh,000h -DB 000h,0c4h,08eh,000h -DB 000h,034h,08fh,000h -DB 000h,054h,090h,000h -DB 000h,074h,092h,000h -DB 000h,064h,093h,000h -DB 000h,001h,091h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_ctx_inverse_mod_383_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/div3w-armv8.asm b/crypto/blst_src/build/win64/div3w-armv8.asm deleted file mode 100644 index aec90679eea..00000000000 --- a/crypto/blst_src/build/win64/div3w-armv8.asm +++ /dev/null @@ -1,89 +0,0 @@ - AREA |.text|,CODE,ALIGN=8,ARM64 - - - EXPORT |div_3_limbs|[FUNC] - ALIGN 32 -|div_3_limbs| PROC - ldp x4,x5,[x0] // load R - eor x0,x0,x0 // Q = 0 - mov x3,#64 // loop counter - nop - -|$Loop| - subs x6,x4,x1 // R - D - add x0,x0,x0 // Q <<= 1 - sbcs x7,x5,x2 - add x0,x0,#1 // Q + speculative bit - csello x4,x4,x6 - extr x1,x2,x1,#1 // D >>= 1 - csello x5,x5,x7 - lsr x2,x2,#1 - sbc x0,x0,xzr // subtract speculative bit - sub x3,x3,#1 - cbnz x3,|$Loop| - - asr x3,x0,#63 // top bit -> mask - add x0,x0,x0 // Q <<= 1 - subs x6,x4,x1 // R - D - add x0,x0,#1 // Q + speculative bit - sbcs x7,x5,x2 - sbc x0,x0,xzr // subtract speculative bit - - orr x0,x0,x3 // all ones if overflow - - ret - ENDP - - EXPORT |quot_rem_128|[FUNC] - ALIGN 32 -|quot_rem_128| PROC - ldp x3,x4,[x1] - - mul x5,x3,x2 // divisor[0:1} * quotient - umulh x6,x3,x2 - mul x11, x4,x2 - umulh x7,x4,x2 - - ldp x8,x9,[x0] // load 3 limbs of the dividend - ldr x10,[x0,#16] - - adds x6,x6,x11 - adc x7,x7,xzr - - subs x8,x8,x5 // dividend - divisor * quotient - sbcs x9,x9,x6 - sbcs x10,x10,x7 - sbc x5,xzr,xzr // borrow -> mask - - add x2,x2,x5 // if borrowed, adjust the quotient ... - and x3,x3,x5 - and x4,x4,x5 - adds x8,x8,x3 // ... and add divisor - adc x9,x9,x4 - - stp x8,x9,[x0] // save 2 limbs of the remainder - str x2,[x0,#16] // and one limb of the quotient - - mov x0,x2 // return adjusted quotient - - ret - ENDP - - - EXPORT |quot_rem_64|[FUNC] - ALIGN 32 -|quot_rem_64| PROC - ldr x3,[x1] - ldr x8,[x0] // load 1 limb of the dividend - - mul x5,x3,x2 // divisor * quotient - - sub x8,x8,x5 // dividend - divisor * quotient - - stp x8,x2,[x0] // save remainder and quotient - - mov x0,x2 // return quotient - - ret - ENDP - END diff --git a/crypto/blst_src/build/win64/div3w-x86_64.asm b/crypto/blst_src/build/win64/div3w-x86_64.asm deleted file mode 100644 index 805c5b1fcb0..00000000000 --- a/crypto/blst_src/build/win64/div3w-x86_64.asm +++ /dev/null @@ -1,257 +0,0 @@ -OPTION DOTNAME -.text$ SEGMENT ALIGN(256) 'CODE' - -PUBLIC div_3_limbs - - -ALIGN 32 -div_3_limbs PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_div_3_limbs:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -$L$SEH_body_div_3_limbs:: - - mov r8,QWORD PTR[rdi] - mov r9,QWORD PTR[8+rdi] - xor rax,rax - mov ecx,64 - -$L$oop:: - mov r10,r8 - sub r8,rsi - mov r11,r9 - sbb r9,rdx - lea rax,QWORD PTR[1+rax*1+rax] - mov rdi,rdx - cmovc r8,r10 - cmovc r9,r11 - sbb rax,0 - shl rdi,63 - shr rsi,1 - shr rdx,1 - or rsi,rdi - sub ecx,1 - jnz $L$oop - - lea rcx,QWORD PTR[1+rax*1+rax] - sar rax,63 - - sub r8,rsi - sbb r9,rdx - sbb rcx,0 - - or rax,rcx - -$L$SEH_epilogue_div_3_limbs:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_div_3_limbs:: -div_3_limbs ENDP -PUBLIC quot_rem_128 - - -ALIGN 32 -quot_rem_128 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_quot_rem_128:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -$L$SEH_body_quot_rem_128:: - - mov rax,rdx - mov rcx,rdx - - mul QWORD PTR[rsi] - mov r8,rax - mov rax,rcx - mov r9,rdx - - mul QWORD PTR[8+rsi] - add r9,rax - adc rdx,0 - - mov r10,QWORD PTR[rdi] - mov r11,QWORD PTR[8+rdi] - mov rax,QWORD PTR[16+rdi] - - sub r10,r8 - sbb r11,r9 - sbb rax,rdx - sbb r8,r8 - - add rcx,r8 - mov r9,r8 - and r8,QWORD PTR[rsi] - and r9,QWORD PTR[8+rsi] - add r10,r8 - adc r11,r9 - - mov QWORD PTR[rdi],r10 - mov QWORD PTR[8+rdi],r11 - mov QWORD PTR[16+rdi],rcx - - mov rax,rcx - -$L$SEH_epilogue_quot_rem_128:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_quot_rem_128:: -quot_rem_128 ENDP - - - - - -PUBLIC quot_rem_64 - - -ALIGN 32 -quot_rem_64 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_quot_rem_64:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -$L$SEH_body_quot_rem_64:: - - mov rax,rdx - imul rdx,QWORD PTR[rsi] - - mov r10,QWORD PTR[rdi] - - sub r10,rdx - - mov QWORD PTR[rdi],r10 - mov QWORD PTR[8+rdi],rax - -$L$SEH_epilogue_quot_rem_64:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_quot_rem_64:: -quot_rem_64 ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_div_3_limbs - DD imagerel $L$SEH_body_div_3_limbs - DD imagerel $L$SEH_info_div_3_limbs_prologue - - DD imagerel $L$SEH_body_div_3_limbs - DD imagerel $L$SEH_epilogue_div_3_limbs - DD imagerel $L$SEH_info_div_3_limbs_body - - DD imagerel $L$SEH_epilogue_div_3_limbs - DD imagerel $L$SEH_end_div_3_limbs - DD imagerel $L$SEH_info_div_3_limbs_epilogue - - DD imagerel $L$SEH_begin_quot_rem_128 - DD imagerel $L$SEH_body_quot_rem_128 - DD imagerel $L$SEH_info_quot_rem_128_prologue - - DD imagerel $L$SEH_body_quot_rem_128 - DD imagerel $L$SEH_epilogue_quot_rem_128 - DD imagerel $L$SEH_info_quot_rem_128_body - - DD imagerel $L$SEH_epilogue_quot_rem_128 - DD imagerel $L$SEH_end_quot_rem_128 - DD imagerel $L$SEH_info_quot_rem_128_epilogue - - DD imagerel $L$SEH_begin_quot_rem_64 - DD imagerel $L$SEH_body_quot_rem_64 - DD imagerel $L$SEH_info_quot_rem_64_prologue - - DD imagerel $L$SEH_body_quot_rem_64 - DD imagerel $L$SEH_epilogue_quot_rem_64 - DD imagerel $L$SEH_info_quot_rem_64_body - - DD imagerel $L$SEH_epilogue_quot_rem_64 - DD imagerel $L$SEH_end_quot_rem_64 - DD imagerel $L$SEH_info_quot_rem_64_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_div_3_limbs_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_div_3_limbs_body:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h -$L$SEH_info_div_3_limbs_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_quot_rem_128_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_quot_rem_128_body:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h -$L$SEH_info_quot_rem_128_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_quot_rem_64_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_quot_rem_64_body:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h -$L$SEH_info_quot_rem_64_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/dll.c b/crypto/blst_src/build/win64/dll.c deleted file mode 100644 index a70d0c98a23..00000000000 --- a/crypto/blst_src/build/win64/dll.c +++ /dev/null @@ -1,32 +0,0 @@ -#include - -#if defined(_MSC_VER) -/* - * Even though we don't have memcpy/memset anywhere, MSVC compiler - * generates calls to them as it recognizes corresponding patterns. - */ -void *memcpy(unsigned char *dst, const unsigned char *src, size_t n) -{ - void *ret = dst; - - while(n--) - *dst++ = *src++; - - return ret; -} - -void *memset(unsigned char *dst, int c, size_t n) -{ - void *ret = dst; - - while(n--) - *dst++ = (unsigned char)c; - - return ret; -} -#elif defined(__GNUC__) -# pragma GCC diagnostic ignored "-Wunused-parameter" -#endif - -BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) -{ return TRUE; } diff --git a/crypto/blst_src/build/win64/mul_mont_256-armv8.asm b/crypto/blst_src/build/win64/mul_mont_256-armv8.asm deleted file mode 100644 index bb2dfe043c7..00000000000 --- a/crypto/blst_src/build/win64/mul_mont_256-armv8.asm +++ /dev/null @@ -1,465 +0,0 @@ - AREA |.text|,CODE,ALIGN=8,ARM64 - - - - EXPORT |mul_mont_sparse_256|[FUNC] - ALIGN 32 -|mul_mont_sparse_256| PROC - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x10,x11,[x1] - ldr x9, [x2] - ldp x12,x13,[x1,#16] - - mul x19,x10,x9 - ldp x5,x6,[x3] - mul x20,x11,x9 - ldp x7,x8,[x3,#16] - mul x21,x12,x9 - mul x22,x13,x9 - - umulh x14,x10,x9 - umulh x15,x11,x9 - mul x3,x4,x19 - umulh x16,x12,x9 - umulh x17,x13,x9 - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,xzr, x17 - mul x17,x8,x3 - ldr x9,[x2,8*1] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - ldr x9,[x2,8*2] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - ldr x9,[x2,8*3] - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - mul x14,x10,x9 - adcs x20,x21,x15 - mul x15,x11,x9 - adcs x21,x22,x16 - mul x16,x12,x9 - adcs x22,x23,x17 - mul x17,x13,x9 - adc x23,xzr,xzr - - adds x19,x19,x14 - umulh x14,x10,x9 - adcs x20,x20,x15 - umulh x15,x11,x9 - adcs x21,x21,x16 - mul x3,x4,x19 - umulh x16,x12,x9 - adcs x22,x22,x17 - umulh x17,x13,x9 - adc x23,x23,xzr - - adds x20,x20,x14 - //mul x14,x5,x3 - adcs x21,x21,x15 - mul x15,x6,x3 - adcs x22,x22,x16 - mul x16,x7,x3 - adc x23,x23,x17 - mul x17,x8,x3 - subs xzr,x19,#1 //adds x19,x19,x14 - umulh x14,x5,x3 - adcs x20,x20,x15 - umulh x15,x6,x3 - adcs x21,x21,x16 - umulh x16,x7,x3 - adcs x22,x22,x17 - umulh x17,x8,x3 - adc x23,x23,xzr - - adds x19,x20,x14 - adcs x20,x21,x15 - adcs x21,x22,x16 - adcs x22,x23,x17 - adc x23,xzr,xzr - - subs x14,x19,x5 - sbcs x15,x20,x6 - sbcs x16,x21,x7 - sbcs x17,x22,x8 - sbcs xzr, x23,xzr - - csello x19,x19,x14 - csello x20,x20,x15 - csello x21,x21,x16 - csello x22,x22,x17 - - stp x19,x20,[x0] - stp x21,x22,[x0,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - ret - ENDP - - - EXPORT |sqr_mont_sparse_256|[FUNC] - ALIGN 32 -|sqr_mont_sparse_256| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-48]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - mov x4,x3 - - //////////////////////////////////////////////////////////////// - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul x11,x6,x5 // a[1]*a[0] - umulh x15,x6,x5 - mul x12,x7,x5 // a[2]*a[0] - umulh x16,x7,x5 - mul x13,x8,x5 // a[3]*a[0] - umulh x19,x8,x5 - - adds x12,x12,x15 // accumulate high parts of multiplication - mul x14,x7,x6 // a[2]*a[1] - umulh x15,x7,x6 - adcs x13,x13,x16 - mul x16,x8,x6 // a[3]*a[1] - umulh x17,x8,x6 - adc x19,x19,xzr // can't overflow - - mul x20,x8,x7 // a[3]*a[2] - umulh x21,x8,x7 - - adds x15,x15,x16 // accumulate high parts of multiplication - mul x10,x5,x5 // a[0]*a[0] - adc x16,x17,xzr // can't overflow - - adds x13,x13,x14 // accumulate low parts of multiplication - umulh x5,x5,x5 - adcs x19,x19,x15 - mul x15,x6,x6 // a[1]*a[1] - adcs x20,x20,x16 - umulh x6,x6,x6 - adc x21,x21,xzr // can't overflow - - adds x11,x11,x11 // acc[1-6]*=2 - mul x16,x7,x7 // a[2]*a[2] - adcs x12,x12,x12 - umulh x7,x7,x7 - adcs x13,x13,x13 - mul x17,x8,x8 // a[3]*a[3] - adcs x19,x19,x19 - umulh x8,x8,x8 - adcs x20,x20,x20 - adcs x21,x21,x21 - adc x22,xzr,xzr - - adds x11,x11,x5 // +a[i]*a[i] - adcs x12,x12,x15 - adcs x13,x13,x6 - adcs x19,x19,x16 - adcs x20,x20,x7 - adcs x21,x21,x17 - adc x22,x22,x8 - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - adds x10,x10,x19 // accumulate upper half - adcs x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - adc x19,xzr,xzr - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - sbcs xzr, x19,xzr - - csello x10,x10,x14 - csello x11,x11,x15 - csello x12,x12,x16 - csello x13,x13,x17 - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldr x29,[sp],#48 - DCDU 3573752767 - ret - ENDP - - - EXPORT |from_mont_256|[FUNC] - ALIGN 32 -|from_mont_256| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x4,x3 - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - - csello x10,x10,x14 - csello x11,x11,x15 - csello x12,x12,x16 - csello x13,x13,x17 - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldr x29,[sp],#16 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |redc_mont_256|[FUNC] - ALIGN 32 -|redc_mont_256| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x4,x3 - ldp x10,x11,[x1] - ldp x12,x13,[x1,#16] - - bl __mul_by_1_mont_256 - ldr x30,[x29,#8] - - ldp x14,x15,[x1,#32] - ldp x16,x17,[x1,#48] - - adds x10,x10,x14 - adcs x11,x11,x15 - adcs x12,x12,x16 - adcs x13,x13,x17 - adc x9,xzr,xzr - - subs x14,x10,x5 - sbcs x15,x11,x6 - sbcs x16,x12,x7 - sbcs x17,x13,x8 - sbcs xzr, x9,xzr - - csello x10,x10,x14 - csello x11,x11,x15 - csello x12,x12,x16 - csello x13,x13,x17 - - stp x10,x11,[x0] - stp x12,x13,[x0,#16] - - ldr x29,[sp],#16 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__mul_by_1_mont_256| PROC - mul x3,x4,x10 - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - mul x3,x4,x10 - adc x13,x9,x17 - //mul x14,x5,x3 - mul x15,x6,x3 - mul x16,x7,x3 - mul x17,x8,x3 - subs xzr,x10,#1 //adds x10,x10,x14 - umulh x14,x5,x3 - adcs x11,x11,x15 - umulh x15,x6,x3 - adcs x12,x12,x16 - umulh x16,x7,x3 - adcs x13,x13,x17 - umulh x17,x8,x3 - adc x9,xzr,xzr - - adds x10,x11,x14 - adcs x11,x12,x15 - adcs x12,x13,x16 - adc x13,x9,x17 - - ret - ENDP - END diff --git a/crypto/blst_src/build/win64/mul_mont_384-armv8.asm b/crypto/blst_src/build/win64/mul_mont_384-armv8.asm deleted file mode 100644 index a309dfa4121..00000000000 --- a/crypto/blst_src/build/win64/mul_mont_384-armv8.asm +++ /dev/null @@ -1,2373 +0,0 @@ - AREA |.text|,CODE,ALIGN=8,ARM64 - - - EXPORT |add_mod_384x384|[FUNC] - ALIGN 32 -|add_mod_384x384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __add_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__add_mod_384x384| PROC - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - adds x11,x11,x19 - ldp x21,x22,[x2,#16] - adcs x12,x12,x20 - ldp x15, x16, [x1,#32] - adcs x13,x13,x21 - ldp x23,x24,[x2,#32] - adcs x14,x14,x22 - stp x11, x12, [x0] - adcs x15,x15,x23 - ldp x11, x12, [x1,#48] - adcs x16,x16,x24 - - ldp x19,x20,[x2,#48] - stp x13, x14, [x0,#16] - ldp x13, x14, [x1,#64] - ldp x21,x22,[x2,#64] - - adcs x11,x11,x19 - stp x15, x16, [x0,#32] - adcs x12,x12,x20 - ldp x15, x16, [x1,#80] - adcs x13,x13,x21 - ldp x23,x24,[x2,#80] - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x17,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x17,xzr - - csello x11,x11,x19 - csello x12,x12,x20 - csello x13,x13,x21 - csello x14,x14,x22 - stp x11,x12,[x0,#48] - csello x15,x15,x23 - stp x13,x14,[x0,#64] - csello x16,x16,x24 - stp x15,x16,[x0,#80] - - ret - ENDP - - - EXPORT |sub_mod_384x384|[FUNC] - ALIGN 32 -|sub_mod_384x384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__sub_mod_384x384| PROC - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - subs x11,x11,x19 - ldp x21,x22,[x2,#16] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#32] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#32] - sbcs x14,x14,x22 - stp x11, x12, [x0] - sbcs x15,x15,x23 - ldp x11, x12, [x1,#48] - sbcs x16,x16,x24 - - ldp x19,x20,[x2,#48] - stp x13, x14, [x0,#16] - ldp x13, x14, [x1,#64] - ldp x21,x22,[x2,#64] - - sbcs x11,x11,x19 - stp x15, x16, [x0,#32] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#80] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#80] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x17,xzr,xzr - - and x19,x5,x17 - and x20,x6,x17 - adds x11,x11,x19 - and x21,x7,x17 - adcs x12,x12,x20 - and x22,x8,x17 - adcs x13,x13,x21 - and x23,x9,x17 - adcs x14,x14,x22 - and x24,x10,x17 - adcs x15,x15,x23 - stp x11,x12,[x0,#48] - adc x16,x16,x24 - stp x13,x14,[x0,#64] - stp x15,x16,[x0,#80] - - ret - ENDP - - - ALIGN 32 -|__add_mod_384| PROC - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - adds x11,x11,x19 - ldp x21,x22,[x2,#16] - adcs x12,x12,x20 - ldp x15, x16, [x1,#32] - adcs x13,x13,x21 - ldp x23,x24,[x2,#32] - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x17,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x17,xzr - - csello x11,x11,x19 - csello x12,x12,x20 - csello x13,x13,x21 - csello x14,x14,x22 - csello x15,x15,x23 - stp x11,x12,[x0] - csello x16,x16,x24 - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret - ENDP - - - ALIGN 32 -|__sub_mod_384| PROC - ldp x11, x12, [x1] - ldp x19,x20,[x2] - ldp x13, x14, [x1,#16] - subs x11,x11,x19 - ldp x21,x22,[x2,#16] - sbcs x12,x12,x20 - ldp x15, x16, [x1,#32] - sbcs x13,x13,x21 - ldp x23,x24,[x2,#32] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x17,xzr,xzr - - and x19,x5,x17 - and x20,x6,x17 - adds x11,x11,x19 - and x21,x7,x17 - adcs x12,x12,x20 - and x22,x8,x17 - adcs x13,x13,x21 - and x23,x9,x17 - adcs x14,x14,x22 - and x24,x10,x17 - adcs x15,x15,x23 - stp x11,x12,[x0] - adc x16,x16,x24 - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret - ENDP - - - - EXPORT |mul_mont_384x|[FUNC] - ALIGN 32 -|mul_mont_384x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#288 // space for 3 768-bit vectors - - mov x26,x0 // save r_ptr - mov x27,x1 // save b_ptr - mov x28,x2 // save b_ptr - - sub x0,sp,#0 // mul_384(t0, a->re, b->re) - bl __mul_384 - - add x1,x1,#48 // mul_384(t1, a->im, b->im) - add x2,x2,#48 - add x0,sp,#96 - bl __mul_384 - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - sub x2,x1,#48 - add x0,sp,#240 - bl __add_mod_384 - - add x1,x28,#0 - add x2,x28,#48 - add x0,sp,#192 // t2 - bl __add_mod_384 - - add x1,x0,#0 - add x2,x0,#48 - bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - mov x1,x0 - add x2,sp,#0 - bl __sub_mod_384x384 - - add x2,sp,#96 - bl __sub_mod_384x384 // t2 = t2-t0-t1 - - add x1,sp,#0 - add x2,sp,#96 - add x0,sp,#0 - bl __sub_mod_384x384 // t0 = t0-t1 - - add x1,sp,#0 // ret->re = redc(t0) - add x0,x26,#0 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - - add x1,sp,#192 // ret->im = redc(t2) - add x0,x0,#48 - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#288 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |sqr_mont_384x|[FUNC] - ALIGN 32 -|sqr_mont_384x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 2 384-bit vectors - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - add x2,x1,#48 - add x0,sp,#0 - bl __add_mod_384 // t0 = a->re + a->im - - add x0,sp,#48 - bl __sub_mod_384 // t1 = a->re - a->im - - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) - - adds x11,x11,x11 // add with itself - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x25,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x25,xzr - - csello x19,x11,x19 - csello x20,x12,x20 - csello x21,x13,x21 - ldp x11,x12,[sp] - csello x22,x14,x22 - ldr x17, [sp,#48] - csello x23,x15,x23 - ldp x13,x14,[sp,#16] - csello x24,x16,x24 - ldp x15,x16,[sp,#32] - - stp x19,x20,[x2,#48] - stp x21,x22,[x2,#64] - stp x23,x24,[x2,#80] - - add x2,sp,#48 - bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |mul_mont_384|[FUNC] - ALIGN 32 -|mul_mont_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there - - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__mul_mont_384| PROC - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - mul x4,x4,x19 - - umulh x26,x11,x17 - umulh x27,x12,x17 - umulh x28,x13,x17 - umulh x0,x14,x17 - umulh x1,x15,x17 - umulh x3,x16,x17 - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,xzr, x3 - mul x3,x10,x4 - mov x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*1] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*2] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*3] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*4] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - adc x4,x17,xzr - ldr x17,[x2,8*5] - - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,x4,xzr - ldr x4,[x29,#96] - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adcs x25,x25,xzr - adc x17,xzr,xzr - - adds x20,x20,x26 - // mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adcs x25,x25,x3 - mul x3,x10,x4 - adc x17,x17,xzr - subs xzr,x19,#1 // adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adcs x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr - adc x17,x17,xzr - - adds x19,x20,x26 - adcs x20,x21,x27 - adcs x21,x22,x28 - adcs x22,x23,x0 - adcs x23,x24,x1 - adcs x24,x25,x3 - adc x25,x17,xzr - - subs x26,x19,x5 - sbcs x27,x20,x6 - sbcs x28,x21,x7 - sbcs x0,x22,x8 - sbcs x1,x23,x9 - sbcs x3,x24,x10 - sbcs xzr, x25,xzr - - csello x11,x19,x26 - csello x12,x20,x27 - csello x13,x21,x28 - csello x14,x22,x0 - csello x15,x23,x1 - csello x16,x24,x3 - ret - ENDP - - - - EXPORT |sqr_mont_384|[FUNC] - ALIGN 32 -|sqr_mont_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for 768-bit vector - mov x4,x3 // adjust for missing b_ptr - - mov x3,x0 // save r_ptr - mov x0,sp - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __sqr_384 - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - mov x1,sp - mov x0,x3 // restore r_ptr - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |sqr_n_mul_mont_383|[FUNC] - ALIGN 32 -|sqr_n_mul_mont_383| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x4,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#96 // space for 768-bit vector - mov x17,x5 // save b_ptr - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - mov x0,sp -|$Loop_sqr_383| - bl __sqr_384 - sub x2,x2,#1 // counter - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - mov x1,sp - bl __mul_by_1_mont_384 - - ldp x19,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x11,x11,x19 // just accumulate upper half - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adc x16,x16,x24 - - cbnz x2,|$Loop_sqr_383| - - mov x2,x17 - ldr x17,[x17] - bl __mul_mont_384 - ldr x30,[x29,#8] - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - ALIGN 32 -|__sqr_384| PROC - mul x19,x12,x11 - mul x20,x13,x11 - mul x21,x14,x11 - mul x22,x15,x11 - mul x23,x16,x11 - - umulh x6,x12,x11 - umulh x7,x13,x11 - umulh x8,x14,x11 - umulh x9,x15,x11 - adds x20,x20,x6 - umulh x10,x16,x11 - adcs x21,x21,x7 - mul x7,x13,x12 - adcs x22,x22,x8 - mul x8,x14,x12 - adcs x23,x23,x9 - mul x9,x15,x12 - adc x24,xzr, x10 - mul x10,x16,x12 - - adds x21,x21,x7 - umulh x7,x13,x12 - adcs x22,x22,x8 - umulh x8,x14,x12 - adcs x23,x23,x9 - umulh x9,x15,x12 - adcs x24,x24,x10 - umulh x10,x16,x12 - adc x25,xzr,xzr - - mul x5,x11,x11 - adds x22,x22,x7 - umulh x11, x11,x11 - adcs x23,x23,x8 - mul x8,x14,x13 - adcs x24,x24,x9 - mul x9,x15,x13 - adc x25,x25,x10 - mul x10,x16,x13 - - adds x23,x23,x8 - umulh x8,x14,x13 - adcs x24,x24,x9 - umulh x9,x15,x13 - adcs x25,x25,x10 - umulh x10,x16,x13 - adc x26,xzr,xzr - - mul x6,x12,x12 - adds x24,x24,x8 - umulh x12, x12,x12 - adcs x25,x25,x9 - mul x9,x15,x14 - adc x26,x26,x10 - mul x10,x16,x14 - - adds x25,x25,x9 - umulh x9,x15,x14 - adcs x26,x26,x10 - umulh x10,x16,x14 - adc x27,xzr,xzr - mul x7,x13,x13 - adds x26,x26,x9 - umulh x13, x13,x13 - adc x27,x27,x10 - mul x8,x14,x14 - - mul x10,x16,x15 - umulh x14, x14,x14 - adds x27,x27,x10 - umulh x10,x16,x15 - mul x9,x15,x15 - adc x28,x10,xzr - - adds x19,x19,x19 - adcs x20,x20,x20 - adcs x21,x21,x21 - adcs x22,x22,x22 - adcs x23,x23,x23 - adcs x24,x24,x24 - adcs x25,x25,x25 - adcs x26,x26,x26 - umulh x15, x15,x15 - adcs x27,x27,x27 - mul x10,x16,x16 - adcs x28,x28,x28 - umulh x16, x16,x16 - adc x1,xzr,xzr - - adds x19,x19,x11 - adcs x20,x20,x6 - adcs x21,x21,x12 - adcs x22,x22,x7 - adcs x23,x23,x13 - adcs x24,x24,x8 - adcs x25,x25,x14 - stp x5,x19,[x0] - adcs x26,x26,x9 - stp x20,x21,[x0,#16] - adcs x27,x27,x15 - stp x22,x23,[x0,#32] - adcs x28,x28,x10 - stp x24,x25,[x0,#48] - adc x16,x16,x1 - stp x26,x27,[x0,#64] - stp x28,x16,[x0,#80] - - ret - ENDP - - - EXPORT |sqr_384|[FUNC] - ALIGN 32 -|sqr_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - bl __sqr_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |redc_mont_384|[FUNC] - ALIGN 32 -|redc_mont_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - bl __mul_by_1_mont_384 - bl __redc_tail_mont_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |from_mont_384|[FUNC] - ALIGN 32 -|from_mont_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - mov x4,x3 // adjust for missing b_ptr - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - - csello x11,x11,x19 - csello x12,x12,x20 - csello x13,x13,x21 - csello x14,x14,x22 - csello x15,x15,x23 - csello x16,x16,x24 - - stp x11,x12,[x0] - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__mul_by_1_mont_384| PROC - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - mul x26,x4,x11 - ldp x15,x16,[x1,#32] - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - mul x26,x4,x11 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - // mul x19,x5,x26 - mul x20,x6,x26 - mul x21,x7,x26 - mul x22,x8,x26 - mul x23,x9,x26 - mul x24,x10,x26 - subs xzr,x11,#1 // adds x19,x19,x11 - umulh x11,x5,x26 - adcs x20,x20,x12 - umulh x12,x6,x26 - adcs x21,x21,x13 - umulh x13,x7,x26 - adcs x22,x22,x14 - umulh x14,x8,x26 - adcs x23,x23,x15 - umulh x15,x9,x26 - adcs x24,x24,x16 - umulh x16,x10,x26 - adc x25,xzr,xzr - adds x11,x11,x20 - adcs x12,x12,x21 - adcs x13,x13,x22 - adcs x14,x14,x23 - adcs x15,x15,x24 - adc x16,x16,x25 - - ret - ENDP - - - ALIGN 32 -|__redc_tail_mont_384| PROC - ldp x19,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x11,x11,x19 // accumulate upper half - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adcs x16,x16,x24 - adc x25,xzr,xzr - - subs x19,x11,x5 - sbcs x20,x12,x6 - sbcs x21,x13,x7 - sbcs x22,x14,x8 - sbcs x23,x15,x9 - sbcs x24,x16,x10 - sbcs xzr,x25,xzr - - csello x11,x11,x19 - csello x12,x12,x20 - csello x13,x13,x21 - csello x14,x14,x22 - csello x15,x15,x23 - csello x16,x16,x24 - - stp x11,x12,[x0] - stp x13,x14,[x0,#16] - stp x15,x16,[x0,#32] - - ret - ENDP - - - - EXPORT |mul_384|[FUNC] - ALIGN 32 -|mul_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - bl __mul_384 - ldr x30,[x29,#8] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__mul_384| PROC - ldp x11,x12,[x1] - ldr x17, [x2] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - - umulh x5,x11,x17 - umulh x6,x12,x17 - umulh x7,x13,x17 - umulh x8,x14,x17 - umulh x9,x15,x17 - umulh x10,x16,x17 - ldr x17,[x2,8*1] - - str x19,[x0] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,xzr, x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(1+1)] - adc x25,xzr,xzr - - str x19,[x0,8*1] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(2+1)] - adc x25,xzr,xzr - - str x19,[x0,8*2] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(3+1)] - adc x25,xzr,xzr - - str x19,[x0,8*3] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - ldr x17,[x2,#8*(4+1)] - adc x25,xzr,xzr - - str x19,[x0,8*4] - adds x19,x20,x5 - mul x5,x11,x17 - adcs x20,x21,x6 - mul x6,x12,x17 - adcs x21,x22,x7 - mul x7,x13,x17 - adcs x22,x23,x8 - mul x8,x14,x17 - adcs x23,x24,x9 - mul x9,x15,x17 - adc x24,x25,x10 - mul x10,x16,x17 - adds x19,x19,x5 - umulh x5,x11,x17 - adcs x20,x20,x6 - umulh x6,x12,x17 - adcs x21,x21,x7 - umulh x7,x13,x17 - adcs x22,x22,x8 - umulh x8,x14,x17 - adcs x23,x23,x9 - umulh x9,x15,x17 - adcs x24,x24,x10 - umulh x10,x16,x17 - adc x25,xzr,xzr - - str x19,[x0,8*5] - adds x19,x20,x5 - adcs x20,x21,x6 - adcs x21,x22,x7 - adcs x22,x23,x8 - adcs x23,x24,x9 - adc x24,x25,x10 - - stp x19,x20,[x0,#48] - stp x21,x22,[x0,#64] - stp x23,x24,[x0,#80] - - ret - ENDP - - - - EXPORT |mul_382x|[FUNC] - ALIGN 32 -|mul_382x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#96 // space for two 384-bit vectors - - ldp x11,x12,[x1] - mov x26,x0 // save r_ptr - ldp x19,x20,[x1,#48] - mov x27,x1 // save a_ptr - ldp x13,x14,[x1,#16] - mov x28,x2 // save b_ptr - ldp x21,x22,[x1,#64] - ldp x15,x16,[x1,#32] - adds x5,x11,x19 // t0 = a->re + a->im - ldp x23,x24,[x1,#80] - adcs x6,x12,x20 - ldp x11,x12,[x2] - adcs x7,x13,x21 - ldp x19,x20,[x2,#48] - adcs x8,x14,x22 - ldp x13,x14,[x2,#16] - adcs x9,x15,x23 - ldp x21,x22,[x2,#64] - adc x10,x16,x24 - ldp x15,x16,[x2,#32] - - stp x5,x6,[sp] - adds x5,x11,x19 // t1 = b->re + b->im - ldp x23,x24,[x2,#80] - adcs x6,x12,x20 - stp x7,x8,[sp,#16] - adcs x7,x13,x21 - adcs x8,x14,x22 - stp x9,x10,[sp,#32] - adcs x9,x15,x23 - stp x5,x6,[sp,#48] - adc x10,x16,x24 - stp x7,x8,[sp,#64] - stp x9,x10,[sp,#80] - - bl __mul_384 // mul_384(ret->re, a->re, b->re) - - add x1,sp,#0 // mul_384(ret->im, t0, t1) - add x2,sp,#48 - add x0,x26,#96 - bl __mul_384 - - add x1,x27,#48 // mul_384(tx, a->im, b->im) - add x2,x28,#48 - add x0,sp,#0 - bl __mul_384 - - ldp x5,x6,[x3] - ldp x7,x8,[x3,#16] - ldp x9,x10,[x3,#32] - - add x1,x26,#96 // ret->im -= tx - add x2,sp,#0 - add x0,x26,#96 - bl __sub_mod_384x384 - - add x2,x26,#0 // ret->im -= ret->re - bl __sub_mod_384x384 - - add x1,x26,#0 // ret->re -= tx - add x2,sp,#0 - add x0,x26,#0 - bl __sub_mod_384x384 - ldr x30,[x29,#8] - - add sp,sp,#96 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |sqr_382x|[FUNC] - ALIGN 32 -|sqr_382x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp x11,x12,[x1] - ldp x19,x20,[x1,#48] - ldp x13,x14,[x1,#16] - adds x5,x11,x19 // t0 = a->re + a->im - ldp x21,x22,[x1,#64] - adcs x6,x12,x20 - ldp x15,x16,[x1,#32] - adcs x7,x13,x21 - ldp x23,x24,[x1,#80] - adcs x8,x14,x22 - stp x5,x6,[x0] - adcs x9,x15,x23 - ldp x5,x6,[x2] - adc x10,x16,x24 - stp x7,x8,[x0,#16] - - subs x11,x11,x19 // t1 = a->re - a->im - ldp x7,x8,[x2,#16] - sbcs x12,x12,x20 - stp x9,x10,[x0,#32] - sbcs x13,x13,x21 - ldp x9,x10,[x2,#32] - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x25,xzr,xzr - - and x19,x5,x25 - and x20,x6,x25 - adds x11,x11,x19 - and x21,x7,x25 - adcs x12,x12,x20 - and x22,x8,x25 - adcs x13,x13,x21 - and x23,x9,x25 - adcs x14,x14,x22 - and x24,x10,x25 - adcs x15,x15,x23 - stp x11,x12,[x0,#48] - adc x16,x16,x24 - stp x13,x14,[x0,#64] - stp x15,x16,[x0,#80] - - mov x4,x1 // save a_ptr - add x1,x0,#0 // mul_384(ret->re, t0, t1) - add x2,x0,#48 - bl __mul_384 - - add x1,x4,#0 // mul_384(ret->im, a->re, a->im) - add x2,x4,#48 - add x0,x0,#96 - bl __mul_384 - ldr x30,[x29,#8] - - ldp x11,x12,[x0] - ldp x13,x14,[x0,#16] - adds x11,x11,x11 // add with itself - ldp x15,x16,[x0,#32] - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adcs x19,x19,x19 - adcs x20,x20,x20 - stp x11,x12,[x0] - adcs x21,x21,x21 - stp x13,x14,[x0,#16] - adcs x22,x22,x22 - stp x15,x16,[x0,#32] - adcs x23,x23,x23 - stp x19,x20,[x0,#48] - adc x24,x24,x24 - stp x21,x22,[x0,#64] - stp x23,x24,[x0,#80] - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |sqr_mont_382x|[FUNC] - ALIGN 32 -|sqr_mont_382x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x3,x0,[sp,#96] // __mul_mont_384 wants them there - sub sp,sp,#112 // space for two 384-bit vectors + word - mov x4,x3 // adjust for missing b_ptr - - ldp x11,x12,[x1] - ldp x13,x14,[x1,#16] - ldp x15,x16,[x1,#32] - - ldp x17,x20,[x1,#48] - ldp x21,x22,[x1,#64] - ldp x23,x24,[x1,#80] - - adds x5,x11,x17 // t0 = a->re + a->im - adcs x6,x12,x20 - adcs x7,x13,x21 - adcs x8,x14,x22 - adcs x9,x15,x23 - adc x10,x16,x24 - - subs x19,x11,x17 // t1 = a->re - a->im - sbcs x20,x12,x20 - sbcs x21,x13,x21 - sbcs x22,x14,x22 - sbcs x23,x15,x23 - sbcs x24,x16,x24 - sbc x25,xzr,xzr // borrow flag as mask - - stp x5,x6,[sp] - stp x7,x8,[sp,#16] - stp x9,x10,[sp,#32] - stp x19,x20,[sp,#48] - stp x21,x22,[sp,#64] - stp x23,x24,[sp,#80] - str x25,[sp,#96] - - ldp x5,x6,[x2] - ldp x7,x8,[x2,#16] - ldp x9,x10,[x2,#32] - - add x2,x1,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) - - adds x19,x11,x11 // add with itself - adcs x20,x12,x12 - adcs x21,x13,x13 - adcs x22,x14,x14 - adcs x23,x15,x15 - adc x24,x16,x16 - - stp x19,x20,[x2,#48] - stp x21,x22,[x2,#64] - stp x23,x24,[x2,#80] - - ldp x11,x12,[sp] - ldr x17,[sp,#48] - ldp x13,x14,[sp,#16] - ldp x15,x16,[sp,#32] - - add x2,sp,#48 - bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) - ldr x30,[x29,#8] - - ldr x25,[sp,#96] // account for sign from a->re - a->im - ldp x19,x20,[sp] - ldp x21,x22,[sp,#16] - ldp x23,x24,[sp,#32] - - and x19,x19,x25 - and x20,x20,x25 - and x21,x21,x25 - and x22,x22,x25 - and x23,x23,x25 - and x24,x24,x25 - - subs x11,x11,x19 - sbcs x12,x12,x20 - sbcs x13,x13,x21 - sbcs x14,x14,x22 - sbcs x15,x15,x23 - sbcs x16,x16,x24 - sbc x25,xzr,xzr - - and x19,x5,x25 - and x20,x6,x25 - and x21,x7,x25 - and x22,x8,x25 - and x23,x9,x25 - and x24,x10,x25 - - adds x11,x11,x19 - adcs x12,x12,x20 - adcs x13,x13,x21 - adcs x14,x14,x22 - adcs x15,x15,x23 - adc x16,x16,x24 - - stp x11,x12,[x2] - stp x13,x14,[x2,#16] - stp x15,x16,[x2,#32] - - add sp,sp,#112 - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - ALIGN 32 -|__mul_mont_383_nonred| PROC - mul x19,x11,x17 - mul x20,x12,x17 - mul x21,x13,x17 - mul x22,x14,x17 - mul x23,x15,x17 - mul x24,x16,x17 - mul x4,x4,x19 - - umulh x26,x11,x17 - umulh x27,x12,x17 - umulh x28,x13,x17 - umulh x0,x14,x17 - umulh x1,x15,x17 - umulh x3,x16,x17 - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,xzr, x3 - mul x3,x10,x4 - ldr x17,[x2,8*1] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*2] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*3] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*4] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - ldr x17,[x2,8*5] - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - - ldr x4,[x29,#96] - adds x19,x20,x26 - mul x26,x11,x17 - adcs x20,x21,x27 - mul x27,x12,x17 - adcs x21,x22,x28 - mul x28,x13,x17 - adcs x22,x23,x0 - mul x0,x14,x17 - adcs x23,x24,x1 - mul x1,x15,x17 - adcs x24,x25,x3 - mul x3,x16,x17 - adc x25,xzr,xzr - - adds x19,x19,x26 - umulh x26,x11,x17 - adcs x20,x20,x27 - umulh x27,x12,x17 - adcs x21,x21,x28 - mul x4,x4,x19 - umulh x28,x13,x17 - adcs x22,x22,x0 - umulh x0,x14,x17 - adcs x23,x23,x1 - umulh x1,x15,x17 - adcs x24,x24,x3 - umulh x3,x16,x17 - adc x25,x25,xzr - - adds x20,x20,x26 - mul x26,x5,x4 - adcs x21,x21,x27 - mul x27,x6,x4 - adcs x22,x22,x28 - mul x28,x7,x4 - adcs x23,x23,x0 - mul x0,x8,x4 - adcs x24,x24,x1 - mul x1,x9,x4 - adc x25,x25,x3 - mul x3,x10,x4 - adds x19,x19,x26 - umulh x26,x5,x4 - adcs x20,x20,x27 - umulh x27,x6,x4 - adcs x21,x21,x28 - umulh x28,x7,x4 - adcs x22,x22,x0 - umulh x0,x8,x4 - adcs x23,x23,x1 - umulh x1,x9,x4 - adcs x24,x24,x3 - umulh x3,x10,x4 - adc x25,x25,xzr - ldp x4,x2,[x29,#96] // pull r_ptr - - adds x11,x20,x26 - adcs x12,x21,x27 - adcs x13,x22,x28 - adcs x14,x23,x0 - adcs x15,x24,x1 - adcs x16,x25,x3 - - ret - ENDP - - - - EXPORT |sgn0_pty_mont_384|[FUNC] - ALIGN 32 -|sgn0_pty_mont_384| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov x4,x2 - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - ldp x9,x10,[x1,#32] - mov x1,x0 - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and x0,x11,#1 - adds x11,x11,x11 - adcs x12,x12,x12 - adcs x13,x13,x13 - adcs x14,x14,x14 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x0,x0,x17 - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - - - - EXPORT |sgn0_pty_mont_384x|[FUNC] - ALIGN 32 -|sgn0_pty_mont_384x| PROC - DCDU 3573752639 - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - mov x4,x2 - ldp x5,x6,[x1] - ldp x7,x8,[x1,#16] - ldp x9,x10,[x1,#32] - mov x1,x0 - - bl __mul_by_1_mont_384 - add x1,x1,#48 - - and x2,x11,#1 - orr x3,x11,x12 - adds x11,x11,x11 - orr x3,x3,x13 - adcs x12,x12,x12 - orr x3,x3,x14 - adcs x13,x13,x13 - orr x3,x3,x15 - adcs x14,x14,x14 - orr x3,x3,x16 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x2,x2,x17 - - bl __mul_by_1_mont_384 - ldr x30,[x29,#8] - - and x0,x11,#1 - orr x1,x11,x12 - adds x11,x11,x11 - orr x1,x1,x13 - adcs x12,x12,x12 - orr x1,x1,x14 - adcs x13,x13,x13 - orr x1,x1,x15 - adcs x14,x14,x14 - orr x1,x1,x16 - adcs x15,x15,x15 - adcs x16,x16,x16 - adc x17,xzr,xzr - - subs x11,x11,x5 - sbcs x12,x12,x6 - sbcs x13,x13,x7 - sbcs x14,x14,x8 - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbc x17,x17,xzr - - mvn x17,x17 - and x17,x17,#2 - orr x0,x0,x17 - - cmp x3,#0 - cseleq x3,x0,x2 - - cmp x1,#0 - cselne x1,x0,x2 - - and x3,x3,#1 - and x1,x1,#2 - orr x0,x1,x3 // pack sign and parity - - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - DCDU 3573752767 - ret - ENDP - END diff --git a/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm deleted file mode 100644 index 6aedca7cdaf..00000000000 --- a/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm +++ /dev/null @@ -1,913 +0,0 @@ -OPTION DOTNAME -EXTERN mul_mont_sparse_256$1:NEAR -EXTERN sqr_mont_sparse_256$1:NEAR -EXTERN from_mont_256$1:NEAR -EXTERN redc_mont_256$1:NEAR -_DATA SEGMENT -COMM __blst_platform_cap:DWORD:1 -_DATA ENDS -.text$ SEGMENT ALIGN(256) 'CODE' - -PUBLIC mul_mont_sparse_256 - - -ALIGN 32 -mul_mont_sparse_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_mont_sparse_256:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz mul_mont_sparse_256$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rdi - -$L$SEH_body_mul_mont_sparse_256:: - - - mov rax,QWORD PTR[rdx] - mov r13,QWORD PTR[rsi] - mov r14,QWORD PTR[8+rsi] - mov r12,QWORD PTR[16+rsi] - mov rbp,QWORD PTR[24+rsi] - mov rbx,rdx - - mov r15,rax - mul r13 - mov r9,rax - mov rax,r15 - mov r10,rdx - call __mulq_mont_sparse_256 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_mul_mont_sparse_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_mont_sparse_256:: -mul_mont_sparse_256 ENDP - -PUBLIC sqr_mont_sparse_256 - - -ALIGN 32 -sqr_mont_sparse_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqr_mont_sparse_256:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sqr_mont_sparse_256$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rdi - -$L$SEH_body_sqr_mont_sparse_256:: - - - mov rax,QWORD PTR[rsi] - mov r8,rcx - mov r14,QWORD PTR[8+rsi] - mov rcx,rdx - mov r12,QWORD PTR[16+rsi] - lea rbx,QWORD PTR[rsi] - mov rbp,QWORD PTR[24+rsi] - - mov r15,rax - mul rax - mov r9,rax - mov rax,r15 - mov r10,rdx - call __mulq_mont_sparse_256 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sqr_mont_sparse_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqr_mont_sparse_256:: -sqr_mont_sparse_256 ENDP - -ALIGN 32 -__mulq_mont_sparse_256 PROC PRIVATE - DB 243,15,30,250 - - mul r14 - add r10,rax - mov rax,r15 - adc rdx,0 - mov r11,rdx - - mul r12 - add r11,rax - mov rax,r15 - adc rdx,0 - mov r12,rdx - - mul rbp - add r12,rax - mov rax,QWORD PTR[8+rbx] - adc rdx,0 - xor r14,r14 - mov r13,rdx - - mov rdi,r9 - imul r9,r8 - - - mov r15,rax - mul QWORD PTR[rsi] - add r10,rax - mov rax,r15 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[8+rsi] - add r11,rax - mov rax,r15 - adc rdx,0 - add r11,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rsi] - add r12,rax - mov rax,r15 - adc rdx,0 - add r12,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rsi] - add r13,rax - mov rax,r9 - adc rdx,0 - add r13,rbp - adc r14,rdx - xor r15,r15 - - - mul QWORD PTR[rcx] - add rdi,rax - mov rax,r9 - adc rdi,rdx - - mul QWORD PTR[8+rcx] - add r10,rax - mov rax,r9 - adc rdx,0 - add r10,rdi - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r11,rax - mov rax,r9 - adc rdx,0 - add r11,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - add r12,rax - mov rax,QWORD PTR[16+rbx] - adc rdx,0 - add r12,rbp - adc rdx,0 - add r13,rdx - adc r14,0 - adc r15,0 - mov rdi,r10 - imul r10,r8 - - - mov r9,rax - mul QWORD PTR[rsi] - add r11,rax - mov rax,r9 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[8+rsi] - add r12,rax - mov rax,r9 - adc rdx,0 - add r12,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rsi] - add r13,rax - mov rax,r9 - adc rdx,0 - add r13,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rsi] - add r14,rax - mov rax,r10 - adc rdx,0 - add r14,rbp - adc r15,rdx - xor r9,r9 - - - mul QWORD PTR[rcx] - add rdi,rax - mov rax,r10 - adc rdi,rdx - - mul QWORD PTR[8+rcx] - add r11,rax - mov rax,r10 - adc rdx,0 - add r11,rdi - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r12,rax - mov rax,r10 - adc rdx,0 - add r12,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - add r13,rax - mov rax,QWORD PTR[24+rbx] - adc rdx,0 - add r13,rbp - adc rdx,0 - add r14,rdx - adc r15,0 - adc r9,0 - mov rdi,r11 - imul r11,r8 - - - mov r10,rax - mul QWORD PTR[rsi] - add r12,rax - mov rax,r10 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[8+rsi] - add r13,rax - mov rax,r10 - adc rdx,0 - add r13,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rsi] - add r14,rax - mov rax,r10 - adc rdx,0 - add r14,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rsi] - add r15,rax - mov rax,r11 - adc rdx,0 - add r15,rbp - adc r9,rdx - xor r10,r10 - - - mul QWORD PTR[rcx] - add rdi,rax - mov rax,r11 - adc rdi,rdx - - mul QWORD PTR[8+rcx] - add r12,rax - mov rax,r11 - adc rdx,0 - add r12,rdi - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r13,rax - mov rax,r11 - adc rdx,0 - add r13,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - add r14,rax - mov rax,r12 - adc rdx,0 - add r14,rbp - adc rdx,0 - add r15,rdx - adc r9,0 - adc r10,0 - imul rax,r8 - mov rsi,QWORD PTR[8+rsp] - - - mov r11,rax - mul QWORD PTR[rcx] - add r12,rax - mov rax,r11 - adc r12,rdx - - mul QWORD PTR[8+rcx] - add r13,rax - mov rax,r11 - adc rdx,0 - add r13,r12 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r14,rax - mov rax,r11 - adc rdx,0 - add r14,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - mov rbx,r14 - add r15,rbp - adc rdx,0 - add r15,rax - mov rax,r13 - adc rdx,0 - add r9,rdx - adc r10,0 - - - - - mov r12,r15 - sub r13,QWORD PTR[rcx] - sbb r14,QWORD PTR[8+rcx] - sbb r15,QWORD PTR[16+rcx] - mov rbp,r9 - sbb r9,QWORD PTR[24+rcx] - sbb r10,0 - - cmovc r13,rax - cmovc r14,rbx - cmovc r15,r12 - mov QWORD PTR[rsi],r13 - cmovc r9,rbp - mov QWORD PTR[8+rsi],r14 - mov QWORD PTR[16+rsi],r15 - mov QWORD PTR[24+rsi],r9 - - DB 0F3h,0C3h ;repret - -__mulq_mont_sparse_256 ENDP -PUBLIC from_mont_256 - - -ALIGN 32 -from_mont_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_from_mont_256:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz from_mont_256$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_from_mont_256:: - - - mov rbx,rdx - call __mulq_by_1_mont_256 - - - - - - mov r10,r14 - mov r11,r15 - mov r12,r9 - - sub r13,QWORD PTR[rbx] - sbb r14,QWORD PTR[8+rbx] - sbb r15,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - - cmovnc rax,r13 - cmovnc r10,r14 - cmovnc r11,r15 - mov QWORD PTR[rdi],rax - cmovnc r12,r9 - mov QWORD PTR[8+rdi],r10 - mov QWORD PTR[16+rdi],r11 - mov QWORD PTR[24+rdi],r12 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_from_mont_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_from_mont_256:: -from_mont_256 ENDP - -PUBLIC redc_mont_256 - - -ALIGN 32 -redc_mont_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_redc_mont_256:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz redc_mont_256$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_redc_mont_256:: - - - mov rbx,rdx - call __mulq_by_1_mont_256 - - add r13,QWORD PTR[32+rsi] - adc r14,QWORD PTR[40+rsi] - mov rax,r13 - adc r15,QWORD PTR[48+rsi] - mov r10,r14 - adc r9,QWORD PTR[56+rsi] - sbb rsi,rsi - - - - - mov r11,r15 - sub r13,QWORD PTR[rbx] - sbb r14,QWORD PTR[8+rbx] - sbb r15,QWORD PTR[16+rbx] - mov r12,r9 - sbb r9,QWORD PTR[24+rbx] - sbb rsi,0 - - cmovnc rax,r13 - cmovnc r10,r14 - cmovnc r11,r15 - mov QWORD PTR[rdi],rax - cmovnc r12,r9 - mov QWORD PTR[8+rdi],r10 - mov QWORD PTR[16+rdi],r11 - mov QWORD PTR[24+rdi],r12 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_redc_mont_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_redc_mont_256:: -redc_mont_256 ENDP - -ALIGN 32 -__mulq_by_1_mont_256 PROC PRIVATE - DB 243,15,30,250 - - mov rax,QWORD PTR[rsi] - mov r10,QWORD PTR[8+rsi] - mov r11,QWORD PTR[16+rsi] - mov r12,QWORD PTR[24+rsi] - - mov r13,rax - imul rax,rcx - mov r9,rax - - mul QWORD PTR[rbx] - add r13,rax - mov rax,r9 - adc r13,rdx - - mul QWORD PTR[8+rbx] - add r10,rax - mov rax,r9 - adc rdx,0 - add r10,r13 - adc rdx,0 - mov r13,rdx - - mul QWORD PTR[16+rbx] - mov r14,r10 - imul r10,rcx - add r11,rax - mov rax,r9 - adc rdx,0 - add r11,r13 - adc rdx,0 - mov r13,rdx - - mul QWORD PTR[24+rbx] - add r12,rax - mov rax,r10 - adc rdx,0 - add r12,r13 - adc rdx,0 - mov r13,rdx - - mul QWORD PTR[rbx] - add r14,rax - mov rax,r10 - adc r14,rdx - - mul QWORD PTR[8+rbx] - add r11,rax - mov rax,r10 - adc rdx,0 - add r11,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[16+rbx] - mov r15,r11 - imul r11,rcx - add r12,rax - mov rax,r10 - adc rdx,0 - add r12,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[24+rbx] - add r13,rax - mov rax,r11 - adc rdx,0 - add r13,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[rbx] - add r15,rax - mov rax,r11 - adc r15,rdx - - mul QWORD PTR[8+rbx] - add r12,rax - mov rax,r11 - adc rdx,0 - add r12,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[16+rbx] - mov r9,r12 - imul r12,rcx - add r13,rax - mov rax,r11 - adc rdx,0 - add r13,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[24+rbx] - add r14,rax - mov rax,r12 - adc rdx,0 - add r14,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[rbx] - add r9,rax - mov rax,r12 - adc r9,rdx - - mul QWORD PTR[8+rbx] - add r13,rax - mov rax,r12 - adc rdx,0 - add r13,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[16+rbx] - add r14,rax - mov rax,r12 - adc rdx,0 - add r14,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[24+rbx] - add r15,rax - mov rax,r13 - adc rdx,0 - add r15,r9 - adc rdx,0 - mov r9,rdx - DB 0F3h,0C3h ;repret -__mulq_by_1_mont_256 ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_mul_mont_sparse_256 - DD imagerel $L$SEH_body_mul_mont_sparse_256 - DD imagerel $L$SEH_info_mul_mont_sparse_256_prologue - - DD imagerel $L$SEH_body_mul_mont_sparse_256 - DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 - DD imagerel $L$SEH_info_mul_mont_sparse_256_body - - DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 - DD imagerel $L$SEH_end_mul_mont_sparse_256 - DD imagerel $L$SEH_info_mul_mont_sparse_256_epilogue - - DD imagerel $L$SEH_begin_sqr_mont_sparse_256 - DD imagerel $L$SEH_body_sqr_mont_sparse_256 - DD imagerel $L$SEH_info_sqr_mont_sparse_256_prologue - - DD imagerel $L$SEH_body_sqr_mont_sparse_256 - DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 - DD imagerel $L$SEH_info_sqr_mont_sparse_256_body - - DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 - DD imagerel $L$SEH_end_sqr_mont_sparse_256 - DD imagerel $L$SEH_info_sqr_mont_sparse_256_epilogue - - DD imagerel $L$SEH_begin_from_mont_256 - DD imagerel $L$SEH_body_from_mont_256 - DD imagerel $L$SEH_info_from_mont_256_prologue - - DD imagerel $L$SEH_body_from_mont_256 - DD imagerel $L$SEH_epilogue_from_mont_256 - DD imagerel $L$SEH_info_from_mont_256_body - - DD imagerel $L$SEH_epilogue_from_mont_256 - DD imagerel $L$SEH_end_from_mont_256 - DD imagerel $L$SEH_info_from_mont_256_epilogue - - DD imagerel $L$SEH_begin_redc_mont_256 - DD imagerel $L$SEH_body_redc_mont_256 - DD imagerel $L$SEH_info_redc_mont_256_prologue - - DD imagerel $L$SEH_body_redc_mont_256 - DD imagerel $L$SEH_epilogue_redc_mont_256 - DD imagerel $L$SEH_info_redc_mont_256_body - - DD imagerel $L$SEH_epilogue_redc_mont_256 - DD imagerel $L$SEH_end_redc_mont_256 - DD imagerel $L$SEH_info_redc_mont_256_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_mul_mont_sparse_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_mont_sparse_256_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mul_mont_sparse_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqr_mont_sparse_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqr_mont_sparse_256_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqr_mont_sparse_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_from_mont_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_from_mont_256_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_from_mont_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_redc_mont_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_redc_mont_256_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_redc_mont_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm deleted file mode 100644 index 8563815917e..00000000000 --- a/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm +++ /dev/null @@ -1,4341 +0,0 @@ -OPTION DOTNAME -EXTERN mul_mont_384x$1:NEAR -EXTERN sqr_mont_384x$1:NEAR -EXTERN mul_382x$1:NEAR -EXTERN sqr_382x$1:NEAR -EXTERN mul_384$1:NEAR -EXTERN sqr_384$1:NEAR -EXTERN redc_mont_384$1:NEAR -EXTERN from_mont_384$1:NEAR -EXTERN sgn0_pty_mont_384$1:NEAR -EXTERN sgn0_pty_mont_384x$1:NEAR -EXTERN mul_mont_384$1:NEAR -EXTERN sqr_mont_384$1:NEAR -EXTERN sqr_n_mul_mont_384$1:NEAR -EXTERN sqr_n_mul_mont_383$1:NEAR -EXTERN sqr_mont_382x$1:NEAR -_DATA SEGMENT -COMM __blst_platform_cap:DWORD:1 -_DATA ENDS -.text$ SEGMENT ALIGN(256) 'CODE' - - - - - - - - -ALIGN 32 -__subq_mod_384x384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov r14,QWORD PTR[48+rsi] - - sub r8,QWORD PTR[rdx] - mov r15,QWORD PTR[56+rsi] - sbb r9,QWORD PTR[8+rdx] - mov rax,QWORD PTR[64+rsi] - sbb r10,QWORD PTR[16+rdx] - mov rbx,QWORD PTR[72+rsi] - sbb r11,QWORD PTR[24+rdx] - mov rbp,QWORD PTR[80+rsi] - sbb r12,QWORD PTR[32+rdx] - mov rsi,QWORD PTR[88+rsi] - sbb r13,QWORD PTR[40+rdx] - mov QWORD PTR[rdi],r8 - sbb r14,QWORD PTR[48+rdx] - mov r8,QWORD PTR[rcx] - mov QWORD PTR[8+rdi],r9 - sbb r15,QWORD PTR[56+rdx] - mov r9,QWORD PTR[8+rcx] - mov QWORD PTR[16+rdi],r10 - sbb rax,QWORD PTR[64+rdx] - mov r10,QWORD PTR[16+rcx] - mov QWORD PTR[24+rdi],r11 - sbb rbx,QWORD PTR[72+rdx] - mov r11,QWORD PTR[24+rcx] - mov QWORD PTR[32+rdi],r12 - sbb rbp,QWORD PTR[80+rdx] - mov r12,QWORD PTR[32+rcx] - mov QWORD PTR[40+rdi],r13 - sbb rsi,QWORD PTR[88+rdx] - mov r13,QWORD PTR[40+rcx] - sbb rdx,rdx - - and r8,rdx - and r9,rdx - and r10,rdx - and r11,rdx - and r12,rdx - and r13,rdx - - add r14,r8 - adc r15,r9 - mov QWORD PTR[48+rdi],r14 - adc rax,r10 - mov QWORD PTR[56+rdi],r15 - adc rbx,r11 - mov QWORD PTR[64+rdi],rax - adc rbp,r12 - mov QWORD PTR[72+rdi],rbx - adc rsi,r13 - mov QWORD PTR[80+rdi],rbp - mov QWORD PTR[88+rdi],rsi - - DB 0F3h,0C3h ;repret -__subq_mod_384x384 ENDP - - -ALIGN 32 -__addq_mod_384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - add r8,QWORD PTR[rdx] - adc r9,QWORD PTR[8+rdx] - adc r10,QWORD PTR[16+rdx] - mov r14,r8 - adc r11,QWORD PTR[24+rdx] - mov r15,r9 - adc r12,QWORD PTR[32+rdx] - mov rax,r10 - adc r13,QWORD PTR[40+rdx] - mov rbx,r11 - sbb rdx,rdx - - sub r8,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rcx] - mov rbp,r12 - sbb r10,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rcx] - sbb r12,QWORD PTR[32+rcx] - mov rsi,r13 - sbb r13,QWORD PTR[40+rcx] - sbb rdx,0 - - cmovc r8,r14 - cmovc r9,r15 - cmovc r10,rax - mov QWORD PTR[rdi],r8 - cmovc r11,rbx - mov QWORD PTR[8+rdi],r9 - cmovc r12,rbp - mov QWORD PTR[16+rdi],r10 - cmovc r13,rsi - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - DB 0F3h,0C3h ;repret -__addq_mod_384 ENDP - - -ALIGN 32 -__subq_mod_384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - -__subq_mod_384_a_is_loaded:: - sub r8,QWORD PTR[rdx] - mov r14,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rdx] - mov r15,QWORD PTR[8+rcx] - sbb r10,QWORD PTR[16+rdx] - mov rax,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rdx] - mov rbx,QWORD PTR[24+rcx] - sbb r12,QWORD PTR[32+rdx] - mov rbp,QWORD PTR[32+rcx] - sbb r13,QWORD PTR[40+rdx] - mov rsi,QWORD PTR[40+rcx] - sbb rdx,rdx - - and r14,rdx - and r15,rdx - and rax,rdx - and rbx,rdx - and rbp,rdx - and rsi,rdx - - add r8,r14 - adc r9,r15 - mov QWORD PTR[rdi],r8 - adc r10,rax - mov QWORD PTR[8+rdi],r9 - adc r11,rbx - mov QWORD PTR[16+rdi],r10 - adc r12,rbp - mov QWORD PTR[24+rdi],r11 - adc r13,rsi - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - DB 0F3h,0C3h ;repret -__subq_mod_384 ENDP -PUBLIC mul_mont_384x - - -ALIGN 32 -mul_mont_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_mont_384x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz mul_mont_384x$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,328 - -$L$SEH_body_mul_mont_384x:: - - - mov rbx,rdx - mov QWORD PTR[32+rsp],rdi - mov QWORD PTR[24+rsp],rsi - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[8+rsp],rcx - mov QWORD PTR[rsp],r8 - - - - - lea rdi,QWORD PTR[40+rsp] - call __mulq_384 - - - lea rbx,QWORD PTR[48+rbx] - lea rsi,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[((40+96))+rsp] - call __mulq_384 - - - mov rcx,QWORD PTR[8+rsp] - lea rdx,QWORD PTR[((-48))+rsi] - lea rdi,QWORD PTR[((40+192+48))+rsp] - call __addq_mod_384 - - mov rsi,QWORD PTR[16+rsp] - lea rdx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[((-48))+rdi] - call __addq_mod_384 - - lea rbx,QWORD PTR[rdi] - lea rsi,QWORD PTR[48+rdi] - call __mulq_384 - - - lea rsi,QWORD PTR[rdi] - lea rdx,QWORD PTR[40+rsp] - mov rcx,QWORD PTR[8+rsp] - call __subq_mod_384x384 - - lea rsi,QWORD PTR[rdi] - lea rdx,QWORD PTR[((-96))+rdi] - call __subq_mod_384x384 - - - lea rsi,QWORD PTR[40+rsp] - lea rdx,QWORD PTR[((40+96))+rsp] - lea rdi,QWORD PTR[40+rsp] - call __subq_mod_384x384 - - mov rbx,rcx - - - lea rsi,QWORD PTR[40+rsp] - mov rcx,QWORD PTR[rsp] - mov rdi,QWORD PTR[32+rsp] - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - - lea rsi,QWORD PTR[((40+192))+rsp] - mov rcx,QWORD PTR[rsp] - lea rdi,QWORD PTR[48+rdi] - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - lea r8,QWORD PTR[328+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_mul_mont_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_mont_384x:: -mul_mont_384x ENDP -PUBLIC sqr_mont_384x - - -ALIGN 32 -sqr_mont_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqr_mont_384x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sqr_mont_384x$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,136 - -$L$SEH_body_sqr_mont_384x:: - - - mov QWORD PTR[rsp],rcx - mov rcx,rdx - mov QWORD PTR[8+rsp],rdi - mov QWORD PTR[16+rsp],rsi - - - lea rdx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[32+rsp] - call __addq_mod_384 - - - mov rsi,QWORD PTR[16+rsp] - lea rdx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[((32+48))+rsp] - call __subq_mod_384 - - - mov rsi,QWORD PTR[16+rsp] - lea rbx,QWORD PTR[48+rsi] - - mov rax,QWORD PTR[48+rsi] - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov r12,QWORD PTR[16+rsi] - mov r13,QWORD PTR[24+rsi] - - call __mulq_mont_384 - add r14,r14 - adc r15,r15 - adc r8,r8 - mov r12,r14 - adc r9,r9 - mov r13,r15 - adc r10,r10 - mov rax,r8 - adc r11,r11 - mov rbx,r9 - sbb rdx,rdx - - sub r14,QWORD PTR[rcx] - sbb r15,QWORD PTR[8+rcx] - mov rbp,r10 - sbb r8,QWORD PTR[16+rcx] - sbb r9,QWORD PTR[24+rcx] - sbb r10,QWORD PTR[32+rcx] - mov rsi,r11 - sbb r11,QWORD PTR[40+rcx] - sbb rdx,0 - - cmovc r14,r12 - cmovc r15,r13 - cmovc r8,rax - mov QWORD PTR[48+rdi],r14 - cmovc r9,rbx - mov QWORD PTR[56+rdi],r15 - cmovc r10,rbp - mov QWORD PTR[64+rdi],r8 - cmovc r11,rsi - mov QWORD PTR[72+rdi],r9 - mov QWORD PTR[80+rdi],r10 - mov QWORD PTR[88+rdi],r11 - - lea rsi,QWORD PTR[32+rsp] - lea rbx,QWORD PTR[((32+48))+rsp] - - mov rax,QWORD PTR[((32+48))+rsp] - mov r14,QWORD PTR[((32+0))+rsp] - mov r15,QWORD PTR[((32+8))+rsp] - mov r12,QWORD PTR[((32+16))+rsp] - mov r13,QWORD PTR[((32+24))+rsp] - - call __mulq_mont_384 - - lea r8,QWORD PTR[136+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_sqr_mont_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqr_mont_384x:: -sqr_mont_384x ENDP - -PUBLIC mul_382x - - -ALIGN 32 -mul_382x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_382x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz mul_382x$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,136 - -$L$SEH_body_mul_382x:: - - - lea rdi,QWORD PTR[96+rdi] - mov QWORD PTR[rsp],rsi - mov QWORD PTR[8+rsp],rdx - mov QWORD PTR[16+rsp],rdi - mov QWORD PTR[24+rsp],rcx - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - add r8,QWORD PTR[48+rsi] - adc r9,QWORD PTR[56+rsi] - adc r10,QWORD PTR[64+rsi] - adc r11,QWORD PTR[72+rsi] - adc r12,QWORD PTR[80+rsi] - adc r13,QWORD PTR[88+rsi] - - mov QWORD PTR[((32+0))+rsp],r8 - mov QWORD PTR[((32+8))+rsp],r9 - mov QWORD PTR[((32+16))+rsp],r10 - mov QWORD PTR[((32+24))+rsp],r11 - mov QWORD PTR[((32+32))+rsp],r12 - mov QWORD PTR[((32+40))+rsp],r13 - - - mov r8,QWORD PTR[rdx] - mov r9,QWORD PTR[8+rdx] - mov r10,QWORD PTR[16+rdx] - mov r11,QWORD PTR[24+rdx] - mov r12,QWORD PTR[32+rdx] - mov r13,QWORD PTR[40+rdx] - - add r8,QWORD PTR[48+rdx] - adc r9,QWORD PTR[56+rdx] - adc r10,QWORD PTR[64+rdx] - adc r11,QWORD PTR[72+rdx] - adc r12,QWORD PTR[80+rdx] - adc r13,QWORD PTR[88+rdx] - - mov QWORD PTR[((32+48))+rsp],r8 - mov QWORD PTR[((32+56))+rsp],r9 - mov QWORD PTR[((32+64))+rsp],r10 - mov QWORD PTR[((32+72))+rsp],r11 - mov QWORD PTR[((32+80))+rsp],r12 - mov QWORD PTR[((32+88))+rsp],r13 - - - lea rsi,QWORD PTR[((32+0))+rsp] - lea rbx,QWORD PTR[((32+48))+rsp] - call __mulq_384 - - - mov rsi,QWORD PTR[rsp] - mov rbx,QWORD PTR[8+rsp] - lea rdi,QWORD PTR[((-96))+rdi] - call __mulq_384 - - - lea rsi,QWORD PTR[48+rsi] - lea rbx,QWORD PTR[48+rbx] - lea rdi,QWORD PTR[32+rsp] - call __mulq_384 - - - mov rsi,QWORD PTR[16+rsp] - lea rdx,QWORD PTR[32+rsp] - mov rcx,QWORD PTR[24+rsp] - mov rdi,rsi - call __subq_mod_384x384 - - - lea rsi,QWORD PTR[rdi] - lea rdx,QWORD PTR[((-96))+rdi] - call __subq_mod_384x384 - - - lea rsi,QWORD PTR[((-96))+rdi] - lea rdx,QWORD PTR[32+rsp] - lea rdi,QWORD PTR[((-96))+rdi] - call __subq_mod_384x384 - - lea r8,QWORD PTR[136+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_mul_382x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_382x:: -mul_382x ENDP -PUBLIC sqr_382x - - -ALIGN 32 -sqr_382x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqr_382x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sqr_382x$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rsi - -$L$SEH_body_sqr_382x:: - - - mov rcx,rdx - - - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rax,QWORD PTR[16+rsi] - mov rbx,QWORD PTR[24+rsi] - mov rbp,QWORD PTR[32+rsi] - mov rdx,QWORD PTR[40+rsi] - - mov r8,r14 - add r14,QWORD PTR[48+rsi] - mov r9,r15 - adc r15,QWORD PTR[56+rsi] - mov r10,rax - adc rax,QWORD PTR[64+rsi] - mov r11,rbx - adc rbx,QWORD PTR[72+rsi] - mov r12,rbp - adc rbp,QWORD PTR[80+rsi] - mov r13,rdx - adc rdx,QWORD PTR[88+rsi] - - mov QWORD PTR[rdi],r14 - mov QWORD PTR[8+rdi],r15 - mov QWORD PTR[16+rdi],rax - mov QWORD PTR[24+rdi],rbx - mov QWORD PTR[32+rdi],rbp - mov QWORD PTR[40+rdi],rdx - - - lea rdx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[48+rdi] - call __subq_mod_384_a_is_loaded - - - lea rsi,QWORD PTR[rdi] - lea rbx,QWORD PTR[((-48))+rdi] - lea rdi,QWORD PTR[((-48))+rdi] - call __mulq_384 - - - mov rsi,QWORD PTR[rsp] - lea rbx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[96+rdi] - call __mulq_384 - - mov r8,QWORD PTR[rdi] - mov r9,QWORD PTR[8+rdi] - mov r10,QWORD PTR[16+rdi] - mov r11,QWORD PTR[24+rdi] - mov r12,QWORD PTR[32+rdi] - mov r13,QWORD PTR[40+rdi] - mov r14,QWORD PTR[48+rdi] - mov r15,QWORD PTR[56+rdi] - mov rax,QWORD PTR[64+rdi] - mov rbx,QWORD PTR[72+rdi] - mov rbp,QWORD PTR[80+rdi] - add r8,r8 - mov rdx,QWORD PTR[88+rdi] - adc r9,r9 - mov QWORD PTR[rdi],r8 - adc r10,r10 - mov QWORD PTR[8+rdi],r9 - adc r11,r11 - mov QWORD PTR[16+rdi],r10 - adc r12,r12 - mov QWORD PTR[24+rdi],r11 - adc r13,r13 - mov QWORD PTR[32+rdi],r12 - adc r14,r14 - mov QWORD PTR[40+rdi],r13 - adc r15,r15 - mov QWORD PTR[48+rdi],r14 - adc rax,rax - mov QWORD PTR[56+rdi],r15 - adc rbx,rbx - mov QWORD PTR[64+rdi],rax - adc rbp,rbp - mov QWORD PTR[72+rdi],rbx - adc rdx,rdx - mov QWORD PTR[80+rdi],rbp - mov QWORD PTR[88+rdi],rdx - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sqr_382x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqr_382x:: -sqr_382x ENDP -PUBLIC mul_384 - - -ALIGN 32 -mul_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz mul_384$1 -endif - push rbp - - push rbx - - push r12 - -$L$SEH_body_mul_384:: - - - mov rbx,rdx - call __mulq_384 - - mov r12,QWORD PTR[rsp] - - mov rbx,QWORD PTR[8+rsp] - - mov rbp,QWORD PTR[16+rsp] - - lea rsp,QWORD PTR[24+rsp] - -$L$SEH_epilogue_mul_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_384:: -mul_384 ENDP - - -ALIGN 32 -__mulq_384 PROC PRIVATE - DB 243,15,30,250 - - mov rax,QWORD PTR[rbx] - - mov rbp,rax - mul QWORD PTR[rsi] - mov QWORD PTR[rdi],rax - mov rax,rbp - mov rcx,rdx - - mul QWORD PTR[8+rsi] - add rcx,rax - mov rax,rbp - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rsi] - add r8,rax - mov rax,rbp - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[24+rsi] - add r9,rax - mov rax,rbp - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rsi] - add r10,rax - mov rax,rbp - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rsi] - add r11,rax - mov rax,QWORD PTR[8+rbx] - adc rdx,0 - mov r12,rdx - mov rbp,rax - mul QWORD PTR[rsi] - add rcx,rax - mov rax,rbp - adc rdx,0 - mov QWORD PTR[8+rdi],rcx - mov rcx,rdx - - mul QWORD PTR[8+rsi] - add r8,rax - mov rax,rbp - adc rdx,0 - add rcx,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rsi] - add r9,rax - mov rax,rbp - adc rdx,0 - add r8,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[24+rsi] - add r10,rax - mov rax,rbp - adc rdx,0 - add r9,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rsi] - add r11,rax - mov rax,rbp - adc rdx,0 - add r10,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rsi] - add r12,rax - mov rax,QWORD PTR[16+rbx] - adc rdx,0 - add r11,r12 - adc rdx,0 - mov r12,rdx - mov rbp,rax - mul QWORD PTR[rsi] - add rcx,rax - mov rax,rbp - adc rdx,0 - mov QWORD PTR[16+rdi],rcx - mov rcx,rdx - - mul QWORD PTR[8+rsi] - add r8,rax - mov rax,rbp - adc rdx,0 - add rcx,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rsi] - add r9,rax - mov rax,rbp - adc rdx,0 - add r8,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[24+rsi] - add r10,rax - mov rax,rbp - adc rdx,0 - add r9,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rsi] - add r11,rax - mov rax,rbp - adc rdx,0 - add r10,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rsi] - add r12,rax - mov rax,QWORD PTR[24+rbx] - adc rdx,0 - add r11,r12 - adc rdx,0 - mov r12,rdx - mov rbp,rax - mul QWORD PTR[rsi] - add rcx,rax - mov rax,rbp - adc rdx,0 - mov QWORD PTR[24+rdi],rcx - mov rcx,rdx - - mul QWORD PTR[8+rsi] - add r8,rax - mov rax,rbp - adc rdx,0 - add rcx,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rsi] - add r9,rax - mov rax,rbp - adc rdx,0 - add r8,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[24+rsi] - add r10,rax - mov rax,rbp - adc rdx,0 - add r9,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rsi] - add r11,rax - mov rax,rbp - adc rdx,0 - add r10,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rsi] - add r12,rax - mov rax,QWORD PTR[32+rbx] - adc rdx,0 - add r11,r12 - adc rdx,0 - mov r12,rdx - mov rbp,rax - mul QWORD PTR[rsi] - add rcx,rax - mov rax,rbp - adc rdx,0 - mov QWORD PTR[32+rdi],rcx - mov rcx,rdx - - mul QWORD PTR[8+rsi] - add r8,rax - mov rax,rbp - adc rdx,0 - add rcx,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rsi] - add r9,rax - mov rax,rbp - adc rdx,0 - add r8,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[24+rsi] - add r10,rax - mov rax,rbp - adc rdx,0 - add r9,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rsi] - add r11,rax - mov rax,rbp - adc rdx,0 - add r10,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rsi] - add r12,rax - mov rax,QWORD PTR[40+rbx] - adc rdx,0 - add r11,r12 - adc rdx,0 - mov r12,rdx - mov rbp,rax - mul QWORD PTR[rsi] - add rcx,rax - mov rax,rbp - adc rdx,0 - mov QWORD PTR[40+rdi],rcx - mov rcx,rdx - - mul QWORD PTR[8+rsi] - add r8,rax - mov rax,rbp - adc rdx,0 - add rcx,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rsi] - add r9,rax - mov rax,rbp - adc rdx,0 - add r8,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[24+rsi] - add r10,rax - mov rax,rbp - adc rdx,0 - add r9,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rsi] - add r11,rax - mov rax,rbp - adc rdx,0 - add r10,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rsi] - add r12,rax - mov rax,rax - adc rdx,0 - add r11,r12 - adc rdx,0 - mov r12,rdx - mov QWORD PTR[48+rdi],rcx - mov QWORD PTR[56+rdi],r8 - mov QWORD PTR[64+rdi],r9 - mov QWORD PTR[72+rdi],r10 - mov QWORD PTR[80+rdi],r11 - mov QWORD PTR[88+rdi],r12 - - DB 0F3h,0C3h ;repret -__mulq_384 ENDP -PUBLIC sqr_384 - - -ALIGN 32 -sqr_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqr_384:: - - - mov rdi,rcx - mov rsi,rdx -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sqr_384$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_sqr_384:: - - - call __sqrq_384 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sqr_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqr_384:: -sqr_384 ENDP - - -ALIGN 32 -__sqrq_384 PROC PRIVATE - DB 243,15,30,250 - - mov rax,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rcx,QWORD PTR[16+rsi] - mov rbx,QWORD PTR[24+rsi] - - - mov r14,rax - mul r15 - mov r9,rax - mov rax,r14 - mov rbp,QWORD PTR[32+rsi] - mov r10,rdx - - mul rcx - add r10,rax - mov rax,r14 - adc rdx,0 - mov rsi,QWORD PTR[40+rsi] - mov r11,rdx - - mul rbx - add r11,rax - mov rax,r14 - adc rdx,0 - mov r12,rdx - - mul rbp - add r12,rax - mov rax,r14 - adc rdx,0 - mov r13,rdx - - mul rsi - add r13,rax - mov rax,r14 - adc rdx,0 - mov r14,rdx - - mul rax - xor r8,r8 - mov QWORD PTR[rdi],rax - mov rax,r15 - add r9,r9 - adc r8,0 - add r9,rdx - adc r8,0 - mov QWORD PTR[8+rdi],r9 - - mul rcx - add r11,rax - mov rax,r15 - adc rdx,0 - mov r9,rdx - - mul rbx - add r12,rax - mov rax,r15 - adc rdx,0 - add r12,r9 - adc rdx,0 - mov r9,rdx - - mul rbp - add r13,rax - mov rax,r15 - adc rdx,0 - add r13,r9 - adc rdx,0 - mov r9,rdx - - mul rsi - add r14,rax - mov rax,r15 - adc rdx,0 - add r14,r9 - adc rdx,0 - mov r15,rdx - - mul rax - xor r9,r9 - add r8,rax - mov rax,rcx - add r10,r10 - adc r11,r11 - adc r9,0 - add r10,r8 - adc r11,rdx - adc r9,0 - mov QWORD PTR[16+rdi],r10 - - mul rbx - add r13,rax - mov rax,rcx - adc rdx,0 - mov QWORD PTR[24+rdi],r11 - mov r8,rdx - - mul rbp - add r14,rax - mov rax,rcx - adc rdx,0 - add r14,r8 - adc rdx,0 - mov r8,rdx - - mul rsi - add r15,rax - mov rax,rcx - adc rdx,0 - add r15,r8 - adc rdx,0 - mov rcx,rdx - - mul rax - xor r11,r11 - add r9,rax - mov rax,rbx - add r12,r12 - adc r13,r13 - adc r11,0 - add r12,r9 - adc r13,rdx - adc r11,0 - mov QWORD PTR[32+rdi],r12 - - - mul rbp - add r15,rax - mov rax,rbx - adc rdx,0 - mov QWORD PTR[40+rdi],r13 - mov r8,rdx - - mul rsi - add rcx,rax - mov rax,rbx - adc rdx,0 - add rcx,r8 - adc rdx,0 - mov rbx,rdx - - mul rax - xor r12,r12 - add r11,rax - mov rax,rbp - add r14,r14 - adc r15,r15 - adc r12,0 - add r14,r11 - adc r15,rdx - mov QWORD PTR[48+rdi],r14 - adc r12,0 - mov QWORD PTR[56+rdi],r15 - - - mul rsi - add rbx,rax - mov rax,rbp - adc rdx,0 - mov rbp,rdx - - mul rax - xor r13,r13 - add r12,rax - mov rax,rsi - add rcx,rcx - adc rbx,rbx - adc r13,0 - add rcx,r12 - adc rbx,rdx - mov QWORD PTR[64+rdi],rcx - adc r13,0 - mov QWORD PTR[72+rdi],rbx - - - mul rax - add rax,r13 - add rbp,rbp - adc rdx,0 - add rax,rbp - adc rdx,0 - mov QWORD PTR[80+rdi],rax - mov QWORD PTR[88+rdi],rdx - - DB 0F3h,0C3h ;repret -__sqrq_384 ENDP - -PUBLIC sqr_mont_384 - - -ALIGN 32 -sqr_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqr_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sqr_mont_384$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8*15 - -$L$SEH_body_sqr_mont_384:: - - - mov QWORD PTR[96+rsp],rcx - mov QWORD PTR[104+rsp],rdx - mov QWORD PTR[112+rsp],rdi - - mov rdi,rsp - call __sqrq_384 - - lea rsi,QWORD PTR[rsp] - mov rcx,QWORD PTR[96+rsp] - mov rbx,QWORD PTR[104+rsp] - mov rdi,QWORD PTR[112+rsp] - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - lea r8,QWORD PTR[120+rsp] - mov r15,QWORD PTR[120+rsp] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_sqr_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqr_mont_384:: -sqr_mont_384 ENDP - - - -PUBLIC redc_mont_384 - - -ALIGN 32 -redc_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_redc_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz redc_mont_384$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_redc_mont_384:: - - - mov rbx,rdx - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_redc_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_redc_mont_384:: -redc_mont_384 ENDP - - - - -PUBLIC from_mont_384 - - -ALIGN 32 -from_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_from_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz from_mont_384$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_from_mont_384:: - - - mov rbx,rdx - call __mulq_by_1_mont_384 - - - - - - mov rcx,r15 - mov rdx,r8 - mov rbp,r9 - - sub r14,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - mov r13,r10 - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - mov rsi,r11 - sbb r11,QWORD PTR[40+rbx] - - cmovc r14,rax - cmovc r15,rcx - cmovc r8,rdx - mov QWORD PTR[rdi],r14 - cmovc r9,rbp - mov QWORD PTR[8+rdi],r15 - cmovc r10,r13 - mov QWORD PTR[16+rdi],r8 - cmovc r11,rsi - mov QWORD PTR[24+rdi],r9 - mov QWORD PTR[32+rdi],r10 - mov QWORD PTR[40+rdi],r11 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_from_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_from_mont_384:: -from_mont_384 ENDP - -ALIGN 32 -__mulq_by_1_mont_384 PROC PRIVATE - DB 243,15,30,250 - - mov rax,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov r14,rax - imul rax,rcx - mov r8,rax - - mul QWORD PTR[rbx] - add r14,rax - mov rax,r8 - adc r14,rdx - - mul QWORD PTR[8+rbx] - add r9,rax - mov rax,r8 - adc rdx,0 - add r9,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[16+rbx] - add r10,rax - mov rax,r8 - adc rdx,0 - add r10,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[24+rbx] - add r11,rax - mov rax,r8 - adc rdx,0 - mov r15,r9 - imul r9,rcx - add r11,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[32+rbx] - add r12,rax - mov rax,r8 - adc rdx,0 - add r12,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[40+rbx] - add r13,rax - mov rax,r9 - adc rdx,0 - add r13,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[rbx] - add r15,rax - mov rax,r9 - adc r15,rdx - - mul QWORD PTR[8+rbx] - add r10,rax - mov rax,r9 - adc rdx,0 - add r10,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[16+rbx] - add r11,rax - mov rax,r9 - adc rdx,0 - add r11,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[24+rbx] - add r12,rax - mov rax,r9 - adc rdx,0 - mov r8,r10 - imul r10,rcx - add r12,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[32+rbx] - add r13,rax - mov rax,r9 - adc rdx,0 - add r13,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[40+rbx] - add r14,rax - mov rax,r10 - adc rdx,0 - add r14,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[rbx] - add r8,rax - mov rax,r10 - adc r8,rdx - - mul QWORD PTR[8+rbx] - add r11,rax - mov rax,r10 - adc rdx,0 - add r11,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rbx] - add r12,rax - mov rax,r10 - adc rdx,0 - add r12,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[24+rbx] - add r13,rax - mov rax,r10 - adc rdx,0 - mov r9,r11 - imul r11,rcx - add r13,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[32+rbx] - add r14,rax - mov rax,r10 - adc rdx,0 - add r14,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[40+rbx] - add r15,rax - mov rax,r11 - adc rdx,0 - add r15,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[rbx] - add r9,rax - mov rax,r11 - adc r9,rdx - - mul QWORD PTR[8+rbx] - add r12,rax - mov rax,r11 - adc rdx,0 - add r12,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[16+rbx] - add r13,rax - mov rax,r11 - adc rdx,0 - add r13,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[24+rbx] - add r14,rax - mov rax,r11 - adc rdx,0 - mov r10,r12 - imul r12,rcx - add r14,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[32+rbx] - add r15,rax - mov rax,r11 - adc rdx,0 - add r15,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[40+rbx] - add r8,rax - mov rax,r12 - adc rdx,0 - add r8,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[rbx] - add r10,rax - mov rax,r12 - adc r10,rdx - - mul QWORD PTR[8+rbx] - add r13,rax - mov rax,r12 - adc rdx,0 - add r13,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[16+rbx] - add r14,rax - mov rax,r12 - adc rdx,0 - add r14,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[24+rbx] - add r15,rax - mov rax,r12 - adc rdx,0 - mov r11,r13 - imul r13,rcx - add r15,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rbx] - add r8,rax - mov rax,r12 - adc rdx,0 - add r8,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[40+rbx] - add r9,rax - mov rax,r13 - adc rdx,0 - add r9,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[rbx] - add r11,rax - mov rax,r13 - adc r11,rdx - - mul QWORD PTR[8+rbx] - add r14,rax - mov rax,r13 - adc rdx,0 - add r14,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[16+rbx] - add r15,rax - mov rax,r13 - adc rdx,0 - add r15,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[24+rbx] - add r8,rax - mov rax,r13 - adc rdx,0 - add r8,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[32+rbx] - add r9,rax - mov rax,r13 - adc rdx,0 - add r9,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rbx] - add r10,rax - mov rax,r14 - adc rdx,0 - add r10,r11 - adc rdx,0 - mov r11,rdx - DB 0F3h,0C3h ;repret -__mulq_by_1_mont_384 ENDP - - -ALIGN 32 -__redq_tail_mont_384 PROC PRIVATE - DB 243,15,30,250 - - add r14,QWORD PTR[48+rsi] - mov rax,r14 - adc r15,QWORD PTR[56+rsi] - adc r8,QWORD PTR[64+rsi] - adc r9,QWORD PTR[72+rsi] - mov rcx,r15 - adc r10,QWORD PTR[80+rsi] - adc r11,QWORD PTR[88+rsi] - sbb r12,r12 - - - - - mov rdx,r8 - mov rbp,r9 - - sub r14,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - mov r13,r10 - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - mov rsi,r11 - sbb r11,QWORD PTR[40+rbx] - sbb r12,0 - - cmovc r14,rax - cmovc r15,rcx - cmovc r8,rdx - mov QWORD PTR[rdi],r14 - cmovc r9,rbp - mov QWORD PTR[8+rdi],r15 - cmovc r10,r13 - mov QWORD PTR[16+rdi],r8 - cmovc r11,rsi - mov QWORD PTR[24+rdi],r9 - mov QWORD PTR[32+rdi],r10 - mov QWORD PTR[40+rdi],r11 - - DB 0F3h,0C3h ;repret -__redq_tail_mont_384 ENDP - -PUBLIC sgn0_pty_mont_384 - - -ALIGN 32 -sgn0_pty_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sgn0_pty_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sgn0_pty_mont_384$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_sgn0_pty_mont_384:: - - - mov rbx,rsi - lea rsi,QWORD PTR[rdi] - mov rcx,rdx - call __mulq_by_1_mont_384 - - xor rax,rax - mov r13,r14 - add r14,r14 - adc r15,r15 - adc r8,r8 - adc r9,r9 - adc r10,r10 - adc r11,r11 - adc rax,0 - - sub r14,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - sbb r11,QWORD PTR[40+rbx] - sbb rax,0 - - not rax - and r13,1 - and rax,2 - or rax,r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sgn0_pty_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sgn0_pty_mont_384:: -sgn0_pty_mont_384 ENDP - -PUBLIC sgn0_pty_mont_384x - - -ALIGN 32 -sgn0_pty_mont_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sgn0_pty_mont_384x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sgn0_pty_mont_384x$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_sgn0_pty_mont_384x:: - - - mov rbx,rsi - lea rsi,QWORD PTR[48+rdi] - mov rcx,rdx - call __mulq_by_1_mont_384 - - mov r12,r14 - or r14,r15 - or r14,r8 - or r14,r9 - or r14,r10 - or r14,r11 - - lea rsi,QWORD PTR[rdi] - xor rdi,rdi - mov r13,r12 - add r12,r12 - adc r15,r15 - adc r8,r8 - adc r9,r9 - adc r10,r10 - adc r11,r11 - adc rdi,0 - - sub r12,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - sbb r11,QWORD PTR[40+rbx] - sbb rdi,0 - - mov QWORD PTR[rsp],r14 - not rdi - and r13,1 - and rdi,2 - or rdi,r13 - - call __mulq_by_1_mont_384 - - mov r12,r14 - or r14,r15 - or r14,r8 - or r14,r9 - or r14,r10 - or r14,r11 - - xor rax,rax - mov r13,r12 - add r12,r12 - adc r15,r15 - adc r8,r8 - adc r9,r9 - adc r10,r10 - adc r11,r11 - adc rax,0 - - sub r12,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - sbb r11,QWORD PTR[40+rbx] - sbb rax,0 - - mov r12,QWORD PTR[rsp] - - not rax - - test r14,r14 - cmovz r13,rdi - - test r12,r12 - cmovnz rax,rdi - - and r13,1 - and rax,2 - or rax,r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sgn0_pty_mont_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sgn0_pty_mont_384x:: -sgn0_pty_mont_384x ENDP -PUBLIC mul_mont_384 - - -ALIGN 32 -mul_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mul_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz mul_mont_384$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8*3 - -$L$SEH_body_mul_mont_384:: - - - mov rax,QWORD PTR[rdx] - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov r12,QWORD PTR[16+rsi] - mov r13,QWORD PTR[24+rsi] - mov rbx,rdx - mov QWORD PTR[rsp],r8 - mov QWORD PTR[8+rsp],rdi - - call __mulq_mont_384 - - mov r15,QWORD PTR[24+rsp] - - mov r14,QWORD PTR[32+rsp] - - mov r13,QWORD PTR[40+rsp] - - mov r12,QWORD PTR[48+rsp] - - mov rbx,QWORD PTR[56+rsp] - - mov rbp,QWORD PTR[64+rsp] - - lea rsp,QWORD PTR[72+rsp] - -$L$SEH_epilogue_mul_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mul_mont_384:: -mul_mont_384 ENDP - -ALIGN 32 -__mulq_mont_384 PROC PRIVATE - DB 243,15,30,250 - - mov rdi,rax - mul r14 - mov r8,rax - mov rax,rdi - mov r9,rdx - - mul r15 - add r9,rax - mov rax,rdi - adc rdx,0 - mov r10,rdx - - mul r12 - add r10,rax - mov rax,rdi - adc rdx,0 - mov r11,rdx - - mov rbp,r8 - imul r8,QWORD PTR[8+rsp] - - mul r13 - add r11,rax - mov rax,rdi - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[32+rsi] - add r12,rax - mov rax,rdi - adc rdx,0 - mov r13,rdx - - mul QWORD PTR[40+rsi] - add r13,rax - mov rax,r8 - adc rdx,0 - xor r15,r15 - mov r14,rdx - - mul QWORD PTR[rcx] - add rbp,rax - mov rax,r8 - adc rbp,rdx - - mul QWORD PTR[8+rcx] - add r9,rax - mov rax,r8 - adc rdx,0 - add r9,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r10,rax - mov rax,r8 - adc rdx,0 - add r10,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - add r11,rbp - adc rdx,0 - add r11,rax - mov rax,r8 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[32+rcx] - add r12,rax - mov rax,r8 - adc rdx,0 - add r12,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[40+rcx] - add r13,rax - mov rax,QWORD PTR[8+rbx] - adc rdx,0 - add r13,rbp - adc r14,rdx - adc r15,0 - - mov rdi,rax - mul QWORD PTR[rsi] - add r9,rax - mov rax,rdi - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[8+rsi] - add r10,rax - mov rax,rdi - adc rdx,0 - add r10,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rsi] - add r11,rax - mov rax,rdi - adc rdx,0 - add r11,r8 - adc rdx,0 - mov r8,rdx - - mov rbp,r9 - imul r9,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r12,rax - mov rax,rdi - adc rdx,0 - add r12,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[32+rsi] - add r13,rax - mov rax,rdi - adc rdx,0 - add r13,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[40+rsi] - add r14,r8 - adc rdx,0 - xor r8,r8 - add r14,rax - mov rax,r9 - adc r15,rdx - adc r8,0 - - mul QWORD PTR[rcx] - add rbp,rax - mov rax,r9 - adc rbp,rdx - - mul QWORD PTR[8+rcx] - add r10,rax - mov rax,r9 - adc rdx,0 - add r10,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r11,rax - mov rax,r9 - adc rdx,0 - add r11,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - add r12,rbp - adc rdx,0 - add r12,rax - mov rax,r9 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[32+rcx] - add r13,rax - mov rax,r9 - adc rdx,0 - add r13,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[40+rcx] - add r14,rax - mov rax,QWORD PTR[16+rbx] - adc rdx,0 - add r14,rbp - adc r15,rdx - adc r8,0 - - mov rdi,rax - mul QWORD PTR[rsi] - add r10,rax - mov rax,rdi - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[8+rsi] - add r11,rax - mov rax,rdi - adc rdx,0 - add r11,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[16+rsi] - add r12,rax - mov rax,rdi - adc rdx,0 - add r12,r9 - adc rdx,0 - mov r9,rdx - - mov rbp,r10 - imul r10,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r13,rax - mov rax,rdi - adc rdx,0 - add r13,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[32+rsi] - add r14,rax - mov rax,rdi - adc rdx,0 - add r14,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[40+rsi] - add r15,r9 - adc rdx,0 - xor r9,r9 - add r15,rax - mov rax,r10 - adc r8,rdx - adc r9,0 - - mul QWORD PTR[rcx] - add rbp,rax - mov rax,r10 - adc rbp,rdx - - mul QWORD PTR[8+rcx] - add r11,rax - mov rax,r10 - adc rdx,0 - add r11,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r12,rax - mov rax,r10 - adc rdx,0 - add r12,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - add r13,rbp - adc rdx,0 - add r13,rax - mov rax,r10 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[32+rcx] - add r14,rax - mov rax,r10 - adc rdx,0 - add r14,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[40+rcx] - add r15,rax - mov rax,QWORD PTR[24+rbx] - adc rdx,0 - add r15,rbp - adc r8,rdx - adc r9,0 - - mov rdi,rax - mul QWORD PTR[rsi] - add r11,rax - mov rax,rdi - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[8+rsi] - add r12,rax - mov rax,rdi - adc rdx,0 - add r12,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[16+rsi] - add r13,rax - mov rax,rdi - adc rdx,0 - add r13,r10 - adc rdx,0 - mov r10,rdx - - mov rbp,r11 - imul r11,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r14,rax - mov rax,rdi - adc rdx,0 - add r14,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rsi] - add r15,rax - mov rax,rdi - adc rdx,0 - add r15,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[40+rsi] - add r8,r10 - adc rdx,0 - xor r10,r10 - add r8,rax - mov rax,r11 - adc r9,rdx - adc r10,0 - - mul QWORD PTR[rcx] - add rbp,rax - mov rax,r11 - adc rbp,rdx - - mul QWORD PTR[8+rcx] - add r12,rax - mov rax,r11 - adc rdx,0 - add r12,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r13,rax - mov rax,r11 - adc rdx,0 - add r13,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - add r14,rbp - adc rdx,0 - add r14,rax - mov rax,r11 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[32+rcx] - add r15,rax - mov rax,r11 - adc rdx,0 - add r15,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[40+rcx] - add r8,rax - mov rax,QWORD PTR[32+rbx] - adc rdx,0 - add r8,rbp - adc r9,rdx - adc r10,0 - - mov rdi,rax - mul QWORD PTR[rsi] - add r12,rax - mov rax,rdi - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[8+rsi] - add r13,rax - mov rax,rdi - adc rdx,0 - add r13,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[16+rsi] - add r14,rax - mov rax,rdi - adc rdx,0 - add r14,r11 - adc rdx,0 - mov r11,rdx - - mov rbp,r12 - imul r12,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r15,rax - mov rax,rdi - adc rdx,0 - add r15,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[32+rsi] - add r8,rax - mov rax,rdi - adc rdx,0 - add r8,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rsi] - add r9,r11 - adc rdx,0 - xor r11,r11 - add r9,rax - mov rax,r12 - adc r10,rdx - adc r11,0 - - mul QWORD PTR[rcx] - add rbp,rax - mov rax,r12 - adc rbp,rdx - - mul QWORD PTR[8+rcx] - add r13,rax - mov rax,r12 - adc rdx,0 - add r13,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r14,rax - mov rax,r12 - adc rdx,0 - add r14,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - add r15,rbp - adc rdx,0 - add r15,rax - mov rax,r12 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[32+rcx] - add r8,rax - mov rax,r12 - adc rdx,0 - add r8,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[40+rcx] - add r9,rax - mov rax,QWORD PTR[40+rbx] - adc rdx,0 - add r9,rbp - adc r10,rdx - adc r11,0 - - mov rdi,rax - mul QWORD PTR[rsi] - add r13,rax - mov rax,rdi - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[8+rsi] - add r14,rax - mov rax,rdi - adc rdx,0 - add r14,r12 - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[16+rsi] - add r15,rax - mov rax,rdi - adc rdx,0 - add r15,r12 - adc rdx,0 - mov r12,rdx - - mov rbp,r13 - imul r13,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r8,rax - mov rax,rdi - adc rdx,0 - add r8,r12 - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[32+rsi] - add r9,rax - mov rax,rdi - adc rdx,0 - add r9,r12 - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[40+rsi] - add r10,r12 - adc rdx,0 - xor r12,r12 - add r10,rax - mov rax,r13 - adc r11,rdx - adc r12,0 - - mul QWORD PTR[rcx] - add rbp,rax - mov rax,r13 - adc rbp,rdx - - mul QWORD PTR[8+rcx] - add r14,rax - mov rax,r13 - adc rdx,0 - add r14,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[16+rcx] - add r15,rax - mov rax,r13 - adc rdx,0 - add r15,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[24+rcx] - add r8,rbp - adc rdx,0 - add r8,rax - mov rax,r13 - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[32+rcx] - add r9,rax - mov rax,r13 - adc rdx,0 - add r9,rbp - adc rdx,0 - mov rbp,rdx - - mul QWORD PTR[40+rcx] - add r10,rax - mov rax,r14 - adc rdx,0 - add r10,rbp - adc r11,rdx - adc r12,0 - - - - - mov rdi,QWORD PTR[16+rsp] - sub r14,QWORD PTR[rcx] - mov rdx,r15 - sbb r15,QWORD PTR[8+rcx] - mov rbx,r8 - sbb r8,QWORD PTR[16+rcx] - mov rsi,r9 - sbb r9,QWORD PTR[24+rcx] - mov rbp,r10 - sbb r10,QWORD PTR[32+rcx] - mov r13,r11 - sbb r11,QWORD PTR[40+rcx] - sbb r12,0 - - cmovc r14,rax - cmovc r15,rdx - cmovc r8,rbx - mov QWORD PTR[rdi],r14 - cmovc r9,rsi - mov QWORD PTR[8+rdi],r15 - cmovc r10,rbp - mov QWORD PTR[16+rdi],r8 - cmovc r11,r13 - mov QWORD PTR[24+rdi],r9 - mov QWORD PTR[32+rdi],r10 - mov QWORD PTR[40+rdi],r11 - - DB 0F3h,0C3h ;repret -__mulq_mont_384 ENDP -PUBLIC sqr_n_mul_mont_384 - - -ALIGN 32 -sqr_n_mul_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqr_n_mul_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] - mov r9,QWORD PTR[48+rsp] -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sqr_n_mul_mont_384$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8*17 - -$L$SEH_body_sqr_n_mul_mont_384:: - - - mov QWORD PTR[rsp],r8 - mov QWORD PTR[8+rsp],rdi - mov QWORD PTR[16+rsp],rcx - lea rdi,QWORD PTR[32+rsp] - mov QWORD PTR[24+rsp],r9 - movq xmm2,QWORD PTR[r9] - -$L$oop_sqr_384:: - movd xmm1,edx - - call __sqrq_384 - - lea rsi,QWORD PTR[rdi] - mov rcx,QWORD PTR[rsp] - mov rbx,QWORD PTR[16+rsp] - call __mulq_by_1_mont_384 - call __redq_tail_mont_384 - - movd edx,xmm1 - lea rsi,QWORD PTR[rdi] - dec edx - jnz $L$oop_sqr_384 - -DB 102,72,15,126,208 - mov rcx,rbx - mov rbx,QWORD PTR[24+rsp] - - - - - - - mov r12,r8 - mov r13,r9 - - call __mulq_mont_384 - - lea r8,QWORD PTR[136+rsp] - mov r15,QWORD PTR[136+rsp] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_sqr_n_mul_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqr_n_mul_mont_384:: -sqr_n_mul_mont_384 ENDP - -PUBLIC sqr_n_mul_mont_383 - - -ALIGN 32 -sqr_n_mul_mont_383 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqr_n_mul_mont_383:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] - mov r9,QWORD PTR[48+rsp] -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sqr_n_mul_mont_383$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8*17 - -$L$SEH_body_sqr_n_mul_mont_383:: - - - mov QWORD PTR[rsp],r8 - mov QWORD PTR[8+rsp],rdi - mov QWORD PTR[16+rsp],rcx - lea rdi,QWORD PTR[32+rsp] - mov QWORD PTR[24+rsp],r9 - movq xmm2,QWORD PTR[r9] - -$L$oop_sqr_383:: - movd xmm1,edx - - call __sqrq_384 - - lea rsi,QWORD PTR[rdi] - mov rcx,QWORD PTR[rsp] - mov rbx,QWORD PTR[16+rsp] - call __mulq_by_1_mont_384 - - movd edx,xmm1 - add r14,QWORD PTR[48+rsi] - adc r15,QWORD PTR[56+rsi] - adc r8,QWORD PTR[64+rsi] - adc r9,QWORD PTR[72+rsi] - adc r10,QWORD PTR[80+rsi] - adc r11,QWORD PTR[88+rsi] - lea rsi,QWORD PTR[rdi] - - mov QWORD PTR[rdi],r14 - mov QWORD PTR[8+rdi],r15 - mov QWORD PTR[16+rdi],r8 - mov QWORD PTR[24+rdi],r9 - mov QWORD PTR[32+rdi],r10 - mov QWORD PTR[40+rdi],r11 - - dec edx - jnz $L$oop_sqr_383 - -DB 102,72,15,126,208 - mov rcx,rbx - mov rbx,QWORD PTR[24+rsp] - - - - - - - mov r12,r8 - mov r13,r9 - - call __mulq_mont_384 - - lea r8,QWORD PTR[136+rsp] - mov r15,QWORD PTR[136+rsp] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_sqr_n_mul_mont_383:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqr_n_mul_mont_383:: -sqr_n_mul_mont_383 ENDP - -ALIGN 32 -__mulq_mont_383_nonred PROC PRIVATE - DB 243,15,30,250 - - mov rbp,rax - mul r14 - mov r8,rax - mov rax,rbp - mov r9,rdx - - mul r15 - add r9,rax - mov rax,rbp - adc rdx,0 - mov r10,rdx - - mul r12 - add r10,rax - mov rax,rbp - adc rdx,0 - mov r11,rdx - - mov r15,r8 - imul r8,QWORD PTR[8+rsp] - - mul r13 - add r11,rax - mov rax,rbp - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[32+rsi] - add r12,rax - mov rax,rbp - adc rdx,0 - mov r13,rdx - - mul QWORD PTR[40+rsi] - add r13,rax - mov rax,r8 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[rcx] - add r15,rax - mov rax,r8 - adc r15,rdx - - mul QWORD PTR[8+rcx] - add r9,rax - mov rax,r8 - adc rdx,0 - add r9,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[16+rcx] - add r10,rax - mov rax,r8 - adc rdx,0 - add r10,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[24+rcx] - add r11,r15 - adc rdx,0 - add r11,rax - mov rax,r8 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[32+rcx] - add r12,rax - mov rax,r8 - adc rdx,0 - add r12,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[40+rcx] - add r13,rax - mov rax,QWORD PTR[8+rbx] - adc rdx,0 - add r13,r15 - adc r14,rdx - - mov rbp,rax - mul QWORD PTR[rsi] - add r9,rax - mov rax,rbp - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[8+rsi] - add r10,rax - mov rax,rbp - adc rdx,0 - add r10,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[16+rsi] - add r11,rax - mov rax,rbp - adc rdx,0 - add r11,r15 - adc rdx,0 - mov r15,rdx - - mov r8,r9 - imul r9,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r12,rax - mov rax,rbp - adc rdx,0 - add r12,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[32+rsi] - add r13,rax - mov rax,rbp - adc rdx,0 - add r13,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[40+rsi] - add r14,r15 - adc rdx,0 - add r14,rax - mov rax,r9 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[rcx] - add r8,rax - mov rax,r9 - adc r8,rdx - - mul QWORD PTR[8+rcx] - add r10,rax - mov rax,r9 - adc rdx,0 - add r10,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rcx] - add r11,rax - mov rax,r9 - adc rdx,0 - add r11,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[24+rcx] - add r12,r8 - adc rdx,0 - add r12,rax - mov rax,r9 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[32+rcx] - add r13,rax - mov rax,r9 - adc rdx,0 - add r13,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[40+rcx] - add r14,rax - mov rax,QWORD PTR[16+rbx] - adc rdx,0 - add r14,r8 - adc r15,rdx - - mov rbp,rax - mul QWORD PTR[rsi] - add r10,rax - mov rax,rbp - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[8+rsi] - add r11,rax - mov rax,rbp - adc rdx,0 - add r11,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[16+rsi] - add r12,rax - mov rax,rbp - adc rdx,0 - add r12,r8 - adc rdx,0 - mov r8,rdx - - mov r9,r10 - imul r10,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r13,rax - mov rax,rbp - adc rdx,0 - add r13,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[32+rsi] - add r14,rax - mov rax,rbp - adc rdx,0 - add r14,r8 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[40+rsi] - add r15,r8 - adc rdx,0 - add r15,rax - mov rax,r10 - adc rdx,0 - mov r8,rdx - - mul QWORD PTR[rcx] - add r9,rax - mov rax,r10 - adc r9,rdx - - mul QWORD PTR[8+rcx] - add r11,rax - mov rax,r10 - adc rdx,0 - add r11,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[16+rcx] - add r12,rax - mov rax,r10 - adc rdx,0 - add r12,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[24+rcx] - add r13,r9 - adc rdx,0 - add r13,rax - mov rax,r10 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[32+rcx] - add r14,rax - mov rax,r10 - adc rdx,0 - add r14,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[40+rcx] - add r15,rax - mov rax,QWORD PTR[24+rbx] - adc rdx,0 - add r15,r9 - adc r8,rdx - - mov rbp,rax - mul QWORD PTR[rsi] - add r11,rax - mov rax,rbp - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[8+rsi] - add r12,rax - mov rax,rbp - adc rdx,0 - add r12,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[16+rsi] - add r13,rax - mov rax,rbp - adc rdx,0 - add r13,r9 - adc rdx,0 - mov r9,rdx - - mov r10,r11 - imul r11,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r14,rax - mov rax,rbp - adc rdx,0 - add r14,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[32+rsi] - add r15,rax - mov rax,rbp - adc rdx,0 - add r15,r9 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[40+rsi] - add r8,r9 - adc rdx,0 - add r8,rax - mov rax,r11 - adc rdx,0 - mov r9,rdx - - mul QWORD PTR[rcx] - add r10,rax - mov rax,r11 - adc r10,rdx - - mul QWORD PTR[8+rcx] - add r12,rax - mov rax,r11 - adc rdx,0 - add r12,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[16+rcx] - add r13,rax - mov rax,r11 - adc rdx,0 - add r13,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[24+rcx] - add r14,r10 - adc rdx,0 - add r14,rax - mov rax,r11 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rcx] - add r15,rax - mov rax,r11 - adc rdx,0 - add r15,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[40+rcx] - add r8,rax - mov rax,QWORD PTR[32+rbx] - adc rdx,0 - add r8,r10 - adc r9,rdx - - mov rbp,rax - mul QWORD PTR[rsi] - add r12,rax - mov rax,rbp - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[8+rsi] - add r13,rax - mov rax,rbp - adc rdx,0 - add r13,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[16+rsi] - add r14,rax - mov rax,rbp - adc rdx,0 - add r14,r10 - adc rdx,0 - mov r10,rdx - - mov r11,r12 - imul r12,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r15,rax - mov rax,rbp - adc rdx,0 - add r15,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[32+rsi] - add r8,rax - mov rax,rbp - adc rdx,0 - add r8,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[40+rsi] - add r9,r10 - adc rdx,0 - add r9,rax - mov rax,r12 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[rcx] - add r11,rax - mov rax,r12 - adc r11,rdx - - mul QWORD PTR[8+rcx] - add r13,rax - mov rax,r12 - adc rdx,0 - add r13,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[16+rcx] - add r14,rax - mov rax,r12 - adc rdx,0 - add r14,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[24+rcx] - add r15,r11 - adc rdx,0 - add r15,rax - mov rax,r12 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[32+rcx] - add r8,rax - mov rax,r12 - adc rdx,0 - add r8,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rcx] - add r9,rax - mov rax,QWORD PTR[40+rbx] - adc rdx,0 - add r9,r11 - adc r10,rdx - - mov rbp,rax - mul QWORD PTR[rsi] - add r13,rax - mov rax,rbp - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[8+rsi] - add r14,rax - mov rax,rbp - adc rdx,0 - add r14,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[16+rsi] - add r15,rax - mov rax,rbp - adc rdx,0 - add r15,r11 - adc rdx,0 - mov r11,rdx - - mov r12,r13 - imul r13,QWORD PTR[8+rsp] - - mul QWORD PTR[24+rsi] - add r8,rax - mov rax,rbp - adc rdx,0 - add r8,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[32+rsi] - add r9,rax - mov rax,rbp - adc rdx,0 - add r9,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[40+rsi] - add r10,r11 - adc rdx,0 - add r10,rax - mov rax,r13 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[rcx] - add r12,rax - mov rax,r13 - adc r12,rdx - - mul QWORD PTR[8+rcx] - add r14,rax - mov rax,r13 - adc rdx,0 - add r14,r12 - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[16+rcx] - add r15,rax - mov rax,r13 - adc rdx,0 - add r15,r12 - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[24+rcx] - add r8,r12 - adc rdx,0 - add r8,rax - mov rax,r13 - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[32+rcx] - add r9,rax - mov rax,r13 - adc rdx,0 - add r9,r12 - adc rdx,0 - mov r12,rdx - - mul QWORD PTR[40+rcx] - add r10,rax - mov rax,r14 - adc rdx,0 - add r10,r12 - adc r11,rdx - DB 0F3h,0C3h ;repret -__mulq_mont_383_nonred ENDP -PUBLIC sqr_mont_382x - - -ALIGN 32 -sqr_mont_382x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqr_mont_382x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -ifdef __BLST_PORTABLE__ - test DWORD PTR[__blst_platform_cap],1 - jnz sqr_mont_382x$1 -endif - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,136 - -$L$SEH_body_sqr_mont_382x:: - - - mov QWORD PTR[rsp],rcx - mov rcx,rdx - mov QWORD PTR[16+rsp],rsi - mov QWORD PTR[24+rsp],rdi - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov r14,r8 - add r8,QWORD PTR[48+rsi] - mov r15,r9 - adc r9,QWORD PTR[56+rsi] - mov rax,r10 - adc r10,QWORD PTR[64+rsi] - mov rdx,r11 - adc r11,QWORD PTR[72+rsi] - mov rbx,r12 - adc r12,QWORD PTR[80+rsi] - mov rbp,r13 - adc r13,QWORD PTR[88+rsi] - - sub r14,QWORD PTR[48+rsi] - sbb r15,QWORD PTR[56+rsi] - sbb rax,QWORD PTR[64+rsi] - sbb rdx,QWORD PTR[72+rsi] - sbb rbx,QWORD PTR[80+rsi] - sbb rbp,QWORD PTR[88+rsi] - sbb rdi,rdi - - mov QWORD PTR[((32+0))+rsp],r8 - mov QWORD PTR[((32+8))+rsp],r9 - mov QWORD PTR[((32+16))+rsp],r10 - mov QWORD PTR[((32+24))+rsp],r11 - mov QWORD PTR[((32+32))+rsp],r12 - mov QWORD PTR[((32+40))+rsp],r13 - - mov QWORD PTR[((32+48))+rsp],r14 - mov QWORD PTR[((32+56))+rsp],r15 - mov QWORD PTR[((32+64))+rsp],rax - mov QWORD PTR[((32+72))+rsp],rdx - mov QWORD PTR[((32+80))+rsp],rbx - mov QWORD PTR[((32+88))+rsp],rbp - mov QWORD PTR[((32+96))+rsp],rdi - - - - lea rbx,QWORD PTR[48+rsi] - - mov rax,QWORD PTR[48+rsi] - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov r12,QWORD PTR[16+rsi] - mov r13,QWORD PTR[24+rsi] - - mov rdi,QWORD PTR[24+rsp] - call __mulq_mont_383_nonred - add r14,r14 - adc r15,r15 - adc r8,r8 - adc r9,r9 - adc r10,r10 - adc r11,r11 - - mov QWORD PTR[48+rdi],r14 - mov QWORD PTR[56+rdi],r15 - mov QWORD PTR[64+rdi],r8 - mov QWORD PTR[72+rdi],r9 - mov QWORD PTR[80+rdi],r10 - mov QWORD PTR[88+rdi],r11 - - lea rsi,QWORD PTR[32+rsp] - lea rbx,QWORD PTR[((32+48))+rsp] - - mov rax,QWORD PTR[((32+48))+rsp] - mov r14,QWORD PTR[((32+0))+rsp] - mov r15,QWORD PTR[((32+8))+rsp] - mov r12,QWORD PTR[((32+16))+rsp] - mov r13,QWORD PTR[((32+24))+rsp] - - call __mulq_mont_383_nonred - mov rsi,QWORD PTR[((32+96))+rsp] - mov r12,QWORD PTR[((32+0))+rsp] - mov r13,QWORD PTR[((32+8))+rsp] - and r12,rsi - mov rax,QWORD PTR[((32+16))+rsp] - and r13,rsi - mov rbx,QWORD PTR[((32+24))+rsp] - and rax,rsi - mov rbp,QWORD PTR[((32+32))+rsp] - and rbx,rsi - and rbp,rsi - and rsi,QWORD PTR[((32+40))+rsp] - - sub r14,r12 - mov r12,QWORD PTR[rcx] - sbb r15,r13 - mov r13,QWORD PTR[8+rcx] - sbb r8,rax - mov rax,QWORD PTR[16+rcx] - sbb r9,rbx - mov rbx,QWORD PTR[24+rcx] - sbb r10,rbp - mov rbp,QWORD PTR[32+rcx] - sbb r11,rsi - sbb rsi,rsi - - and r12,rsi - and r13,rsi - and rax,rsi - and rbx,rsi - and rbp,rsi - and rsi,QWORD PTR[40+rcx] - - add r14,r12 - adc r15,r13 - adc r8,rax - adc r9,rbx - adc r10,rbp - adc r11,rsi - - mov QWORD PTR[rdi],r14 - mov QWORD PTR[8+rdi],r15 - mov QWORD PTR[16+rdi],r8 - mov QWORD PTR[24+rdi],r9 - mov QWORD PTR[32+rdi],r10 - mov QWORD PTR[40+rdi],r11 - lea r8,QWORD PTR[136+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_sqr_mont_382x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqr_mont_382x:: -sqr_mont_382x ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_mul_mont_384x - DD imagerel $L$SEH_body_mul_mont_384x - DD imagerel $L$SEH_info_mul_mont_384x_prologue - - DD imagerel $L$SEH_body_mul_mont_384x - DD imagerel $L$SEH_epilogue_mul_mont_384x - DD imagerel $L$SEH_info_mul_mont_384x_body - - DD imagerel $L$SEH_epilogue_mul_mont_384x - DD imagerel $L$SEH_end_mul_mont_384x - DD imagerel $L$SEH_info_mul_mont_384x_epilogue - - DD imagerel $L$SEH_begin_sqr_mont_384x - DD imagerel $L$SEH_body_sqr_mont_384x - DD imagerel $L$SEH_info_sqr_mont_384x_prologue - - DD imagerel $L$SEH_body_sqr_mont_384x - DD imagerel $L$SEH_epilogue_sqr_mont_384x - DD imagerel $L$SEH_info_sqr_mont_384x_body - - DD imagerel $L$SEH_epilogue_sqr_mont_384x - DD imagerel $L$SEH_end_sqr_mont_384x - DD imagerel $L$SEH_info_sqr_mont_384x_epilogue - - DD imagerel $L$SEH_begin_mul_382x - DD imagerel $L$SEH_body_mul_382x - DD imagerel $L$SEH_info_mul_382x_prologue - - DD imagerel $L$SEH_body_mul_382x - DD imagerel $L$SEH_epilogue_mul_382x - DD imagerel $L$SEH_info_mul_382x_body - - DD imagerel $L$SEH_epilogue_mul_382x - DD imagerel $L$SEH_end_mul_382x - DD imagerel $L$SEH_info_mul_382x_epilogue - - DD imagerel $L$SEH_begin_sqr_382x - DD imagerel $L$SEH_body_sqr_382x - DD imagerel $L$SEH_info_sqr_382x_prologue - - DD imagerel $L$SEH_body_sqr_382x - DD imagerel $L$SEH_epilogue_sqr_382x - DD imagerel $L$SEH_info_sqr_382x_body - - DD imagerel $L$SEH_epilogue_sqr_382x - DD imagerel $L$SEH_end_sqr_382x - DD imagerel $L$SEH_info_sqr_382x_epilogue - - DD imagerel $L$SEH_begin_mul_384 - DD imagerel $L$SEH_body_mul_384 - DD imagerel $L$SEH_info_mul_384_prologue - - DD imagerel $L$SEH_body_mul_384 - DD imagerel $L$SEH_epilogue_mul_384 - DD imagerel $L$SEH_info_mul_384_body - - DD imagerel $L$SEH_epilogue_mul_384 - DD imagerel $L$SEH_end_mul_384 - DD imagerel $L$SEH_info_mul_384_epilogue - - DD imagerel $L$SEH_begin_sqr_384 - DD imagerel $L$SEH_body_sqr_384 - DD imagerel $L$SEH_info_sqr_384_prologue - - DD imagerel $L$SEH_body_sqr_384 - DD imagerel $L$SEH_epilogue_sqr_384 - DD imagerel $L$SEH_info_sqr_384_body - - DD imagerel $L$SEH_epilogue_sqr_384 - DD imagerel $L$SEH_end_sqr_384 - DD imagerel $L$SEH_info_sqr_384_epilogue - - DD imagerel $L$SEH_begin_sqr_mont_384 - DD imagerel $L$SEH_body_sqr_mont_384 - DD imagerel $L$SEH_info_sqr_mont_384_prologue - - DD imagerel $L$SEH_body_sqr_mont_384 - DD imagerel $L$SEH_epilogue_sqr_mont_384 - DD imagerel $L$SEH_info_sqr_mont_384_body - - DD imagerel $L$SEH_epilogue_sqr_mont_384 - DD imagerel $L$SEH_end_sqr_mont_384 - DD imagerel $L$SEH_info_sqr_mont_384_epilogue - - DD imagerel $L$SEH_begin_redc_mont_384 - DD imagerel $L$SEH_body_redc_mont_384 - DD imagerel $L$SEH_info_redc_mont_384_prologue - - DD imagerel $L$SEH_body_redc_mont_384 - DD imagerel $L$SEH_epilogue_redc_mont_384 - DD imagerel $L$SEH_info_redc_mont_384_body - - DD imagerel $L$SEH_epilogue_redc_mont_384 - DD imagerel $L$SEH_end_redc_mont_384 - DD imagerel $L$SEH_info_redc_mont_384_epilogue - - DD imagerel $L$SEH_begin_from_mont_384 - DD imagerel $L$SEH_body_from_mont_384 - DD imagerel $L$SEH_info_from_mont_384_prologue - - DD imagerel $L$SEH_body_from_mont_384 - DD imagerel $L$SEH_epilogue_from_mont_384 - DD imagerel $L$SEH_info_from_mont_384_body - - DD imagerel $L$SEH_epilogue_from_mont_384 - DD imagerel $L$SEH_end_from_mont_384 - DD imagerel $L$SEH_info_from_mont_384_epilogue - - DD imagerel $L$SEH_begin_sgn0_pty_mont_384 - DD imagerel $L$SEH_body_sgn0_pty_mont_384 - DD imagerel $L$SEH_info_sgn0_pty_mont_384_prologue - - DD imagerel $L$SEH_body_sgn0_pty_mont_384 - DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 - DD imagerel $L$SEH_info_sgn0_pty_mont_384_body - - DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 - DD imagerel $L$SEH_end_sgn0_pty_mont_384 - DD imagerel $L$SEH_info_sgn0_pty_mont_384_epilogue - - DD imagerel $L$SEH_begin_sgn0_pty_mont_384x - DD imagerel $L$SEH_body_sgn0_pty_mont_384x - DD imagerel $L$SEH_info_sgn0_pty_mont_384x_prologue - - DD imagerel $L$SEH_body_sgn0_pty_mont_384x - DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x - DD imagerel $L$SEH_info_sgn0_pty_mont_384x_body - - DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x - DD imagerel $L$SEH_end_sgn0_pty_mont_384x - DD imagerel $L$SEH_info_sgn0_pty_mont_384x_epilogue - - DD imagerel $L$SEH_begin_mul_mont_384 - DD imagerel $L$SEH_body_mul_mont_384 - DD imagerel $L$SEH_info_mul_mont_384_prologue - - DD imagerel $L$SEH_body_mul_mont_384 - DD imagerel $L$SEH_epilogue_mul_mont_384 - DD imagerel $L$SEH_info_mul_mont_384_body - - DD imagerel $L$SEH_epilogue_mul_mont_384 - DD imagerel $L$SEH_end_mul_mont_384 - DD imagerel $L$SEH_info_mul_mont_384_epilogue - - DD imagerel $L$SEH_begin_sqr_n_mul_mont_384 - DD imagerel $L$SEH_body_sqr_n_mul_mont_384 - DD imagerel $L$SEH_info_sqr_n_mul_mont_384_prologue - - DD imagerel $L$SEH_body_sqr_n_mul_mont_384 - DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 - DD imagerel $L$SEH_info_sqr_n_mul_mont_384_body - - DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 - DD imagerel $L$SEH_end_sqr_n_mul_mont_384 - DD imagerel $L$SEH_info_sqr_n_mul_mont_384_epilogue - - DD imagerel $L$SEH_begin_sqr_n_mul_mont_383 - DD imagerel $L$SEH_body_sqr_n_mul_mont_383 - DD imagerel $L$SEH_info_sqr_n_mul_mont_383_prologue - - DD imagerel $L$SEH_body_sqr_n_mul_mont_383 - DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 - DD imagerel $L$SEH_info_sqr_n_mul_mont_383_body - - DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 - DD imagerel $L$SEH_end_sqr_n_mul_mont_383 - DD imagerel $L$SEH_info_sqr_n_mul_mont_383_epilogue - - DD imagerel $L$SEH_begin_sqr_mont_382x - DD imagerel $L$SEH_body_sqr_mont_382x - DD imagerel $L$SEH_info_sqr_mont_382x_prologue - - DD imagerel $L$SEH_body_sqr_mont_382x - DD imagerel $L$SEH_epilogue_sqr_mont_382x - DD imagerel $L$SEH_info_sqr_mont_382x_body - - DD imagerel $L$SEH_epilogue_sqr_mont_382x - DD imagerel $L$SEH_end_sqr_mont_382x - DD imagerel $L$SEH_info_sqr_mont_382x_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_mul_mont_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_mont_384x_body:: -DB 1,0,18,0 -DB 000h,0f4h,029h,000h -DB 000h,0e4h,02ah,000h -DB 000h,0d4h,02bh,000h -DB 000h,0c4h,02ch,000h -DB 000h,034h,02dh,000h -DB 000h,054h,02eh,000h -DB 000h,074h,030h,000h -DB 000h,064h,031h,000h -DB 000h,001h,02fh,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mul_mont_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqr_mont_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqr_mont_384x_body:: -DB 1,0,18,0 -DB 000h,0f4h,011h,000h -DB 000h,0e4h,012h,000h -DB 000h,0d4h,013h,000h -DB 000h,0c4h,014h,000h -DB 000h,034h,015h,000h -DB 000h,054h,016h,000h -DB 000h,074h,018h,000h -DB 000h,064h,019h,000h -DB 000h,001h,017h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqr_mont_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mul_382x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_382x_body:: -DB 1,0,18,0 -DB 000h,0f4h,011h,000h -DB 000h,0e4h,012h,000h -DB 000h,0d4h,013h,000h -DB 000h,0c4h,014h,000h -DB 000h,034h,015h,000h -DB 000h,054h,016h,000h -DB 000h,074h,018h,000h -DB 000h,064h,019h,000h -DB 000h,001h,017h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mul_382x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqr_382x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqr_382x_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqr_382x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mul_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_384_body:: -DB 1,0,11,0 -DB 000h,0c4h,000h,000h -DB 000h,034h,001h,000h -DB 000h,054h,002h,000h -DB 000h,074h,004h,000h -DB 000h,064h,005h,000h -DB 000h,022h -DB 000h,000h,000h,000h,000h,000h -$L$SEH_info_mul_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqr_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqr_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqr_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqr_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqr_mont_384_body:: -DB 1,0,18,0 -DB 000h,0f4h,00fh,000h -DB 000h,0e4h,010h,000h -DB 000h,0d4h,011h,000h -DB 000h,0c4h,012h,000h -DB 000h,034h,013h,000h -DB 000h,054h,014h,000h -DB 000h,074h,016h,000h -DB 000h,064h,017h,000h -DB 000h,001h,015h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqr_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_redc_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_redc_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_redc_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_from_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_from_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_from_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sgn0_pty_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sgn0_pty_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sgn0_pty_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sgn0_pty_mont_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sgn0_pty_mont_384x_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sgn0_pty_mont_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mul_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mul_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,003h,000h -DB 000h,0e4h,004h,000h -DB 000h,0d4h,005h,000h -DB 000h,0c4h,006h,000h -DB 000h,034h,007h,000h -DB 000h,054h,008h,000h -DB 000h,074h,00ah,000h -DB 000h,064h,00bh,000h -DB 000h,082h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mul_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqr_n_mul_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqr_n_mul_mont_384_body:: -DB 1,0,18,0 -DB 000h,0f4h,011h,000h -DB 000h,0e4h,012h,000h -DB 000h,0d4h,013h,000h -DB 000h,0c4h,014h,000h -DB 000h,034h,015h,000h -DB 000h,054h,016h,000h -DB 000h,074h,018h,000h -DB 000h,064h,019h,000h -DB 000h,001h,017h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqr_n_mul_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqr_n_mul_mont_383_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqr_n_mul_mont_383_body:: -DB 1,0,18,0 -DB 000h,0f4h,011h,000h -DB 000h,0e4h,012h,000h -DB 000h,0d4h,013h,000h -DB 000h,0c4h,014h,000h -DB 000h,034h,015h,000h -DB 000h,054h,016h,000h -DB 000h,074h,018h,000h -DB 000h,064h,019h,000h -DB 000h,001h,017h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqr_n_mul_mont_383_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqr_mont_382x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqr_mont_382x_body:: -DB 1,0,18,0 -DB 000h,0f4h,011h,000h -DB 000h,0e4h,012h,000h -DB 000h,0d4h,013h,000h -DB 000h,0c4h,014h,000h -DB 000h,034h,015h,000h -DB 000h,054h,016h,000h -DB 000h,074h,018h,000h -DB 000h,064h,019h,000h -DB 000h,001h,017h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqr_mont_382x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm deleted file mode 100644 index 21d18a8b40b..00000000000 --- a/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm +++ /dev/null @@ -1,810 +0,0 @@ -OPTION DOTNAME -PUBLIC mul_mont_sparse_256$1 -PUBLIC sqr_mont_sparse_256$1 -PUBLIC from_mont_256$1 -PUBLIC redc_mont_256$1 -.text$ SEGMENT ALIGN(256) 'CODE' - -PUBLIC mulx_mont_sparse_256 - - -ALIGN 32 -mulx_mont_sparse_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mulx_mont_sparse_256:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] -mul_mont_sparse_256$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_mulx_mont_sparse_256:: - - - mov rbx,rdx - mov rdx,QWORD PTR[rdx] - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rbp,QWORD PTR[16+rsi] - mov r9,QWORD PTR[24+rsi] - lea rsi,QWORD PTR[((-128))+rsi] - lea rcx,QWORD PTR[((-128))+rcx] - - mulx r11,rax,r14 - call __mulx_mont_sparse_256 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_mulx_mont_sparse_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mulx_mont_sparse_256:: -mulx_mont_sparse_256 ENDP - -PUBLIC sqrx_mont_sparse_256 - - -ALIGN 32 -sqrx_mont_sparse_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqrx_mont_sparse_256:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -sqr_mont_sparse_256$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_sqrx_mont_sparse_256:: - - - mov rbx,rsi - mov r8,rcx - mov rcx,rdx - mov rdx,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rbp,QWORD PTR[16+rsi] - mov r9,QWORD PTR[24+rsi] - lea rsi,QWORD PTR[((-128))+rbx] - lea rcx,QWORD PTR[((-128))+rcx] - - mulx r11,rax,rdx - call __mulx_mont_sparse_256 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sqrx_mont_sparse_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqrx_mont_sparse_256:: -sqrx_mont_sparse_256 ENDP - -ALIGN 32 -__mulx_mont_sparse_256 PROC PRIVATE - DB 243,15,30,250 - - mulx r12,r15,r15 - mulx r13,rbp,rbp - add r11,r15 - mulx r14,r9,r9 - mov rdx,QWORD PTR[8+rbx] - adc r12,rbp - adc r13,r9 - adc r14,0 - - mov r10,rax - imul rax,r8 - - - xor r15,r15 - mulx r9,rbp,QWORD PTR[((0+128))+rsi] - adox r11,rbp - adcx r12,r9 - - mulx r9,rbp,QWORD PTR[((8+128))+rsi] - adox r12,rbp - adcx r13,r9 - - mulx r9,rbp,QWORD PTR[((16+128))+rsi] - adox r13,rbp - adcx r14,r9 - - mulx r9,rbp,QWORD PTR[((24+128))+rsi] - mov rdx,rax - adox r14,rbp - adcx r9,r15 - adox r15,r9 - - - mulx rax,rbp,QWORD PTR[((0+128))+rcx] - adcx r10,rbp - adox rax,r11 - - mulx r9,rbp,QWORD PTR[((8+128))+rcx] - adcx rax,rbp - adox r12,r9 - - mulx r9,rbp,QWORD PTR[((16+128))+rcx] - adcx r12,rbp - adox r13,r9 - - mulx r9,rbp,QWORD PTR[((24+128))+rcx] - mov rdx,QWORD PTR[16+rbx] - adcx r13,rbp - adox r14,r9 - adcx r14,r10 - adox r15,r10 - adcx r15,r10 - adox r10,r10 - adc r10,0 - mov r11,rax - imul rax,r8 - - - xor rbp,rbp - mulx r9,rbp,QWORD PTR[((0+128))+rsi] - adox r12,rbp - adcx r13,r9 - - mulx r9,rbp,QWORD PTR[((8+128))+rsi] - adox r13,rbp - adcx r14,r9 - - mulx r9,rbp,QWORD PTR[((16+128))+rsi] - adox r14,rbp - adcx r15,r9 - - mulx r9,rbp,QWORD PTR[((24+128))+rsi] - mov rdx,rax - adox r15,rbp - adcx r9,r10 - adox r10,r9 - - - mulx rax,rbp,QWORD PTR[((0+128))+rcx] - adcx r11,rbp - adox rax,r12 - - mulx r9,rbp,QWORD PTR[((8+128))+rcx] - adcx rax,rbp - adox r13,r9 - - mulx r9,rbp,QWORD PTR[((16+128))+rcx] - adcx r13,rbp - adox r14,r9 - - mulx r9,rbp,QWORD PTR[((24+128))+rcx] - mov rdx,QWORD PTR[24+rbx] - adcx r14,rbp - adox r15,r9 - adcx r15,r11 - adox r10,r11 - adcx r10,r11 - adox r11,r11 - adc r11,0 - mov r12,rax - imul rax,r8 - - - xor rbp,rbp - mulx r9,rbp,QWORD PTR[((0+128))+rsi] - adox r13,rbp - adcx r14,r9 - - mulx r9,rbp,QWORD PTR[((8+128))+rsi] - adox r14,rbp - adcx r15,r9 - - mulx r9,rbp,QWORD PTR[((16+128))+rsi] - adox r15,rbp - adcx r10,r9 - - mulx r9,rbp,QWORD PTR[((24+128))+rsi] - mov rdx,rax - adox r10,rbp - adcx r9,r11 - adox r11,r9 - - - mulx rax,rbp,QWORD PTR[((0+128))+rcx] - adcx r12,rbp - adox rax,r13 - - mulx r9,rbp,QWORD PTR[((8+128))+rcx] - adcx rax,rbp - adox r14,r9 - - mulx r9,rbp,QWORD PTR[((16+128))+rcx] - adcx r14,rbp - adox r15,r9 - - mulx r9,rbp,QWORD PTR[((24+128))+rcx] - mov rdx,rax - adcx r15,rbp - adox r10,r9 - adcx r10,r12 - adox r11,r12 - adcx r11,r12 - adox r12,r12 - adc r12,0 - imul rdx,r8 - - - xor rbp,rbp - mulx r9,r13,QWORD PTR[((0+128))+rcx] - adcx r13,rax - adox r14,r9 - - mulx r9,rbp,QWORD PTR[((8+128))+rcx] - adcx r14,rbp - adox r15,r9 - - mulx r9,rbp,QWORD PTR[((16+128))+rcx] - adcx r15,rbp - adox r10,r9 - - mulx r9,rbp,QWORD PTR[((24+128))+rcx] - mov rdx,r14 - lea rcx,QWORD PTR[128+rcx] - adcx r10,rbp - adox r11,r9 - mov rax,r15 - adcx r11,r13 - adox r12,r13 - adc r12,0 - - - - - mov rbp,r10 - sub r14,QWORD PTR[rcx] - sbb r15,QWORD PTR[8+rcx] - sbb r10,QWORD PTR[16+rcx] - mov r9,r11 - sbb r11,QWORD PTR[24+rcx] - sbb r12,0 - - cmovc r14,rdx - cmovc r15,rax - cmovc r10,rbp - mov QWORD PTR[rdi],r14 - cmovc r11,r9 - mov QWORD PTR[8+rdi],r15 - mov QWORD PTR[16+rdi],r10 - mov QWORD PTR[24+rdi],r11 - - DB 0F3h,0C3h ;repret -__mulx_mont_sparse_256 ENDP -PUBLIC fromx_mont_256 - - -ALIGN 32 -fromx_mont_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_fromx_mont_256:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -from_mont_256$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_fromx_mont_256:: - - - mov rbx,rdx - call __mulx_by_1_mont_256 - - - - - - mov rdx,r15 - mov r12,r10 - mov r13,r11 - - sub r14,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - sbb r10,QWORD PTR[16+rbx] - sbb r11,QWORD PTR[24+rbx] - - cmovnc rax,r14 - cmovnc rdx,r15 - cmovnc r12,r10 - mov QWORD PTR[rdi],rax - cmovnc r13,r11 - mov QWORD PTR[8+rdi],rdx - mov QWORD PTR[16+rdi],r12 - mov QWORD PTR[24+rdi],r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_fromx_mont_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_fromx_mont_256:: -fromx_mont_256 ENDP - -PUBLIC redcx_mont_256 - - -ALIGN 32 -redcx_mont_256 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_redcx_mont_256:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -redc_mont_256$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_redcx_mont_256:: - - - mov rbx,rdx - call __mulx_by_1_mont_256 - - add r14,QWORD PTR[32+rsi] - adc r15,QWORD PTR[40+rsi] - mov rax,r14 - adc r10,QWORD PTR[48+rsi] - mov rdx,r15 - adc r11,QWORD PTR[56+rsi] - sbb rsi,rsi - - - - - mov r12,r10 - sub r14,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - sbb r10,QWORD PTR[16+rbx] - mov r13,r11 - sbb r11,QWORD PTR[24+rbx] - sbb rsi,0 - - cmovnc rax,r14 - cmovnc rdx,r15 - cmovnc r12,r10 - mov QWORD PTR[rdi],rax - cmovnc r13,r11 - mov QWORD PTR[8+rdi],rdx - mov QWORD PTR[16+rdi],r12 - mov QWORD PTR[24+rdi],r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_redcx_mont_256:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_redcx_mont_256:: -redcx_mont_256 ENDP - -ALIGN 32 -__mulx_by_1_mont_256 PROC PRIVATE - DB 243,15,30,250 - - mov rax,QWORD PTR[rsi] - mov r11,QWORD PTR[8+rsi] - mov r12,QWORD PTR[16+rsi] - mov r13,QWORD PTR[24+rsi] - - mov r14,rax - imul rax,rcx - mov r10,rax - - mul QWORD PTR[rbx] - add r14,rax - mov rax,r10 - adc r14,rdx - - mul QWORD PTR[8+rbx] - add r11,rax - mov rax,r10 - adc rdx,0 - add r11,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[16+rbx] - mov r15,r11 - imul r11,rcx - add r12,rax - mov rax,r10 - adc rdx,0 - add r12,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[24+rbx] - add r13,rax - mov rax,r11 - adc rdx,0 - add r13,r14 - adc rdx,0 - mov r14,rdx - - mul QWORD PTR[rbx] - add r15,rax - mov rax,r11 - adc r15,rdx - - mul QWORD PTR[8+rbx] - add r12,rax - mov rax,r11 - adc rdx,0 - add r12,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[16+rbx] - mov r10,r12 - imul r12,rcx - add r13,rax - mov rax,r11 - adc rdx,0 - add r13,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[24+rbx] - add r14,rax - mov rax,r12 - adc rdx,0 - add r14,r15 - adc rdx,0 - mov r15,rdx - - mul QWORD PTR[rbx] - add r10,rax - mov rax,r12 - adc r10,rdx - - mul QWORD PTR[8+rbx] - add r13,rax - mov rax,r12 - adc rdx,0 - add r13,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[16+rbx] - mov r11,r13 - imul r13,rcx - add r14,rax - mov rax,r12 - adc rdx,0 - add r14,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[24+rbx] - add r15,rax - mov rax,r13 - adc rdx,0 - add r15,r10 - adc rdx,0 - mov r10,rdx - - mul QWORD PTR[rbx] - add r11,rax - mov rax,r13 - adc r11,rdx - - mul QWORD PTR[8+rbx] - add r14,rax - mov rax,r13 - adc rdx,0 - add r14,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[16+rbx] - add r15,rax - mov rax,r13 - adc rdx,0 - add r15,r11 - adc rdx,0 - mov r11,rdx - - mul QWORD PTR[24+rbx] - add r10,rax - mov rax,r14 - adc rdx,0 - add r10,r11 - adc rdx,0 - mov r11,rdx - DB 0F3h,0C3h ;repret -__mulx_by_1_mont_256 ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_mulx_mont_sparse_256 - DD imagerel $L$SEH_body_mulx_mont_sparse_256 - DD imagerel $L$SEH_info_mulx_mont_sparse_256_prologue - - DD imagerel $L$SEH_body_mulx_mont_sparse_256 - DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 - DD imagerel $L$SEH_info_mulx_mont_sparse_256_body - - DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 - DD imagerel $L$SEH_end_mulx_mont_sparse_256 - DD imagerel $L$SEH_info_mulx_mont_sparse_256_epilogue - - DD imagerel $L$SEH_begin_sqrx_mont_sparse_256 - DD imagerel $L$SEH_body_sqrx_mont_sparse_256 - DD imagerel $L$SEH_info_sqrx_mont_sparse_256_prologue - - DD imagerel $L$SEH_body_sqrx_mont_sparse_256 - DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 - DD imagerel $L$SEH_info_sqrx_mont_sparse_256_body - - DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 - DD imagerel $L$SEH_end_sqrx_mont_sparse_256 - DD imagerel $L$SEH_info_sqrx_mont_sparse_256_epilogue - - DD imagerel $L$SEH_begin_fromx_mont_256 - DD imagerel $L$SEH_body_fromx_mont_256 - DD imagerel $L$SEH_info_fromx_mont_256_prologue - - DD imagerel $L$SEH_body_fromx_mont_256 - DD imagerel $L$SEH_epilogue_fromx_mont_256 - DD imagerel $L$SEH_info_fromx_mont_256_body - - DD imagerel $L$SEH_epilogue_fromx_mont_256 - DD imagerel $L$SEH_end_fromx_mont_256 - DD imagerel $L$SEH_info_fromx_mont_256_epilogue - - DD imagerel $L$SEH_begin_redcx_mont_256 - DD imagerel $L$SEH_body_redcx_mont_256 - DD imagerel $L$SEH_info_redcx_mont_256_prologue - - DD imagerel $L$SEH_body_redcx_mont_256 - DD imagerel $L$SEH_epilogue_redcx_mont_256 - DD imagerel $L$SEH_info_redcx_mont_256_body - - DD imagerel $L$SEH_epilogue_redcx_mont_256 - DD imagerel $L$SEH_end_redcx_mont_256 - DD imagerel $L$SEH_info_redcx_mont_256_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_mulx_mont_sparse_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mulx_mont_sparse_256_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mulx_mont_sparse_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqrx_mont_sparse_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqrx_mont_sparse_256_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqrx_mont_sparse_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_fromx_mont_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_fromx_mont_256_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_fromx_mont_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_redcx_mont_256_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_redcx_mont_256_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_redcx_mont_256_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm deleted file mode 100644 index 4dc41b04098..00000000000 --- a/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm +++ /dev/null @@ -1,3644 +0,0 @@ -OPTION DOTNAME -PUBLIC mul_mont_384x$1 -PUBLIC sqr_mont_384x$1 -PUBLIC mul_382x$1 -PUBLIC sqr_382x$1 -PUBLIC mul_384$1 -PUBLIC sqr_384$1 -PUBLIC redc_mont_384$1 -PUBLIC from_mont_384$1 -PUBLIC sgn0_pty_mont_384$1 -PUBLIC sgn0_pty_mont_384x$1 -PUBLIC mul_mont_384$1 -PUBLIC sqr_mont_384$1 -PUBLIC sqr_n_mul_mont_384$1 -PUBLIC sqr_n_mul_mont_383$1 -PUBLIC sqr_mont_382x$1 -.text$ SEGMENT ALIGN(256) 'CODE' - - - - - - - - -ALIGN 32 -__subx_mod_384x384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - mov r14,QWORD PTR[48+rsi] - - sub r8,QWORD PTR[rdx] - mov r15,QWORD PTR[56+rsi] - sbb r9,QWORD PTR[8+rdx] - mov rax,QWORD PTR[64+rsi] - sbb r10,QWORD PTR[16+rdx] - mov rbx,QWORD PTR[72+rsi] - sbb r11,QWORD PTR[24+rdx] - mov rbp,QWORD PTR[80+rsi] - sbb r12,QWORD PTR[32+rdx] - mov rsi,QWORD PTR[88+rsi] - sbb r13,QWORD PTR[40+rdx] - mov QWORD PTR[rdi],r8 - sbb r14,QWORD PTR[48+rdx] - mov r8,QWORD PTR[rcx] - mov QWORD PTR[8+rdi],r9 - sbb r15,QWORD PTR[56+rdx] - mov r9,QWORD PTR[8+rcx] - mov QWORD PTR[16+rdi],r10 - sbb rax,QWORD PTR[64+rdx] - mov r10,QWORD PTR[16+rcx] - mov QWORD PTR[24+rdi],r11 - sbb rbx,QWORD PTR[72+rdx] - mov r11,QWORD PTR[24+rcx] - mov QWORD PTR[32+rdi],r12 - sbb rbp,QWORD PTR[80+rdx] - mov r12,QWORD PTR[32+rcx] - mov QWORD PTR[40+rdi],r13 - sbb rsi,QWORD PTR[88+rdx] - mov r13,QWORD PTR[40+rcx] - sbb rdx,rdx - - and r8,rdx - and r9,rdx - and r10,rdx - and r11,rdx - and r12,rdx - and r13,rdx - - add r14,r8 - adc r15,r9 - mov QWORD PTR[48+rdi],r14 - adc rax,r10 - mov QWORD PTR[56+rdi],r15 - adc rbx,r11 - mov QWORD PTR[64+rdi],rax - adc rbp,r12 - mov QWORD PTR[72+rdi],rbx - adc rsi,r13 - mov QWORD PTR[80+rdi],rbp - mov QWORD PTR[88+rdi],rsi - - DB 0F3h,0C3h ;repret -__subx_mod_384x384 ENDP - - -ALIGN 32 -__addx_mod_384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - add r8,QWORD PTR[rdx] - adc r9,QWORD PTR[8+rdx] - adc r10,QWORD PTR[16+rdx] - mov r14,r8 - adc r11,QWORD PTR[24+rdx] - mov r15,r9 - adc r12,QWORD PTR[32+rdx] - mov rax,r10 - adc r13,QWORD PTR[40+rdx] - mov rbx,r11 - sbb rdx,rdx - - sub r8,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rcx] - mov rbp,r12 - sbb r10,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rcx] - sbb r12,QWORD PTR[32+rcx] - mov rsi,r13 - sbb r13,QWORD PTR[40+rcx] - sbb rdx,0 - - cmovc r8,r14 - cmovc r9,r15 - cmovc r10,rax - mov QWORD PTR[rdi],r8 - cmovc r11,rbx - mov QWORD PTR[8+rdi],r9 - cmovc r12,rbp - mov QWORD PTR[16+rdi],r10 - cmovc r13,rsi - mov QWORD PTR[24+rdi],r11 - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - DB 0F3h,0C3h ;repret -__addx_mod_384 ENDP - - -ALIGN 32 -__subx_mod_384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - -__subx_mod_384_a_is_loaded:: - sub r8,QWORD PTR[rdx] - mov r14,QWORD PTR[rcx] - sbb r9,QWORD PTR[8+rdx] - mov r15,QWORD PTR[8+rcx] - sbb r10,QWORD PTR[16+rdx] - mov rax,QWORD PTR[16+rcx] - sbb r11,QWORD PTR[24+rdx] - mov rbx,QWORD PTR[24+rcx] - sbb r12,QWORD PTR[32+rdx] - mov rbp,QWORD PTR[32+rcx] - sbb r13,QWORD PTR[40+rdx] - mov rsi,QWORD PTR[40+rcx] - sbb rdx,rdx - - and r14,rdx - and r15,rdx - and rax,rdx - and rbx,rdx - and rbp,rdx - and rsi,rdx - - add r8,r14 - adc r9,r15 - mov QWORD PTR[rdi],r8 - adc r10,rax - mov QWORD PTR[8+rdi],r9 - adc r11,rbx - mov QWORD PTR[16+rdi],r10 - adc r12,rbp - mov QWORD PTR[24+rdi],r11 - adc r13,rsi - mov QWORD PTR[32+rdi],r12 - mov QWORD PTR[40+rdi],r13 - - DB 0F3h,0C3h ;repret -__subx_mod_384 ENDP -PUBLIC mulx_mont_384x - - -ALIGN 32 -mulx_mont_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mulx_mont_384x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] -mul_mont_384x$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,328 - -$L$SEH_body_mulx_mont_384x:: - - - mov rbx,rdx - mov QWORD PTR[32+rsp],rdi - mov QWORD PTR[24+rsp],rsi - mov QWORD PTR[16+rsp],rdx - mov QWORD PTR[8+rsp],rcx - mov QWORD PTR[rsp],r8 - - - - - lea rdi,QWORD PTR[40+rsp] - call __mulx_384 - - - lea rbx,QWORD PTR[48+rbx] - lea rsi,QWORD PTR[((128+48))+rsi] - lea rdi,QWORD PTR[96+rdi] - call __mulx_384 - - - mov rcx,QWORD PTR[8+rsp] - lea rsi,QWORD PTR[rbx] - lea rdx,QWORD PTR[((-48))+rbx] - lea rdi,QWORD PTR[((40+192+48))+rsp] - call __addx_mod_384 - - mov rsi,QWORD PTR[24+rsp] - lea rdx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[((-48))+rdi] - call __addx_mod_384 - - lea rbx,QWORD PTR[rdi] - lea rsi,QWORD PTR[48+rdi] - call __mulx_384 - - - lea rsi,QWORD PTR[rdi] - lea rdx,QWORD PTR[40+rsp] - mov rcx,QWORD PTR[8+rsp] - call __subx_mod_384x384 - - lea rsi,QWORD PTR[rdi] - lea rdx,QWORD PTR[((-96))+rdi] - call __subx_mod_384x384 - - - lea rsi,QWORD PTR[40+rsp] - lea rdx,QWORD PTR[((40+96))+rsp] - lea rdi,QWORD PTR[40+rsp] - call __subx_mod_384x384 - - lea rbx,QWORD PTR[rcx] - - - lea rsi,QWORD PTR[40+rsp] - mov rcx,QWORD PTR[rsp] - mov rdi,QWORD PTR[32+rsp] - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - - lea rsi,QWORD PTR[((40+192))+rsp] - mov rcx,QWORD PTR[rsp] - lea rdi,QWORD PTR[48+rdi] - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - lea r8,QWORD PTR[328+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_mulx_mont_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mulx_mont_384x:: -mulx_mont_384x ENDP -PUBLIC sqrx_mont_384x - - -ALIGN 32 -sqrx_mont_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqrx_mont_384x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -sqr_mont_384x$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,136 - -$L$SEH_body_sqrx_mont_384x:: - - - mov QWORD PTR[rsp],rcx - mov rcx,rdx - - mov QWORD PTR[16+rsp],rdi - mov QWORD PTR[24+rsp],rsi - - - lea rdx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[32+rsp] - call __addx_mod_384 - - - mov rsi,QWORD PTR[24+rsp] - lea rdx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[((32+48))+rsp] - call __subx_mod_384 - - - mov rsi,QWORD PTR[24+rsp] - lea rbx,QWORD PTR[48+rsi] - - mov rdx,QWORD PTR[48+rsi] - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rax,QWORD PTR[16+rsi] - mov r12,QWORD PTR[24+rsi] - mov rdi,QWORD PTR[32+rsi] - mov rbp,QWORD PTR[40+rsi] - lea rsi,QWORD PTR[((-128))+rsi] - lea rcx,QWORD PTR[((-128))+rcx] - - mulx r9,r8,r14 - call __mulx_mont_384 - add rdx,rdx - adc r15,r15 - adc rax,rax - mov r8,rdx - adc r12,r12 - mov r9,r15 - adc rdi,rdi - mov r10,rax - adc rbp,rbp - mov r11,r12 - sbb rsi,rsi - - sub rdx,QWORD PTR[rcx] - sbb r15,QWORD PTR[8+rcx] - mov r13,rdi - sbb rax,QWORD PTR[16+rcx] - sbb r12,QWORD PTR[24+rcx] - sbb rdi,QWORD PTR[32+rcx] - mov r14,rbp - sbb rbp,QWORD PTR[40+rcx] - sbb rsi,0 - - cmovc rdx,r8 - cmovc r15,r9 - cmovc rax,r10 - mov QWORD PTR[48+rbx],rdx - cmovc r12,r11 - mov QWORD PTR[56+rbx],r15 - cmovc rdi,r13 - mov QWORD PTR[64+rbx],rax - cmovc rbp,r14 - mov QWORD PTR[72+rbx],r12 - mov QWORD PTR[80+rbx],rdi - mov QWORD PTR[88+rbx],rbp - - lea rsi,QWORD PTR[32+rsp] - lea rbx,QWORD PTR[((32+48))+rsp] - - mov rdx,QWORD PTR[((32+48))+rsp] - mov r14,QWORD PTR[((32+0))+rsp] - mov r15,QWORD PTR[((32+8))+rsp] - mov rax,QWORD PTR[((32+16))+rsp] - mov r12,QWORD PTR[((32+24))+rsp] - mov rdi,QWORD PTR[((32+32))+rsp] - mov rbp,QWORD PTR[((32+40))+rsp] - lea rsi,QWORD PTR[((-128))+rsi] - lea rcx,QWORD PTR[((-128))+rcx] - - mulx r9,r8,r14 - call __mulx_mont_384 - - lea r8,QWORD PTR[136+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_sqrx_mont_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqrx_mont_384x:: -sqrx_mont_384x ENDP - -PUBLIC mulx_382x - - -ALIGN 32 -mulx_382x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mulx_382x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -mul_382x$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,136 - -$L$SEH_body_mulx_382x:: - - - lea rdi,QWORD PTR[96+rdi] - mov QWORD PTR[rsp],rsi - mov QWORD PTR[8+rsp],rdx - mov QWORD PTR[16+rsp],rdi - mov QWORD PTR[24+rsp],rcx - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - add r8,QWORD PTR[48+rsi] - adc r9,QWORD PTR[56+rsi] - adc r10,QWORD PTR[64+rsi] - adc r11,QWORD PTR[72+rsi] - adc r12,QWORD PTR[80+rsi] - adc r13,QWORD PTR[88+rsi] - - mov QWORD PTR[((32+0))+rsp],r8 - mov QWORD PTR[((32+8))+rsp],r9 - mov QWORD PTR[((32+16))+rsp],r10 - mov QWORD PTR[((32+24))+rsp],r11 - mov QWORD PTR[((32+32))+rsp],r12 - mov QWORD PTR[((32+40))+rsp],r13 - - - mov r8,QWORD PTR[rdx] - mov r9,QWORD PTR[8+rdx] - mov r10,QWORD PTR[16+rdx] - mov r11,QWORD PTR[24+rdx] - mov r12,QWORD PTR[32+rdx] - mov r13,QWORD PTR[40+rdx] - - add r8,QWORD PTR[48+rdx] - adc r9,QWORD PTR[56+rdx] - adc r10,QWORD PTR[64+rdx] - adc r11,QWORD PTR[72+rdx] - adc r12,QWORD PTR[80+rdx] - adc r13,QWORD PTR[88+rdx] - - mov QWORD PTR[((32+48))+rsp],r8 - mov QWORD PTR[((32+56))+rsp],r9 - mov QWORD PTR[((32+64))+rsp],r10 - mov QWORD PTR[((32+72))+rsp],r11 - mov QWORD PTR[((32+80))+rsp],r12 - mov QWORD PTR[((32+88))+rsp],r13 - - - lea rsi,QWORD PTR[((32+0))+rsp] - lea rbx,QWORD PTR[((32+48))+rsp] - call __mulx_384 - - - mov rsi,QWORD PTR[rsp] - mov rbx,QWORD PTR[8+rsp] - lea rdi,QWORD PTR[((-96))+rdi] - call __mulx_384 - - - lea rsi,QWORD PTR[((48+128))+rsi] - lea rbx,QWORD PTR[48+rbx] - lea rdi,QWORD PTR[32+rsp] - call __mulx_384 - - - mov rsi,QWORD PTR[16+rsp] - lea rdx,QWORD PTR[32+rsp] - mov rcx,QWORD PTR[24+rsp] - mov rdi,rsi - call __subx_mod_384x384 - - - lea rsi,QWORD PTR[rdi] - lea rdx,QWORD PTR[((-96))+rdi] - call __subx_mod_384x384 - - - lea rsi,QWORD PTR[((-96))+rdi] - lea rdx,QWORD PTR[32+rsp] - lea rdi,QWORD PTR[((-96))+rdi] - call __subx_mod_384x384 - - lea r8,QWORD PTR[136+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_mulx_382x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mulx_382x:: -mulx_382x ENDP -PUBLIC sqrx_382x - - -ALIGN 32 -sqrx_382x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqrx_382x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -sqr_382x$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rsi - -$L$SEH_body_sqrx_382x:: - - - mov rcx,rdx - - - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rax,QWORD PTR[16+rsi] - mov rbx,QWORD PTR[24+rsi] - mov rbp,QWORD PTR[32+rsi] - mov rdx,QWORD PTR[40+rsi] - - mov r8,r14 - add r14,QWORD PTR[48+rsi] - mov r9,r15 - adc r15,QWORD PTR[56+rsi] - mov r10,rax - adc rax,QWORD PTR[64+rsi] - mov r11,rbx - adc rbx,QWORD PTR[72+rsi] - mov r12,rbp - adc rbp,QWORD PTR[80+rsi] - mov r13,rdx - adc rdx,QWORD PTR[88+rsi] - - mov QWORD PTR[rdi],r14 - mov QWORD PTR[8+rdi],r15 - mov QWORD PTR[16+rdi],rax - mov QWORD PTR[24+rdi],rbx - mov QWORD PTR[32+rdi],rbp - mov QWORD PTR[40+rdi],rdx - - - lea rdx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[48+rdi] - call __subx_mod_384_a_is_loaded - - - lea rsi,QWORD PTR[rdi] - lea rbx,QWORD PTR[((-48))+rdi] - lea rdi,QWORD PTR[((-48))+rdi] - call __mulx_384 - - - mov rsi,QWORD PTR[rsp] - lea rbx,QWORD PTR[48+rsi] - lea rdi,QWORD PTR[96+rdi] - call __mulx_384 - - mov r8,QWORD PTR[rdi] - mov r9,QWORD PTR[8+rdi] - mov r10,QWORD PTR[16+rdi] - mov r11,QWORD PTR[24+rdi] - mov r12,QWORD PTR[32+rdi] - mov r13,QWORD PTR[40+rdi] - mov r14,QWORD PTR[48+rdi] - mov r15,QWORD PTR[56+rdi] - mov rax,QWORD PTR[64+rdi] - mov rbx,QWORD PTR[72+rdi] - mov rbp,QWORD PTR[80+rdi] - add r8,r8 - mov rdx,QWORD PTR[88+rdi] - adc r9,r9 - mov QWORD PTR[rdi],r8 - adc r10,r10 - mov QWORD PTR[8+rdi],r9 - adc r11,r11 - mov QWORD PTR[16+rdi],r10 - adc r12,r12 - mov QWORD PTR[24+rdi],r11 - adc r13,r13 - mov QWORD PTR[32+rdi],r12 - adc r14,r14 - mov QWORD PTR[40+rdi],r13 - adc r15,r15 - mov QWORD PTR[48+rdi],r14 - adc rax,rax - mov QWORD PTR[56+rdi],r15 - adc rbx,rbx - mov QWORD PTR[64+rdi],rax - adc rbp,rbp - mov QWORD PTR[72+rdi],rbx - adc rdx,rdx - mov QWORD PTR[80+rdi],rbp - mov QWORD PTR[88+rdi],rdx - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sqrx_382x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqrx_382x:: -sqrx_382x ENDP -PUBLIC mulx_384 - - -ALIGN 32 -mulx_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mulx_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -mul_384$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - -$L$SEH_body_mulx_384:: - - - mov rbx,rdx - call __mulx_384 - - mov r15,QWORD PTR[rsp] - - mov r14,QWORD PTR[8+rsp] - - mov r13,QWORD PTR[16+rsp] - - mov r12,QWORD PTR[24+rsp] - - mov rbx,QWORD PTR[32+rsp] - - mov rbp,QWORD PTR[40+rsp] - - lea rsp,QWORD PTR[48+rsp] - -$L$SEH_epilogue_mulx_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mulx_384:: -mulx_384 ENDP - - -ALIGN 32 -__mulx_384 PROC PRIVATE - DB 243,15,30,250 - - mov rdx,QWORD PTR[rbx] - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - lea rsi,QWORD PTR[((-128))+rsi] - - mulx rcx,r9,r14 - xor rbp,rbp - - mulx rax,r8,r15 - adcx r8,rcx - mov QWORD PTR[rdi],r9 - - mulx rcx,r9,r10 - adcx r9,rax - - mulx rax,r10,r11 - adcx r10,rcx - - mulx rcx,r11,r12 - adcx r11,rax - - mulx r13,r12,r13 - mov rdx,QWORD PTR[8+rbx] - adcx r12,rcx - adcx r13,rbp - mulx rcx,rax,r14 - adcx rax,r8 - adox r9,rcx - mov QWORD PTR[8+rdi],rax - - mulx rcx,r8,r15 - adcx r8,r9 - adox r10,rcx - - mulx rax,r9,QWORD PTR[((128+16))+rsi] - adcx r9,r10 - adox r11,rax - - mulx rcx,r10,QWORD PTR[((128+24))+rsi] - adcx r10,r11 - adox r12,rcx - - mulx rax,r11,QWORD PTR[((128+32))+rsi] - adcx r11,r12 - adox rax,r13 - - mulx r13,r12,QWORD PTR[((128+40))+rsi] - mov rdx,QWORD PTR[16+rbx] - adcx r12,rax - adox r13,rbp - adcx r13,rbp - mulx rcx,rax,r14 - adcx rax,r8 - adox r9,rcx - mov QWORD PTR[16+rdi],rax - - mulx rcx,r8,r15 - adcx r8,r9 - adox r10,rcx - - mulx rax,r9,QWORD PTR[((128+16))+rsi] - adcx r9,r10 - adox r11,rax - - mulx rcx,r10,QWORD PTR[((128+24))+rsi] - adcx r10,r11 - adox r12,rcx - - mulx rax,r11,QWORD PTR[((128+32))+rsi] - adcx r11,r12 - adox rax,r13 - - mulx r13,r12,QWORD PTR[((128+40))+rsi] - mov rdx,QWORD PTR[24+rbx] - adcx r12,rax - adox r13,rbp - adcx r13,rbp - mulx rcx,rax,r14 - adcx rax,r8 - adox r9,rcx - mov QWORD PTR[24+rdi],rax - - mulx rcx,r8,r15 - adcx r8,r9 - adox r10,rcx - - mulx rax,r9,QWORD PTR[((128+16))+rsi] - adcx r9,r10 - adox r11,rax - - mulx rcx,r10,QWORD PTR[((128+24))+rsi] - adcx r10,r11 - adox r12,rcx - - mulx rax,r11,QWORD PTR[((128+32))+rsi] - adcx r11,r12 - adox rax,r13 - - mulx r13,r12,QWORD PTR[((128+40))+rsi] - mov rdx,QWORD PTR[32+rbx] - adcx r12,rax - adox r13,rbp - adcx r13,rbp - mulx rcx,rax,r14 - adcx rax,r8 - adox r9,rcx - mov QWORD PTR[32+rdi],rax - - mulx rcx,r8,r15 - adcx r8,r9 - adox r10,rcx - - mulx rax,r9,QWORD PTR[((128+16))+rsi] - adcx r9,r10 - adox r11,rax - - mulx rcx,r10,QWORD PTR[((128+24))+rsi] - adcx r10,r11 - adox r12,rcx - - mulx rax,r11,QWORD PTR[((128+32))+rsi] - adcx r11,r12 - adox rax,r13 - - mulx r13,r12,QWORD PTR[((128+40))+rsi] - mov rdx,QWORD PTR[40+rbx] - adcx r12,rax - adox r13,rbp - adcx r13,rbp - mulx rcx,rax,r14 - adcx rax,r8 - adox r9,rcx - mov QWORD PTR[40+rdi],rax - - mulx rcx,r8,r15 - adcx r8,r9 - adox r10,rcx - - mulx rax,r9,QWORD PTR[((128+16))+rsi] - adcx r9,r10 - adox r11,rax - - mulx rcx,r10,QWORD PTR[((128+24))+rsi] - adcx r10,r11 - adox r12,rcx - - mulx rax,r11,QWORD PTR[((128+32))+rsi] - adcx r11,r12 - adox rax,r13 - - mulx r13,r12,QWORD PTR[((128+40))+rsi] - mov rdx,rax - adcx r12,rax - adox r13,rbp - adcx r13,rbp - mov QWORD PTR[48+rdi],r8 - mov QWORD PTR[56+rdi],r9 - mov QWORD PTR[64+rdi],r10 - mov QWORD PTR[72+rdi],r11 - mov QWORD PTR[80+rdi],r12 - mov QWORD PTR[88+rdi],r13 - - DB 0F3h,0C3h ;repret -__mulx_384 ENDP -PUBLIC sqrx_384 - - -ALIGN 32 -sqrx_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqrx_384:: - - - mov rdi,rcx - mov rsi,rdx -sqr_384$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - push rdi - -$L$SEH_body_sqrx_384:: - - - call __sqrx_384 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sqrx_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqrx_384:: -sqrx_384 ENDP - -ALIGN 32 -__sqrx_384 PROC PRIVATE - DB 243,15,30,250 - - mov rdx,QWORD PTR[rsi] - mov r14,QWORD PTR[8+rsi] - mov r15,QWORD PTR[16+rsi] - mov rcx,QWORD PTR[24+rsi] - mov rbx,QWORD PTR[32+rsi] - - - mulx rdi,r8,r14 - mov rbp,QWORD PTR[40+rsi] - mulx rax,r9,r15 - add r9,rdi - mulx rdi,r10,rcx - adc r10,rax - mulx rax,r11,rbx - adc r11,rdi - mulx r13,r12,rbp - mov rdx,r14 - adc r12,rax - adc r13,0 - - - xor r14,r14 - mulx rax,rdi,r15 - adcx r10,rdi - adox r11,rax - - mulx rax,rdi,rcx - adcx r11,rdi - adox r12,rax - - mulx rax,rdi,rbx - adcx r12,rdi - adox r13,rax - - mulx rax,rdi,rbp - mov rdx,r15 - adcx r13,rdi - adox rax,r14 - adcx r14,rax - - - xor r15,r15 - mulx rax,rdi,rcx - adcx r12,rdi - adox r13,rax - - mulx rax,rdi,rbx - adcx r13,rdi - adox r14,rax - - mulx rax,rdi,rbp - mov rdx,rcx - adcx r14,rdi - adox rax,r15 - adcx r15,rax - - - xor rcx,rcx - mulx rax,rdi,rbx - adcx r14,rdi - adox r15,rax - - mulx rax,rdi,rbp - mov rdx,rbx - adcx r15,rdi - adox rax,rcx - adcx rcx,rax - - - mulx rbx,rdi,rbp - mov rdx,QWORD PTR[rsi] - add rcx,rdi - mov rdi,QWORD PTR[8+rsp] - adc rbx,0 - - - xor rbp,rbp - adcx r8,r8 - adcx r9,r9 - adcx r10,r10 - adcx r11,r11 - adcx r12,r12 - - - mulx rax,rdx,rdx - mov QWORD PTR[rdi],rdx - mov rdx,QWORD PTR[8+rsi] - adox r8,rax - mov QWORD PTR[8+rdi],r8 - - mulx rax,r8,rdx - mov rdx,QWORD PTR[16+rsi] - adox r9,r8 - adox r10,rax - mov QWORD PTR[16+rdi],r9 - mov QWORD PTR[24+rdi],r10 - - mulx r9,r8,rdx - mov rdx,QWORD PTR[24+rsi] - adox r11,r8 - adox r12,r9 - adcx r13,r13 - adcx r14,r14 - mov QWORD PTR[32+rdi],r11 - mov QWORD PTR[40+rdi],r12 - - mulx r9,r8,rdx - mov rdx,QWORD PTR[32+rsi] - adox r13,r8 - adox r14,r9 - adcx r15,r15 - adcx rcx,rcx - mov QWORD PTR[48+rdi],r13 - mov QWORD PTR[56+rdi],r14 - - mulx r9,r8,rdx - mov rdx,QWORD PTR[40+rsi] - adox r15,r8 - adox rcx,r9 - adcx rbx,rbx - adcx rbp,rbp - mov QWORD PTR[64+rdi],r15 - mov QWORD PTR[72+rdi],rcx - - mulx r9,r8,rdx - adox rbx,r8 - adox rbp,r9 - - mov QWORD PTR[80+rdi],rbx - mov QWORD PTR[88+rdi],rbp - - DB 0F3h,0C3h ;repret -__sqrx_384 ENDP - - - -PUBLIC redcx_mont_384 - - -ALIGN 32 -redcx_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_redcx_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -redc_mont_384$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_redcx_mont_384:: - - - mov rbx,rdx - call __mulx_by_1_mont_384 - call __redx_tail_mont_384 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_redcx_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_redcx_mont_384:: -redcx_mont_384 ENDP - - - - -PUBLIC fromx_mont_384 - - -ALIGN 32 -fromx_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_fromx_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -from_mont_384$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_fromx_mont_384:: - - - mov rbx,rdx - call __mulx_by_1_mont_384 - - - - - mov rax,r14 - mov rcx,r15 - mov rdx,r8 - mov rbp,r9 - - sub r14,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - mov r13,r10 - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - mov rsi,r11 - sbb r11,QWORD PTR[40+rbx] - - cmovc r14,rax - cmovc r15,rcx - cmovc r8,rdx - mov QWORD PTR[rdi],r14 - cmovc r9,rbp - mov QWORD PTR[8+rdi],r15 - cmovc r10,r13 - mov QWORD PTR[16+rdi],r8 - cmovc r11,rsi - mov QWORD PTR[24+rdi],r9 - mov QWORD PTR[32+rdi],r10 - mov QWORD PTR[40+rdi],r11 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_fromx_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_fromx_mont_384:: -fromx_mont_384 ENDP - -ALIGN 32 -__mulx_by_1_mont_384 PROC PRIVATE - DB 243,15,30,250 - - mov r8,QWORD PTR[rsi] - mov rdx,rcx - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - imul rdx,r8 - - - xor r14,r14 - mulx rbp,rax,QWORD PTR[rbx] - adcx r8,rax - adox r9,rbp - - mulx rbp,rax,QWORD PTR[8+rbx] - adcx r9,rax - adox r10,rbp - - mulx rbp,rax,QWORD PTR[16+rbx] - adcx r10,rax - adox r11,rbp - - mulx rbp,rax,QWORD PTR[24+rbx] - adcx r11,rax - adox r12,rbp - - mulx rbp,rax,QWORD PTR[32+rbx] - adcx r12,rax - adox r13,rbp - - mulx rbp,rax,QWORD PTR[40+rbx] - mov rdx,rcx - adcx r13,rax - adox rbp,r14 - adcx r14,rbp - imul rdx,r9 - - - xor r15,r15 - mulx rbp,rax,QWORD PTR[rbx] - adcx r9,rax - adox r10,rbp - - mulx rbp,rax,QWORD PTR[8+rbx] - adcx r10,rax - adox r11,rbp - - mulx rbp,rax,QWORD PTR[16+rbx] - adcx r11,rax - adox r12,rbp - - mulx rbp,rax,QWORD PTR[24+rbx] - adcx r12,rax - adox r13,rbp - - mulx rbp,rax,QWORD PTR[32+rbx] - adcx r13,rax - adox r14,rbp - - mulx rbp,rax,QWORD PTR[40+rbx] - mov rdx,rcx - adcx r14,rax - adox rbp,r15 - adcx r15,rbp - imul rdx,r10 - - - xor r8,r8 - mulx rbp,rax,QWORD PTR[rbx] - adcx r10,rax - adox r11,rbp - - mulx rbp,rax,QWORD PTR[8+rbx] - adcx r11,rax - adox r12,rbp - - mulx rbp,rax,QWORD PTR[16+rbx] - adcx r12,rax - adox r13,rbp - - mulx rbp,rax,QWORD PTR[24+rbx] - adcx r13,rax - adox r14,rbp - - mulx rbp,rax,QWORD PTR[32+rbx] - adcx r14,rax - adox r15,rbp - - mulx rbp,rax,QWORD PTR[40+rbx] - mov rdx,rcx - adcx r15,rax - adox rbp,r8 - adcx r8,rbp - imul rdx,r11 - - - xor r9,r9 - mulx rbp,rax,QWORD PTR[rbx] - adcx r11,rax - adox r12,rbp - - mulx rbp,rax,QWORD PTR[8+rbx] - adcx r12,rax - adox r13,rbp - - mulx rbp,rax,QWORD PTR[16+rbx] - adcx r13,rax - adox r14,rbp - - mulx rbp,rax,QWORD PTR[24+rbx] - adcx r14,rax - adox r15,rbp - - mulx rbp,rax,QWORD PTR[32+rbx] - adcx r15,rax - adox r8,rbp - - mulx rbp,rax,QWORD PTR[40+rbx] - mov rdx,rcx - adcx r8,rax - adox rbp,r9 - adcx r9,rbp - imul rdx,r12 - - - xor r10,r10 - mulx rbp,rax,QWORD PTR[rbx] - adcx r12,rax - adox r13,rbp - - mulx rbp,rax,QWORD PTR[8+rbx] - adcx r13,rax - adox r14,rbp - - mulx rbp,rax,QWORD PTR[16+rbx] - adcx r14,rax - adox r15,rbp - - mulx rbp,rax,QWORD PTR[24+rbx] - adcx r15,rax - adox r8,rbp - - mulx rbp,rax,QWORD PTR[32+rbx] - adcx r8,rax - adox r9,rbp - - mulx rbp,rax,QWORD PTR[40+rbx] - mov rdx,rcx - adcx r9,rax - adox rbp,r10 - adcx r10,rbp - imul rdx,r13 - - - xor r11,r11 - mulx rbp,rax,QWORD PTR[rbx] - adcx r13,rax - adox r14,rbp - - mulx rbp,rax,QWORD PTR[8+rbx] - adcx r14,rax - adox r15,rbp - - mulx rbp,rax,QWORD PTR[16+rbx] - adcx r15,rax - adox r8,rbp - - mulx rbp,rax,QWORD PTR[24+rbx] - adcx r8,rax - adox r9,rbp - - mulx rbp,rax,QWORD PTR[32+rbx] - adcx r9,rax - adox r10,rbp - - mulx rbp,rax,QWORD PTR[40+rbx] - mov rdx,rcx - adcx r10,rax - adox rbp,r11 - adcx r11,rbp - DB 0F3h,0C3h ;repret -__mulx_by_1_mont_384 ENDP - - -ALIGN 32 -__redx_tail_mont_384 PROC PRIVATE - DB 243,15,30,250 - - add r14,QWORD PTR[48+rsi] - mov rax,r14 - adc r15,QWORD PTR[56+rsi] - adc r8,QWORD PTR[64+rsi] - adc r9,QWORD PTR[72+rsi] - mov rcx,r15 - adc r10,QWORD PTR[80+rsi] - adc r11,QWORD PTR[88+rsi] - sbb r12,r12 - - - - - mov rdx,r8 - mov rbp,r9 - - sub r14,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - mov r13,r10 - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - mov rsi,r11 - sbb r11,QWORD PTR[40+rbx] - sbb r12,0 - - cmovc r14,rax - cmovc r15,rcx - cmovc r8,rdx - mov QWORD PTR[rdi],r14 - cmovc r9,rbp - mov QWORD PTR[8+rdi],r15 - cmovc r10,r13 - mov QWORD PTR[16+rdi],r8 - cmovc r11,rsi - mov QWORD PTR[24+rdi],r9 - mov QWORD PTR[32+rdi],r10 - mov QWORD PTR[40+rdi],r11 - - DB 0F3h,0C3h ;repret -__redx_tail_mont_384 ENDP - -PUBLIC sgn0x_pty_mont_384 - - -ALIGN 32 -sgn0x_pty_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sgn0x_pty_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -sgn0_pty_mont_384$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_sgn0x_pty_mont_384:: - - - mov rbx,rsi - lea rsi,QWORD PTR[rdi] - mov rcx,rdx - call __mulx_by_1_mont_384 - - xor rax,rax - mov r13,r14 - add r14,r14 - adc r15,r15 - adc r8,r8 - adc r9,r9 - adc r10,r10 - adc r11,r11 - adc rax,0 - - sub r14,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - sbb r11,QWORD PTR[40+rbx] - sbb rax,0 - - not rax - and r13,1 - and rax,2 - or rax,r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sgn0x_pty_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sgn0x_pty_mont_384:: -sgn0x_pty_mont_384 ENDP - -PUBLIC sgn0x_pty_mont_384x - - -ALIGN 32 -sgn0x_pty_mont_384x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sgn0x_pty_mont_384x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -sgn0_pty_mont_384x$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,8 - -$L$SEH_body_sgn0x_pty_mont_384x:: - - - mov rbx,rsi - lea rsi,QWORD PTR[48+rdi] - mov rcx,rdx - call __mulx_by_1_mont_384 - - mov r12,r14 - or r14,r15 - or r14,r8 - or r14,r9 - or r14,r10 - or r14,r11 - - lea rsi,QWORD PTR[rdi] - xor rdi,rdi - mov r13,r12 - add r12,r12 - adc r15,r15 - adc r8,r8 - adc r9,r9 - adc r10,r10 - adc r11,r11 - adc rdi,0 - - sub r12,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - sbb r11,QWORD PTR[40+rbx] - sbb rdi,0 - - mov QWORD PTR[rsp],r14 - not rdi - and r13,1 - and rdi,2 - or rdi,r13 - - call __mulx_by_1_mont_384 - - mov r12,r14 - or r14,r15 - or r14,r8 - or r14,r9 - or r14,r10 - or r14,r11 - - xor rax,rax - mov r13,r12 - add r12,r12 - adc r15,r15 - adc r8,r8 - adc r9,r9 - adc r10,r10 - adc r11,r11 - adc rax,0 - - sub r12,QWORD PTR[rbx] - sbb r15,QWORD PTR[8+rbx] - sbb r8,QWORD PTR[16+rbx] - sbb r9,QWORD PTR[24+rbx] - sbb r10,QWORD PTR[32+rbx] - sbb r11,QWORD PTR[40+rbx] - sbb rax,0 - - mov r12,QWORD PTR[rsp] - - not rax - - test r14,r14 - cmovz r13,rdi - - test r12,r12 - cmovnz rax,rdi - - and r13,1 - and rax,2 - or rax,r13 - - mov r15,QWORD PTR[8+rsp] - - mov r14,QWORD PTR[16+rsp] - - mov r13,QWORD PTR[24+rsp] - - mov r12,QWORD PTR[32+rsp] - - mov rbx,QWORD PTR[40+rsp] - - mov rbp,QWORD PTR[48+rsp] - - lea rsp,QWORD PTR[56+rsp] - -$L$SEH_epilogue_sgn0x_pty_mont_384x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sgn0x_pty_mont_384x:: -sgn0x_pty_mont_384x ENDP -PUBLIC mulx_mont_384 - - -ALIGN 32 -mulx_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_mulx_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] -mul_mont_384$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,QWORD PTR[((-24))+rsp] - -$L$SEH_body_mulx_mont_384:: - - - mov rbx,rdx - mov rdx,QWORD PTR[rdx] - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rax,QWORD PTR[16+rsi] - mov r12,QWORD PTR[24+rsi] - mov QWORD PTR[16+rsp],rdi - mov rdi,QWORD PTR[32+rsi] - mov rbp,QWORD PTR[40+rsi] - lea rsi,QWORD PTR[((-128))+rsi] - lea rcx,QWORD PTR[((-128))+rcx] - mov QWORD PTR[rsp],r8 - - mulx r9,r8,r14 - call __mulx_mont_384 - - mov r15,QWORD PTR[24+rsp] - - mov r14,QWORD PTR[32+rsp] - - mov r13,QWORD PTR[40+rsp] - - mov r12,QWORD PTR[48+rsp] - - mov rbx,QWORD PTR[56+rsp] - - mov rbp,QWORD PTR[64+rsp] - - lea rsp,QWORD PTR[72+rsp] - -$L$SEH_epilogue_mulx_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_mulx_mont_384:: -mulx_mont_384 ENDP - -ALIGN 32 -__mulx_mont_384 PROC PRIVATE - DB 243,15,30,250 - - - mulx r10,r14,r15 - mulx r11,r15,rax - add r9,r14 - mulx r12,rax,r12 - adc r10,r15 - mulx r13,rdi,rdi - adc r11,rax - mulx r14,rbp,rbp - mov rdx,QWORD PTR[8+rbx] - adc r12,rdi - adc r13,rbp - adc r14,0 - xor r15,r15 - - mov QWORD PTR[16+rsp],r8 - imul r8,QWORD PTR[8+rsp] - - - xor rax,rax - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r9,rdi - adcx r10,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r10,rdi - adcx r11,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r11,rdi - adcx r12,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox r12,rdi - adcx r13,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r8 - adox r14,rdi - adcx r15,rbp - adox r15,rax - adox rax,rax - - - xor r8,r8 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx rdi,QWORD PTR[16+rsp] - adox r9,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r9,rdi - adox r10,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r10,rdi - adox r11,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r11,rdi - adox r12,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r12,rdi - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,QWORD PTR[16+rbx] - adcx r13,rdi - adox r14,rbp - adcx r14,r8 - adox r15,r8 - adcx r15,r8 - adox rax,r8 - adcx rax,r8 - mov QWORD PTR[16+rsp],r9 - imul r9,QWORD PTR[8+rsp] - - - xor r8,r8 - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r10,rdi - adcx r11,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r11,rdi - adcx r12,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r12,rdi - adcx r13,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox r14,rdi - adcx r15,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r9 - adox r15,rdi - adcx rax,rbp - adox rax,r8 - adox r8,r8 - - - xor r9,r9 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx rdi,QWORD PTR[16+rsp] - adox r10,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r10,rdi - adox r11,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r11,rdi - adox r12,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r12,rdi - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,QWORD PTR[24+rbx] - adcx r14,rdi - adox r15,rbp - adcx r15,r9 - adox rax,r9 - adcx rax,r9 - adox r8,r9 - adcx r8,r9 - mov QWORD PTR[16+rsp],r10 - imul r10,QWORD PTR[8+rsp] - - - xor r9,r9 - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r11,rdi - adcx r12,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r12,rdi - adcx r13,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox r14,rdi - adcx r15,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox r15,rdi - adcx rax,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r10 - adox rax,rdi - adcx r8,rbp - adox r8,r9 - adox r9,r9 - - - xor r10,r10 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx rdi,QWORD PTR[16+rsp] - adox r11,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r11,rdi - adox r12,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r12,rdi - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r14,rdi - adox r15,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,QWORD PTR[32+rbx] - adcx r15,rdi - adox rax,rbp - adcx rax,r10 - adox r8,r10 - adcx r8,r10 - adox r9,r10 - adcx r9,r10 - mov QWORD PTR[16+rsp],r11 - imul r11,QWORD PTR[8+rsp] - - - xor r10,r10 - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r12,rdi - adcx r13,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r14,rdi - adcx r15,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox r15,rdi - adcx rax,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox rax,rdi - adcx r8,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r11 - adox r8,rdi - adcx r9,rbp - adox r9,r10 - adox r10,r10 - - - xor r11,r11 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx rdi,QWORD PTR[16+rsp] - adox r12,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r12,rdi - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r14,rdi - adox r15,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r15,rdi - adox rax,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,QWORD PTR[40+rbx] - adcx rax,rdi - adox r8,rbp - adcx r8,r11 - adox r9,r11 - adcx r9,r11 - adox r10,r11 - adcx r10,r11 - mov QWORD PTR[16+rsp],r12 - imul r12,QWORD PTR[8+rsp] - - - xor r11,r11 - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r14,rdi - adcx r15,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r15,rdi - adcx rax,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox rax,rdi - adcx r8,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox r8,rdi - adcx r9,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r12 - adox r9,rdi - adcx r10,rbp - adox r10,r11 - adox r11,r11 - - - xor r12,r12 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx rdi,QWORD PTR[16+rsp] - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r14,rdi - adox r15,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r15,rdi - adox rax,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx rax,rdi - adox r8,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,r13 - adcx r8,rdi - adox r9,rbp - adcx r9,r12 - adox r10,r12 - adcx r10,r12 - adox r11,r12 - adcx r11,r12 - imul rdx,QWORD PTR[8+rsp] - mov rbx,QWORD PTR[24+rsp] - - - xor r12,r12 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r14,rdi - adox r15,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r15,rdi - adox rax,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx rax,rdi - adox r8,rbp - mov r13,r15 - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r8,rdi - adox r9,rbp - mov rsi,rax - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - adcx r9,rdi - adox r10,rbp - mov rdx,r14 - adcx r10,r12 - adox r11,r12 - lea rcx,QWORD PTR[128+rcx] - mov r12,r8 - adc r11,0 - - - - - sub r14,QWORD PTR[rcx] - sbb r15,QWORD PTR[8+rcx] - mov rdi,r9 - sbb rax,QWORD PTR[16+rcx] - sbb r8,QWORD PTR[24+rcx] - sbb r9,QWORD PTR[32+rcx] - mov rbp,r10 - sbb r10,QWORD PTR[40+rcx] - sbb r11,0 - - cmovnc rdx,r14 - cmovc r15,r13 - cmovc rax,rsi - cmovnc r12,r8 - mov QWORD PTR[rbx],rdx - cmovnc rdi,r9 - mov QWORD PTR[8+rbx],r15 - cmovnc rbp,r10 - mov QWORD PTR[16+rbx],rax - mov QWORD PTR[24+rbx],r12 - mov QWORD PTR[32+rbx],rdi - mov QWORD PTR[40+rbx],rbp - - DB 0F3h,0C3h ;repret - -__mulx_mont_384 ENDP -PUBLIC sqrx_mont_384 - - -ALIGN 32 -sqrx_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqrx_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -sqr_mont_384$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,QWORD PTR[((-24))+rsp] - -$L$SEH_body_sqrx_mont_384:: - - - mov r8,rcx - lea rcx,QWORD PTR[((-128))+rdx] - mov rdx,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rax,QWORD PTR[16+rsi] - mov r12,QWORD PTR[24+rsi] - mov QWORD PTR[16+rsp],rdi - mov rdi,QWORD PTR[32+rsi] - mov rbp,QWORD PTR[40+rsi] - - lea rbx,QWORD PTR[rsi] - mov QWORD PTR[rsp],r8 - lea rsi,QWORD PTR[((-128))+rsi] - - mulx r9,r8,rdx - call __mulx_mont_384 - - mov r15,QWORD PTR[24+rsp] - - mov r14,QWORD PTR[32+rsp] - - mov r13,QWORD PTR[40+rsp] - - mov r12,QWORD PTR[48+rsp] - - mov rbx,QWORD PTR[56+rsp] - - mov rbp,QWORD PTR[64+rsp] - - lea rsp,QWORD PTR[72+rsp] - -$L$SEH_epilogue_sqrx_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqrx_mont_384:: -sqrx_mont_384 ENDP - -PUBLIC sqrx_n_mul_mont_384 - - -ALIGN 32 -sqrx_n_mul_mont_384 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqrx_n_mul_mont_384:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] - mov r9,QWORD PTR[48+rsp] -sqr_n_mul_mont_384$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,QWORD PTR[((-40))+rsp] - -$L$SEH_body_sqrx_n_mul_mont_384:: - - - mov r10,rdx - mov rdx,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rax,QWORD PTR[16+rsi] - mov rbx,rsi - mov r12,QWORD PTR[24+rsi] - mov QWORD PTR[16+rsp],rdi - mov rdi,QWORD PTR[32+rsi] - mov rbp,QWORD PTR[40+rsi] - - mov QWORD PTR[rsp],r8 - mov QWORD PTR[24+rsp],r9 - movq xmm2,QWORD PTR[r9] - -$L$oop_sqrx_384:: - movd xmm1,r10d - lea rsi,QWORD PTR[((-128))+rbx] - lea rcx,QWORD PTR[((-128))+rcx] - - mulx r9,r8,rdx - call __mulx_mont_384 - - movd r10d,xmm1 - dec r10d - jnz $L$oop_sqrx_384 - - mov r14,rdx -DB 102,72,15,126,210 - lea rsi,QWORD PTR[((-128))+rbx] - mov rbx,QWORD PTR[24+rsp] - lea rcx,QWORD PTR[((-128))+rcx] - - mulx r9,r8,r14 - call __mulx_mont_384 - - mov r15,QWORD PTR[40+rsp] - - mov r14,QWORD PTR[48+rsp] - - mov r13,QWORD PTR[56+rsp] - - mov r12,QWORD PTR[64+rsp] - - mov rbx,QWORD PTR[72+rsp] - - mov rbp,QWORD PTR[80+rsp] - - lea rsp,QWORD PTR[88+rsp] - -$L$SEH_epilogue_sqrx_n_mul_mont_384:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqrx_n_mul_mont_384:: -sqrx_n_mul_mont_384 ENDP - -PUBLIC sqrx_n_mul_mont_383 - - -ALIGN 32 -sqrx_n_mul_mont_383 PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqrx_n_mul_mont_383:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD PTR[40+rsp] - mov r9,QWORD PTR[48+rsp] -sqr_n_mul_mont_383$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - lea rsp,QWORD PTR[((-40))+rsp] - -$L$SEH_body_sqrx_n_mul_mont_383:: - - - mov r10,rdx - mov rdx,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rax,QWORD PTR[16+rsi] - mov rbx,rsi - mov r12,QWORD PTR[24+rsi] - mov QWORD PTR[16+rsp],rdi - mov rdi,QWORD PTR[32+rsi] - mov rbp,QWORD PTR[40+rsi] - - mov QWORD PTR[rsp],r8 - mov QWORD PTR[24+rsp],r9 - movq xmm2,QWORD PTR[r9] - lea rcx,QWORD PTR[((-128))+rcx] - -$L$oop_sqrx_383:: - movd xmm1,r10d - lea rsi,QWORD PTR[((-128))+rbx] - - mulx r9,r8,rdx - call __mulx_mont_383_nonred - - movd r10d,xmm1 - dec r10d - jnz $L$oop_sqrx_383 - - mov r14,rdx -DB 102,72,15,126,210 - lea rsi,QWORD PTR[((-128))+rbx] - mov rbx,QWORD PTR[24+rsp] - - mulx r9,r8,r14 - call __mulx_mont_384 - - mov r15,QWORD PTR[40+rsp] - - mov r14,QWORD PTR[48+rsp] - - mov r13,QWORD PTR[56+rsp] - - mov r12,QWORD PTR[64+rsp] - - mov rbx,QWORD PTR[72+rsp] - - mov rbp,QWORD PTR[80+rsp] - - lea rsp,QWORD PTR[88+rsp] - -$L$SEH_epilogue_sqrx_n_mul_mont_383:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqrx_n_mul_mont_383:: -sqrx_n_mul_mont_383 ENDP - -ALIGN 32 -__mulx_mont_383_nonred PROC PRIVATE - DB 243,15,30,250 - - - mulx r10,r14,r15 - mulx r11,r15,rax - add r9,r14 - mulx r12,rax,r12 - adc r10,r15 - mulx r13,rdi,rdi - adc r11,rax - mulx r14,rbp,rbp - mov rdx,QWORD PTR[8+rbx] - adc r12,rdi - adc r13,rbp - adc r14,0 - mov rax,r8 - imul r8,QWORD PTR[8+rsp] - - - xor r15,r15 - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r9,rdi - adcx r10,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r10,rdi - adcx r11,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r11,rdi - adcx r12,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox r12,rdi - adcx r13,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r8 - adox r14,rdi - adcx rbp,r15 - adox r15,rbp - - - xor r8,r8 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx rax,rdi - adox r9,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r9,rdi - adox r10,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r10,rdi - adox r11,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r11,rdi - adox r12,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r12,rdi - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,QWORD PTR[16+rbx] - adcx r13,rdi - adox r14,rbp - adcx r14,rax - adox r15,rax - adcx r15,rax - mov r8,r9 - imul r9,QWORD PTR[8+rsp] - - - xor rax,rax - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r10,rdi - adcx r11,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r11,rdi - adcx r12,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r12,rdi - adcx r13,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox r14,rdi - adcx r15,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r9 - adox r15,rdi - adcx rbp,rax - adox rax,rbp - - - xor r9,r9 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx r8,rdi - adox r10,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r10,rdi - adox r11,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r11,rdi - adox r12,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r12,rdi - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,QWORD PTR[24+rbx] - adcx r14,rdi - adox r15,rbp - adcx r15,r8 - adox rax,r8 - adcx rax,r8 - mov r9,r10 - imul r10,QWORD PTR[8+rsp] - - - xor r8,r8 - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r11,rdi - adcx r12,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r12,rdi - adcx r13,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox r14,rdi - adcx r15,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox r15,rdi - adcx rax,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r10 - adox rax,rdi - adcx rbp,r8 - adox r8,rbp - - - xor r10,r10 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx r9,rdi - adox r11,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r11,rdi - adox r12,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r12,rdi - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r14,rdi - adox r15,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,QWORD PTR[32+rbx] - adcx r15,rdi - adox rax,rbp - adcx rax,r9 - adox r8,r9 - adcx r8,r9 - mov r10,r11 - imul r11,QWORD PTR[8+rsp] - - - xor r9,r9 - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r12,rdi - adcx r13,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r14,rdi - adcx r15,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox r15,rdi - adcx rax,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox rax,rdi - adcx r8,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r11 - adox r8,rdi - adcx rbp,r9 - adox r9,rbp - - - xor r11,r11 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx r10,rdi - adox r12,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r12,rdi - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r14,rdi - adox r15,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r15,rdi - adox rax,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,QWORD PTR[40+rbx] - adcx rax,rdi - adox r8,rbp - adcx r8,r10 - adox r9,r10 - adcx r9,r10 - mov r11,r12 - imul r12,QWORD PTR[8+rsp] - - - xor r10,r10 - mulx rbp,rdi,QWORD PTR[((0+128))+rsi] - adox r13,rdi - adcx r14,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rsi] - adox r14,rdi - adcx r15,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rsi] - adox r15,rdi - adcx rax,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rsi] - adox rax,rdi - adcx r8,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rsi] - adox r8,rdi - adcx r9,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rsi] - mov rdx,r12 - adox r9,rdi - adcx rbp,r10 - adox r10,rbp - - - xor r12,r12 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx r11,rdi - adox r13,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r14,rdi - adox r15,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx r15,rdi - adox rax,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx rax,rdi - adox r8,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,r13 - adcx r8,rdi - adox r9,rbp - adcx r9,r11 - adox r10,r11 - adcx r10,r11 - imul rdx,QWORD PTR[8+rsp] - mov rbx,QWORD PTR[24+rsp] - - - xor r12,r12 - mulx rbp,rdi,QWORD PTR[((0+128))+rcx] - adcx r13,rdi - adox r14,rbp - - mulx rbp,rdi,QWORD PTR[((8+128))+rcx] - adcx r14,rdi - adox r15,rbp - - mulx rbp,rdi,QWORD PTR[((16+128))+rcx] - adcx r15,rdi - adox rax,rbp - - mulx rbp,rdi,QWORD PTR[((24+128))+rcx] - adcx rax,rdi - adox r8,rbp - - mulx rbp,rdi,QWORD PTR[((32+128))+rcx] - adcx r8,rdi - adox r9,rbp - - mulx rbp,rdi,QWORD PTR[((40+128))+rcx] - mov rdx,r14 - adcx r9,rdi - adox r10,rbp - adc r10,0 - mov r12,r8 - - mov QWORD PTR[rbx],r14 - mov QWORD PTR[8+rbx],r15 - mov QWORD PTR[16+rbx],rax - mov rdi,r9 - mov QWORD PTR[24+rbx],r8 - mov QWORD PTR[32+rbx],r9 - mov QWORD PTR[40+rbx],r10 - mov rbp,r10 - - DB 0F3h,0C3h ;repret - -__mulx_mont_383_nonred ENDP -PUBLIC sqrx_mont_382x - - -ALIGN 32 -sqrx_mont_382x PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_sqrx_mont_382x:: - - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 -sqr_mont_382x$1:: - push rbp - - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - sub rsp,136 - -$L$SEH_body_sqrx_mont_382x:: - - - mov QWORD PTR[rsp],rcx - mov rcx,rdx - mov QWORD PTR[16+rsp],rdi - mov QWORD PTR[24+rsp],rsi - - - mov r8,QWORD PTR[rsi] - mov r9,QWORD PTR[8+rsi] - mov r10,QWORD PTR[16+rsi] - mov r11,QWORD PTR[24+rsi] - mov r12,QWORD PTR[32+rsi] - mov r13,QWORD PTR[40+rsi] - - mov r14,r8 - add r8,QWORD PTR[48+rsi] - mov r15,r9 - adc r9,QWORD PTR[56+rsi] - mov rax,r10 - adc r10,QWORD PTR[64+rsi] - mov rdx,r11 - adc r11,QWORD PTR[72+rsi] - mov rbx,r12 - adc r12,QWORD PTR[80+rsi] - mov rbp,r13 - adc r13,QWORD PTR[88+rsi] - - sub r14,QWORD PTR[48+rsi] - sbb r15,QWORD PTR[56+rsi] - sbb rax,QWORD PTR[64+rsi] - sbb rdx,QWORD PTR[72+rsi] - sbb rbx,QWORD PTR[80+rsi] - sbb rbp,QWORD PTR[88+rsi] - sbb rdi,rdi - - mov QWORD PTR[((32+0))+rsp],r8 - mov QWORD PTR[((32+8))+rsp],r9 - mov QWORD PTR[((32+16))+rsp],r10 - mov QWORD PTR[((32+24))+rsp],r11 - mov QWORD PTR[((32+32))+rsp],r12 - mov QWORD PTR[((32+40))+rsp],r13 - - mov QWORD PTR[((32+48))+rsp],r14 - mov QWORD PTR[((32+56))+rsp],r15 - mov QWORD PTR[((32+64))+rsp],rax - mov QWORD PTR[((32+72))+rsp],rdx - mov QWORD PTR[((32+80))+rsp],rbx - mov QWORD PTR[((32+88))+rsp],rbp - mov QWORD PTR[((32+96))+rsp],rdi - - - - lea rbx,QWORD PTR[48+rsi] - - mov rdx,QWORD PTR[48+rsi] - mov r14,QWORD PTR[rsi] - mov r15,QWORD PTR[8+rsi] - mov rax,QWORD PTR[16+rsi] - mov r12,QWORD PTR[24+rsi] - mov rdi,QWORD PTR[32+rsi] - mov rbp,QWORD PTR[40+rsi] - lea rsi,QWORD PTR[((-128))+rsi] - lea rcx,QWORD PTR[((-128))+rcx] - - mulx r9,r8,r14 - call __mulx_mont_383_nonred - add rdx,rdx - adc r15,r15 - adc rax,rax - adc r12,r12 - adc rdi,rdi - adc rbp,rbp - - mov QWORD PTR[48+rbx],rdx - mov QWORD PTR[56+rbx],r15 - mov QWORD PTR[64+rbx],rax - mov QWORD PTR[72+rbx],r12 - mov QWORD PTR[80+rbx],rdi - mov QWORD PTR[88+rbx],rbp - - lea rsi,QWORD PTR[((32-128))+rsp] - lea rbx,QWORD PTR[((32+48))+rsp] - - mov rdx,QWORD PTR[((32+48))+rsp] - mov r14,QWORD PTR[((32+0))+rsp] - mov r15,QWORD PTR[((32+8))+rsp] - mov rax,QWORD PTR[((32+16))+rsp] - mov r12,QWORD PTR[((32+24))+rsp] - mov rdi,QWORD PTR[((32+32))+rsp] - mov rbp,QWORD PTR[((32+40))+rsp] - - - - mulx r9,r8,r14 - call __mulx_mont_383_nonred - mov r14,QWORD PTR[((32+96))+rsp] - lea rcx,QWORD PTR[128+rcx] - mov r8,QWORD PTR[((32+0))+rsp] - and r8,r14 - mov r9,QWORD PTR[((32+8))+rsp] - and r9,r14 - mov r10,QWORD PTR[((32+16))+rsp] - and r10,r14 - mov r11,QWORD PTR[((32+24))+rsp] - and r11,r14 - mov r13,QWORD PTR[((32+32))+rsp] - and r13,r14 - and r14,QWORD PTR[((32+40))+rsp] - - sub rdx,r8 - mov r8,QWORD PTR[rcx] - sbb r15,r9 - mov r9,QWORD PTR[8+rcx] - sbb rax,r10 - mov r10,QWORD PTR[16+rcx] - sbb r12,r11 - mov r11,QWORD PTR[24+rcx] - sbb rdi,r13 - mov r13,QWORD PTR[32+rcx] - sbb rbp,r14 - sbb r14,r14 - - and r8,r14 - and r9,r14 - and r10,r14 - and r11,r14 - and r13,r14 - and r14,QWORD PTR[40+rcx] - - add rdx,r8 - adc r15,r9 - adc rax,r10 - adc r12,r11 - adc rdi,r13 - adc rbp,r14 - - mov QWORD PTR[rbx],rdx - mov QWORD PTR[8+rbx],r15 - mov QWORD PTR[16+rbx],rax - mov QWORD PTR[24+rbx],r12 - mov QWORD PTR[32+rbx],rdi - mov QWORD PTR[40+rbx],rbp - lea r8,QWORD PTR[136+rsp] - mov r15,QWORD PTR[r8] - - mov r14,QWORD PTR[8+r8] - - mov r13,QWORD PTR[16+r8] - - mov r12,QWORD PTR[24+r8] - - mov rbx,QWORD PTR[32+r8] - - mov rbp,QWORD PTR[40+r8] - - lea rsp,QWORD PTR[48+r8] - -$L$SEH_epilogue_sqrx_mont_382x:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_sqrx_mont_382x:: -sqrx_mont_382x ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_mulx_mont_384x - DD imagerel $L$SEH_body_mulx_mont_384x - DD imagerel $L$SEH_info_mulx_mont_384x_prologue - - DD imagerel $L$SEH_body_mulx_mont_384x - DD imagerel $L$SEH_epilogue_mulx_mont_384x - DD imagerel $L$SEH_info_mulx_mont_384x_body - - DD imagerel $L$SEH_epilogue_mulx_mont_384x - DD imagerel $L$SEH_end_mulx_mont_384x - DD imagerel $L$SEH_info_mulx_mont_384x_epilogue - - DD imagerel $L$SEH_begin_sqrx_mont_384x - DD imagerel $L$SEH_body_sqrx_mont_384x - DD imagerel $L$SEH_info_sqrx_mont_384x_prologue - - DD imagerel $L$SEH_body_sqrx_mont_384x - DD imagerel $L$SEH_epilogue_sqrx_mont_384x - DD imagerel $L$SEH_info_sqrx_mont_384x_body - - DD imagerel $L$SEH_epilogue_sqrx_mont_384x - DD imagerel $L$SEH_end_sqrx_mont_384x - DD imagerel $L$SEH_info_sqrx_mont_384x_epilogue - - DD imagerel $L$SEH_begin_mulx_382x - DD imagerel $L$SEH_body_mulx_382x - DD imagerel $L$SEH_info_mulx_382x_prologue - - DD imagerel $L$SEH_body_mulx_382x - DD imagerel $L$SEH_epilogue_mulx_382x - DD imagerel $L$SEH_info_mulx_382x_body - - DD imagerel $L$SEH_epilogue_mulx_382x - DD imagerel $L$SEH_end_mulx_382x - DD imagerel $L$SEH_info_mulx_382x_epilogue - - DD imagerel $L$SEH_begin_sqrx_382x - DD imagerel $L$SEH_body_sqrx_382x - DD imagerel $L$SEH_info_sqrx_382x_prologue - - DD imagerel $L$SEH_body_sqrx_382x - DD imagerel $L$SEH_epilogue_sqrx_382x - DD imagerel $L$SEH_info_sqrx_382x_body - - DD imagerel $L$SEH_epilogue_sqrx_382x - DD imagerel $L$SEH_end_sqrx_382x - DD imagerel $L$SEH_info_sqrx_382x_epilogue - - DD imagerel $L$SEH_begin_mulx_384 - DD imagerel $L$SEH_body_mulx_384 - DD imagerel $L$SEH_info_mulx_384_prologue - - DD imagerel $L$SEH_body_mulx_384 - DD imagerel $L$SEH_epilogue_mulx_384 - DD imagerel $L$SEH_info_mulx_384_body - - DD imagerel $L$SEH_epilogue_mulx_384 - DD imagerel $L$SEH_end_mulx_384 - DD imagerel $L$SEH_info_mulx_384_epilogue - - DD imagerel $L$SEH_begin_sqrx_384 - DD imagerel $L$SEH_body_sqrx_384 - DD imagerel $L$SEH_info_sqrx_384_prologue - - DD imagerel $L$SEH_body_sqrx_384 - DD imagerel $L$SEH_epilogue_sqrx_384 - DD imagerel $L$SEH_info_sqrx_384_body - - DD imagerel $L$SEH_epilogue_sqrx_384 - DD imagerel $L$SEH_end_sqrx_384 - DD imagerel $L$SEH_info_sqrx_384_epilogue - - DD imagerel $L$SEH_begin_redcx_mont_384 - DD imagerel $L$SEH_body_redcx_mont_384 - DD imagerel $L$SEH_info_redcx_mont_384_prologue - - DD imagerel $L$SEH_body_redcx_mont_384 - DD imagerel $L$SEH_epilogue_redcx_mont_384 - DD imagerel $L$SEH_info_redcx_mont_384_body - - DD imagerel $L$SEH_epilogue_redcx_mont_384 - DD imagerel $L$SEH_end_redcx_mont_384 - DD imagerel $L$SEH_info_redcx_mont_384_epilogue - - DD imagerel $L$SEH_begin_fromx_mont_384 - DD imagerel $L$SEH_body_fromx_mont_384 - DD imagerel $L$SEH_info_fromx_mont_384_prologue - - DD imagerel $L$SEH_body_fromx_mont_384 - DD imagerel $L$SEH_epilogue_fromx_mont_384 - DD imagerel $L$SEH_info_fromx_mont_384_body - - DD imagerel $L$SEH_epilogue_fromx_mont_384 - DD imagerel $L$SEH_end_fromx_mont_384 - DD imagerel $L$SEH_info_fromx_mont_384_epilogue - - DD imagerel $L$SEH_begin_sgn0x_pty_mont_384 - DD imagerel $L$SEH_body_sgn0x_pty_mont_384 - DD imagerel $L$SEH_info_sgn0x_pty_mont_384_prologue - - DD imagerel $L$SEH_body_sgn0x_pty_mont_384 - DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 - DD imagerel $L$SEH_info_sgn0x_pty_mont_384_body - - DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 - DD imagerel $L$SEH_end_sgn0x_pty_mont_384 - DD imagerel $L$SEH_info_sgn0x_pty_mont_384_epilogue - - DD imagerel $L$SEH_begin_sgn0x_pty_mont_384x - DD imagerel $L$SEH_body_sgn0x_pty_mont_384x - DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_prologue - - DD imagerel $L$SEH_body_sgn0x_pty_mont_384x - DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x - DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_body - - DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x - DD imagerel $L$SEH_end_sgn0x_pty_mont_384x - DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_epilogue - - DD imagerel $L$SEH_begin_mulx_mont_384 - DD imagerel $L$SEH_body_mulx_mont_384 - DD imagerel $L$SEH_info_mulx_mont_384_prologue - - DD imagerel $L$SEH_body_mulx_mont_384 - DD imagerel $L$SEH_epilogue_mulx_mont_384 - DD imagerel $L$SEH_info_mulx_mont_384_body - - DD imagerel $L$SEH_epilogue_mulx_mont_384 - DD imagerel $L$SEH_end_mulx_mont_384 - DD imagerel $L$SEH_info_mulx_mont_384_epilogue - - DD imagerel $L$SEH_begin_sqrx_mont_384 - DD imagerel $L$SEH_body_sqrx_mont_384 - DD imagerel $L$SEH_info_sqrx_mont_384_prologue - - DD imagerel $L$SEH_body_sqrx_mont_384 - DD imagerel $L$SEH_epilogue_sqrx_mont_384 - DD imagerel $L$SEH_info_sqrx_mont_384_body - - DD imagerel $L$SEH_epilogue_sqrx_mont_384 - DD imagerel $L$SEH_end_sqrx_mont_384 - DD imagerel $L$SEH_info_sqrx_mont_384_epilogue - - DD imagerel $L$SEH_begin_sqrx_n_mul_mont_384 - DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 - DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_prologue - - DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 - DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 - DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_body - - DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 - DD imagerel $L$SEH_end_sqrx_n_mul_mont_384 - DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_epilogue - - DD imagerel $L$SEH_begin_sqrx_n_mul_mont_383 - DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 - DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_prologue - - DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 - DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 - DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_body - - DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 - DD imagerel $L$SEH_end_sqrx_n_mul_mont_383 - DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_epilogue - - DD imagerel $L$SEH_begin_sqrx_mont_382x - DD imagerel $L$SEH_body_sqrx_mont_382x - DD imagerel $L$SEH_info_sqrx_mont_382x_prologue - - DD imagerel $L$SEH_body_sqrx_mont_382x - DD imagerel $L$SEH_epilogue_sqrx_mont_382x - DD imagerel $L$SEH_info_sqrx_mont_382x_body - - DD imagerel $L$SEH_epilogue_sqrx_mont_382x - DD imagerel $L$SEH_end_sqrx_mont_382x - DD imagerel $L$SEH_info_sqrx_mont_382x_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_mulx_mont_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mulx_mont_384x_body:: -DB 1,0,18,0 -DB 000h,0f4h,029h,000h -DB 000h,0e4h,02ah,000h -DB 000h,0d4h,02bh,000h -DB 000h,0c4h,02ch,000h -DB 000h,034h,02dh,000h -DB 000h,054h,02eh,000h -DB 000h,074h,030h,000h -DB 000h,064h,031h,000h -DB 000h,001h,02fh,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mulx_mont_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqrx_mont_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqrx_mont_384x_body:: -DB 1,0,18,0 -DB 000h,0f4h,011h,000h -DB 000h,0e4h,012h,000h -DB 000h,0d4h,013h,000h -DB 000h,0c4h,014h,000h -DB 000h,034h,015h,000h -DB 000h,054h,016h,000h -DB 000h,074h,018h,000h -DB 000h,064h,019h,000h -DB 000h,001h,017h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqrx_mont_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mulx_382x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mulx_382x_body:: -DB 1,0,18,0 -DB 000h,0f4h,011h,000h -DB 000h,0e4h,012h,000h -DB 000h,0d4h,013h,000h -DB 000h,0c4h,014h,000h -DB 000h,034h,015h,000h -DB 000h,054h,016h,000h -DB 000h,074h,018h,000h -DB 000h,064h,019h,000h -DB 000h,001h,017h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mulx_382x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqrx_382x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqrx_382x_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqrx_382x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mulx_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mulx_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,000h,000h -DB 000h,0e4h,001h,000h -DB 000h,0d4h,002h,000h -DB 000h,0c4h,003h,000h -DB 000h,034h,004h,000h -DB 000h,054h,005h,000h -DB 000h,074h,007h,000h -DB 000h,064h,008h,000h -DB 000h,052h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mulx_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqrx_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqrx_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqrx_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_redcx_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_redcx_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_redcx_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_fromx_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_fromx_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_fromx_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sgn0x_pty_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sgn0x_pty_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sgn0x_pty_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sgn0x_pty_mont_384x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sgn0x_pty_mont_384x_body:: -DB 1,0,17,0 -DB 000h,0f4h,001h,000h -DB 000h,0e4h,002h,000h -DB 000h,0d4h,003h,000h -DB 000h,0c4h,004h,000h -DB 000h,034h,005h,000h -DB 000h,054h,006h,000h -DB 000h,074h,008h,000h -DB 000h,064h,009h,000h -DB 000h,062h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sgn0x_pty_mont_384x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_mulx_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_mulx_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,003h,000h -DB 000h,0e4h,004h,000h -DB 000h,0d4h,005h,000h -DB 000h,0c4h,006h,000h -DB 000h,034h,007h,000h -DB 000h,054h,008h,000h -DB 000h,074h,00ah,000h -DB 000h,064h,00bh,000h -DB 000h,082h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_mulx_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqrx_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqrx_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,003h,000h -DB 000h,0e4h,004h,000h -DB 000h,0d4h,005h,000h -DB 000h,0c4h,006h,000h -DB 000h,034h,007h,000h -DB 000h,054h,008h,000h -DB 000h,074h,00ah,000h -DB 000h,064h,00bh,000h -DB 000h,082h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqrx_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqrx_n_mul_mont_384_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqrx_n_mul_mont_384_body:: -DB 1,0,17,0 -DB 000h,0f4h,005h,000h -DB 000h,0e4h,006h,000h -DB 000h,0d4h,007h,000h -DB 000h,0c4h,008h,000h -DB 000h,034h,009h,000h -DB 000h,054h,00ah,000h -DB 000h,074h,00ch,000h -DB 000h,064h,00dh,000h -DB 000h,0a2h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqrx_n_mul_mont_384_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqrx_n_mul_mont_383_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqrx_n_mul_mont_383_body:: -DB 1,0,17,0 -DB 000h,0f4h,005h,000h -DB 000h,0e4h,006h,000h -DB 000h,0d4h,007h,000h -DB 000h,0c4h,008h,000h -DB 000h,034h,009h,000h -DB 000h,054h,00ah,000h -DB 000h,074h,00ch,000h -DB 000h,064h,00dh,000h -DB 000h,0a2h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqrx_n_mul_mont_383_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_sqrx_mont_382x_prologue:: -DB 1,0,5,00bh -DB 0,074h,1,0 -DB 0,064h,2,0 -DB 0,0b3h -DB 0,0 - DD 0,0 -$L$SEH_info_sqrx_mont_382x_body:: -DB 1,0,18,0 -DB 000h,0f4h,011h,000h -DB 000h,0e4h,012h,000h -DB 000h,0d4h,013h,000h -DB 000h,0c4h,014h,000h -DB 000h,034h,015h,000h -DB 000h,054h,016h,000h -DB 000h,074h,018h,000h -DB 000h,064h,019h,000h -DB 000h,001h,017h,000h -DB 000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_sqrx_mont_382x_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/build/win64/sha256-armv8.asm b/crypto/blst_src/build/win64/sha256-armv8.asm deleted file mode 100644 index 31e74219c19..00000000000 --- a/crypto/blst_src/build/win64/sha256-armv8.asm +++ /dev/null @@ -1,1084 +0,0 @@ -// -// Copyright Supranational LLC -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 -// -// ==================================================================== -// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL -// project. -// ==================================================================== -// -// sha256_block procedure for ARMv8. -// -// This module is stripped of scalar code paths, with rationale that all -// known processors are NEON-capable. -// -// See original module at CRYPTOGAMS for further details. - - COMMON |__blst_platform_cap|,4 - AREA |.text|,CODE,ALIGN=8,ARM64 - - ALIGN 64 - -|$LK256| - DCDU 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 - DCDU 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 - DCDU 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 - DCDU 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 - DCDU 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc - DCDU 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da - DCDU 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 - DCDU 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 - DCDU 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 - DCDU 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 - DCDU 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 - DCDU 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 - DCDU 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 - DCDU 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 - DCDU 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 - DCDU 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - DCDU 0 //terminator - - DCB "SHA256 block transform for ARMv8, CRYPTOGAMS by @dot-asm",0 - ALIGN 4 - ALIGN 4 - - EXPORT |blst_sha256_block_armv8|[FUNC] - ALIGN 64 -|blst_sha256_block_armv8| PROC -|$Lv8_entry| - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v0.4s,v1.4s},[x0] - adr x3,|$LK256| - -|$Loop_hw| - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - sub x2,x2,#1 - ld1 {v16.4s},[x3],#16 - rev32 v4.16b,v4.16b - rev32 v5.16b,v5.16b - rev32 v6.16b,v6.16b - rev32 v7.16b,v7.16b - orr v18.16b,v0.16b,v0.16b // offload - orr v19.16b,v1.16b,v1.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s - DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s - DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s - DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s - DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s - DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s - DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - ld1 {v17.4s},[x3] - add v16.4s,v16.4s,v6.4s - sub x3,x3,#64*4-16 // rewind - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s - DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - add v17.4s,v17.4s,v7.4s - orr v2.16b,v0.16b,v0.16b - DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s - DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - add v0.4s,v0.4s,v18.4s - add v1.4s,v1.4s,v19.4s - - cbnz x2,|$Loop_hw| - - st1 {v0.4s,v1.4s},[x0] - - ldr x29,[sp],#16 - ret - ENDP - - EXPORT |blst_sha256_block_data_order|[FUNC] - ALIGN 16 -|blst_sha256_block_data_order| PROC - adrp x16,__blst_platform_cap - ldr w16,[x16,__blst_platform_cap] - tst w16,#1 - bne |$Lv8_entry| - - stp x29, x30, [sp, #-16]! - mov x29, sp - sub sp,sp,#16*4 - - adr x16,|$LK256| - add x2,x1,x2,lsl#6 // len to point at the end of inp - - ld1 {v0.16b},[x1], #16 - ld1 {v1.16b},[x1], #16 - ld1 {v2.16b},[x1], #16 - ld1 {v3.16b},[x1], #16 - ld1 {v4.4s},[x16], #16 - ld1 {v5.4s},[x16], #16 - ld1 {v6.4s},[x16], #16 - ld1 {v7.4s},[x16], #16 - rev32 v0.16b,v0.16b // yes, even on - rev32 v1.16b,v1.16b // big-endian - rev32 v2.16b,v2.16b - rev32 v3.16b,v3.16b - mov x17,sp - add v4.4s,v4.4s,v0.4s - add v5.4s,v5.4s,v1.4s - add v6.4s,v6.4s,v2.4s - st1 {v4.4s,v5.4s},[x17], #32 - add v7.4s,v7.4s,v3.4s - st1 {v6.4s,v7.4s},[x17] - sub x17,x17,#32 - - ldp w3,w4,[x0] - ldp w5,w6,[x0,#8] - ldp w7,w8,[x0,#16] - ldp w9,w10,[x0,#24] - ldr w12,[sp,#0] - mov w13,wzr - eor w14,w4,w5 - mov w15,wzr - b |$L_00_48| - - ALIGN 16 -|$L_00_48| - ext8 v4.16b,v0.16b,v1.16b,#4 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - bic w15,w9,w7 - ext8 v7.16b,v2.16b,v3.16b,#4 - eor w11,w7,w7,ror#5 - add w3,w3,w13 - mov d19,v3.d[1] - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w3,w3,ror#11 - ushr v5.4s,v4.4s,#3 - add w10,w10,w12 - add v0.4s,v0.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - ushr v7.4s,v4.4s,#18 - add w10,w10,w11 - ldr w12,[sp,#4] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w6,w6,w10 - sli v7.4s,v4.4s,#14 - eor w14,w14,w4 - ushr v16.4s,v19.4s,#17 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - eor v5.16b,v5.16b,v7.16b - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - sli v16.4s,v19.4s,#15 - add w10,w10,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - ushr v7.4s,v19.4s,#19 - add w9,w9,w12 - ror w11,w11,#6 - add v0.4s,v0.4s,v5.4s - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - sli v7.4s,v19.4s,#13 - add w9,w9,w11 - ldr w12,[sp,#8] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - eor v17.16b,v17.16b,v7.16b - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - add v0.4s,v0.4s,v17.4s - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - ushr v18.4s,v0.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v0.4s,#10 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - sli v18.4s,v0.4s,#15 - add w8,w8,w12 - ushr v17.4s,v0.4s,#19 - ror w11,w11,#6 - eor w13,w9,w10 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w9,ror#20 - add w8,w8,w11 - sli v17.4s,v0.4s,#13 - ldr w12,[sp,#12] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w4,w4,w8 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w10 - eor v17.16b,v17.16b,v17.16b - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - mov v17.d[1],v19.d[0] - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - add v0.4s,v0.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add v4.4s,v4.4s,v0.4s - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#16] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - ext8 v4.16b,v1.16b,v2.16b,#4 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - bic w15,w5,w3 - ext8 v7.16b,v3.16b,v0.16b,#4 - eor w11,w3,w3,ror#5 - add w7,w7,w13 - mov d19,v0.d[1] - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w7,w7,ror#11 - ushr v5.4s,v4.4s,#3 - add w6,w6,w12 - add v1.4s,v1.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - ushr v7.4s,v4.4s,#18 - add w6,w6,w11 - ldr w12,[sp,#20] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w10,w10,w6 - sli v7.4s,v4.4s,#14 - eor w14,w14,w8 - ushr v16.4s,v19.4s,#17 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - eor v5.16b,v5.16b,v7.16b - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - sli v16.4s,v19.4s,#15 - add w6,w6,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - ushr v7.4s,v19.4s,#19 - add w5,w5,w12 - ror w11,w11,#6 - add v1.4s,v1.4s,v5.4s - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - sli v7.4s,v19.4s,#13 - add w5,w5,w11 - ldr w12,[sp,#24] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - eor v17.16b,v17.16b,v7.16b - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - add v1.4s,v1.4s,v17.4s - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - ushr v18.4s,v1.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v1.4s,#10 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - sli v18.4s,v1.4s,#15 - add w4,w4,w12 - ushr v17.4s,v1.4s,#19 - ror w11,w11,#6 - eor w13,w5,w6 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w5,ror#20 - add w4,w4,w11 - sli v17.4s,v1.4s,#13 - ldr w12,[sp,#28] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w8,w8,w4 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w6 - eor v17.16b,v17.16b,v17.16b - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - mov v17.d[1],v19.d[0] - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - add v1.4s,v1.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add v4.4s,v4.4s,v1.4s - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[sp,#32] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - ext8 v4.16b,v2.16b,v3.16b,#4 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - bic w15,w9,w7 - ext8 v7.16b,v0.16b,v1.16b,#4 - eor w11,w7,w7,ror#5 - add w3,w3,w13 - mov d19,v1.d[1] - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w3,w3,ror#11 - ushr v5.4s,v4.4s,#3 - add w10,w10,w12 - add v2.4s,v2.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - ushr v7.4s,v4.4s,#18 - add w10,w10,w11 - ldr w12,[sp,#36] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w6,w6,w10 - sli v7.4s,v4.4s,#14 - eor w14,w14,w4 - ushr v16.4s,v19.4s,#17 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - eor v5.16b,v5.16b,v7.16b - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - sli v16.4s,v19.4s,#15 - add w10,w10,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - ushr v7.4s,v19.4s,#19 - add w9,w9,w12 - ror w11,w11,#6 - add v2.4s,v2.4s,v5.4s - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - sli v7.4s,v19.4s,#13 - add w9,w9,w11 - ldr w12,[sp,#40] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - eor v17.16b,v17.16b,v7.16b - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - add v2.4s,v2.4s,v17.4s - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - ushr v18.4s,v2.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v2.4s,#10 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - sli v18.4s,v2.4s,#15 - add w8,w8,w12 - ushr v17.4s,v2.4s,#19 - ror w11,w11,#6 - eor w13,w9,w10 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w9,ror#20 - add w8,w8,w11 - sli v17.4s,v2.4s,#13 - ldr w12,[sp,#44] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w4,w4,w8 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w10 - eor v17.16b,v17.16b,v17.16b - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - mov v17.d[1],v19.d[0] - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - add v2.4s,v2.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add v4.4s,v4.4s,v2.4s - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#48] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - ext8 v4.16b,v3.16b,v0.16b,#4 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - bic w15,w5,w3 - ext8 v7.16b,v1.16b,v2.16b,#4 - eor w11,w3,w3,ror#5 - add w7,w7,w13 - mov d19,v2.d[1] - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - ushr v6.4s,v4.4s,#7 - eor w15,w7,w7,ror#11 - ushr v5.4s,v4.4s,#3 - add w6,w6,w12 - add v3.4s,v3.4s,v7.4s - ror w11,w11,#6 - sli v6.4s,v4.4s,#25 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - ushr v7.4s,v4.4s,#18 - add w6,w6,w11 - ldr w12,[sp,#52] - and w14,w14,w13 - eor v5.16b,v5.16b,v6.16b - ror w15,w15,#2 - add w10,w10,w6 - sli v7.4s,v4.4s,#14 - eor w14,w14,w8 - ushr v16.4s,v19.4s,#17 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - eor v5.16b,v5.16b,v7.16b - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - sli v16.4s,v19.4s,#15 - add w6,w6,w14 - orr w12,w12,w15 - ushr v17.4s,v19.4s,#10 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - ushr v7.4s,v19.4s,#19 - add w5,w5,w12 - ror w11,w11,#6 - add v3.4s,v3.4s,v5.4s - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - sli v7.4s,v19.4s,#13 - add w5,w5,w11 - ldr w12,[sp,#56] - and w13,w13,w14 - eor v17.16b,v17.16b,v16.16b - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - eor v17.16b,v17.16b,v7.16b - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - add v3.4s,v3.4s,v17.4s - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - ushr v18.4s,v3.4s,#17 - orr w12,w12,w15 - ushr v19.4s,v3.4s,#10 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - sli v18.4s,v3.4s,#15 - add w4,w4,w12 - ushr v17.4s,v3.4s,#19 - ror w11,w11,#6 - eor w13,w5,w6 - eor v19.16b,v19.16b,v18.16b - eor w15,w15,w5,ror#20 - add w4,w4,w11 - sli v17.4s,v3.4s,#13 - ldr w12,[sp,#60] - and w14,w14,w13 - ror w15,w15,#2 - ld1 {v4.4s},[x16], #16 - add w8,w8,w4 - eor v19.16b,v19.16b,v17.16b - eor w14,w14,w6 - eor v17.16b,v17.16b,v17.16b - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - mov v17.d[1],v19.d[0] - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - add v3.4s,v3.4s,v17.4s - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add v4.4s,v4.4s,v3.4s - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[x16] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - cmp w12,#0 // check for K256 terminator - ldr w12,[sp,#0] - sub x17,x17,#64 - bne |$L_00_48| - - sub x16,x16,#256 // rewind x16 - cmp x1,x2 - mov x17, #64 - cseleq x17,x17,xzr - sub x1,x1,x17 // avoid SEGV - mov x17,sp - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - ld1 {v0.16b},[x1],#16 - bic w15,w9,w7 - eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 - add w3,w3,w13 - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - eor w15,w3,w3,ror#11 - rev32 v0.16b,v0.16b - add w10,w10,w12 - ror w11,w11,#6 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - add v4.4s,v4.4s,v0.4s - add w10,w10,w11 - ldr w12,[sp,#4] - and w14,w14,w13 - ror w15,w15,#2 - add w6,w6,w10 - eor w14,w14,w4 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - add w10,w10,w14 - orr w12,w12,w15 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - add w9,w9,w12 - ror w11,w11,#6 - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - add w9,w9,w11 - ldr w12,[sp,#8] - and w13,w13,w14 - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - orr w12,w12,w15 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - add w8,w8,w12 - ror w11,w11,#6 - eor w13,w9,w10 - eor w15,w15,w9,ror#20 - add w8,w8,w11 - ldr w12,[sp,#12] - and w14,w14,w13 - ror w15,w15,#2 - add w4,w4,w8 - eor w14,w14,w10 - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#16] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - ld1 {v1.16b},[x1],#16 - bic w15,w5,w3 - eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 - add w7,w7,w13 - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - eor w15,w7,w7,ror#11 - rev32 v1.16b,v1.16b - add w6,w6,w12 - ror w11,w11,#6 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - add v4.4s,v4.4s,v1.4s - add w6,w6,w11 - ldr w12,[sp,#20] - and w14,w14,w13 - ror w15,w15,#2 - add w10,w10,w6 - eor w14,w14,w8 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - add w6,w6,w14 - orr w12,w12,w15 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - add w5,w5,w12 - ror w11,w11,#6 - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - add w5,w5,w11 - ldr w12,[sp,#24] - and w13,w13,w14 - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - orr w12,w12,w15 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - add w4,w4,w12 - ror w11,w11,#6 - eor w13,w5,w6 - eor w15,w15,w5,ror#20 - add w4,w4,w11 - ldr w12,[sp,#28] - and w14,w14,w13 - ror w15,w15,#2 - add w8,w8,w4 - eor w14,w14,w6 - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - ldr w12,[sp,#32] - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - add w10,w10,w12 - add w3,w3,w15 - and w12,w8,w7 - ld1 {v2.16b},[x1],#16 - bic w15,w9,w7 - eor w11,w7,w7,ror#5 - ld1 {v4.4s},[x16],#16 - add w3,w3,w13 - orr w12,w12,w15 - eor w11,w11,w7,ror#19 - eor w15,w3,w3,ror#11 - rev32 v2.16b,v2.16b - add w10,w10,w12 - ror w11,w11,#6 - eor w13,w3,w4 - eor w15,w15,w3,ror#20 - add v4.4s,v4.4s,v2.4s - add w10,w10,w11 - ldr w12,[sp,#36] - and w14,w14,w13 - ror w15,w15,#2 - add w6,w6,w10 - eor w14,w14,w4 - add w9,w9,w12 - add w10,w10,w15 - and w12,w7,w6 - bic w15,w8,w6 - eor w11,w6,w6,ror#5 - add w10,w10,w14 - orr w12,w12,w15 - eor w11,w11,w6,ror#19 - eor w15,w10,w10,ror#11 - add w9,w9,w12 - ror w11,w11,#6 - eor w14,w10,w3 - eor w15,w15,w10,ror#20 - add w9,w9,w11 - ldr w12,[sp,#40] - and w13,w13,w14 - ror w15,w15,#2 - add w5,w5,w9 - eor w13,w13,w3 - add w8,w8,w12 - add w9,w9,w15 - and w12,w6,w5 - bic w15,w7,w5 - eor w11,w5,w5,ror#5 - add w9,w9,w13 - orr w12,w12,w15 - eor w11,w11,w5,ror#19 - eor w15,w9,w9,ror#11 - add w8,w8,w12 - ror w11,w11,#6 - eor w13,w9,w10 - eor w15,w15,w9,ror#20 - add w8,w8,w11 - ldr w12,[sp,#44] - and w14,w14,w13 - ror w15,w15,#2 - add w4,w4,w8 - eor w14,w14,w10 - add w7,w7,w12 - add w8,w8,w15 - and w12,w5,w4 - bic w15,w6,w4 - eor w11,w4,w4,ror#5 - add w8,w8,w14 - orr w12,w12,w15 - eor w11,w11,w4,ror#19 - eor w15,w8,w8,ror#11 - add w7,w7,w12 - ror w11,w11,#6 - eor w14,w8,w9 - eor w15,w15,w8,ror#20 - add w7,w7,w11 - ldr w12,[sp,#48] - and w13,w13,w14 - ror w15,w15,#2 - add w3,w3,w7 - eor w13,w13,w9 - st1 {v4.4s},[x17], #16 - add w6,w6,w12 - add w7,w7,w15 - and w12,w4,w3 - ld1 {v3.16b},[x1],#16 - bic w15,w5,w3 - eor w11,w3,w3,ror#5 - ld1 {v4.4s},[x16],#16 - add w7,w7,w13 - orr w12,w12,w15 - eor w11,w11,w3,ror#19 - eor w15,w7,w7,ror#11 - rev32 v3.16b,v3.16b - add w6,w6,w12 - ror w11,w11,#6 - eor w13,w7,w8 - eor w15,w15,w7,ror#20 - add v4.4s,v4.4s,v3.4s - add w6,w6,w11 - ldr w12,[sp,#52] - and w14,w14,w13 - ror w15,w15,#2 - add w10,w10,w6 - eor w14,w14,w8 - add w5,w5,w12 - add w6,w6,w15 - and w12,w3,w10 - bic w15,w4,w10 - eor w11,w10,w10,ror#5 - add w6,w6,w14 - orr w12,w12,w15 - eor w11,w11,w10,ror#19 - eor w15,w6,w6,ror#11 - add w5,w5,w12 - ror w11,w11,#6 - eor w14,w6,w7 - eor w15,w15,w6,ror#20 - add w5,w5,w11 - ldr w12,[sp,#56] - and w13,w13,w14 - ror w15,w15,#2 - add w9,w9,w5 - eor w13,w13,w7 - add w4,w4,w12 - add w5,w5,w15 - and w12,w10,w9 - bic w15,w3,w9 - eor w11,w9,w9,ror#5 - add w5,w5,w13 - orr w12,w12,w15 - eor w11,w11,w9,ror#19 - eor w15,w5,w5,ror#11 - add w4,w4,w12 - ror w11,w11,#6 - eor w13,w5,w6 - eor w15,w15,w5,ror#20 - add w4,w4,w11 - ldr w12,[sp,#60] - and w14,w14,w13 - ror w15,w15,#2 - add w8,w8,w4 - eor w14,w14,w6 - add w3,w3,w12 - add w4,w4,w15 - and w12,w9,w8 - bic w15,w10,w8 - eor w11,w8,w8,ror#5 - add w4,w4,w14 - orr w12,w12,w15 - eor w11,w11,w8,ror#19 - eor w15,w4,w4,ror#11 - add w3,w3,w12 - ror w11,w11,#6 - eor w14,w4,w5 - eor w15,w15,w4,ror#20 - add w3,w3,w11 - and w13,w13,w14 - ror w15,w15,#2 - add w7,w7,w3 - eor w13,w13,w5 - st1 {v4.4s},[x17], #16 - add w3,w3,w15 // h+=Sigma0(a) from the past - ldp w11,w12,[x0,#0] - add w3,w3,w13 // h+=Maj(a,b,c) from the past - ldp w13,w14,[x0,#8] - add w3,w3,w11 // accumulate - add w4,w4,w12 - ldp w11,w12,[x0,#16] - add w5,w5,w13 - add w6,w6,w14 - ldp w13,w14,[x0,#24] - add w7,w7,w11 - add w8,w8,w12 - ldr w12,[sp,#0] - stp w3,w4,[x0,#0] - add w9,w9,w13 - mov w13,wzr - stp w5,w6,[x0,#8] - add w10,w10,w14 - stp w7,w8,[x0,#16] - eor w14,w4,w5 - stp w9,w10,[x0,#24] - mov w15,wzr - mov x17,sp - bne |$L_00_48| - - ldr x29,[x29] - add sp,sp,#16*4+16 - ret - ENDP - - - EXPORT |blst_sha256_emit|[FUNC] - ALIGN 16 -|blst_sha256_emit| PROC - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] -#ifndef __AARCH64EB__ - rev x4,x4 - rev x5,x5 - rev x6,x6 - rev x7,x7 -#endif - str w4,[x0,#4] - lsr x4,x4,#32 - str w5,[x0,#12] - lsr x5,x5,#32 - str w6,[x0,#20] - lsr x6,x6,#32 - str w7,[x0,#28] - lsr x7,x7,#32 - str w4,[x0,#0] - str w5,[x0,#8] - str w6,[x0,#16] - str w7,[x0,#24] - ret - ENDP - - - - EXPORT |blst_sha256_bcopy|[FUNC] - ALIGN 16 -|blst_sha256_bcopy| PROC -|$Loop_bcopy| - ldrb w3,[x1],#1 - sub x2,x2,#1 - strb w3,[x0],#1 - cbnz x2,|$Loop_bcopy| - ret - ENDP - - - - EXPORT |blst_sha256_hcopy|[FUNC] - ALIGN 16 -|blst_sha256_hcopy| PROC - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - stp x4,x5,[x0] - stp x6,x7,[x0,#16] - ret - ENDP - END diff --git a/crypto/blst_src/build/win64/sha256-x86_64.asm b/crypto/blst_src/build/win64/sha256-x86_64.asm deleted file mode 100644 index a502a75ecaf..00000000000 --- a/crypto/blst_src/build/win64/sha256-x86_64.asm +++ /dev/null @@ -1,1575 +0,0 @@ -OPTION DOTNAME -_DATA SEGMENT -COMM __blst_platform_cap:DWORD:1 -_DATA ENDS -.text$ SEGMENT ALIGN(256) 'CODE' - -ALIGN 64 - -K256:: - DD 0428a2f98h,071374491h,0b5c0fbcfh,0e9b5dba5h - DD 03956c25bh,059f111f1h,0923f82a4h,0ab1c5ed5h - DD 0d807aa98h,012835b01h,0243185beh,0550c7dc3h - DD 072be5d74h,080deb1feh,09bdc06a7h,0c19bf174h - DD 0e49b69c1h,0efbe4786h,00fc19dc6h,0240ca1cch - DD 02de92c6fh,04a7484aah,05cb0a9dch,076f988dah - DD 0983e5152h,0a831c66dh,0b00327c8h,0bf597fc7h - DD 0c6e00bf3h,0d5a79147h,006ca6351h,014292967h - DD 027b70a85h,02e1b2138h,04d2c6dfch,053380d13h - DD 0650a7354h,0766a0abbh,081c2c92eh,092722c85h - DD 0a2bfe8a1h,0a81a664bh,0c24b8b70h,0c76c51a3h - DD 0d192e819h,0d6990624h,0f40e3585h,0106aa070h - DD 019a4c116h,01e376c08h,02748774ch,034b0bcb5h - DD 0391c0cb3h,04ed8aa4ah,05b9cca4fh,0682e6ff3h - DD 0748f82eeh,078a5636fh,084c87814h,08cc70208h - DD 090befffah,0a4506cebh,0bef9a3f7h,0c67178f2h - - DD 000010203h,004050607h,008090a0bh,00c0d0e0fh - DD 003020100h,00b0a0908h,0ffffffffh,0ffffffffh - DD 0ffffffffh,0ffffffffh,003020100h,00b0a0908h -DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 -DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 -DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 -DB 32,64,100,111,116,45,97,115,109,0 -PUBLIC blst_sha256_block_data_order_shaext - - -ALIGN 64 -blst_sha256_block_data_order_shaext PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_blst_sha256_block_data_order_shaext:: - - - push rbp - - mov rbp,rsp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 -$L$blst_sha256_block_data_order$2:: - sub rsp,050h - - movaps XMMWORD PTR[(-80)+rbp],xmm6 - movaps XMMWORD PTR[(-64)+rbp],xmm7 - movaps XMMWORD PTR[(-48)+rbp],xmm8 - movaps XMMWORD PTR[(-32)+rbp],xmm9 - movaps XMMWORD PTR[(-16)+rbp],xmm10 - -$L$SEH_body_blst_sha256_block_data_order_shaext:: - - lea rcx,QWORD PTR[((K256+128))] - movdqu xmm1,XMMWORD PTR[rdi] - movdqu xmm2,XMMWORD PTR[16+rdi] - movdqa xmm7,XMMWORD PTR[((256-128))+rcx] - - pshufd xmm0,xmm1,01bh - pshufd xmm1,xmm1,0b1h - pshufd xmm2,xmm2,01bh - movdqa xmm8,xmm7 -DB 102,15,58,15,202,8 - punpcklqdq xmm2,xmm0 - jmp $L$oop_shaext - -ALIGN 16 -$L$oop_shaext:: - movdqu xmm3,XMMWORD PTR[rsi] - movdqu xmm4,XMMWORD PTR[16+rsi] - movdqu xmm5,XMMWORD PTR[32+rsi] -DB 102,15,56,0,223 - movdqu xmm6,XMMWORD PTR[48+rsi] - - movdqa xmm0,XMMWORD PTR[((0-128))+rcx] - paddd xmm0,xmm3 -DB 102,15,56,0,231 - movdqa xmm10,xmm2 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - nop - movdqa xmm9,xmm1 -DB 15,56,203,202 - - movdqa xmm0,XMMWORD PTR[((16-128))+rcx] - paddd xmm0,xmm4 -DB 102,15,56,0,239 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - lea rsi,QWORD PTR[64+rsi] -DB 15,56,204,220 -DB 15,56,203,202 - - movdqa xmm0,XMMWORD PTR[((32-128))+rcx] - paddd xmm0,xmm5 -DB 102,15,56,0,247 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm6 -DB 102,15,58,15,253,4 - nop - paddd xmm3,xmm7 -DB 15,56,204,229 -DB 15,56,203,202 - - movdqa xmm0,XMMWORD PTR[((48-128))+rcx] - paddd xmm0,xmm6 -DB 15,56,205,222 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm3 -DB 102,15,58,15,254,4 - nop - paddd xmm4,xmm7 -DB 15,56,204,238 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((64-128))+rcx] - paddd xmm0,xmm3 -DB 15,56,205,227 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm4 -DB 102,15,58,15,251,4 - nop - paddd xmm5,xmm7 -DB 15,56,204,243 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((80-128))+rcx] - paddd xmm0,xmm4 -DB 15,56,205,236 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm5 -DB 102,15,58,15,252,4 - nop - paddd xmm6,xmm7 -DB 15,56,204,220 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((96-128))+rcx] - paddd xmm0,xmm5 -DB 15,56,205,245 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm6 -DB 102,15,58,15,253,4 - nop - paddd xmm3,xmm7 -DB 15,56,204,229 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((112-128))+rcx] - paddd xmm0,xmm6 -DB 15,56,205,222 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm3 -DB 102,15,58,15,254,4 - nop - paddd xmm4,xmm7 -DB 15,56,204,238 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((128-128))+rcx] - paddd xmm0,xmm3 -DB 15,56,205,227 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm4 -DB 102,15,58,15,251,4 - nop - paddd xmm5,xmm7 -DB 15,56,204,243 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((144-128))+rcx] - paddd xmm0,xmm4 -DB 15,56,205,236 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm5 -DB 102,15,58,15,252,4 - nop - paddd xmm6,xmm7 -DB 15,56,204,220 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((160-128))+rcx] - paddd xmm0,xmm5 -DB 15,56,205,245 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm6 -DB 102,15,58,15,253,4 - nop - paddd xmm3,xmm7 -DB 15,56,204,229 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((176-128))+rcx] - paddd xmm0,xmm6 -DB 15,56,205,222 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm3 -DB 102,15,58,15,254,4 - nop - paddd xmm4,xmm7 -DB 15,56,204,238 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((192-128))+rcx] - paddd xmm0,xmm3 -DB 15,56,205,227 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm4 -DB 102,15,58,15,251,4 - nop - paddd xmm5,xmm7 -DB 15,56,204,243 -DB 15,56,203,202 - movdqa xmm0,XMMWORD PTR[((208-128))+rcx] - paddd xmm0,xmm4 -DB 15,56,205,236 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - movdqa xmm7,xmm5 -DB 102,15,58,15,252,4 -DB 15,56,203,202 - paddd xmm6,xmm7 - - movdqa xmm0,XMMWORD PTR[((224-128))+rcx] - paddd xmm0,xmm5 -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh -DB 15,56,205,245 - movdqa xmm7,xmm8 -DB 15,56,203,202 - - movdqa xmm0,XMMWORD PTR[((240-128))+rcx] - paddd xmm0,xmm6 - nop -DB 15,56,203,209 - pshufd xmm0,xmm0,00eh - dec rdx - nop -DB 15,56,203,202 - - paddd xmm2,xmm10 - paddd xmm1,xmm9 - jnz $L$oop_shaext - - pshufd xmm2,xmm2,0b1h - pshufd xmm7,xmm1,01bh - pshufd xmm1,xmm1,0b1h - punpckhqdq xmm1,xmm2 -DB 102,15,58,15,215,8 - - movdqu XMMWORD PTR[rdi],xmm1 - movdqu XMMWORD PTR[16+rdi],xmm2 - movaps xmm6,XMMWORD PTR[((-80))+rbp] - movaps xmm7,XMMWORD PTR[((-64))+rbp] - movaps xmm8,XMMWORD PTR[((-48))+rbp] - movaps xmm9,XMMWORD PTR[((-32))+rbp] - movaps xmm10,XMMWORD PTR[((-16))+rbp] - mov rsp,rbp - - pop rbp - -$L$SEH_epilogue_blst_sha256_block_data_order_shaext:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_blst_sha256_block_data_order_shaext:: -blst_sha256_block_data_order_shaext ENDP -PUBLIC blst_sha256_block_data_order - - -ALIGN 64 -blst_sha256_block_data_order PROC PUBLIC - DB 243,15,30,250 - mov QWORD PTR[8+rsp],rdi ;WIN64 prologue - mov QWORD PTR[16+rsp],rsi - mov r11,rsp -$L$SEH_begin_blst_sha256_block_data_order:: - - - push rbp - - mov rbp,rsp - - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - test DWORD PTR[__blst_platform_cap],2 - jnz $L$blst_sha256_block_data_order$2 - push rbx - - push r12 - - push r13 - - push r14 - - push r15 - - shl rdx,4 - sub rsp,88 - - lea rdx,QWORD PTR[rdx*4+rsi] - mov QWORD PTR[((-64))+rbp],rdi - - mov QWORD PTR[((-48))+rbp],rdx - movaps XMMWORD PTR[(-128)+rbp],xmm6 - movaps XMMWORD PTR[(-112)+rbp],xmm7 - movaps XMMWORD PTR[(-96)+rbp],xmm8 - movaps XMMWORD PTR[(-80)+rbp],xmm9 - -$L$SEH_body_blst_sha256_block_data_order:: - - - lea rsp,QWORD PTR[((-64))+rsp] - mov eax,DWORD PTR[rdi] - and rsp,-64 - mov ebx,DWORD PTR[4+rdi] - mov ecx,DWORD PTR[8+rdi] - mov edx,DWORD PTR[12+rdi] - mov r8d,DWORD PTR[16+rdi] - mov r9d,DWORD PTR[20+rdi] - mov r10d,DWORD PTR[24+rdi] - mov r11d,DWORD PTR[28+rdi] - - - jmp $L$loop_ssse3 -ALIGN 16 -$L$loop_ssse3:: - movdqa xmm7,XMMWORD PTR[((K256+256))] - mov QWORD PTR[((-56))+rbp],rsi - movdqu xmm0,XMMWORD PTR[rsi] - movdqu xmm1,XMMWORD PTR[16+rsi] - movdqu xmm2,XMMWORD PTR[32+rsi] -DB 102,15,56,0,199 - movdqu xmm3,XMMWORD PTR[48+rsi] - lea rsi,QWORD PTR[K256] -DB 102,15,56,0,207 - movdqa xmm4,XMMWORD PTR[rsi] - movdqa xmm5,XMMWORD PTR[16+rsi] -DB 102,15,56,0,215 - paddd xmm4,xmm0 - movdqa xmm6,XMMWORD PTR[32+rsi] -DB 102,15,56,0,223 - movdqa xmm7,XMMWORD PTR[48+rsi] - paddd xmm5,xmm1 - paddd xmm6,xmm2 - paddd xmm7,xmm3 - movdqa XMMWORD PTR[rsp],xmm4 - mov r14d,eax - movdqa XMMWORD PTR[16+rsp],xmm5 - mov edi,ebx - movdqa XMMWORD PTR[32+rsp],xmm6 - xor edi,ecx - movdqa XMMWORD PTR[48+rsp],xmm7 - mov r13d,r8d - jmp $L$ssse3_00_47 - -ALIGN 16 -$L$ssse3_00_47:: - sub rsi,-64 - ror r13d,14 - movdqa xmm4,xmm1 - mov eax,r14d - mov r12d,r9d - movdqa xmm7,xmm3 - ror r14d,9 - xor r13d,r8d - xor r12d,r10d - ror r13d,5 - xor r14d,eax -DB 102,15,58,15,224,4 - and r12d,r8d - xor r13d,r8d -DB 102,15,58,15,250,4 - add r11d,DWORD PTR[rsp] - mov r15d,eax - xor r12d,r10d - ror r14d,11 - movdqa xmm5,xmm4 - xor r15d,ebx - add r11d,r12d - movdqa xmm6,xmm4 - ror r13d,6 - and edi,r15d - psrld xmm4,3 - xor r14d,eax - add r11d,r13d - xor edi,ebx - paddd xmm0,xmm7 - ror r14d,2 - add edx,r11d - psrld xmm6,7 - add r11d,edi - mov r13d,edx - pshufd xmm7,xmm3,250 - add r14d,r11d - ror r13d,14 - pslld xmm5,14 - mov r11d,r14d - mov r12d,r8d - pxor xmm4,xmm6 - ror r14d,9 - xor r13d,edx - xor r12d,r9d - ror r13d,5 - psrld xmm6,11 - xor r14d,r11d - pxor xmm4,xmm5 - and r12d,edx - xor r13d,edx - pslld xmm5,11 - add r10d,DWORD PTR[4+rsp] - mov edi,r11d - pxor xmm4,xmm6 - xor r12d,r9d - ror r14d,11 - movdqa xmm6,xmm7 - xor edi,eax - add r10d,r12d - pxor xmm4,xmm5 - ror r13d,6 - and r15d,edi - xor r14d,r11d - psrld xmm7,10 - add r10d,r13d - xor r15d,eax - paddd xmm0,xmm4 - ror r14d,2 - add ecx,r10d - psrlq xmm6,17 - add r10d,r15d - mov r13d,ecx - add r14d,r10d - pxor xmm7,xmm6 - ror r13d,14 - mov r10d,r14d - mov r12d,edx - ror r14d,9 - psrlq xmm6,2 - xor r13d,ecx - xor r12d,r8d - pxor xmm7,xmm6 - ror r13d,5 - xor r14d,r10d - and r12d,ecx - pshufd xmm7,xmm7,128 - xor r13d,ecx - add r9d,DWORD PTR[8+rsp] - mov r15d,r10d - psrldq xmm7,8 - xor r12d,r8d - ror r14d,11 - xor r15d,r11d - add r9d,r12d - ror r13d,6 - paddd xmm0,xmm7 - and edi,r15d - xor r14d,r10d - add r9d,r13d - pshufd xmm7,xmm0,80 - xor edi,r11d - ror r14d,2 - add ebx,r9d - movdqa xmm6,xmm7 - add r9d,edi - mov r13d,ebx - psrld xmm7,10 - add r14d,r9d - ror r13d,14 - psrlq xmm6,17 - mov r9d,r14d - mov r12d,ecx - pxor xmm7,xmm6 - ror r14d,9 - xor r13d,ebx - xor r12d,edx - ror r13d,5 - xor r14d,r9d - psrlq xmm6,2 - and r12d,ebx - xor r13d,ebx - add r8d,DWORD PTR[12+rsp] - pxor xmm7,xmm6 - mov edi,r9d - xor r12d,edx - ror r14d,11 - pshufd xmm7,xmm7,8 - xor edi,r10d - add r8d,r12d - movdqa xmm6,XMMWORD PTR[rsi] - ror r13d,6 - and r15d,edi - pslldq xmm7,8 - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - paddd xmm0,xmm7 - ror r14d,2 - add eax,r8d - add r8d,r15d - paddd xmm6,xmm0 - mov r13d,eax - add r14d,r8d - movdqa XMMWORD PTR[rsp],xmm6 - ror r13d,14 - movdqa xmm4,xmm2 - mov r8d,r14d - mov r12d,ebx - movdqa xmm7,xmm0 - ror r14d,9 - xor r13d,eax - xor r12d,ecx - ror r13d,5 - xor r14d,r8d -DB 102,15,58,15,225,4 - and r12d,eax - xor r13d,eax -DB 102,15,58,15,251,4 - add edx,DWORD PTR[16+rsp] - mov r15d,r8d - xor r12d,ecx - ror r14d,11 - movdqa xmm5,xmm4 - xor r15d,r9d - add edx,r12d - movdqa xmm6,xmm4 - ror r13d,6 - and edi,r15d - psrld xmm4,3 - xor r14d,r8d - add edx,r13d - xor edi,r9d - paddd xmm1,xmm7 - ror r14d,2 - add r11d,edx - psrld xmm6,7 - add edx,edi - mov r13d,r11d - pshufd xmm7,xmm0,250 - add r14d,edx - ror r13d,14 - pslld xmm5,14 - mov edx,r14d - mov r12d,eax - pxor xmm4,xmm6 - ror r14d,9 - xor r13d,r11d - xor r12d,ebx - ror r13d,5 - psrld xmm6,11 - xor r14d,edx - pxor xmm4,xmm5 - and r12d,r11d - xor r13d,r11d - pslld xmm5,11 - add ecx,DWORD PTR[20+rsp] - mov edi,edx - pxor xmm4,xmm6 - xor r12d,ebx - ror r14d,11 - movdqa xmm6,xmm7 - xor edi,r8d - add ecx,r12d - pxor xmm4,xmm5 - ror r13d,6 - and r15d,edi - xor r14d,edx - psrld xmm7,10 - add ecx,r13d - xor r15d,r8d - paddd xmm1,xmm4 - ror r14d,2 - add r10d,ecx - psrlq xmm6,17 - add ecx,r15d - mov r13d,r10d - add r14d,ecx - pxor xmm7,xmm6 - ror r13d,14 - mov ecx,r14d - mov r12d,r11d - ror r14d,9 - psrlq xmm6,2 - xor r13d,r10d - xor r12d,eax - pxor xmm7,xmm6 - ror r13d,5 - xor r14d,ecx - and r12d,r10d - pshufd xmm7,xmm7,128 - xor r13d,r10d - add ebx,DWORD PTR[24+rsp] - mov r15d,ecx - psrldq xmm7,8 - xor r12d,eax - ror r14d,11 - xor r15d,edx - add ebx,r12d - ror r13d,6 - paddd xmm1,xmm7 - and edi,r15d - xor r14d,ecx - add ebx,r13d - pshufd xmm7,xmm1,80 - xor edi,edx - ror r14d,2 - add r9d,ebx - movdqa xmm6,xmm7 - add ebx,edi - mov r13d,r9d - psrld xmm7,10 - add r14d,ebx - ror r13d,14 - psrlq xmm6,17 - mov ebx,r14d - mov r12d,r10d - pxor xmm7,xmm6 - ror r14d,9 - xor r13d,r9d - xor r12d,r11d - ror r13d,5 - xor r14d,ebx - psrlq xmm6,2 - and r12d,r9d - xor r13d,r9d - add eax,DWORD PTR[28+rsp] - pxor xmm7,xmm6 - mov edi,ebx - xor r12d,r11d - ror r14d,11 - pshufd xmm7,xmm7,8 - xor edi,ecx - add eax,r12d - movdqa xmm6,XMMWORD PTR[16+rsi] - ror r13d,6 - and r15d,edi - pslldq xmm7,8 - xor r14d,ebx - add eax,r13d - xor r15d,ecx - paddd xmm1,xmm7 - ror r14d,2 - add r8d,eax - add eax,r15d - paddd xmm6,xmm1 - mov r13d,r8d - add r14d,eax - movdqa XMMWORD PTR[16+rsp],xmm6 - ror r13d,14 - movdqa xmm4,xmm3 - mov eax,r14d - mov r12d,r9d - movdqa xmm7,xmm1 - ror r14d,9 - xor r13d,r8d - xor r12d,r10d - ror r13d,5 - xor r14d,eax -DB 102,15,58,15,226,4 - and r12d,r8d - xor r13d,r8d -DB 102,15,58,15,248,4 - add r11d,DWORD PTR[32+rsp] - mov r15d,eax - xor r12d,r10d - ror r14d,11 - movdqa xmm5,xmm4 - xor r15d,ebx - add r11d,r12d - movdqa xmm6,xmm4 - ror r13d,6 - and edi,r15d - psrld xmm4,3 - xor r14d,eax - add r11d,r13d - xor edi,ebx - paddd xmm2,xmm7 - ror r14d,2 - add edx,r11d - psrld xmm6,7 - add r11d,edi - mov r13d,edx - pshufd xmm7,xmm1,250 - add r14d,r11d - ror r13d,14 - pslld xmm5,14 - mov r11d,r14d - mov r12d,r8d - pxor xmm4,xmm6 - ror r14d,9 - xor r13d,edx - xor r12d,r9d - ror r13d,5 - psrld xmm6,11 - xor r14d,r11d - pxor xmm4,xmm5 - and r12d,edx - xor r13d,edx - pslld xmm5,11 - add r10d,DWORD PTR[36+rsp] - mov edi,r11d - pxor xmm4,xmm6 - xor r12d,r9d - ror r14d,11 - movdqa xmm6,xmm7 - xor edi,eax - add r10d,r12d - pxor xmm4,xmm5 - ror r13d,6 - and r15d,edi - xor r14d,r11d - psrld xmm7,10 - add r10d,r13d - xor r15d,eax - paddd xmm2,xmm4 - ror r14d,2 - add ecx,r10d - psrlq xmm6,17 - add r10d,r15d - mov r13d,ecx - add r14d,r10d - pxor xmm7,xmm6 - ror r13d,14 - mov r10d,r14d - mov r12d,edx - ror r14d,9 - psrlq xmm6,2 - xor r13d,ecx - xor r12d,r8d - pxor xmm7,xmm6 - ror r13d,5 - xor r14d,r10d - and r12d,ecx - pshufd xmm7,xmm7,128 - xor r13d,ecx - add r9d,DWORD PTR[40+rsp] - mov r15d,r10d - psrldq xmm7,8 - xor r12d,r8d - ror r14d,11 - xor r15d,r11d - add r9d,r12d - ror r13d,6 - paddd xmm2,xmm7 - and edi,r15d - xor r14d,r10d - add r9d,r13d - pshufd xmm7,xmm2,80 - xor edi,r11d - ror r14d,2 - add ebx,r9d - movdqa xmm6,xmm7 - add r9d,edi - mov r13d,ebx - psrld xmm7,10 - add r14d,r9d - ror r13d,14 - psrlq xmm6,17 - mov r9d,r14d - mov r12d,ecx - pxor xmm7,xmm6 - ror r14d,9 - xor r13d,ebx - xor r12d,edx - ror r13d,5 - xor r14d,r9d - psrlq xmm6,2 - and r12d,ebx - xor r13d,ebx - add r8d,DWORD PTR[44+rsp] - pxor xmm7,xmm6 - mov edi,r9d - xor r12d,edx - ror r14d,11 - pshufd xmm7,xmm7,8 - xor edi,r10d - add r8d,r12d - movdqa xmm6,XMMWORD PTR[32+rsi] - ror r13d,6 - and r15d,edi - pslldq xmm7,8 - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - paddd xmm2,xmm7 - ror r14d,2 - add eax,r8d - add r8d,r15d - paddd xmm6,xmm2 - mov r13d,eax - add r14d,r8d - movdqa XMMWORD PTR[32+rsp],xmm6 - ror r13d,14 - movdqa xmm4,xmm0 - mov r8d,r14d - mov r12d,ebx - movdqa xmm7,xmm2 - ror r14d,9 - xor r13d,eax - xor r12d,ecx - ror r13d,5 - xor r14d,r8d -DB 102,15,58,15,227,4 - and r12d,eax - xor r13d,eax -DB 102,15,58,15,249,4 - add edx,DWORD PTR[48+rsp] - mov r15d,r8d - xor r12d,ecx - ror r14d,11 - movdqa xmm5,xmm4 - xor r15d,r9d - add edx,r12d - movdqa xmm6,xmm4 - ror r13d,6 - and edi,r15d - psrld xmm4,3 - xor r14d,r8d - add edx,r13d - xor edi,r9d - paddd xmm3,xmm7 - ror r14d,2 - add r11d,edx - psrld xmm6,7 - add edx,edi - mov r13d,r11d - pshufd xmm7,xmm2,250 - add r14d,edx - ror r13d,14 - pslld xmm5,14 - mov edx,r14d - mov r12d,eax - pxor xmm4,xmm6 - ror r14d,9 - xor r13d,r11d - xor r12d,ebx - ror r13d,5 - psrld xmm6,11 - xor r14d,edx - pxor xmm4,xmm5 - and r12d,r11d - xor r13d,r11d - pslld xmm5,11 - add ecx,DWORD PTR[52+rsp] - mov edi,edx - pxor xmm4,xmm6 - xor r12d,ebx - ror r14d,11 - movdqa xmm6,xmm7 - xor edi,r8d - add ecx,r12d - pxor xmm4,xmm5 - ror r13d,6 - and r15d,edi - xor r14d,edx - psrld xmm7,10 - add ecx,r13d - xor r15d,r8d - paddd xmm3,xmm4 - ror r14d,2 - add r10d,ecx - psrlq xmm6,17 - add ecx,r15d - mov r13d,r10d - add r14d,ecx - pxor xmm7,xmm6 - ror r13d,14 - mov ecx,r14d - mov r12d,r11d - ror r14d,9 - psrlq xmm6,2 - xor r13d,r10d - xor r12d,eax - pxor xmm7,xmm6 - ror r13d,5 - xor r14d,ecx - and r12d,r10d - pshufd xmm7,xmm7,128 - xor r13d,r10d - add ebx,DWORD PTR[56+rsp] - mov r15d,ecx - psrldq xmm7,8 - xor r12d,eax - ror r14d,11 - xor r15d,edx - add ebx,r12d - ror r13d,6 - paddd xmm3,xmm7 - and edi,r15d - xor r14d,ecx - add ebx,r13d - pshufd xmm7,xmm3,80 - xor edi,edx - ror r14d,2 - add r9d,ebx - movdqa xmm6,xmm7 - add ebx,edi - mov r13d,r9d - psrld xmm7,10 - add r14d,ebx - ror r13d,14 - psrlq xmm6,17 - mov ebx,r14d - mov r12d,r10d - pxor xmm7,xmm6 - ror r14d,9 - xor r13d,r9d - xor r12d,r11d - ror r13d,5 - xor r14d,ebx - psrlq xmm6,2 - and r12d,r9d - xor r13d,r9d - add eax,DWORD PTR[60+rsp] - pxor xmm7,xmm6 - mov edi,ebx - xor r12d,r11d - ror r14d,11 - pshufd xmm7,xmm7,8 - xor edi,ecx - add eax,r12d - movdqa xmm6,XMMWORD PTR[48+rsi] - ror r13d,6 - and r15d,edi - pslldq xmm7,8 - xor r14d,ebx - add eax,r13d - xor r15d,ecx - paddd xmm3,xmm7 - ror r14d,2 - add r8d,eax - add eax,r15d - paddd xmm6,xmm3 - mov r13d,r8d - add r14d,eax - movdqa XMMWORD PTR[48+rsp],xmm6 - cmp BYTE PTR[67+rsi],0 - jne $L$ssse3_00_47 - ror r13d,14 - mov eax,r14d - mov r12d,r9d - ror r14d,9 - xor r13d,r8d - xor r12d,r10d - ror r13d,5 - xor r14d,eax - and r12d,r8d - xor r13d,r8d - add r11d,DWORD PTR[rsp] - mov r15d,eax - xor r12d,r10d - ror r14d,11 - xor r15d,ebx - add r11d,r12d - ror r13d,6 - and edi,r15d - xor r14d,eax - add r11d,r13d - xor edi,ebx - ror r14d,2 - add edx,r11d - add r11d,edi - mov r13d,edx - add r14d,r11d - ror r13d,14 - mov r11d,r14d - mov r12d,r8d - ror r14d,9 - xor r13d,edx - xor r12d,r9d - ror r13d,5 - xor r14d,r11d - and r12d,edx - xor r13d,edx - add r10d,DWORD PTR[4+rsp] - mov edi,r11d - xor r12d,r9d - ror r14d,11 - xor edi,eax - add r10d,r12d - ror r13d,6 - and r15d,edi - xor r14d,r11d - add r10d,r13d - xor r15d,eax - ror r14d,2 - add ecx,r10d - add r10d,r15d - mov r13d,ecx - add r14d,r10d - ror r13d,14 - mov r10d,r14d - mov r12d,edx - ror r14d,9 - xor r13d,ecx - xor r12d,r8d - ror r13d,5 - xor r14d,r10d - and r12d,ecx - xor r13d,ecx - add r9d,DWORD PTR[8+rsp] - mov r15d,r10d - xor r12d,r8d - ror r14d,11 - xor r15d,r11d - add r9d,r12d - ror r13d,6 - and edi,r15d - xor r14d,r10d - add r9d,r13d - xor edi,r11d - ror r14d,2 - add ebx,r9d - add r9d,edi - mov r13d,ebx - add r14d,r9d - ror r13d,14 - mov r9d,r14d - mov r12d,ecx - ror r14d,9 - xor r13d,ebx - xor r12d,edx - ror r13d,5 - xor r14d,r9d - and r12d,ebx - xor r13d,ebx - add r8d,DWORD PTR[12+rsp] - mov edi,r9d - xor r12d,edx - ror r14d,11 - xor edi,r10d - add r8d,r12d - ror r13d,6 - and r15d,edi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - ror r14d,2 - add eax,r8d - add r8d,r15d - mov r13d,eax - add r14d,r8d - ror r13d,14 - mov r8d,r14d - mov r12d,ebx - ror r14d,9 - xor r13d,eax - xor r12d,ecx - ror r13d,5 - xor r14d,r8d - and r12d,eax - xor r13d,eax - add edx,DWORD PTR[16+rsp] - mov r15d,r8d - xor r12d,ecx - ror r14d,11 - xor r15d,r9d - add edx,r12d - ror r13d,6 - and edi,r15d - xor r14d,r8d - add edx,r13d - xor edi,r9d - ror r14d,2 - add r11d,edx - add edx,edi - mov r13d,r11d - add r14d,edx - ror r13d,14 - mov edx,r14d - mov r12d,eax - ror r14d,9 - xor r13d,r11d - xor r12d,ebx - ror r13d,5 - xor r14d,edx - and r12d,r11d - xor r13d,r11d - add ecx,DWORD PTR[20+rsp] - mov edi,edx - xor r12d,ebx - ror r14d,11 - xor edi,r8d - add ecx,r12d - ror r13d,6 - and r15d,edi - xor r14d,edx - add ecx,r13d - xor r15d,r8d - ror r14d,2 - add r10d,ecx - add ecx,r15d - mov r13d,r10d - add r14d,ecx - ror r13d,14 - mov ecx,r14d - mov r12d,r11d - ror r14d,9 - xor r13d,r10d - xor r12d,eax - ror r13d,5 - xor r14d,ecx - and r12d,r10d - xor r13d,r10d - add ebx,DWORD PTR[24+rsp] - mov r15d,ecx - xor r12d,eax - ror r14d,11 - xor r15d,edx - add ebx,r12d - ror r13d,6 - and edi,r15d - xor r14d,ecx - add ebx,r13d - xor edi,edx - ror r14d,2 - add r9d,ebx - add ebx,edi - mov r13d,r9d - add r14d,ebx - ror r13d,14 - mov ebx,r14d - mov r12d,r10d - ror r14d,9 - xor r13d,r9d - xor r12d,r11d - ror r13d,5 - xor r14d,ebx - and r12d,r9d - xor r13d,r9d - add eax,DWORD PTR[28+rsp] - mov edi,ebx - xor r12d,r11d - ror r14d,11 - xor edi,ecx - add eax,r12d - ror r13d,6 - and r15d,edi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - ror r14d,2 - add r8d,eax - add eax,r15d - mov r13d,r8d - add r14d,eax - ror r13d,14 - mov eax,r14d - mov r12d,r9d - ror r14d,9 - xor r13d,r8d - xor r12d,r10d - ror r13d,5 - xor r14d,eax - and r12d,r8d - xor r13d,r8d - add r11d,DWORD PTR[32+rsp] - mov r15d,eax - xor r12d,r10d - ror r14d,11 - xor r15d,ebx - add r11d,r12d - ror r13d,6 - and edi,r15d - xor r14d,eax - add r11d,r13d - xor edi,ebx - ror r14d,2 - add edx,r11d - add r11d,edi - mov r13d,edx - add r14d,r11d - ror r13d,14 - mov r11d,r14d - mov r12d,r8d - ror r14d,9 - xor r13d,edx - xor r12d,r9d - ror r13d,5 - xor r14d,r11d - and r12d,edx - xor r13d,edx - add r10d,DWORD PTR[36+rsp] - mov edi,r11d - xor r12d,r9d - ror r14d,11 - xor edi,eax - add r10d,r12d - ror r13d,6 - and r15d,edi - xor r14d,r11d - add r10d,r13d - xor r15d,eax - ror r14d,2 - add ecx,r10d - add r10d,r15d - mov r13d,ecx - add r14d,r10d - ror r13d,14 - mov r10d,r14d - mov r12d,edx - ror r14d,9 - xor r13d,ecx - xor r12d,r8d - ror r13d,5 - xor r14d,r10d - and r12d,ecx - xor r13d,ecx - add r9d,DWORD PTR[40+rsp] - mov r15d,r10d - xor r12d,r8d - ror r14d,11 - xor r15d,r11d - add r9d,r12d - ror r13d,6 - and edi,r15d - xor r14d,r10d - add r9d,r13d - xor edi,r11d - ror r14d,2 - add ebx,r9d - add r9d,edi - mov r13d,ebx - add r14d,r9d - ror r13d,14 - mov r9d,r14d - mov r12d,ecx - ror r14d,9 - xor r13d,ebx - xor r12d,edx - ror r13d,5 - xor r14d,r9d - and r12d,ebx - xor r13d,ebx - add r8d,DWORD PTR[44+rsp] - mov edi,r9d - xor r12d,edx - ror r14d,11 - xor edi,r10d - add r8d,r12d - ror r13d,6 - and r15d,edi - xor r14d,r9d - add r8d,r13d - xor r15d,r10d - ror r14d,2 - add eax,r8d - add r8d,r15d - mov r13d,eax - add r14d,r8d - ror r13d,14 - mov r8d,r14d - mov r12d,ebx - ror r14d,9 - xor r13d,eax - xor r12d,ecx - ror r13d,5 - xor r14d,r8d - and r12d,eax - xor r13d,eax - add edx,DWORD PTR[48+rsp] - mov r15d,r8d - xor r12d,ecx - ror r14d,11 - xor r15d,r9d - add edx,r12d - ror r13d,6 - and edi,r15d - xor r14d,r8d - add edx,r13d - xor edi,r9d - ror r14d,2 - add r11d,edx - add edx,edi - mov r13d,r11d - add r14d,edx - ror r13d,14 - mov edx,r14d - mov r12d,eax - ror r14d,9 - xor r13d,r11d - xor r12d,ebx - ror r13d,5 - xor r14d,edx - and r12d,r11d - xor r13d,r11d - add ecx,DWORD PTR[52+rsp] - mov edi,edx - xor r12d,ebx - ror r14d,11 - xor edi,r8d - add ecx,r12d - ror r13d,6 - and r15d,edi - xor r14d,edx - add ecx,r13d - xor r15d,r8d - ror r14d,2 - add r10d,ecx - add ecx,r15d - mov r13d,r10d - add r14d,ecx - ror r13d,14 - mov ecx,r14d - mov r12d,r11d - ror r14d,9 - xor r13d,r10d - xor r12d,eax - ror r13d,5 - xor r14d,ecx - and r12d,r10d - xor r13d,r10d - add ebx,DWORD PTR[56+rsp] - mov r15d,ecx - xor r12d,eax - ror r14d,11 - xor r15d,edx - add ebx,r12d - ror r13d,6 - and edi,r15d - xor r14d,ecx - add ebx,r13d - xor edi,edx - ror r14d,2 - add r9d,ebx - add ebx,edi - mov r13d,r9d - add r14d,ebx - ror r13d,14 - mov ebx,r14d - mov r12d,r10d - ror r14d,9 - xor r13d,r9d - xor r12d,r11d - ror r13d,5 - xor r14d,ebx - and r12d,r9d - xor r13d,r9d - add eax,DWORD PTR[60+rsp] - mov edi,ebx - xor r12d,r11d - ror r14d,11 - xor edi,ecx - add eax,r12d - ror r13d,6 - and r15d,edi - xor r14d,ebx - add eax,r13d - xor r15d,ecx - ror r14d,2 - add r8d,eax - add eax,r15d - mov r13d,r8d - add r14d,eax - mov rdi,QWORD PTR[((-64))+rbp] - mov eax,r14d - mov rsi,QWORD PTR[((-56))+rbp] - - add eax,DWORD PTR[rdi] - add ebx,DWORD PTR[4+rdi] - add ecx,DWORD PTR[8+rdi] - add edx,DWORD PTR[12+rdi] - add r8d,DWORD PTR[16+rdi] - add r9d,DWORD PTR[20+rdi] - add r10d,DWORD PTR[24+rdi] - add r11d,DWORD PTR[28+rdi] - - lea rsi,QWORD PTR[64+rsi] - cmp rsi,QWORD PTR[((-48))+rbp] - - mov DWORD PTR[rdi],eax - mov DWORD PTR[4+rdi],ebx - mov DWORD PTR[8+rdi],ecx - mov DWORD PTR[12+rdi],edx - mov DWORD PTR[16+rdi],r8d - mov DWORD PTR[20+rdi],r9d - mov DWORD PTR[24+rdi],r10d - mov DWORD PTR[28+rdi],r11d - jb $L$loop_ssse3 - - xorps xmm0,xmm0 - movaps XMMWORD PTR[rsp],xmm0 - movaps XMMWORD PTR[16+rsp],xmm0 - movaps XMMWORD PTR[32+rsp],xmm0 - movaps XMMWORD PTR[48+rsp],xmm0 - movaps xmm6,XMMWORD PTR[((-128))+rbp] - movaps xmm7,XMMWORD PTR[((-112))+rbp] - movaps xmm8,XMMWORD PTR[((-96))+rbp] - movaps xmm9,XMMWORD PTR[((-80))+rbp] - mov r15,QWORD PTR[((-40))+rbp] - mov r14,QWORD PTR[((-32))+rbp] - mov r13,QWORD PTR[((-24))+rbp] - mov r12,QWORD PTR[((-16))+rbp] - mov rbx,QWORD PTR[((-8))+rbp] - mov rsp,rbp - - pop rbp - -$L$SEH_epilogue_blst_sha256_block_data_order:: - mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue - mov rsi,QWORD PTR[16+rsp] - - DB 0F3h,0C3h ;repret - -$L$SEH_end_blst_sha256_block_data_order:: -blst_sha256_block_data_order ENDP -PUBLIC blst_sha256_emit - - -ALIGN 16 -blst_sha256_emit PROC PUBLIC - DB 243,15,30,250 - - mov r8,QWORD PTR[rdx] - mov r9,QWORD PTR[8+rdx] - mov r10,QWORD PTR[16+rdx] - bswap r8 - mov r11,QWORD PTR[24+rdx] - bswap r9 - mov DWORD PTR[4+rcx],r8d - bswap r10 - mov DWORD PTR[12+rcx],r9d - bswap r11 - mov DWORD PTR[20+rcx],r10d - shr r8,32 - mov DWORD PTR[28+rcx],r11d - shr r9,32 - mov DWORD PTR[rcx],r8d - shr r10,32 - mov DWORD PTR[8+rcx],r9d - shr r11,32 - mov DWORD PTR[16+rcx],r10d - mov DWORD PTR[24+rcx],r11d - DB 0F3h,0C3h ;repret -blst_sha256_emit ENDP - -PUBLIC blst_sha256_bcopy - - -ALIGN 16 -blst_sha256_bcopy PROC PUBLIC - DB 243,15,30,250 - - sub rcx,rdx -$L$oop_bcopy:: - movzx eax,BYTE PTR[rdx] - lea rdx,QWORD PTR[1+rdx] - mov BYTE PTR[((-1))+rdx*1+rcx],al - dec r8 - jnz $L$oop_bcopy - DB 0F3h,0C3h ;repret -blst_sha256_bcopy ENDP - -PUBLIC blst_sha256_hcopy - - -ALIGN 16 -blst_sha256_hcopy PROC PUBLIC - DB 243,15,30,250 - - mov r8,QWORD PTR[rdx] - mov r9,QWORD PTR[8+rdx] - mov r10,QWORD PTR[16+rdx] - mov r11,QWORD PTR[24+rdx] - mov QWORD PTR[rcx],r8 - mov QWORD PTR[8+rcx],r9 - mov QWORD PTR[16+rcx],r10 - mov QWORD PTR[24+rcx],r11 - DB 0F3h,0C3h ;repret -blst_sha256_hcopy ENDP -.text$ ENDS -.pdata SEGMENT READONLY ALIGN(4) -ALIGN 4 - DD imagerel $L$SEH_begin_blst_sha256_block_data_order_shaext - DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext - DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_prologue - - DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext - DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext - DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_body - - DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext - DD imagerel $L$SEH_end_blst_sha256_block_data_order_shaext - DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_epilogue - - DD imagerel $L$SEH_begin_blst_sha256_block_data_order - DD imagerel $L$SEH_body_blst_sha256_block_data_order - DD imagerel $L$SEH_info_blst_sha256_block_data_order_prologue - - DD imagerel $L$SEH_body_blst_sha256_block_data_order - DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order - DD imagerel $L$SEH_info_blst_sha256_block_data_order_body - - DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order - DD imagerel $L$SEH_end_blst_sha256_block_data_order - DD imagerel $L$SEH_info_blst_sha256_block_data_order_epilogue - -.pdata ENDS -.xdata SEGMENT READONLY ALIGN(8) -ALIGN 8 -$L$SEH_info_blst_sha256_block_data_order_shaext_prologue:: -DB 1,4,6,005h -DB 4,074h,2,0 -DB 4,064h,3,0 -DB 4,053h -DB 1,050h - DD 0,0 -$L$SEH_info_blst_sha256_block_data_order_shaext_body:: -DB 1,0,17,85 -DB 000h,068h,000h,000h -DB 000h,078h,001h,000h -DB 000h,088h,002h,000h -DB 000h,098h,003h,000h -DB 000h,0a8h,004h,000h -DB 000h,074h,00ch,000h -DB 000h,064h,00dh,000h -DB 000h,053h -DB 000h,092h -DB 000h,050h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_blst_sha256_block_data_order_shaext_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - -$L$SEH_info_blst_sha256_block_data_order_prologue:: -DB 1,4,6,005h -DB 4,074h,2,0 -DB 4,064h,3,0 -DB 4,053h -DB 1,050h - DD 0,0 -$L$SEH_info_blst_sha256_block_data_order_body:: -DB 1,0,25,133 -DB 000h,068h,000h,000h -DB 000h,078h,001h,000h -DB 000h,088h,002h,000h -DB 000h,098h,003h,000h -DB 000h,0f4h,00bh,000h -DB 000h,0e4h,00ch,000h -DB 000h,0d4h,00dh,000h -DB 000h,0c4h,00eh,000h -DB 000h,034h,00fh,000h -DB 000h,074h,012h,000h -DB 000h,064h,013h,000h -DB 000h,053h -DB 000h,0f2h -DB 000h,050h -DB 000h,000h,000h,000h,000h,000h -DB 000h,000h,000h,000h -$L$SEH_info_blst_sha256_block_data_order_epilogue:: -DB 1,0,4,0 -DB 000h,074h,001h,000h -DB 000h,064h,002h,000h -DB 000h,000h,000h,000h - - -.xdata ENDS -END diff --git a/crypto/blst_src/bulk_addition.c b/crypto/blst_src/bulk_addition.c deleted file mode 100644 index 4d36f405b64..00000000000 --- a/crypto/blst_src/bulk_addition.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "fields.h" -#include "point.h" - -/* - * This implementation uses explicit addition formula: - * - * λ = (Y₂-Y₁)/(X₂-X₁) - * X₃ = λ²-(X₁+X₂) - * Y₃ = λ⋅(X₁-X₃)-Y₁ - * - * But since we don't know if we'll have to add point to itself, we need - * to eventually resort to corresponding doubling formula: - * - * λ = 3X₁²/2Y₁ - * X₃ = λ²-2X₁ - * Y₃ = λ⋅(X₁-X₃)-Y₁ - * - * The formulae use prohibitively expensive inversion, but whenever we - * have a lot of affine points to accumulate, we can amortize the cost - * by applying Montgomery's batch inversion approach. As a result, - * asymptotic[!] per-point cost for addition is as small as 5M+1S. For - * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things - * considered, the improvement coefficient varies from 60% to 85% - * depending on platform and curve. - * - * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an - * application that requires constant time-ness, speak up!] - */ - -/* - * Calculate λ's numerator and denominator. - * - * input: A x1 y1 - - * B x2 y2 - - * output: - * if A!=B: A x1 y1 (x2-x1)*mul_acc - * B x2+x1 y2-y1 (x2-x1) - * - * if A==B: A x y 2y*mul_acc - * B 2x 3*x^2 2y - * - * if A==-B: A 0 0 1*mul_acc - * B 0 3*x^2 0 - */ -#define HEAD(ptype, bits, field, one) \ -static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \ -{ \ - ptype *A = AB, *B = AB+1; \ - limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \ - vec_is_zero(B, sizeof(ptype##_affine)); \ - static const vec##bits zero = { 0 }; \ -\ - sub_##field(B->Z, B->X, A->X); /* X2-X1 */ \ - add_##field(B->X, B->X, A->X); /* X2+X1 */ \ - add_##field(A->Z, B->Y, A->Y); /* Y2+Y1 */ \ - sub_##field(B->Y, B->Y, A->Y); /* Y2-Y1 */ \ - if (vec_is_zero(B->Z, sizeof(B->Z))) { /* X2==X1 */ \ - inf = vec_is_zero(A->Z, sizeof(A->Z)); \ - vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \ - sqr_##field(B->Y, A->X); \ - mul_by_3_##field(B->Y, B->Y); /* 3*X1^2 */ \ - vec_copy(B->Z, A->Z, sizeof(B->Z)); /* 2*Y1 */ \ - } /* B->Y is numenator */ \ - /* B->Z is denominator */ \ - vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \ - vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \ - vec_select(A->Z, one, B->Z, sizeof(A->Z), inf); \ - vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \ - if (mul_acc != NULL) \ - mul_##field(A->Z, A->Z, mul_acc); /* chain multiplication */\ -} - -/* - * Calculate λ and resulting coordinates. - * - * input: A x1 y1 - - * B x2+x1 nominator - - * lambda 1/denominator - * output: D x3=(nom/den)^2-(x2+x1) y3=(nom/den)(x1-x3)-y1 - */ -#define TAIL(ptype, bits, field, one) \ -static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \ -{ \ - ptype *A = AB, *B = AB+1; \ - vec##bits llambda; \ - limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \ -\ - mul_##field(lambda, lambda, B->Y); /* λ = (Y2-Y1)/(X2-X1) */ \ - /* alt. 3*X1^2/2*Y1 */ \ - sqr_##field(llambda, lambda); \ - sub_##field(D->X, llambda, B->X); /* X3 = λ^2-X1-X2 */ \ -\ - sub_##field(D->Y, A->X, D->X); \ - mul_##field(D->Y, D->Y, lambda); \ - sub_##field(D->Y, D->Y, A->Y); /* Y3 = λ*(X1-X3)-Y1 */ \ -\ - vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \ - vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \ -} - -/* - * |points[]| is volatile buffer with |X|s and |Y|s initially holding - * input affine coordinates, and with |Z|s being used as additional - * temporary storage [unrelated to Jacobian coordinates]. |sum| is - * in-/output, initialize to infinity accordingly. - */ -#define ADDITION_BTREE(prefix, ptype, bits, field, one) \ -HEAD(ptype, bits, field, one) \ -TAIL(ptype, bits, field, one) \ -static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \ -{ \ - ptype *dst; \ - void *mul_acc; \ - size_t i; \ -\ - while (n >= 16) { \ - if (n & 1) \ - ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ - n /= 2; \ - for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \ - ptype##_head(points, mul_acc); \ -\ - reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \ -\ - for (dst = points, i = n; --i;) { \ - dst--; points -= 2; \ - mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \ - ptype##_tail(dst, points, points[-2].Z); \ - mul_##field(points[-2].Z, points[0].Z, points[1].Z); \ - } \ - dst--; points -= 2; \ - ptype##_tail(dst, points, points[0].Z); \ - points = dst; \ - } \ - while (n--) \ - ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ -} \ -\ -void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \ - size_t npoints) \ -{ \ - const size_t stride = SCRATCH_LIMIT / sizeof(ptype); \ - ptype *scratch = alloca((npoints > stride ? stride : npoints) * \ - sizeof(ptype)); \ - const ptype##_affine *point = NULL; \ -\ - vec_zero(sum, sizeof(*sum)); \ - while (npoints) { \ - size_t i, j = npoints > stride ? stride : npoints; \ - for (i=0; i> (8 * (n % sizeof(limb_t)))); - } -} - -static inline void limbs_from_le_bytes(limb_t *restrict ret, - const unsigned char *in, size_t n) -{ - limb_t limb = 0; - - while(n--) { - limb <<= 8; - limb |= in[n]; - /* - * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper - * to perform redundant stores than to pay penalty for - * mispredicted branch. Besides, some compilers unroll the - * loop and remove redundant stores to 'restrict'-ed storage... - */ - ret[n / sizeof(limb_t)] = limb; - } -} - -static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in, - size_t n) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - limb_t limb; - size_t i, j, r; - - if ((uptr_t)out == (uptr_t)in && is_endian.little) - return; - - r = n % sizeof(limb_t); - n /= sizeof(limb_t); - - for(i = 0; i < n; i++) { - for (limb = in[i], j = 0; j < sizeof(limb_t); j++, limb >>= 8) - *out++ = (unsigned char)limb; - } - if (r) { - for (limb = in[i], j = 0; j < r; j++, limb >>= 8) - *out++ = (unsigned char)limb; - } -} - -static inline char hex_from_nibble(unsigned char nibble) -{ - int mask = (9 - (nibble &= 0xf)) >> 31; - return (char)(nibble + ((('a'-10) & mask) | ('0' & ~mask))); -} - -static unsigned char nibble_from_hex(char c) -{ - int mask, ret; - - mask = (('a'-c-1) & (c-1-'f')) >> 31; - ret = (10 + c - 'a') & mask; - mask = (('A'-c-1) & (c-1-'F')) >> 31; - ret |= (10 + c - 'A') & mask; - mask = (('0'-c-1) & (c-1-'9')) >> 31; - ret |= (c - '0') & mask; - mask = ((ret-1) & ~mask) >> 31; - ret |= 16 & mask; - - return (unsigned char)ret; -} - -static void bytes_from_hexascii(unsigned char *ret, size_t sz, const char *hex) -{ - size_t len; - unsigned char b = 0; - - if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) - hex += 2; - - for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; - - bytes_zero(ret, sz); - - while(len--) { - b <<= 4; - b |= nibble_from_hex(*hex++); - if (len % 2 == 0) - ret[len / 2] = b; - } -} - -static void limbs_from_hexascii(limb_t *ret, size_t sz, const char *hex) -{ - size_t len; - limb_t limb = 0; - - if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) - hex += 2; - - for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; - - vec_zero(ret, sz); - - while(len--) { - limb <<= 4; - limb |= nibble_from_hex(*hex++); - if (len % (2*sizeof(limb_t)) == 0) - ret[len / (2*sizeof(limb_t))] = limb; - } -} - -#endif diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c deleted file mode 100644 index 0fcf563f502..00000000000 --- a/crypto/blst_src/client_min_pk.c +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "keygen.c" -#include "e2.c" -#include "hash_to_field.c" -#include "map_to_g2.c" -#include "e1.c" -#include "exp.c" -#include "sqrt.c" -#include "recip.c" -#include "consts.c" -#include "vect.c" -#include "exports.c" diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c deleted file mode 100644 index 8e4663daede..00000000000 --- a/crypto/blst_src/client_min_sig.c +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "keygen.c" -#include "e1.c" -#include "hash_to_field.c" -#include "map_to_g1.c" -#include "e2.c" -#include "exp.c" -#include "sqrt.c" -#include "recip.c" -#include "consts.c" -#include "vect.c" -#include "exports.c" diff --git a/crypto/blst_src/consts.c b/crypto/blst_src/consts.c deleted file mode 100644 index 021c878a258..00000000000 --- a/crypto/blst_src/consts.c +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "consts.h" - -/* z = -0xd201000000010000 */ -const vec384 BLS12_381_P = { /* (z-1)^2 * (z^4 - z^2 + 1)/3 + z */ - TO_LIMB_T(0xb9feffffffffaaab), TO_LIMB_T(0x1eabfffeb153ffff), - TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf), - TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a) -}; -const limb_t BLS12_381_p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ - -const radix384 BLS12_381_Rx = { /* (1<<384)%P, "radix", one-in-Montgomery */ - { { ONE_MONT_P }, - { 0 } } -}; - -const vec384 BLS12_381_RR = { /* (1<<768)%P, "radix"^2, to-Montgomery */ - TO_LIMB_T(0xf4df1f341c341746), TO_LIMB_T(0x0a76e6a609d104f1), - TO_LIMB_T(0x8de5476c4c95b6d5), TO_LIMB_T(0x67eb88a9939d83c0), - TO_LIMB_T(0x9a793e85b519952d), TO_LIMB_T(0x11988fe592cae3aa) -}; - -const vec256 BLS12_381_r = { /* z^4 - z^2 + 1, group order */ - TO_LIMB_T(0xffffffff00000001), TO_LIMB_T(0x53bda402fffe5bfe), - TO_LIMB_T(0x3339d80809a1d805), TO_LIMB_T(0x73eda753299d7d48) -}; - -const vec256 BLS12_381_rRR = { /* (1<<512)%r, "radix"^2, to-Montgomery */ - TO_LIMB_T(0xc999e990f3f29c6d), TO_LIMB_T(0x2b6cedcb87925c23), - TO_LIMB_T(0x05d314967254398f), TO_LIMB_T(0x0748d9d99f59ff11) -}; diff --git a/crypto/blst_src/consts.h b/crypto/blst_src/consts.h deleted file mode 100644 index cb391b817df..00000000000 --- a/crypto/blst_src/consts.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLS12_381_ASM_CONST_H__ -#define __BLS12_381_ASM_CONST_H__ -#include "vect.h" - -extern const vec384 BLS12_381_P; -extern const limb_t BLS12_381_p0; -static const limb_t p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ -typedef union { vec384 p12[12]; vec384x p2; vec384 p; } radix384; -extern const radix384 BLS12_381_Rx; /* (1<<384)%P, "radix", one-in-Montgomery */ -extern const vec384 BLS12_381_RR; /* (1<<768)%P, "radix"^2, to-Montgomery */ - -#define ONE_MONT_P TO_LIMB_T(0x760900000002fffd), \ - TO_LIMB_T(0xebf4000bc40c0002), \ - TO_LIMB_T(0x5f48985753c758ba), \ - TO_LIMB_T(0x77ce585370525745), \ - TO_LIMB_T(0x5c071a97a256ec6d), \ - TO_LIMB_T(0x15f65ec3fa80e493) - -#define ZERO_384 (BLS12_381_Rx.p2[1]) - -extern const vec256 BLS12_381_r; /* order */ -static const limb_t r0 = (limb_t)0xfffffffeffffffff; /* -1/r */ -extern const vec256 BLS12_381_rRR; /* (1<<512)%r, "radix"^2, to-Montgomery */ - -#endif diff --git a/crypto/blst_src/cpuid.c b/crypto/blst_src/cpuid.c deleted file mode 100644 index 43b9229d341..00000000000 --- a/crypto/blst_src/cpuid.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#if (defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C)) && !defined(_WIN32) -__attribute__((visibility("hidden"))) -#endif -int __blst_platform_cap = 0; - -#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) - -# if defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C) -static void __cpuidex(int info[4], int func, int sub) -{ - int eax, ebx, ecx, edx; - - __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) - : "a"(func), "c"(sub)); - - info[0] = eax; - info[1] = ebx; - info[2] = ecx; - info[3] = edx; -} -# else -# include -# endif - -# if defined(__GNUC__) || defined(__clang__) -__attribute__((constructor)) -# endif -static int __blst_cpuid(void) -{ - int info[4], cap = 0; - - __cpuidex(info, 0, 0); - if (info[0] > 6) { - __cpuidex(info, 7, 0); - cap |= (info[1]>>19) & 1; /* ADX */ - cap |= (info[1]>>28) & 2; /* SHA */ - } - - __blst_platform_cap = cap; - - return 0; -} - -# if defined(_MSC_VER) && !defined(__clang__) -# pragma section(".CRT$XCU",read) -__declspec(allocate(".CRT$XCU")) static int (*p)(void) = __blst_cpuid; -# elif defined(__SUNPRO_C) -# pragma init(__blst_cpuid) -# endif - -#elif defined(__aarch64__) || defined(__aarch64) - -# if defined(__linux__) && (defined(__GNUC__) || defined(__clang__)) -extern unsigned long getauxval(unsigned long type) __attribute__ ((weak)); - -__attribute__((constructor)) -static int __blst_cpuid(void) -{ - int cap = 0; - - if (getauxval) { - unsigned long hwcap_ce = getauxval(16); - cap = (hwcap_ce>>6) & 1; /* SHA256 */ - } - - __blst_platform_cap = cap; - - return 0; -} -# elif defined(__APPLE__) && (defined(__GNUC__) || defined(__clang__)) -__attribute__((constructor)) -static int __blst_cpuid() -{ - __blst_platform_cap = 1; /* SHA256 */ - return 0; -} -# endif - -#endif diff --git a/crypto/blst_src/e1.c b/crypto/blst_src/e1.c deleted file mode 100644 index f8a7be7bc14..00000000000 --- a/crypto/blst_src/e1.c +++ /dev/null @@ -1,564 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "point.h" -#include "fields.h" -#include "errors.h" - -/* - * y^2 = x^3 + B - */ -static const vec384 B_E1 = { /* (4 << 384) % P */ - TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), - TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), - TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) -}; - -const POINTonE1 BLS12_381_G1 = { /* generator point [in Montgomery] */ - /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 - * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ - { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), - TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), - TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, - /* (0x08b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af6 - * 00db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 << 384) % P */ - { TO_LIMB_T(0xbaac93d50ce72271), TO_LIMB_T(0x8c22631a7918fd8e), - TO_LIMB_T(0xdd595f13570725ce), TO_LIMB_T(0x51ac582950405194), - TO_LIMB_T(0x0e1c8c3fad0059c0), TO_LIMB_T(0x0bbc3efc5008a26a) }, - { ONE_MONT_P } -}; - -const POINTonE1 BLS12_381_NEG_G1 = { /* negative generator [in Montgomery] */ - /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 - * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ - { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), - TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), - TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, - /* (0x114d1d6855d545a8aa7d76c8cf2e21f267816aef1db507c9 - * 6655b9d5caac42364e6f38ba0ecb751bad54dcd6b939c2ca << 384) % P */ - { TO_LIMB_T(0xff526c2af318883a), TO_LIMB_T(0x92899ce4383b0270), - TO_LIMB_T(0x89d7738d9fa9d055), TO_LIMB_T(0x12caf35ba344c12a), - TO_LIMB_T(0x3cff1b76964b5317), TO_LIMB_T(0x0e44d2ede9774430) }, - { ONE_MONT_P } -}; - -static inline void mul_by_b_onE1(vec384 out, const vec384 in) -{ lshift_fp(out, in, 2); } - -static inline void mul_by_4b_onE1(vec384 out, const vec384 in) -{ lshift_fp(out, in, 4); } - -static void POINTonE1_cneg(POINTonE1 *p, bool_t cbit) -{ cneg_fp(p->Y, p->Y, cbit); } - -void blst_p1_cneg(POINTonE1 *a, int cbit) -{ POINTonE1_cneg(a, is_zero(cbit) ^ 1); } - -static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in) -{ - vec384 Z, ZZ; - limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); - - reciprocal_fp(Z, in->Z); /* 1/Z */ - - sqr_fp(ZZ, Z); - mul_fp(out->X, in->X, ZZ); /* X = X/Z^2 */ - - mul_fp(ZZ, ZZ, Z); - mul_fp(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ - - vec_select(out->Z, in->Z, BLS12_381_G1.Z, - sizeof(BLS12_381_G1.Z), inf); /* Z = inf ? 0 : 1 */ -} - -void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a) -{ POINTonE1_from_Jacobian(out, a); } - -static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in) -{ - POINTonE1 p; - - if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { - POINTonE1_from_Jacobian(&p, in); - in = &p; - } - vec_copy(out, in, sizeof(*out)); -} - -void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a) -{ POINTonE1_to_affine(out, a); } - -void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a) -{ - vec_copy(out, a, sizeof(*a)); - vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z), - vec_is_zero(a, sizeof(*a))); -} - -static bool_t POINTonE1_affine_on_curve(const POINTonE1_affine *p) -{ - vec384 XXX, YY; - - sqr_fp(XXX, p->X); - mul_fp(XXX, XXX, p->X); /* X^3 */ - add_fp(XXX, XXX, B_E1); /* X^3 + B */ - - sqr_fp(YY, p->Y); /* Y^2 */ - - return vec_is_equal(XXX, YY, sizeof(XXX)); -} - -int blst_p1_affine_on_curve(const POINTonE1_affine *p) -{ return (int)(POINTonE1_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } - -static bool_t POINTonE1_on_curve(const POINTonE1 *p) -{ - vec384 XXX, YY, BZ6; - limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); - - sqr_fp(BZ6, p->Z); - mul_fp(BZ6, BZ6, p->Z); - sqr_fp(BZ6, BZ6); /* Z^6 */ - mul_by_b_onE1(BZ6, BZ6); /* B*Z^6 */ - - sqr_fp(XXX, p->X); - mul_fp(XXX, XXX, p->X); /* X^3 */ - add_fp(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ - - sqr_fp(YY, p->Y); /* Y^2 */ - - return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; -} - -int blst_p1_on_curve(const POINTonE1 *p) -{ return (int)POINTonE1_on_curve(p); } - -static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96], - const POINTonE1_affine *in) -{ - vec384 temp; - - from_fp(temp, in->X); - be_bytes_from_limbs(out, temp, sizeof(temp)); - - from_fp(temp, in->Y); - be_bytes_from_limbs(out + 48, temp, sizeof(temp)); - - return sgn0_pty_mod_384(temp, BLS12_381_P); -} - -void blst_p1_affine_serialize(unsigned char out[96], - const POINTonE1_affine *in) -{ - if (vec_is_zero(in->X, 2*sizeof(in->X))) { - bytes_zero(out, 96); - out[0] = 0x40; /* infinity bit */ - } else { - (void)POINTonE1_affine_Serialize_BE(out, in); - } -} - -static limb_t POINTonE1_Serialize_BE(unsigned char out[96], - const POINTonE1 *in) -{ - POINTonE1 p; - - if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { - POINTonE1_from_Jacobian(&p, in); - in = &p; - } - - return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in); -} - -static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in) -{ - if (vec_is_zero(in->Z, sizeof(in->Z))) { - bytes_zero(out, 96); - out[0] = 0x40; /* infinity bit */ - } else { - (void)POINTonE1_Serialize_BE(out, in); - } -} - -void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in) -{ POINTonE1_Serialize(out, in); } - -static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48], - const POINTonE1_affine *in) -{ - vec384 temp; - - from_fp(temp, in->X); - be_bytes_from_limbs(out, temp, sizeof(temp)); - - return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0); -} - -void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in) -{ - if (vec_is_zero(in->X, 2*sizeof(in->X))) { - bytes_zero(out, 48); - out[0] = 0xc0; /* compressed and infinity bits */ - } else { - limb_t sign = POINTonE1_affine_Compress_BE(out, in); - out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); - } -} - -static limb_t POINTonE1_Compress_BE(unsigned char out[48], - const POINTonE1 *in) -{ - POINTonE1 p; - - if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { - POINTonE1_from_Jacobian(&p, in); - in = &p; - } - - return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in); -} - -void blst_p1_compress(unsigned char out[48], const POINTonE1 *in) -{ - if (vec_is_zero(in->Z, sizeof(in->Z))) { - bytes_zero(out, 48); - out[0] = 0xc0; /* compressed and infinity bits */ - } else { - limb_t sign = POINTonE1_Compress_BE(out, in); - out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); - } -} - -static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out, - const unsigned char in[48]) -{ - POINTonE1_affine ret; - vec384 temp; - - limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); - /* clear top 3 bits in case caller was conveying some information there */ - ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; - add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ - if (!vec_is_equal(temp, ret.X, sizeof(temp))) - return (limb_t)0 - BLST_BAD_ENCODING; - mul_fp(ret.X, ret.X, BLS12_381_RR); - - sqr_fp(ret.Y, ret.X); - mul_fp(ret.Y, ret.Y, ret.X); - add_fp(ret.Y, ret.Y, B_E1); /* X^3 + B */ - if (!sqrt_fp(ret.Y, ret.Y)) - return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; - - vec_copy(out, &ret, sizeof(ret)); - - return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0); -} - -static BLST_ERROR POINTonE1_Uncompress_Z(POINTonE1_affine *out, - const unsigned char in[48]) -{ - unsigned char in0 = in[0]; - limb_t sgn0_pty; - - if ((in0 & 0x80) == 0) /* compressed bit */ - return BLST_BAD_ENCODING; - - if (in0 & 0x40) { /* infinity bit */ - if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 47)) { - vec_zero(out, sizeof(*out)); - return BLST_SUCCESS; - } else { - return BLST_BAD_ENCODING; - } - } - - sgn0_pty = POINTonE1_Uncompress_BE(out, in); - - if (sgn0_pty > 3) - return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ - - sgn0_pty >>= 1; /* skip over parity bit */ - sgn0_pty ^= (in0 & 0x20) >> 5; - cneg_fp(out->Y, out->Y, sgn0_pty); - - /* (0,±2) is not in group, but application might want to ignore? */ - return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP - : BLST_SUCCESS; -} - -BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48]) -{ return POINTonE1_Uncompress_Z(out, in); } - -static BLST_ERROR POINTonE1_Deserialize_BE(POINTonE1_affine *out, - const unsigned char in[96]) -{ - POINTonE1_affine ret; - vec384 temp; - - limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); - limbs_from_be_bytes(ret.Y, in + 48, sizeof(ret.Y)); - - /* clear top 3 bits in case caller was conveying some information there */ - ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; - add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ - if (!vec_is_equal(temp, ret.X, sizeof(temp))) - return BLST_BAD_ENCODING; - - add_fp(temp, ret.Y, ZERO_384); /* less than modulus? */ - if (!vec_is_equal(temp, ret.Y, sizeof(temp))) - return BLST_BAD_ENCODING; - - mul_fp(ret.X, ret.X, BLS12_381_RR); - mul_fp(ret.Y, ret.Y, BLS12_381_RR); - - if (!POINTonE1_affine_on_curve(&ret)) - return BLST_POINT_NOT_ON_CURVE; - - vec_copy(out, &ret, sizeof(ret)); - - /* (0,±2) is not in group, but application might want to ignore? */ - return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP - : BLST_SUCCESS; -} - -static BLST_ERROR POINTonE1_Deserialize_Z(POINTonE1_affine *out, - const unsigned char in[96]) -{ - unsigned char in0 = in[0]; - - if ((in0 & 0xe0) == 0) - return POINTonE1_Deserialize_BE(out, in); - - if (in0 & 0x80) /* compressed bit */ - return POINTonE1_Uncompress_Z(out, in); - - if (in0 & 0x40) { /* infinity bit */ - if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { - vec_zero(out, sizeof(*out)); - return BLST_SUCCESS; - } - } - - return BLST_BAD_ENCODING; -} - -BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out, - const unsigned char in[96]) -{ return POINTonE1_Deserialize_Z(out, in); } - -#include "ec_ops.h" -POINT_DADD_IMPL(POINTonE1, 384, fp) -POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p) -POINT_ADD_IMPL(POINTonE1, 384, fp) -POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) -POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp) -POINT_IS_EQUAL_IMPL(POINTonE1, 384, fp) - -void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) -{ POINTonE1_add(out, a, b); } - -void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a, - const POINTonE1 *b) -{ POINTonE1_dadd(out, a, b, NULL); } - -void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a, - const POINTonE1_affine *b) -{ POINTonE1_add_affine(out, a, b); } - -void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a, - const POINTonE1_affine *b) -{ POINTonE1_dadd_affine(out, a, b); } - -void blst_p1_double(POINTonE1 *out, const POINTonE1 *a) -{ POINTonE1_double(out, a); } - -int blst_p1_is_equal(const POINTonE1 *a, const POINTonE1 *b) -{ return (int)POINTonE1_is_equal(a, b); } - -#include "ec_mult.h" -POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 4) -POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 5) - -#ifdef __BLST_PRIVATE_TESTMODE__ -POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1) - -DECLARE_PRIVATE_POINTXZ(POINTonE1, 384) -POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp) -POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1) -POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1) -POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1) -#endif - -static const vec384 beta = { /* such that beta^3 - 1 = 0 */ - /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */ - /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 - 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */ - TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), - TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), - TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) -}; - -static void sigma(POINTonE1 *out, const POINTonE1 *in) -{ - vec_copy(out->X, in->X, 2*sizeof(out->X)); - mul_fp(out->Z, in->Z, beta); -} - -/* Gallant-Lambert-Vanstone, ~45% faster than POINTonE1_mult_w5 */ -static void POINTonE1_mult_glv(POINTonE1 *out, const POINTonE1 *in, - const pow256 SK) -{ - union { vec256 l; pow256 s; } val; - - /* SK/z^2 [in constant time] */ - - limbs_from_le_bytes(val.l, SK, 32); - div_by_zz(val.l); - le_bytes_from_limbs(val.s, val.l, 32); - - { - const byte *scalars[2] = { val.s+16, val.s }; - POINTonE1 table[2][1<<(5-1)]; /* 4.5KB */ - size_t i; - - POINTonE1_precompute_w5(table[0], in); - for (i = 0; i < 1<<(5-1); i++) { - mul_fp(table[1][i].X, table[0][i].X, beta); - cneg_fp(table[1][i].Y, table[0][i].Y, 1); - vec_copy(table[1][i].Z, table[0][i].Z, sizeof(table[1][i].Z)); - } - - POINTonE1s_mult_w5(out, NULL, 2, scalars, 128, table); - POINTonE1_cneg(out, 1); - mul_fp(out->Z, out->Z, beta); - mul_fp(out->Z, out->Z, beta); - } - - vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ -} - -static void POINTonE1_sign(POINTonE1 *out, const POINTonE1 *in, const pow256 SK) -{ - vec384 Z, ZZ; - limb_t inf; - - POINTonE1_mult_glv(out, in, SK); - - /* convert to affine to remove possible bias in out->Z */ - inf = vec_is_zero(out->Z, sizeof(out->Z)); -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - flt_reciprocal_fp(Z, out->Z); /* 1/Z */ -#else - reciprocal_fp(Z, out->Z); /* 1/Z */ -#endif - - sqr_fp(ZZ, Z); - mul_fp(out->X, out->X, ZZ); /* X = X/Z^2 */ - - mul_fp(ZZ, ZZ, Z); - mul_fp(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ - - vec_select(out->Z, out->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z), - inf); /* Z = inf ? 0 : 1 */ -} - -void blst_sk_to_pk_in_g1(POINTonE1 *out, const pow256 SK) -{ POINTonE1_sign(out, &BLS12_381_G1, SK); } - -void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const pow256 SK) -{ POINTonE1_sign(out, msg, SK); } - -void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK, - const pow256 SK) -{ - POINTonE1 P[1]; - - POINTonE1_sign(P, &BLS12_381_G1, SK); - if (PK != NULL) - vec_copy(PK, P, sizeof(*PK)); - if (out != NULL) { - limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); - out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ - out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; - } -} - -void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig, - const POINTonE1 *hash, const pow256 SK) -{ - POINTonE1 P[1]; - - POINTonE1_sign(P, hash, SK); - if (sig != NULL) - vec_copy(sig, P, sizeof(*sig)); - if (out != NULL) { - limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); - out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ - out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; - } -} - -void blst_p1_mult(POINTonE1 *out, const POINTonE1 *a, - const byte *scalar, size_t nbits) -{ - if (nbits < 176) { - if (nbits) - POINTonE1_mult_w4(out, a, scalar, nbits); - else - vec_zero(out, sizeof(*out)); - } else if (nbits <= 256) { - union { vec256 l; pow256 s; } val; - size_t i, j, top, mask = (size_t)0 - 1; - - /* this is not about constant-time-ness, but branch optimization */ - for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); - j += 1 & mask; - } - - if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ - POINTonE1_mult_glv(out, a, val.s); - else /* should never be the case, added for formal completeness */ - POINTonE1_mult_w5(out, a, scalar, nbits); - - vec_zero(val.l, sizeof(val)); - } else { /* should never be the case, added for formal completeness */ - POINTonE1_mult_w5(out, a, scalar, nbits); - } -} - -void blst_p1_unchecked_mult(POINTonE1 *out, const POINTonE1 *a, - const byte *scalar, size_t nbits) -{ - if (nbits) - POINTonE1_mult_w4(out, a, scalar, nbits); - else - vec_zero(out, sizeof(*out)); -} - -int blst_p1_affine_is_equal(const POINTonE1_affine *a, - const POINTonE1_affine *b) -{ return (int)vec_is_equal(a, b, sizeof(*a)); } - -int blst_p1_is_inf(const POINTonE1 *p) -{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } - -const POINTonE1 *blst_p1_generator(void) -{ return &BLS12_381_G1; } - -int blst_p1_affine_is_inf(const POINTonE1_affine *p) -{ return (int)vec_is_zero(p, sizeof(*p)); } - -const POINTonE1_affine *blst_p1_affine_generator(void) -{ return (const POINTonE1_affine *)&BLS12_381_G1; } - -size_t blst_p1_sizeof(void) -{ return sizeof(POINTonE1); } - -size_t blst_p1_affine_sizeof(void) -{ return sizeof(POINTonE1_affine); } diff --git a/crypto/blst_src/e2.c b/crypto/blst_src/e2.c deleted file mode 100644 index 77f8064bce2..00000000000 --- a/crypto/blst_src/e2.c +++ /dev/null @@ -1,638 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "point.h" -#include "fields.h" -#include "errors.h" - -/* - * y^2 = x^3 + B - */ -static const vec384x B_E2 = { /* 4 + 4*i */ - { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), - TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), - TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }, - { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), - TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), - TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) } -}; - -const POINTonE2 BLS12_381_G2 = { /* generator point [in Montgomery] */ -{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 - b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ - { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), - TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), - TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, - /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a - b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ - { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), - TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), - TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } -}, -{ /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7 - 6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */ - { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a), - TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f), - TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) }, - /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af - 267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */ - { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc), - TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a), - TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) }, -}, -{ { ONE_MONT_P }, { 0 } } -}; - -const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */ -{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 - b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ - { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), - TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), - TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, - /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a - b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ - { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), - TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), - TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } -}, -{ /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17 - f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */ - { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5), - TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f), - TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) }, - /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10 - 40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */ - { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23), - TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84), - TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) } -}, -{ { ONE_MONT_P }, { 0 } } -}; - -static void mul_by_b_onE2(vec384x out, const vec384x in) -{ - sub_fp(out[0], in[0], in[1]); - add_fp(out[1], in[0], in[1]); - lshift_fp(out[0], out[0], 2); - lshift_fp(out[1], out[1], 2); -} - -static void mul_by_4b_onE2(vec384x out, const vec384x in) -{ - sub_fp(out[0], in[0], in[1]); - add_fp(out[1], in[0], in[1]); - lshift_fp(out[0], out[0], 4); - lshift_fp(out[1], out[1], 4); -} - -static void POINTonE2_cneg(POINTonE2 *p, bool_t cbit) -{ cneg_fp2(p->Y, p->Y, cbit); } - -void blst_p2_cneg(POINTonE2 *a, int cbit) -{ POINTonE2_cneg(a, is_zero(cbit) ^ 1); } - -static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in) -{ - vec384x Z, ZZ; - limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); - - reciprocal_fp2(Z, in->Z); /* 1/Z */ - - sqr_fp2(ZZ, Z); - mul_fp2(out->X, in->X, ZZ); /* X = X/Z^2 */ - - mul_fp2(ZZ, ZZ, Z); - mul_fp2(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ - - vec_select(out->Z, in->Z, BLS12_381_G2.Z, - sizeof(BLS12_381_G2.Z), inf); /* Z = inf ? 0 : 1 */ -} - -void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a) -{ POINTonE2_from_Jacobian(out, a); } - -static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in) -{ - POINTonE2 p; - - if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { - POINTonE2_from_Jacobian(&p, in); - in = &p; - } - vec_copy(out, in, sizeof(*out)); -} - -void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a) -{ POINTonE2_to_affine(out, a); } - -void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a) -{ - vec_copy(out, a, sizeof(*a)); - vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z), - vec_is_zero(a, sizeof(*a))); -} - -static bool_t POINTonE2_affine_on_curve(const POINTonE2_affine *p) -{ - vec384x XXX, YY; - - sqr_fp2(XXX, p->X); - mul_fp2(XXX, XXX, p->X); /* X^3 */ - add_fp2(XXX, XXX, B_E2); /* X^3 + B */ - - sqr_fp2(YY, p->Y); /* Y^2 */ - - return vec_is_equal(XXX, YY, sizeof(XXX)); -} - -int blst_p2_affine_on_curve(const POINTonE2_affine *p) -{ return (int)(POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } - -static bool_t POINTonE2_on_curve(const POINTonE2 *p) -{ - vec384x XXX, YY, BZ6; - limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); - - sqr_fp2(BZ6, p->Z); - mul_fp2(BZ6, BZ6, p->Z); - sqr_fp2(XXX, BZ6); /* Z^6 */ - mul_by_b_onE2(BZ6, XXX); /* B*Z^6 */ - - sqr_fp2(XXX, p->X); - mul_fp2(XXX, XXX, p->X); /* X^3 */ - add_fp2(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ - - sqr_fp2(YY, p->Y); /* Y^2 */ - - return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; -} - -int blst_p2_on_curve(const POINTonE2 *p) -{ return (int)POINTonE2_on_curve(p); } - -static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192], - const POINTonE2_affine *in) -{ - vec384x temp; - - from_fp(temp[1], in->X[1]); - be_bytes_from_limbs(out, temp[1], sizeof(temp[1])); - from_fp(temp[0], in->X[0]); - be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0])); - - from_fp(temp[1], in->Y[1]); - be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1])); - from_fp(temp[0], in->Y[0]); - be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0])); - - return sgn0_pty_mod_384x(temp, BLS12_381_P); -} - -void blst_p2_affine_serialize(unsigned char out[192], - const POINTonE2_affine *in) -{ - if (vec_is_zero(in->X, 2*sizeof(in->X))) { - bytes_zero(out, 192); - out[0] = 0x40; /* infinity bit */ - } else { - (void)POINTonE2_affine_Serialize_BE(out, in); - } -} - -static limb_t POINTonE2_Serialize_BE(unsigned char out[192], - const POINTonE2 *in) -{ - POINTonE2 p; - - if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { - POINTonE2_from_Jacobian(&p, in); - in = &p; - } - - return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in); -} - -static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in) -{ - if (vec_is_zero(in->Z, sizeof(in->Z))) { - bytes_zero(out, 192); - out[0] = 0x40; /* infinity bit */ - } else { - (void)POINTonE2_Serialize_BE(out, in); - } -} - -void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in) -{ POINTonE2_Serialize(out, in); } - -static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96], - const POINTonE2_affine *in) -{ - vec384 temp; - - from_fp(temp, in->X[1]); - be_bytes_from_limbs(out, temp, sizeof(temp)); - from_fp(temp, in->X[0]); - be_bytes_from_limbs(out + 48, temp, sizeof(temp)); - - return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0); -} - -void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in) -{ - if (vec_is_zero(in->X, 2*sizeof(in->X))) { - bytes_zero(out, 96); - out[0] = 0xc0; /* compressed and infinity bits */ - } else { - limb_t sign = POINTonE2_affine_Compress_BE(out, in); - out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); - } -} - -static limb_t POINTonE2_Compress_BE(unsigned char out[96], - const POINTonE2 *in) -{ - POINTonE2 p; - - if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { - POINTonE2_from_Jacobian(&p, in); - in = &p; - } - - return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in); -} - -void blst_p2_compress(unsigned char out[96], const POINTonE2 *in) -{ - if (vec_is_zero(in->Z, sizeof(in->Z))) { - bytes_zero(out, 96); - out[0] = 0xc0; /* compressed and infinity bits */ - } else { - limb_t sign = POINTonE2_Compress_BE(out, in); - out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); - } -} - -static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out, - const unsigned char in[96]) -{ - POINTonE2_affine ret; - vec384 temp; - - limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); - limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); - - /* clear top 3 bits in case caller was conveying some information there */ - ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; - add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ - if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) - return (limb_t)0 - BLST_BAD_ENCODING; - - add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ - if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) - return (limb_t)0 - BLST_BAD_ENCODING; - - mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); - mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); - - sqr_fp2(ret.Y, ret.X); - mul_fp2(ret.Y, ret.Y, ret.X); - add_fp2(ret.Y, ret.Y, B_E2); /* X^3 + B */ - if (!sqrt_fp2(ret.Y, ret.Y)) - return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; - - vec_copy(out, &ret, sizeof(ret)); - - return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0); -} - -static BLST_ERROR POINTonE2_Uncompress_Z(POINTonE2_affine *out, - const unsigned char in[96]) -{ - unsigned char in0 = in[0]; - limb_t sgn0_pty; - - if ((in0 & 0x80) == 0) /* compressed bit */ - return BLST_BAD_ENCODING; - - if (in0 & 0x40) { /* infinity bit */ - if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { - vec_zero(out, sizeof(*out)); - return BLST_SUCCESS; - } else { - return BLST_BAD_ENCODING; - } - } - - sgn0_pty = POINTonE2_Uncompress_BE(out, in); - - if (sgn0_pty > 3) - return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ - - sgn0_pty >>= 1; /* skip over parity bit */ - sgn0_pty ^= (in0 & 0x20) >> 5; - cneg_fp2(out->Y, out->Y, sgn0_pty); - - return BLST_SUCCESS; -} - -BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96]) -{ return POINTonE2_Uncompress_Z(out, in); } - -static BLST_ERROR POINTonE2_Deserialize_BE(POINTonE2_affine *out, - const unsigned char in[192]) -{ - POINTonE2_affine ret; - vec384 temp; - - limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); - limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); - limbs_from_be_bytes(ret.Y[1], in + 96, sizeof(ret.Y[1])); - limbs_from_be_bytes(ret.Y[0], in + 144, sizeof(ret.Y[0])); - - /* clear top 3 bits in case caller was conveying some information there */ - ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; - add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ - if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) - return BLST_BAD_ENCODING; - - add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ - if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) - return BLST_BAD_ENCODING; - - add_fp(temp, ret.Y[1], ZERO_384); /* less than modulus? */ - if (!vec_is_equal(temp, ret.Y[1], sizeof(temp))) - return BLST_BAD_ENCODING; - - add_fp(temp, ret.Y[0], ZERO_384); /* less than modulus? */ - if (!vec_is_equal(temp, ret.Y[0], sizeof(temp))) - return BLST_BAD_ENCODING; - - mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); - mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); - mul_fp(ret.Y[0], ret.Y[0], BLS12_381_RR); - mul_fp(ret.Y[1], ret.Y[1], BLS12_381_RR); - - if (!POINTonE2_affine_on_curve(&ret)) - return BLST_POINT_NOT_ON_CURVE; - - vec_copy(out, &ret, sizeof(ret)); - - return BLST_SUCCESS; -} - -static BLST_ERROR POINTonE2_Deserialize_Z(POINTonE2_affine *out, - const unsigned char in[192]) -{ - unsigned char in0 = in[0]; - - if ((in0 & 0xe0) == 0) - return POINTonE2_Deserialize_BE(out, in); - - if (in0 & 0x80) /* compressed bit */ - return POINTonE2_Uncompress_Z(out, in); - - if (in0 & 0x40) { /* infinity bit */ - if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 191)) { - vec_zero(out, sizeof(*out)); - return BLST_SUCCESS; - } - } - - return BLST_BAD_ENCODING; -} - -BLST_ERROR blst_p2_deserialize(POINTonE2_affine *out, - const unsigned char in[192]) -{ return POINTonE2_Deserialize_Z(out, in); } - -#include "ec_ops.h" -POINT_DADD_IMPL(POINTonE2, 384x, fp2) -POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) -POINT_ADD_IMPL(POINTonE2, 384x, fp2) -POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) -POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2) -POINT_IS_EQUAL_IMPL(POINTonE2, 384x, fp2) - -void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) -{ POINTonE2_add(out, a, b); } - -void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a, - const POINTonE2 *b) -{ POINTonE2_dadd(out, a, b, NULL); } - -void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a, - const POINTonE2_affine *b) -{ POINTonE2_add_affine(out, a, b); } - -void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a, - const POINTonE2_affine *b) -{ POINTonE2_dadd_affine(out, a, b); } - -void blst_p2_double(POINTonE2 *out, const POINTonE2 *a) -{ POINTonE2_double(out, a); } - -int blst_p2_is_equal(const POINTonE2 *a, const POINTonE2 *b) -{ return (int)POINTonE2_is_equal(a, b); } - -#include "ec_mult.h" -POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 4) -POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 5) - -#ifdef __BLST_PRIVATE_TESTMODE__ -POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2) - -DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x) -POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2) -POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2) -POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2) -POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2) -#endif - -static void psi(POINTonE2 *out, const POINTonE2 *in) -{ - static const vec384x frobenius_x = { /* 1/(1 + i)^((P-1)/3) */ - { 0 }, - { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 - 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */ - TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), - TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), - TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) } - }; - static const vec384x frobenius_y = { /* 1/(1 + i)^((P-1)/2) */ - { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60 - ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */ - TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), - TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), - TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, - { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e - 77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */ - TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), - TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), - TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, - }; - - vec_copy(out, in, sizeof(*out)); - cneg_fp(out->X[1], out->X[1], 1); mul_fp2(out->X, out->X, frobenius_x); - cneg_fp(out->Y[1], out->Y[1], 1); mul_fp2(out->Y, out->Y, frobenius_y); - cneg_fp(out->Z[1], out->Z[1], 1); -} - -/* Galbraith-Lin-Scott, ~67% faster than POINTonE2_mul_w5 */ -static void POINTonE2_mult_gls(POINTonE2 *out, const POINTonE2 *in, - const pow256 SK) -{ - union { vec256 l; pow256 s; } val; - - /* break down SK to "digits" with |z| as radix [in constant time] */ - - limbs_from_le_bytes(val.l, SK, 32); - div_by_zz(val.l); - div_by_z(val.l); - div_by_z(val.l + NLIMBS(256)/2); - le_bytes_from_limbs(val.s, val.l, 32); - - { - const byte *scalars[2] = { val.s, NULL }; - POINTonE2 table[4][1<<(5-1)]; /* 18KB */ - size_t i; - - POINTonE2_precompute_w5(table[0], in); - for (i = 0; i < 1<<(5-1); i++) { - psi(&table[1][i], &table[0][i]); - psi(&table[2][i], &table[1][i]); - psi(&table[3][i], &table[2][i]); - POINTonE2_cneg(&table[1][i], 1); /* account for z being negative */ - POINTonE2_cneg(&table[3][i], 1); - } - - POINTonE2s_mult_w5(out, NULL, 4, scalars, 64, table); - } - - vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ -} - -static void POINTonE2_sign(POINTonE2 *out, const POINTonE2 *in, const pow256 SK) -{ - vec384x Z, ZZ; - limb_t inf; - - POINTonE2_mult_gls(out, in, SK); - - /* convert to affine to remove possible bias in out->Z */ - inf = vec_is_zero(out->Z, sizeof(out->Z)); -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - flt_reciprocal_fp2(Z, out->Z); /* 1/Z */ -#else - reciprocal_fp2(Z, out->Z); /* 1/Z */ -#endif - - sqr_fp2(ZZ, Z); - mul_fp2(out->X, out->X, ZZ); /* X = X/Z^2 */ - - mul_fp2(ZZ, ZZ, Z); - mul_fp2(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ - - vec_select(out->Z, out->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z), - inf); /* Z = inf ? 0 : 1 */ -} - -void blst_sk_to_pk_in_g2(POINTonE2 *out, const pow256 SK) -{ POINTonE2_sign(out, &BLS12_381_G2, SK); } - -void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const pow256 SK) -{ POINTonE2_sign(out, msg, SK); } - -void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK, - const pow256 SK) -{ - POINTonE2 P[1]; - - POINTonE2_sign(P, &BLS12_381_G2, SK); - if (PK != NULL) - vec_copy(PK, P, sizeof(*PK)); - if (out != NULL) { - limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); - out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ - out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; - } -} - -void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig, - const POINTonE2 *hash, const pow256 SK) -{ - POINTonE2 P[1]; - - POINTonE2_sign(P, hash, SK); - if (sig != NULL) - vec_copy(sig, P, sizeof(*sig)); - if (out != NULL) { - limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); - out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ - out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; - } -} - -void blst_p2_mult(POINTonE2 *out, const POINTonE2 *a, - const byte *scalar, size_t nbits) -{ - if (nbits < 144) { - if (nbits) - POINTonE2_mult_w4(out, a, scalar, nbits); - else - vec_zero(out, sizeof(*out)); - } else if (nbits <= 256) { - union { vec256 l; pow256 s; } val; - size_t i, j, top, mask = (size_t)0 - 1; - - /* this is not about constant-time-ness, but branch optimization */ - for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); - j += 1 & mask; - } - - if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ - POINTonE2_mult_gls(out, a, val.s); - else /* should never be the case, added for formal completeness */ - POINTonE2_mult_w5(out, a, scalar, nbits); - - vec_zero(val.l, sizeof(val)); - } else { /* should never be the case, added for formal completeness */ - POINTonE2_mult_w5(out, a, scalar, nbits); - } -} - -void blst_p2_unchecked_mult(POINTonE2 *out, const POINTonE2 *a, - const byte *scalar, size_t nbits) -{ - if (nbits) - POINTonE2_mult_w4(out, a, scalar, nbits); - else - vec_zero(out, sizeof(*out)); -} - -int blst_p2_affine_is_equal(const POINTonE2_affine *a, - const POINTonE2_affine *b) -{ return (int)vec_is_equal(a, b, sizeof(*a)); } - -int blst_p2_is_inf(const POINTonE2 *p) -{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } - -const POINTonE2 *blst_p2_generator(void) -{ return &BLS12_381_G2; } - -int blst_p2_affine_is_inf(const POINTonE2_affine *p) -{ return (int)vec_is_zero(p, sizeof(*p)); } - -const POINTonE2_affine *blst_p2_affine_generator(void) -{ return (const POINTonE2_affine *)&BLS12_381_G2; } - -size_t blst_p2_sizeof(void) -{ return sizeof(POINTonE2); } - -size_t blst_p2_affine_sizeof(void) -{ return sizeof(POINTonE2_affine); } diff --git a/crypto/blst_src/ec_mult.h b/crypto/blst_src/ec_mult.h deleted file mode 100644 index 3c23489570c..00000000000 --- a/crypto/blst_src/ec_mult.h +++ /dev/null @@ -1,290 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLS12_381_ASM_EC_MULT_H__ -#define __BLS12_381_ASM_EC_MULT_H__ - -#include "point.h" - -/* Works up to 9 bits */ -static limb_t get_wval(const byte *d, size_t off, size_t bits) -{ - size_t top = off + bits - 1; - limb_t ret; - - ret = ((limb_t)d[top / 8] << 8) | d[off / 8]; - - return ret >> (off%8); -} - -/* Works up to 25 bits. */ -static limb_t get_wval_limb(const byte *d, size_t off, size_t bits) -{ - size_t i, top = (off + bits - 1)/8; - limb_t ret, mask = (limb_t)0 - 1; - - d += off/8; - top -= off/8-1; - - /* this is not about constant-time-ness, but branch optimization */ - for (ret=0, i=0; i<4;) { - ret |= (*d & mask) << (8*i); - mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1)); - d += 1 & mask; - } - - return ret >> (off%8); -} - -/* - * Window value encoding that utilizes the fact that -P is trivially - * calculated, which allows to halve the size of pre-computed table, - * is attributed to A. D. Booth, hence the name of the subroutines... - */ -static limb_t booth_encode(limb_t wval, size_t sz) -{ - limb_t mask = 0 - (wval >> sz); /* "sign" bit -> mask */ - launder(mask); - - wval = (wval + 1) >> 1; - wval = (wval ^ mask) - mask; - - /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ - return wval; -} - -/* - * Key feature of these constant-time subroutines is that they tolerate - * zeros in most significant bit positions of the scalar[s], or in other - * words, zero-padded scalar values. This means that one can and should - * pass order's bit-length, which is customarily publicly known, instead - * of the factual scalars' bit-lengths. This is facilitated by point - * addition subroutines implemented to handle points at infinity, which - * are encoded as Z==0. [Doubling algorithms handle such points at - * infinity "naturally," since resulting Z is product of original Z.] - */ -#define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \ -static void ptype##_gather_booth_w##SZ(ptype *restrict p, \ - const ptype table[1<<(SZ-1)], \ - limb_t booth_idx) \ -{ \ - size_t i; \ - bool_t booth_sign = (booth_idx >> SZ) & 1; \ -\ - booth_idx &= (1< 0) \ - wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ - else \ - wval = (scalar[0] << 1) & wmask; \ -\ - wval = booth_encode(wval, SZ); \ - ptype##_gather_booth_w##SZ(ret, table[0], wval); \ -\ - i = 1; \ - while (bits > 0) { \ - for (; i < npoints; i++) { \ - scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ - wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ - wval = booth_encode(wval, SZ); \ - ptype##_gather_booth_w##SZ(temp, table[i], wval); \ - ptype##_dadd(ret, ret, temp, NULL); \ - } \ -\ - for (j = 0; j < SZ; j++) \ - ptype##_double(ret, ret); \ -\ - window = SZ; \ - wmask = ((limb_t)1 << (window + 1)) - 1; \ - bits -= window; \ - i = 0; scalar_s = scalars; \ - } \ -\ - for (; i < npoints; i++) { \ - scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ - wval = (scalar[0] << 1) & wmask; \ - wval = booth_encode(wval, SZ); \ - ptype##_gather_booth_w##SZ(temp, table[i], wval); \ - ptype##_dadd(ret, ret, temp, NULL); \ - } \ -} \ -\ -static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \ - const byte *scalar, size_t bits) \ -{ \ - limb_t wmask, wval; \ - size_t j, window; \ - ptype temp[1]; \ - ptype table[1<<(SZ-1)]; \ -\ - ptype##_precompute_w##SZ(table, point); \ -\ - /* top excess bits modulo target window size */ \ - window = bits % SZ; /* yes, it may be zero */ \ - wmask = ((limb_t)1 << (window + 1)) - 1; \ -\ - bits -= window; \ - wval = bits ? get_wval(scalar, bits - 1, window + 1) \ - : (limb_t)scalar[0] << 1; \ - wval &= wmask; \ - wval = booth_encode(wval, SZ); \ - ptype##_gather_booth_w##SZ(ret, table, wval); \ -\ - while (bits > 0) { \ - for (j = 0; j < SZ; j++) \ - ptype##_double(ret, ret); \ -\ - window = SZ; \ - wmask = ((limb_t)1 << (window + 1)) - 1; \ - bits -= window; \ -\ - wval = bits ? get_wval(scalar, bits - 1, window + 1) \ - : (limb_t)scalar[0] << 1; \ - wval &= wmask; \ - wval = booth_encode(wval, SZ); \ - ptype##_gather_booth_w##SZ(temp, table, wval); \ - if (bits > 0) ptype##_add(ret, ret, temp); \ - else ptype##_dadd(ret, ret, temp, NULL); \ - } \ -} - -#if 0 -/* ~50%, or ~2x[!] slower than w5... */ -#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ -static void ptype##_mult_ladder(ptype *ret, const ptype *p, \ - const byte *scalar, size_t bits) \ -{ \ - ptype sum[1]; \ - bool_t bit, pbit = 0; \ -\ - vec_copy(sum, p, sizeof(ptype)); \ - vec_zero(ret, sizeof(ptype)); /* infinity */ \ -\ - while (bits--) { \ - bit = is_bit_set(scalar, bits); \ - bit ^= pbit; \ - ptype##_cswap(ret, sum, bit); \ - ptype##_add(sum, sum, ret); \ - ptype##_double(ret, ret); \ - pbit ^= bit; \ - } \ - ptype##_cswap(ret, sum, pbit); \ -} -#else -/* >40% better performance than above, [and ~30% slower than w5]... */ -#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ -static void ptype##_mult_ladder(ptype *out, const ptype *p, \ - const byte *scalar, size_t bits) \ -{ \ - ptype##xz sum[1]; \ - ptype##xz pxz[1]; \ - ptype##xz ret[1]; \ - bool_t bit, pbit = 0; \ -\ - ptype##xz_ladder_pre(pxz, p); \ - vec_copy(sum, pxz, sizeof(ptype##xz)); \ - vec_zero(ret, sizeof(ptype##xz)); /* infinity */ \ -\ - while (bits--) { \ - bit = is_bit_set(scalar, bits); \ - bit ^= pbit; \ - ptype##xz_cswap(ret, sum, bit); \ - ptype##xz_ladder_step(ret, sum, pxz); \ - pbit ^= bit; \ - } \ - ptype##xz_cswap(ret, sum, pbit); \ - ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \ -} -#endif - -/* - * Sole reason for existence of this implementation is that addition - * with affine point renders a share of multiplications redundant by - * virtue of Z==1. And since pre-defined generator point can be and - * customarily is instantiated affine, it would be hardly appropriate - * to pass on this opportunity. Though while it's faster than the - * generic ladder implementation, by ~25%, it's not faster than XZ one - * above, <15% slower. Just in case, it's faster than generic ladder - * even if one accounts for prior conversion to affine coordinates, - * so that choice [for resource-constrained case] is actually between - * this plus said conversion and XZ ladder... - * - * To summarize, if ptype##_mult_w5 executed in one unit of time, then - * - naive ptype##_mult_ladder would execute in ~2; - * - XZ version above - in ~1.4; - * - ptype##_affine_mult_ladder below - in ~1.65; - * - [small-footprint ptype##_to_affine would run in ~0.18]. - * - * Caveat lector, |p_affine|*(order+2) produces wrong result, because - * addition doesn't handle doubling. Indeed, P*(order+1) is P and it - * fails to add with itself producing infinity in last addition. But - * as long as |scalar| is reduced modulo order, as it should be, it's - * not a problem... - */ -#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \ -static void ptype##_affine_mult_ladder(ptype *ret, \ - const ptype##_affine *p_affine, \ - const byte *scalar, size_t bits) \ -{ \ - ptype sum[1]; \ - bool_t bit; \ -\ - vec_zero(ret, sizeof(ptype)); /* infinity */ \ -\ - while (bits--) { \ - ptype##_double(ret, ret); \ - ptype##_add_affine(sum, ret, p_affine); \ - bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ - ptype##_ccopy(ret, sum, bit); \ - } \ -} -#endif diff --git a/crypto/blst_src/ec_ops.h b/crypto/blst_src/ec_ops.h deleted file mode 100644 index 0d531f816e2..00000000000 --- a/crypto/blst_src/ec_ops.h +++ /dev/null @@ -1,787 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLS12_384_ASM_EC_OPS_H__ -#define __BLS12_384_ASM_EC_OPS_H__ -/* - * Addition that can handle doubling [as well as points at infinity, - * which are encoded as Z==0] in constant time. It naturally comes at - * cost, but this subroutine should be called only when independent - * points are processed, which is considered reasonable compromise. - * For example, ptype##s_mult_w5 calls it, but since *major* gain is - * result of pure doublings being effectively divided by amount of - * points, slightly slower addition can be tolerated. But what is the - * additional cost more specifically? Best addition result is 11M+5S, - * while this routine takes 13M+5S (+1M+1S if a4!=0), as per - * - * -------------+------------- - * addition | doubling - * -------------+------------- - * U1 = X1*Z2^2 | U1 = X1 - * U2 = X2*Z1^2 | - * S1 = Y1*Z2^3 | S1 = Y1 - * S2 = Y2*Z1^3 | - * zz = Z1*Z2 | zz = Z1 - * H = U2-U1 | H' = 2*Y1 - * R = S2-S1 | R' = 3*X1^2[+a*Z1^4] - * sx = U1+U2 | sx = X1+X1 - * -------------+------------- - * H!=0 || R!=0 | H==0 && R==0 - * - * X3 = R^2-H^2*sx - * Y3 = R*(H^2*U1-X3)-H^3*S1 - * Z3 = H*zz - * - * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is - * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. - */ -#define POINT_DADD_IMPL(ptype, bits, field) \ -static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ - const vec##bits a4) \ -{ \ - ptype p3; /* starts as (U1, S1, zz) from addition side */\ - struct { vec##bits H, R, sx; } add, dbl; \ - bool_t p1inf, p2inf, is_dbl; \ -\ - add_##field(dbl.sx, p1->X, p1->X); /* sx = X1+X1 */\ - sqr_##field(dbl.R, p1->X); /* X1^2 */\ - mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X1^2 */\ - add_##field(dbl.H, p1->Y, p1->Y); /* H = 2*Y1 */\ -\ - p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ - sqr_##field(p3.X, p2->Z); /* Z2^2 */\ - mul_##field(p3.Z, p1->Z, p2->Z); /* Z1*Z2 */\ - p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ - sqr_##field(add.H, p1->Z); /* Z1^2 */\ -\ - if (a4 != NULL) { \ - sqr_##field(p3.Y, add.H); /* Z1^4, [borrow p3.Y] */\ - mul_##field(p3.Y, p3.Y, a4); \ - add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\ - } \ -\ - mul_##field(p3.Y, p1->Y, p2->Z); \ - mul_##field(p3.Y, p3.Y, p3.X); /* S1 = Y1*Z2^3 */\ - mul_##field(add.R, p2->Y, p1->Z); \ - mul_##field(add.R, add.R, add.H); /* S2 = Y2*Z1^3 */\ - sub_##field(add.R, add.R, p3.Y); /* R = S2-S1 */\ -\ - mul_##field(p3.X, p3.X, p1->X); /* U1 = X1*Z2^2 */\ - mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ -\ - add_##field(add.sx, add.H, p3.X); /* sx = U1+U2 */\ - sub_##field(add.H, add.H, p3.X); /* H = U2-U1 */\ -\ - /* make the choice between addition and doubling */\ - is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ - vec_select(&p3, p1, &p3, sizeof(p3), is_dbl); \ - vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ - /* |p3| and |add| hold all inputs now, |p3| will hold output */\ -\ - mul_##field(p3.Z, p3.Z, add.H); /* Z3 = H*Z1*Z2 */\ -\ - sqr_##field(dbl.H, add.H); /* H^2 */\ - mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ - mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ - mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ -\ - mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ - sqr_##field(p3.X, add.R); /* R^2 */\ - sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ -\ - sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ - mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ - sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ -\ - vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ - vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ -} - -/* - * Addition with affine point that can handle doubling [as well as - * points at infinity, with |p1| being encoded as Z==0 and |p2| as - * X,Y==0] in constant time. But at what additional cost? Best - * addition result is 7M+4S, while this routine takes 8M+5S, as per - * - * -------------+------------- - * addition | doubling - * -------------+------------- - * U1 = X1 | U1 = X2 - * U2 = X2*Z1^2 | - * S1 = Y1 | S1 = Y2 - * S2 = Y2*Z1^3 | - * H = U2-X1 | H' = 2*Y2 - * R = S2-Y1 | R' = 3*X2^2[+a] - * sx = X1+U2 | sx = X2+X2 - * zz = H*Z1 | zz = H' - * -------------+------------- - * H!=0 || R!=0 | H==0 && R==0 - * - * X3 = R^2-H^2*sx - * Y3 = R*(H^2*U1-X3)-H^3*S1 - * Z3 = zz - * - * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is - * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. - */ -#define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \ -static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ - const ptype##_affine *p2) \ -{ \ - ptype p3; /* starts as (,, H*Z1) from addition side */\ - struct { vec##bits H, R, sx; } add, dbl; \ - bool_t p1inf, p2inf, is_dbl; \ -\ - p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ - add_##field(dbl.sx, p2->X, p2->X); /* sx = X2+X2 */\ - sqr_##field(dbl.R, p2->X); /* X2^2 */\ - mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X2^2 */\ - add_##field(dbl.H, p2->Y, p2->Y); /* H = 2*Y2 */\ -\ - p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ - sqr_##field(add.H, p1->Z); /* Z1^2 */\ - mul_##field(add.R, add.H, p1->Z); /* Z1^3 */\ - mul_##field(add.R, add.R, p2->Y); /* S2 = Y2*Z1^3 */\ - sub_##field(add.R, add.R, p1->Y); /* R = S2-Y1 */\ -\ - mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ -\ - add_##field(add.sx, add.H, p1->X); /* sx = X1+U2 */\ - sub_##field(add.H, add.H, p1->X); /* H = U2-X1 */\ -\ - mul_##field(p3.Z, add.H, p1->Z); /* Z3 = H*Z1 */\ -\ - /* make the choice between addition and doubling */ \ - is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ - vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl); \ - vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\ - vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ - /* |p3| and |add| hold all inputs now, |p3| will hold output */\ -\ - sqr_##field(dbl.H, add.H); /* H^2 */\ - mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ - mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ - mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ -\ - mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ - sqr_##field(p3.X, add.R); /* R^2 */\ - sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ -\ - sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ - mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ - sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ -\ - vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ - vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ - vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ -} - -/* - * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl - * with twist to handle either input at infinity, which are encoded as Z==0. - */ -#define POINT_ADD_IMPL(ptype, bits, field) \ -static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \ -{ \ - ptype p3; \ - vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J; \ - bool_t p1inf, p2inf; \ -\ - p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ - sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ -\ - mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ - mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ -\ - p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ - sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ -\ - mul_##field(S1, Z2Z2, p2->Z); /* Z2*Z2Z2 */\ - mul_##field(S1, S1, p1->Y); /* S1 = Y1*Z2*Z2Z2 */\ -\ - sub_##field(p3.Z, p3.Z, S1); /* S2-S1 */\ - add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-S1) */\ -\ - mul_##field(U1, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ - mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ -\ - sub_##field(H, H, U1); /* H = U2-U1 */\ -\ - add_##field(I, H, H); /* 2*H */\ - sqr_##field(I, I); /* I = (2*H)^2 */\ -\ - mul_##field(J, H, I); /* J = H*I */\ - mul_##field(S1, S1, J); /* S1*J */\ -\ - mul_##field(p3.Y, U1, I); /* V = U1*I */\ -\ - sqr_##field(p3.X, p3.Z); /* r^2 */\ - sub_##field(p3.X, p3.X, J); /* r^2-J */\ - sub_##field(p3.X, p3.X, p3.Y); \ - sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ -\ - sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ - mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ - sub_##field(p3.Y, p3.Y, S1); \ - sub_##field(p3.Y, p3.Y, S1); /* Y3 = r*(V-X3)-2*S1*J */\ -\ - add_##field(p3.Z, p1->Z, p2->Z); /* Z1+Z2 */\ - sqr_##field(p3.Z, p3.Z); /* (Z1+Z2)^2 */\ - sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+Z2)^2-Z1Z1 */\ - sub_##field(p3.Z, p3.Z, Z2Z2); /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\ - mul_##field(p3.Z, p3.Z, H); /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\ -\ - vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ - vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ -} - -/* - * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl - * with twist to handle either input at infinity, with |p1| encoded as Z==0, - * and |p2| as X==Y==0. - */ -#define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \ -static void ptype##_add_affine(ptype *out, const ptype *p1, \ - const ptype##_affine *p2) \ -{ \ - ptype p3; \ - vec##bits Z1Z1, H, HH, I, J; \ - bool_t p1inf, p2inf; \ -\ - p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ -\ - sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ -\ - mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ - mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ -\ - p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ -\ - mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ - sub_##field(H, H, p1->X); /* H = U2-X1 */\ -\ - sqr_##field(HH, H); /* HH = H^2 */\ - add_##field(I, HH, HH); \ - add_##field(I, I, I); /* I = 4*HH */\ -\ - mul_##field(p3.Y, p1->X, I); /* V = X1*I */\ - mul_##field(J, H, I); /* J = H*I */\ - mul_##field(I, J, p1->Y); /* Y1*J */\ -\ - sub_##field(p3.Z, p3.Z, p1->Y); /* S2-Y1 */\ - add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-Y1) */\ -\ - sqr_##field(p3.X, p3.Z); /* r^2 */\ - sub_##field(p3.X, p3.X, J); /* r^2-J */\ - sub_##field(p3.X, p3.X, p3.Y); \ - sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ -\ - sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ - mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ - sub_##field(p3.Y, p3.Y, I); \ - sub_##field(p3.Y, p3.Y, I); /* Y3 = r*(V-X3)-2*Y1*J */\ -\ - add_##field(p3.Z, p1->Z, H); /* Z1+H */\ - sqr_##field(p3.Z, p3.Z); /* (Z1+H)^2 */\ - sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+H)^2-Z1Z1 */\ - sub_##field(p3.Z, p3.Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */\ -\ - vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ - vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ - vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ -} - -/* - * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l - */ -#define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \ -static void ptype##_double(ptype *p3, const ptype *p1) \ -{ \ - vec##bits A, B, C; \ -\ - sqr_##field(A, p1->X); /* A = X1^2 */\ - sqr_##field(B, p1->Y); /* B = Y1^2 */\ - sqr_##field(C, B); /* C = B^2 */\ -\ - add_##field(B, B, p1->X); /* X1+B */\ - sqr_##field(B, B); /* (X1+B)^2 */\ - sub_##field(B, B, A); /* (X1+B)^2-A */\ - sub_##field(B, B, C); /* (X1+B)^2-A-C */\ - add_##field(B, B, B); /* D = 2*((X1+B)^2-A-C) */\ -\ - mul_by_3_##field(A, A); /* E = 3*A */\ -\ - sqr_##field(p3->X, A); /* F = E^2 */\ - sub_##field(p3->X, p3->X, B); \ - sub_##field(p3->X, p3->X, B); /* X3 = F-2*D */\ -\ - add_##field(p3->Z, p1->Z, p1->Z); /* 2*Z1 */\ - mul_##field(p3->Z, p3->Z, p1->Y); /* Z3 = 2*Z1*Y1 */\ -\ - mul_by_8_##field(C, C); /* 8*C */\ - sub_##field(p3->Y, B, p3->X); /* D-X3 */\ - mul_##field(p3->Y, p3->Y, A); /* E*(D-X3) */\ - sub_##field(p3->Y, p3->Y, C); /* Y3 = E*(D-X3)-8*C */\ -} - -#define POINT_LADDER_PRE_IMPL(ptype, bits, field) \ -static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \ -{ \ - mul_##field(pxz->X, p->X, p->Z); /* X2 = X1*Z1 */\ - sqr_##field(pxz->Z, p->Z); \ - mul_##field(pxz->Z, pxz->Z, p->Z); /* Z2 = Z1^3 */\ -} - -/* - * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3 - * with twist to handle either input at infinity, which are encoded as Z==0. - * Just in case, order of doubling and addition is reverse in comparison to - * hyperelliptic.org entry. This was done to minimize temporary storage. - * - * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|. - */ -#define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \ -static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ - const ptype##xz *p) \ -{ \ - ptype##xz p5; \ - vec##bits A, B, C, D, XX, ZZ; \ - bool_t r_inf, s_inf; \ - /* s += r */\ - mul_##field(A, r->X, s->X); /* A = X2*X3 */\ - mul_##field(B, r->Z, s->Z); /* B = Z2*Z3 */\ - mul_##field(C, r->X, s->Z); /* C = X2*Z3 */\ - mul_##field(D, r->Z, s->X); /* D = X3*Z2 */\ -\ - sqr_##field(A, A); /* (A[-a*B])^2 */\ - add_##field(p5.X, C, D); /* C+D */\ - mul_##field(p5.X, p5.X, B); /* B*(C+D) */\ - mul_by_4b_##suffix4b(B, p5.X); /* b4*B*(C+D) */\ - sub_##field(p5.X, A, B); /* (A[-a*B])^2-b4*B*(C+D) */\ - mul_##field(p5.X, p5.X, p->Z); /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\ -\ - sub_##field(p5.Z, C, D); /* C-D */\ - sqr_##field(p5.Z, p5.Z); /* (C-D)^2 */\ - mul_##field(p5.Z, p5.Z, p->X); /* Z5 = X1*(C-D)^2 */\ -\ - r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \ - s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ -\ - vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \ - vec_select(s, s, &p5, sizeof(ptype##xz), r_inf); \ - /* r *= 2 */\ - sqr_##field(XX, r->X); /* XX = X2^2 */\ - sqr_##field(ZZ, r->Z); /* ZZ = Z2^2 */\ -\ - add_##field(r->Z, r->X, r->Z); /* X2+Z2 */\ - sqr_##field(r->Z, r->Z); /* (X2+Z2)^2 */\ - sub_##field(r->Z, r->Z, XX); /* (X2+Z2)^2-XX */\ - sub_##field(r->Z, r->Z, ZZ); /* E = (X2+Z2)^2-XX-ZZ */\ -\ - sqr_##field(A, XX); /* (XX[-a*ZZ])^2 */\ - mul_##field(B, r->Z, ZZ); /* E*ZZ */\ - mul_by_4b_##suffix4b(C, B); /* b4*E*ZZ */\ - sub_##field(r->X, A, C); /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\ -\ - sqr_##field(ZZ, ZZ); /* ZZ^2 */\ - mul_by_4b_##suffix4b(B, ZZ); /* b4*ZZ^2 */\ - mul_##field(r->Z, r->Z, XX); /* E*(XX[+a*ZZ]) */\ - add_##field(r->Z, r->Z, r->Z); /* 2*E*(XX[+a*ZZ]) */\ - add_##field(r->Z, r->Z, B); /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\ -} - -/* - * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye, - * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist - * and conversion to Jacobian coordinates from /.../ecp_smpl.c, - * and with twist to recover from |s| at infinity [which occurs when - * multiplying by (order-1)]. - * - * X4 = 2*Y1*X2*Z3*Z1*Z2 - * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2 - * Z4 = 2*Y1*Z3*Z2^2*Z1 - * - * Z3x2 = 2*Z3 - * Y1Z3x2 = Y1*Z3x2 - * Z1Z2 = Z1*Z2 - * X1Z2 = X1*Z2 - * X2Z1 = X2*Z1 - * X4 = Y1Z3x2*X2*Z1Z2 - * A = b*Z3x2*(Z1Z2)^2 - * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1) - * C = X3*(X1Z2-X2Z1)^2 - * Y4 = A+B-C - * Z4 = Y1Z3x2*Z1Z2*Z2 - * - * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0. - */ -#define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \ -static void ptype##xz_ladder_post(ptype *p4, \ - const ptype##xz *r, const ptype##xz *s, \ - const ptype##xz *p, const vec##bits Y1) \ -{ \ - vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \ - bool_t s_inf; \ -\ - add_##field(Z3x2, s->Z, s->Z); /* Z3x2 = 2*Z3 */\ - mul_##field(Y1Z3x2, Y1, Z3x2); /* Y1Z3x2 = Y1*Z3x2 */\ - mul_##field(Z1Z2, p->Z, r->Z); /* Z1Z2 = Z1*Z2 */\ - mul_##field(X1Z2, p->X, r->Z); /* X1Z2 = X1*Z2 */\ - mul_##field(X2Z1, r->X, p->Z); /* X2Z1 = X2*Z1 */\ -\ - mul_##field(p4->X, Y1Z3x2, r->X); /* Y1Z3x2*X2 */\ - mul_##field(p4->X, p4->X, Z1Z2); /* X4 = Y1Z3x2*X2*Z1Z2 */\ -\ - sqr_##field(A, Z1Z2); /* (Z1Z2)^2 */\ - mul_##field(B, A, Z3x2); /* Z3x2*(Z1Z2)^2 */\ - mul_by_b_##suffixb(A, B); /* A = b*Z3x2*(Z1Z2)^2 */\ -\ - mul_##field(B, p->X, r->X); /* [a*Z1Z2+]X1*X2 */\ - mul_##field(B, B, s->Z); /* Z3*([a*Z1Z2+]X1*X2) */\ - add_##field(C, X1Z2, X2Z1); /* X1Z2+X2Z1 */\ - mul_##field(B, B, C); /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\ -\ - sub_##field(C, X1Z2, X2Z1); /* X1Z2-X2Z1 */\ - sqr_##field(C, C); /* (X1Z2-X2Z1)^2 */\ - mul_##field(C, C, s->X); /* C = X3*(X1Z2-X2Z1)^2 */\ -\ - add_##field(A, A, B); /* A+B */\ - sub_##field(A, A, C); /* Y4 = A+B-C */\ -\ - mul_##field(p4->Z, Z1Z2, r->Z); /* Z1Z2*Z2 */\ - mul_##field(p4->Z, p4->Z, Y1Z3x2); /* Y1Z3x2*Z1Z2*Z2 */\ -\ - s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ - vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \ - vec_select(p4->Y, Y1, A, sizeof(p4->Y), s_inf); \ - vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \ - ptype##_cneg(p4, s_inf); \ - /* to Jacobian */\ - mul_##field(p4->X, p4->X, p4->Z); /* X4 = X4*Z4 */\ - sqr_##field(B, p4->Z); \ - mul_##field(p4->Y, p4->Y, B); /* Y4 = Y4*Z4^2 */\ -} - -#define POINT_IS_EQUAL_IMPL(ptype, bits, field) \ -static limb_t ptype##_is_equal(const ptype *p1, const ptype *p2) \ -{ \ - vec##bits Z1Z1, Z2Z2; \ - ptype##_affine a1, a2; \ - bool_t is_inf1 = vec_is_zero(p1->Z, sizeof(p1->Z)); \ - bool_t is_inf2 = vec_is_zero(p2->Z, sizeof(p2->Z)); \ -\ - sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ - sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ -\ - mul_##field(a1.X, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ - mul_##field(a2.X, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ -\ - mul_##field(a1.Y, p1->Y, p2->Z); /* Y1*Z2 */\ - mul_##field(a2.Y, p2->Y, p1->Z); /* Y2*Z1 */\ -\ - mul_##field(a1.Y, a1.Y, Z2Z2); /* S1 = Y1*Z2*Z2Z2 */\ - mul_##field(a2.Y, a2.Y, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */\ -\ - return vec_is_equal(&a1, &a2, sizeof(a1)) & (is_inf1 ^ is_inf2 ^ 1); \ -} - -/* - * https://eprint.iacr.org/2015/1060, algorithm 7 with a twist to handle - * |p3| pointing at either |p1| or |p2|. This is resolved by adding |t5| - * and replacing few first references to |X3| in the formula, up to step - * 21, with it. 12M[+27A], doubling and infinity are handled by the - * formula itself. Infinity is to be encoded as [0, !0, 0]. - */ -#define POINT_PROJ_DADD_IMPL_A0(ptype, bits, field, suffixb) \ -static void ptype##proj_dadd(ptype##proj *p3, const ptype##proj *p1, \ - const ptype##proj *p2) \ -{ \ - vec##bits t0, t1, t2, t3, t4, t5; \ -\ - mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ - mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ - mul_##field(t2, p1->Z, p2->Z); /* 3. t2 = Z1*Z2 */\ - add_##field(t3, p1->X, p1->Y); /* 4. t3 = X1+Y1 */\ - add_##field(t4, p2->X, p2->Y); /* 5. t4 = X2+Y2 */\ - mul_##field(t3, t3, t4); /* 6. t3 = t3*t4 */\ - add_##field(t4, t0, t1); /* 7. t4 = t0+t1 */\ - sub_##field(t3, t3, t4); /* 8. t3 = t3-t4 */\ - add_##field(t4, p1->Y, p1->Z); /* 9. t4 = Y1+Z1 */\ - add_##field(t5, p2->Y, p2->Z); /* 10. t5 = Y2+Z2 */\ - mul_##field(t4, t4, t5); /* 11. t4 = t4*t5 */\ - add_##field(t5, t1, t2); /* 12. t5 = t1+t2 */\ - sub_##field(t4, t4, t5); /* 13. t4 = t4-t5 */\ - add_##field(t5, p1->X, p1->Z); /* 14. t5 = X1+Z1 */\ - add_##field(p3->Y, p2->X, p2->Z); /* 15. Y3 = X2+Z2 */\ - mul_##field(t5, t5, p3->Y); /* 16. t5 = t5*Y3 */\ - add_##field(p3->Y, t0, t2); /* 17. Y3 = t0+t2 */\ - sub_##field(p3->Y, t5, p3->Y); /* 18. Y3 = t5-Y3 */\ - mul_by_3_##field(t0, t0); /* 19-20. t0 = 3*t0 */\ - mul_by_3_##field(t5, t2); /* 21. t5 = 3*t2 */\ - mul_by_b_##suffixb(t2, t5); /* 21. t2 = b*t5 */\ - add_##field(p3->Z, t1, t2); /* 22. Z3 = t1+t2 */\ - sub_##field(t1, t1, t2); /* 23. t1 = t1-t2 */\ - mul_by_3_##field(t5, p3->Y); /* 24. t5 = 3*Y3 */\ - mul_by_b_##suffixb(p3->Y, t5); /* 24. Y3 = b*t5 */\ - mul_##field(p3->X, t4, p3->Y); /* 25. X3 = t4*Y3 */\ - mul_##field(t2, t3, t1); /* 26. t2 = t3*t1 */\ - sub_##field(p3->X, t2, p3->X); /* 27. X3 = t2-X3 */\ - mul_##field(p3->Y, p3->Y, t0); /* 28. Y3 = Y3*t0 */\ - mul_##field(t1, t1, p3->Z); /* 29. t1 = t1*Z3 */\ - add_##field(p3->Y, t1, p3->Y); /* 30. Y3 = t1+Y3 */\ - mul_##field(t0, t0, t3); /* 31. t0 = t0*t3 */\ - mul_##field(p3->Z, p3->Z, t4); /* 32. Z3 = Z3*t4 */\ - add_##field(p3->Z, p3->Z, t0); /* 33. Z3 = Z3+t0 */\ -} - -/* - * https://eprint.iacr.org/2015/1060, algorithm 8 with a twist to handle - * |p2| being infinity encoded as [0, 0]. 11M[+21A]. - */ -#define POINT_PROJ_DADD_AFFINE_IMPL_A0(ptype, bits, field, suffixb) \ -static void ptype##proj_dadd_affine(ptype##proj *out, const ptype##proj *p1, \ - const ptype##_affine *p2) \ -{ \ - ptype##proj p3[1]; \ - vec##bits t0, t1, t2, t3, t4; \ - limb_t p2inf = vec_is_zero(p2, sizeof(*p2)); \ -\ - mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ - mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ - add_##field(t3, p1->X, p1->Y); /* 3. t3 = X1+Y1 */\ - add_##field(t4, p2->X, p2->Y); /* 4. t4 = X2+Y2 */\ - mul_##field(t3, t3, t4); /* 5. t3 = t3*t4 */\ - add_##field(t4, t0, t1); /* 6. t4 = t0+t1 */\ - sub_##field(t3, t3, t4); /* 7. t3 = t3-t4 */\ - mul_##field(t4, p2->Y, p1->Z); /* 8. t4 = Y2*Z1 */\ - add_##field(t4, t4, p1->Y); /* 9. t4 = t4+Y1 */\ - mul_##field(p3->Y, p2->X, p1->Z); /* 10. Y3 = X2*Z1 */\ - add_##field(p3->Y, p3->Y, p1->X); /* 11. Y3 = Y3+X1 */\ - mul_by_3_##field(t0, t0); /* 12-13. t0 = 3*t0 */\ - mul_by_b_##suffixb(t2, p1->Z); /* 14. t2 = b*Z1 */\ - mul_by_3_##field(t2, t2); /* 14. t2 = 3*t2 */\ - add_##field(p3->Z, t1, t2); /* 15. Z3 = t1+t2 */\ - sub_##field(t1, t1, t2); /* 16. t1 = t1-t2 */\ - mul_by_b_##suffixb(t2, p3->Y); /* 17. t2 = b*Y3 */\ - mul_by_3_##field(p3->Y, t2); /* 17. Y3 = 3*t2 */\ - mul_##field(p3->X, t4, p3->Y); /* 18. X3 = t4*Y3 */\ - mul_##field(t2, t3, t1); /* 19. t2 = t3*t1 */\ - sub_##field(p3->X, t2, p3->X); /* 20. X3 = t2-X3 */\ - mul_##field(p3->Y, p3->Y, t0); /* 21. Y3 = Y3*t0 */\ - mul_##field(t1, t1, p3->Z); /* 22. t1 = t1*Z3 */\ - add_##field(p3->Y, t1, p3->Y); /* 23. Y3 = t1+Y3 */\ - mul_##field(t0, t0, t3); /* 24. t0 = t0*t3 */\ - mul_##field(p3->Z, p3->Z, t4); /* 25. Z3 = Z3*t4 */\ - add_##field(p3->Z, p3->Z, t0); /* 26. Z3 = Z3+t0 */\ -\ - vec_select(out, p1, p3, sizeof(*out), p2inf); \ -} - -/* - * https://eprint.iacr.org/2015/1060, algorithm 9 with a twist to handle - * |p3| pointing at |p1|. This is resolved by adding |t3| to hold X*Y - * and reordering operations to bring references to |p1| forward. - * 6M+2S[+13A]. - */ -#define POINT_PROJ_DOUBLE_IMPL_A0(ptype, bits, field, suffixb) \ -static void ptype##proj_double(ptype##proj *p3, const ptype##proj *p1) \ -{ \ - vec##bits t0, t1, t2, t3; \ -\ - sqr_##field(t0, p1->Y); /* 1. t0 = Y*Y */\ - mul_##field(t1, p1->Y, p1->Z); /* 5. t1 = Y*Z */\ - sqr_##field(t2, p1->Z); /* 6. t2 = Z*Z */\ - mul_##field(t3, p1->X, p1->Y); /* 16. t3 = X*Y */\ - lshift_##field(p3->Z, t0, 3); /* 2-4. Z3 = 8*t0 */\ - mul_by_b_##suffixb(p3->X, t2); /* 7. t2 = b*t2 */\ - mul_by_3_##field(t2, p3->X); /* 7. t2 = 3*t2 */\ - mul_##field(p3->X, t2, p3->Z); /* 8. X3 = t2*Z3 */\ - add_##field(p3->Y, t0, t2); /* 9. Y3 = t0+t2 */\ - mul_##field(p3->Z, t1, p3->Z); /* 10. Z3 = t1*Z3 */\ - mul_by_3_##field(t2, t2); /* 11-12. t2 = 3*t2 */\ - sub_##field(t0, t0, t2); /* 13. t0 = t0-t2 */\ - mul_##field(p3->Y, t0, p3->Y); /* 14. Y3 = t0*Y3 */\ - add_##field(p3->Y, p3->X, p3->Y); /* 15. Y3 = X3+Y3 */\ - mul_##field(p3->X, t0, t3); /* 17. X3 = t0*t3 */\ - add_##field(p3->X, p3->X, p3->X); /* 18. X3 = X3+X3 */\ -} - -#define POINT_PROJ_TO_JACOBIAN_IMPL(ptype, bits, field) \ -static void ptype##proj_to_Jacobian(ptype *out, const ptype##proj *in) \ -{ \ - vec##bits ZZ; \ -\ - sqr_##field(ZZ, in->Z); \ - mul_##field(out->X, in->X, in->Z); \ - mul_##field(out->Y, in->Y, ZZ); \ - vec_copy(out->Z, in->Z, sizeof(out->Z)); \ -} - -#define POINT_TO_PROJECTIVE_IMPL(ptype, bits, field, one) \ -static void ptype##_to_projective(ptype##proj *out, const ptype *in) \ -{ \ - vec##bits ZZ; \ - limb_t is_inf = vec_is_zero(in->Z, sizeof(in->Z)); \ -\ - sqr_##field(ZZ, in->Z); \ - mul_##field(out->X, in->X, in->Z); \ - vec_select(out->Y, one, in->Y, sizeof(out->Y), is_inf); \ - mul_##field(out->Z, ZZ, in->Z); \ -} - -/******************* !!!!! NOT CONSTANT TIME !!!!! *******************/ - -/* - * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s - * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 - * with twist to handle either input at infinity. Addition costs 12M+2S, - * while conditional doubling - 4M+6M+3S. - */ -#define POINTXYZZ_DADD_IMPL(ptype, bits, field) \ -static void ptype##xyzz_dadd(ptype##xyzz *p3, const ptype##xyzz *p1, \ - const ptype##xyzz *p2) \ -{ \ - vec##bits U, S, P, R; \ -\ - if (vec_is_zero(p2->ZZZ, 2*sizeof(p2->ZZZ))) { \ - vec_copy(p3, p1, sizeof(*p3)); \ - return; \ - } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ - vec_copy(p3, p2, sizeof(*p3)); \ - return; \ - } \ -\ - mul_##field(U, p1->X, p2->ZZ); /* U1 = X1*ZZ2 */\ - mul_##field(S, p1->Y, p2->ZZZ); /* S1 = Y1*ZZZ2 */\ - mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ - mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ - sub_##field(P, P, U); /* P = U2-U1 */\ - sub_##field(R, R, S); /* R = S2-S1 */\ -\ - if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ - vec##bits PP, PPP, Q; /* add |p1| and |p2| */\ -\ - sqr_##field(PP, P); /* PP = P^2 */\ - mul_##field(PPP, PP, P); /* PPP = P*PP */\ - mul_##field(Q, U, PP); /* Q = U1*PP */\ - sqr_##field(p3->X, R); /* R^2 */\ - add_##field(P, Q, Q); \ - sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ - sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ - sub_##field(Q, Q, p3->X); \ - mul_##field(Q, Q, R); /* R*(Q-X3) */\ - mul_##field(p3->Y, S, PPP); /* S1*PPP */\ - sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-S1*PPP */\ - mul_##field(p3->ZZ, p1->ZZ, p2->ZZ); /* ZZ1*ZZ2 */\ - mul_##field(p3->ZZZ, p1->ZZZ, p2->ZZZ); /* ZZZ1*ZZZ2 */\ - mul_##field(p3->ZZ, p3->ZZ, PP); /* ZZ3 = ZZ1*ZZ2*PP */\ - mul_##field(p3->ZZZ, p3->ZZZ, PPP); /* ZZZ3 = ZZZ1*ZZZ2*PPP */\ - } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ - vec##bits V, W, M; /* double |p1| */\ -\ - add_##field(U, p1->Y, p1->Y); /* U = 2*Y1 */\ - sqr_##field(V, U); /* V = U^2 */\ - mul_##field(W, V, U); /* W = U*V */\ - mul_##field(S, p1->X, V); /* S = X1*V */\ - sqr_##field(M, p1->X); \ - mul_by_3_##field(M, M); /* M = 3*X1^2[+a*ZZ1^2] */\ - sqr_##field(p3->X, M); \ - add_##field(U, S, S); /* 2*S */\ - sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ - mul_##field(p3->Y, W, p1->Y); /* W*Y1 */\ - sub_##field(S, S, p3->X); \ - mul_##field(S, S, M); /* M*(S-X3) */\ - sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ - mul_##field(p3->ZZ, p1->ZZ, V); /* ZZ3 = V*ZZ1 */\ - mul_##field(p3->ZZZ, p1->ZZZ, W); /* ZZ3 = W*ZZZ1 */\ - } else { /* X1==X2 && Y1==-Y2 */\ - vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ - } \ -} - -/* - * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-madd-2008-s - * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1 - * with twists to handle even subtractions and either input at infinity. - * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S. - */ -#define POINTXYZZ_DADD_AFFINE_IMPL(ptype, bits, field, one) \ -static void ptype##xyzz_dadd_affine(ptype##xyzz *p3, const ptype##xyzz *p1, \ - const ptype##_affine *p2, \ - bool_t subtract) \ -{ \ - vec##bits P, R; \ -\ - if (vec_is_zero(p2, sizeof(*p2))) { \ - vec_copy(p3, p1, sizeof(*p3)); \ - return; \ - } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ - vec_copy(p3->X, p2->X, 2*sizeof(p3->X));\ - cneg_##field(p3->ZZZ, one, subtract); \ - vec_copy(p3->ZZ, one, sizeof(p3->ZZ)); \ - return; \ - } \ -\ - mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ - mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ - cneg_##field(R, R, subtract); \ - sub_##field(P, P, p1->X); /* P = U2-X1 */\ - sub_##field(R, R, p1->Y); /* R = S2-Y1 */\ -\ - if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ - vec##bits PP, PPP, Q; /* add |p2| to |p1| */\ -\ - sqr_##field(PP, P); /* PP = P^2 */\ - mul_##field(PPP, PP, P); /* PPP = P*PP */\ - mul_##field(Q, p1->X, PP); /* Q = X1*PP */\ - sqr_##field(p3->X, R); /* R^2 */\ - add_##field(P, Q, Q); \ - sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ - sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ - sub_##field(Q, Q, p3->X); \ - mul_##field(Q, Q, R); /* R*(Q-X3) */\ - mul_##field(p3->Y, p1->Y, PPP); /* Y1*PPP */\ - sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-Y1*PPP */\ - mul_##field(p3->ZZ, p1->ZZ, PP); /* ZZ3 = ZZ1*PP */\ - mul_##field(p3->ZZZ, p1->ZZZ, PPP); /* ZZZ3 = ZZZ1*PPP */\ - } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ - vec##bits U, S, M; /* double |p2| */\ -\ - add_##field(U, p2->Y, p2->Y); /* U = 2*Y1 */\ - sqr_##field(p3->ZZ, U); /* [ZZ3 =] V = U^2 */\ - mul_##field(p3->ZZZ, p3->ZZ, U); /* [ZZZ3 =] W = U*V */\ - mul_##field(S, p2->X, p3->ZZ); /* S = X1*V */\ - sqr_##field(M, p2->X); \ - mul_by_3_##field(M, M); /* M = 3*X1^2[+a] */\ - sqr_##field(p3->X, M); \ - add_##field(U, S, S); /* 2*S */\ - sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ - mul_##field(p3->Y, p3->ZZZ, p2->Y); /* W*Y1 */\ - sub_##field(S, S, p3->X); \ - mul_##field(S, S, M); /* M*(S-X3) */\ - sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ - cneg_##field(p3->ZZZ, p3->ZZZ, subtract); \ - } else { /* X1==X2 && Y1==-Y2 */\ - vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ - } \ -} - -#define POINTXYZZ_TO_JACOBIAN_IMPL(ptype, bits, field) \ -static void ptype##xyzz_to_Jacobian(ptype *out, const ptype##xyzz *in) \ -{ \ - mul_##field(out->X, in->X, in->ZZ); \ - mul_##field(out->Y, in->Y, in->ZZZ); \ - vec_copy(out->Z, in->ZZ, sizeof(out->Z)); \ -} - -#define POINT_TO_XYZZ_IMPL(ptype, bits, field) \ -static void ptype##_to_xyzz(ptype##xyzz *out, const ptype *in) \ -{ \ - vec_copy(out->X, in->X, 2*sizeof(out->X)); \ - sqr_##field(out->ZZ, in->Z); \ - mul_##field(out->ZZZ, out->ZZ, in->Z); \ -} - -#endif diff --git a/crypto/blst_src/errors.h b/crypto/blst_src/errors.h deleted file mode 100644 index 425daeb486f..00000000000 --- a/crypto/blst_src/errors.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLS12_381_ASM_ERRORS_H__ -#define __BLS12_381_ASM_ERRORS_H__ - -typedef enum { - BLST_SUCCESS = 0, - BLST_BAD_ENCODING, - BLST_POINT_NOT_ON_CURVE, - BLST_POINT_NOT_IN_GROUP, - BLST_AGGR_TYPE_MISMATCH, - BLST_VERIFY_FAIL, - BLST_PK_IS_INFINITY, -} BLST_ERROR; - -#endif diff --git a/crypto/blst_src/exp.c b/crypto/blst_src/exp.c deleted file mode 100644 index 55c5c5a7875..00000000000 --- a/crypto/blst_src/exp.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "vect.h" - -/* - * |out| = |inp|^|pow|, small footprint, public exponent - */ -static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, - size_t pow_bits, const vec384 p, limb_t n0) -{ -#if 1 - vec384 ret; - - vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ - --pow_bits; /* most significant bit is set, skip over */ - while (pow_bits--) { - sqr_mont_384(ret, ret, p, n0); - if (is_bit_set(pow, pow_bits)) - mul_mont_384(ret, ret, inp, p, n0); - } - vec_copy(out, ret, sizeof(ret)); /* out = ret */ -#else - unsigned int i; - vec384 sqr; - - vec_copy(sqr, inp, sizeof(sqr)); - for (i = 0; !is_bit_set(pow, i++);) - sqr_mont_384(sqr, sqr, sqr, p, n0); - vec_copy(out, sqr, sizeof(sqr)); - for (; i < pow_bits; i++) { - sqr_mont_384(sqr, sqr, sqr, p, n0); - if (is_bit_set(pow, i)) - mul_mont_384(out, out, sqr, p, n0); - } -#endif -} - -static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, - size_t pow_bits, const vec384 p, limb_t n0) -{ - vec384x ret; - - vec_copy(ret, inp, sizeof(ret)); /* |ret| = |inp|^1 */ - --pow_bits; /* most significant bit is accounted for, skip over */ - while (pow_bits--) { - sqr_mont_384x(ret, ret, p, n0); - if (is_bit_set(pow, pow_bits)) - mul_mont_384x(ret, ret, inp, p, n0); - } - vec_copy(out, ret, sizeof(ret)); /* |out| = |ret| */ -} diff --git a/crypto/blst_src/exports.c b/crypto/blst_src/exports.c deleted file mode 100644 index 1ca4d4757fa..00000000000 --- a/crypto/blst_src/exports.c +++ /dev/null @@ -1,583 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -/* - * Why this file? Overall goal is to ensure that all internal calls - * remain internal after linking application. This is to both - * - * a) minimize possibility of external name conflicts (since all - * non-blst-prefixed and [assembly subroutines] remain static); - * b) preclude possibility of unintentional internal reference - * overload in shared library context (one can achieve same - * effect with -Bsymbolic, but we don't want to rely on end-user - * to remember to use it); - */ - -#include "fields.h" -#include "bytes.h" - -/* - * BLS12-381-specific Fr shortcuts to assembly. - */ -void blst_fr_add(vec256 ret, const vec256 a, const vec256 b) -{ add_mod_256(ret, a, b, BLS12_381_r); } - -void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b) -{ sub_mod_256(ret, a, b, BLS12_381_r); } - -void blst_fr_mul_by_3(vec256 ret, const vec256 a) -{ mul_by_3_mod_256(ret, a, BLS12_381_r); } - -void blst_fr_lshift(vec256 ret, const vec256 a, size_t count) -{ lshift_mod_256(ret, a, count, BLS12_381_r); } - -void blst_fr_rshift(vec256 ret, const vec256 a, size_t count) -{ rshift_mod_256(ret, a, count, BLS12_381_r); } - -void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b) -{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } - -void blst_fr_ct_bfly(vec256 x0, vec256 x1, const vec256 twiddle) -{ - vec256 x2; - - mul_mont_sparse_256(x2, x1, twiddle, BLS12_381_r, r0); - sub_mod_256(x1, x0, x2, BLS12_381_r); - add_mod_256(x0, x0, x2, BLS12_381_r); -} - -void blst_fr_gs_bfly(vec256 x0, vec256 x1, const vec256 twiddle) -{ - vec256 x2; - - sub_mod_256(x2, x0, x1, BLS12_381_r); - add_mod_256(x0, x0, x1, BLS12_381_r); - mul_mont_sparse_256(x1, x2, twiddle, BLS12_381_r, r0); -} - -void blst_fr_sqr(vec256 ret, const vec256 a) -{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } - -void blst_fr_cneg(vec256 ret, const vec256 a, int flag) -{ cneg_mod_256(ret, a, is_zero(flag) ^ 1, BLS12_381_r); } - -void blst_fr_to(vec256 ret, const vec256 a) -{ mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0); } - -void blst_fr_from(vec256 ret, const vec256 a) -{ from_mont_256(ret, a, BLS12_381_r, r0); } - -void blst_fr_from_scalar(vec256 ret, const pow256 a) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - - if ((uptr_t)ret == (uptr_t)a && is_endian.little) { - mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, - BLS12_381_r, r0); - } else { - vec256 out; - limbs_from_le_bytes(out, a, 32); - mul_mont_sparse_256(ret, out, BLS12_381_rRR, BLS12_381_r, r0); - vec_zero(out, sizeof(out)); - } -} - -void blst_scalar_from_fr(pow256 ret, const vec256 a) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - - if ((uptr_t)ret == (uptr_t)a && is_endian.little) { - from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); - } else { - vec256 out; - from_mont_256(out, a, BLS12_381_r, r0); - le_bytes_from_limbs(ret, out, 32); - vec_zero(out, sizeof(out)); - } -} - -int blst_scalar_fr_check(const pow256 a) -{ return (int)(check_mod_256(a, BLS12_381_r) | - bytes_are_zero(a, sizeof(pow256))); -} - -int blst_sk_check(const pow256 a) -{ return (int)check_mod_256(a, BLS12_381_r); } - -int blst_sk_add_n_check(pow256 ret, const pow256 a, const pow256 b) -{ return (int)add_n_check_mod_256(ret, a, b, BLS12_381_r); } - -int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b) -{ return (int)sub_n_check_mod_256(ret, a, b, BLS12_381_r); } - -int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b) -{ - vec256 t[2]; - const union { - long one; - char little; - } is_endian = { 1 }; - bool_t is_zero; - - if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) { - limbs_from_le_bytes(t[0], a, sizeof(pow256)); - limbs_from_le_bytes(t[1], b, sizeof(pow256)); - a = (const byte *)t[0]; - b = (const byte *)t[1]; - } - mul_mont_sparse_256(t[0], BLS12_381_rRR, (const limb_t *)a, BLS12_381_r, r0); - mul_mont_sparse_256(t[0], t[0], (const limb_t *)b, BLS12_381_r, r0); - le_bytes_from_limbs(ret, t[0], sizeof(pow256)); - is_zero = vec_is_zero(t[0], sizeof(vec256)); - vec_zero(t, sizeof(t)); - - return (int)(is_zero^1); -} - -void blst_sk_inverse(pow256 ret, const pow256 a) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - - if (((size_t)a|(size_t)ret)%sizeof(limb_t) == 0 && is_endian.little) { - limb_t *out = (limb_t *)ret; - mul_mont_sparse_256(out, (const limb_t *)a, BLS12_381_rRR, - BLS12_381_r, r0); - reciprocal_fr(out, out); - from_mont_256(out, out, BLS12_381_r, r0); - } else { - vec256 out; - limbs_from_le_bytes(out, a, 32); - mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); - reciprocal_fr(out, out); - from_mont_256(out, out, BLS12_381_r, r0); - le_bytes_from_limbs(ret, out, 32); - vec_zero(out, sizeof(out)); - } -} - -/* - * BLS12-381-specific Fp shortcuts to assembly. - */ -void blst_fp_add(vec384 ret, const vec384 a, const vec384 b) -{ add_fp(ret, a, b); } - -void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b) -{ sub_fp(ret, a, b); } - -void blst_fp_mul_by_3(vec384 ret, const vec384 a) -{ mul_by_3_fp(ret, a); } - -void blst_fp_mul_by_8(vec384 ret, const vec384 a) -{ mul_by_8_fp(ret, a); } - -void blst_fp_lshift(vec384 ret, const vec384 a, size_t count) -{ lshift_fp(ret, a, count); } - -void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b) -{ mul_fp(ret, a, b); } - -void blst_fp_sqr(vec384 ret, const vec384 a) -{ sqr_fp(ret, a); } - -void blst_fp_cneg(vec384 ret, const vec384 a, int flag) -{ cneg_fp(ret, a, is_zero(flag) ^ 1); } - -void blst_fp_to(vec384 ret, const vec384 a) -{ mul_fp(ret, a, BLS12_381_RR); } - -void blst_fp_from(vec384 ret, const vec384 a) -{ from_fp(ret, a); } - -/* - * Fp serialization/deserialization. - */ -void blst_fp_from_uint32(vec384 ret, const unsigned int a[12]) -{ - if (sizeof(limb_t) == 8) { - int i; - for (i = 0; i < 6; i++) - ret[i] = a[2*i] | ((limb_t)a[2*i+1] << (32 & (8*sizeof(limb_t)-1))); - a = (const unsigned int *)ret; - } - mul_fp(ret, (const limb_t *)a, BLS12_381_RR); -} - -void blst_uint32_from_fp(unsigned int ret[12], const vec384 a) -{ - if (sizeof(limb_t) == 4) { - from_fp((limb_t *)ret, a); - } else { - vec384 out; - int i; - - from_fp(out, a); - for (i = 0; i < 6; i++) { - limb_t limb = out[i]; - ret[2*i] = (unsigned int)limb; - ret[2*i+1] = (unsigned int)(limb >> (32 & (8*sizeof(limb_t)-1))); - } - } -} - -void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6]) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - - if (sizeof(limb_t) == 4 && !is_endian.little) { - int i; - for (i = 0; i < 6; i++) { - unsigned long long limb = a[i]; - ret[2*i] = (limb_t)limb; - ret[2*i+1] = (limb_t)(limb >> 32); - } - a = (const unsigned long long *)ret; - } - mul_fp(ret, (const limb_t *)a, BLS12_381_RR); -} - -void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - - if (sizeof(limb_t) == 8 || is_endian.little) { - from_fp((limb_t *)ret, a); - } else { - vec384 out; - int i; - - from_fp(out, a); - for (i = 0; i < 6; i++) - ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); - } -} - -void blst_fp_from_bendian(vec384 ret, const unsigned char a[48]) -{ - vec384 out; - - limbs_from_be_bytes(out, a, sizeof(vec384)); - mul_fp(ret, out, BLS12_381_RR); -} - -void blst_bendian_from_fp(unsigned char ret[48], const vec384 a) -{ - vec384 out; - - from_fp(out, a); - be_bytes_from_limbs(ret, out, sizeof(vec384)); -} - -void blst_fp_from_lendian(vec384 ret, const unsigned char a[48]) -{ - vec384 out; - - limbs_from_le_bytes(out, a, sizeof(vec384)); - mul_fp(ret, out, BLS12_381_RR); -} - -void blst_lendian_from_fp(unsigned char ret[48], const vec384 a) -{ - vec384 out; - - from_fp(out, a); - le_bytes_from_limbs(ret, out, sizeof(vec384)); -} - -/* - * BLS12-381-specific Fp2 shortcuts to assembly. - */ -void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b) -{ add_fp2(ret, a, b); } - -void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b) -{ sub_fp2(ret, a, b); } - -void blst_fp2_mul_by_3(vec384x ret, const vec384x a) -{ mul_by_3_fp2(ret, a); } - -void blst_fp2_mul_by_8(vec384x ret, const vec384x a) -{ mul_by_8_fp2(ret, a); } - -void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count) -{ lshift_fp2(ret, a, count); } - -void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b) -{ mul_fp2(ret, a, b); } - -void blst_fp2_sqr(vec384x ret, const vec384x a) -{ sqr_fp2(ret, a); } - -void blst_fp2_cneg(vec384x ret, const vec384x a, int flag) -{ cneg_fp2(ret, a, is_zero(flag) ^ 1); } - -/* - * Scalar serialization/deserialization. - */ -void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8]) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - size_t i; - - if ((uptr_t)ret==(uptr_t)a && is_endian.little) - return; - - for(i = 0; i < 8; i++) { - unsigned int w = a[i]; - *ret++ = (byte)w; - *ret++ = (byte)(w >> 8); - *ret++ = (byte)(w >> 16); - *ret++ = (byte)(w >> 24); - } -} - -void blst_uint32_from_scalar(unsigned int ret[8], const pow256 a) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - size_t i; - - if ((uptr_t)ret==(uptr_t)a && is_endian.little) - return; - - for(i = 0; i < 8; i++) { - unsigned int w = (unsigned int)(*a++); - w |= (unsigned int)(*a++) << 8; - w |= (unsigned int)(*a++) << 16; - w |= (unsigned int)(*a++) << 24; - ret[i] = w; - } -} - -void blst_scalar_from_uint64(pow256 ret, const unsigned long long a[4]) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - size_t i; - - if ((uptr_t)ret==(uptr_t)a && is_endian.little) - return; - - for(i = 0; i < 4; i++) { - unsigned long long w = a[i]; - *ret++ = (byte)w; - *ret++ = (byte)(w >> 8); - *ret++ = (byte)(w >> 16); - *ret++ = (byte)(w >> 24); - *ret++ = (byte)(w >> 32); - *ret++ = (byte)(w >> 40); - *ret++ = (byte)(w >> 48); - *ret++ = (byte)(w >> 56); - } -} - -void blst_uint64_from_scalar(unsigned long long ret[4], const pow256 a) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - size_t i; - - if ((uptr_t)ret==(uptr_t)a && is_endian.little) - return; - - for(i = 0; i < 4; i++) { - unsigned long long w = (unsigned long long)(*a++); - w |= (unsigned long long)(*a++) << 8; - w |= (unsigned long long)(*a++) << 16; - w |= (unsigned long long)(*a++) << 24; - w |= (unsigned long long)(*a++) << 32; - w |= (unsigned long long)(*a++) << 40; - w |= (unsigned long long)(*a++) << 48; - w |= (unsigned long long)(*a++) << 56; - ret[i] = w; - } -} - -void blst_scalar_from_bendian(pow256 ret, const unsigned char a[32]) -{ - vec256 out; - limbs_from_be_bytes(out, a, sizeof(out)); - le_bytes_from_limbs(ret, out, sizeof(out)); - vec_zero(out, sizeof(out)); -} - -void blst_bendian_from_scalar(unsigned char ret[32], const pow256 a) -{ - vec256 out; - limbs_from_le_bytes(out, a, sizeof(out)); - be_bytes_from_limbs(ret, out, sizeof(out)); - vec_zero(out, sizeof(out)); -} - -void blst_scalar_from_lendian(pow256 ret, const unsigned char a[32]) -{ - size_t i; - - if ((uptr_t)ret==(uptr_t)a) - return; - - for (i = 0; i < 32; i++) - ret[i] = a[i]; -} - -void blst_lendian_from_scalar(unsigned char ret[32], const pow256 a) -{ - size_t i; - - if ((uptr_t)ret==(uptr_t)a) - return; - - for (i = 0; i < 32; i++) - ret[i] = a[i]; -} - -void blst_fr_from_uint64(vec256 ret, const unsigned long long a[4]) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - - if (sizeof(limb_t) == 4 && !is_endian.little) { - int i; - for (i = 0; i < 4; i++) { - unsigned long long limb = a[i]; - ret[2*i] = (limb_t)limb; - ret[2*i+1] = (limb_t)(limb >> 32); - } - a = (const unsigned long long *)ret; - } - mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0); -} - -void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a) -{ - const union { - long one; - char little; - } is_endian = { 1 }; - - if (sizeof(limb_t) == 8 || is_endian.little) { - from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); - } else { - vec256 out; - int i; - - from_mont_256(out, a, BLS12_381_r, r0); - for (i = 0; i < 4; i++) - ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); - vec_zero(out, sizeof(out)); - } -} - -int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n) -{ - size_t rem = (n - 1) % 32 + 1; - struct { vec256 out, digit; } t; - limb_t ret; - - vec_zero(t.out, sizeof(t.out)); - - n -= rem; - limbs_from_le_bytes(t.out, bytes += n, rem); - mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); - - while (n) { - limbs_from_le_bytes(t.digit, bytes -= 32, 32); - add_mod_256(t.out, t.out, t.digit, BLS12_381_r); - mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); - n -= 32; - } - - from_mont_256(t.out, t.out, BLS12_381_r, r0); - - ret = vec_is_zero(t.out, sizeof(t.out)); - le_bytes_from_limbs(out, t.out, 32); - vec_zero(&t, sizeof(t)); - - return (int)(ret^1); -} - -int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n) -{ - size_t rem = (n - 1) % 32 + 1; - struct { vec256 out, digit; } t; - limb_t ret; - - vec_zero(t.out, sizeof(t.out)); - - limbs_from_be_bytes(t.out, bytes, rem); - mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); - - while (n -= rem) { - limbs_from_be_bytes(t.digit, bytes += rem, 32); - add_mod_256(t.out, t.out, t.digit, BLS12_381_r); - mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); - rem = 32; - } - - from_mont_256(t.out, t.out, BLS12_381_r, r0); - - ret = vec_is_zero(t.out, sizeof(t.out)); - le_bytes_from_limbs(out, t.out, 32); - vec_zero(&t, sizeof(t)); - - return (int)(ret^1); -} - -/* - * Single-short SHA-256 hash function. - */ -#include "sha256.h" - -void blst_sha256(unsigned char md[32], const void *msg, size_t len) -{ - SHA256_CTX ctx; - - sha256_init(&ctx); - sha256_update(&ctx, msg, len); - sha256_final(md, &ctx); -} - -/* - * Test facilitator. - */ -void blst_scalar_from_hexascii(pow256 ret, const char *hex) -{ bytes_from_hexascii(ret, sizeof(pow256), hex); } - -void blst_fr_from_hexascii(vec256 ret, const char *hex) -{ - limbs_from_hexascii(ret, sizeof(vec256), hex); - mul_mont_sparse_256(ret, ret, BLS12_381_rRR, BLS12_381_r, r0); -} - -void blst_fp_from_hexascii(vec384 ret, const char *hex) -{ - limbs_from_hexascii(ret, sizeof(vec384), hex); - mul_fp(ret, ret, BLS12_381_RR); -} diff --git a/crypto/blst_src/fields.h b/crypto/blst_src/fields.h deleted file mode 100644 index 4b2323d2cce..00000000000 --- a/crypto/blst_src/fields.h +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLS12_381_ASM_FIELDS_H__ -#define __BLS12_381_ASM_FIELDS_H__ - -#include "vect.h" -#include "consts.h" - -/* - * BLS12-381-specific Fp shortcuts to assembly. - */ -static inline void add_fp(vec384 ret, const vec384 a, const vec384 b) -{ add_mod_384(ret, a, b, BLS12_381_P); } - -static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b) -{ sub_mod_384(ret, a, b, BLS12_381_P); } - -static inline void mul_by_3_fp(vec384 ret, const vec384 a) -{ mul_by_3_mod_384(ret, a, BLS12_381_P); } - -static inline void mul_by_8_fp(vec384 ret, const vec384 a) -{ mul_by_8_mod_384(ret, a, BLS12_381_P); } - -static inline void lshift_fp(vec384 ret, const vec384 a, size_t count) -{ lshift_mod_384(ret, a, count, BLS12_381_P); } - -static inline void rshift_fp(vec384 ret, const vec384 a, size_t count) -{ rshift_mod_384(ret, a, count, BLS12_381_P); } - -static inline void div_by_2_fp(vec384 ret, const vec384 a) -{ div_by_2_mod_384(ret, a, BLS12_381_P); } - -static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b) -{ mul_mont_384(ret, a, b, BLS12_381_P, p0); } - -static inline void sqr_fp(vec384 ret, const vec384 a) -{ sqr_mont_384(ret, a, BLS12_381_P, p0); } - -static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag) -{ cneg_mod_384(ret, a, flag, BLS12_381_P); } - -static inline void from_fp(vec384 ret, const vec384 a) -{ from_mont_384(ret, a, BLS12_381_P, p0); } - -static inline void redc_fp(vec384 ret, const vec768 a) -{ redc_mont_384(ret, a, BLS12_381_P, p0); } - -/* - * BLS12-381-specific Fp2 shortcuts to assembly. - */ -static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) -{ add_mod_384x(ret, a, b, BLS12_381_P); } - -static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) -{ sub_mod_384x(ret, a, b, BLS12_381_P); } - -static inline void mul_by_3_fp2(vec384x ret, const vec384x a) -{ mul_by_3_mod_384x(ret, a, BLS12_381_P); } - -static inline void mul_by_8_fp2(vec384x ret, const vec384x a) -{ mul_by_8_mod_384x(ret, a, BLS12_381_P); } - -static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) -{ - lshift_mod_384(ret[0], a[0], count, BLS12_381_P); - lshift_mod_384(ret[1], a[1], count, BLS12_381_P); -} - -static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) -{ mul_mont_384x(ret, a, b, BLS12_381_P, p0); } - -static inline void sqr_fp2(vec384x ret, const vec384x a) -{ sqr_mont_384x(ret, a, BLS12_381_P, p0); } - -static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag) -{ - cneg_mod_384(ret[0], a[0], flag, BLS12_381_P); - cneg_mod_384(ret[1], a[1], flag, BLS12_381_P); -} - -#define vec_load_global vec_copy - -static void reciprocal_fp(vec384 out, const vec384 inp); -static void flt_reciprocal_fp(vec384 out, const vec384 inp); -static bool_t recip_sqrt_fp(vec384 out, const vec384 inp); -static bool_t sqrt_fp(vec384 out, const vec384 inp); - -static void reciprocal_fp2(vec384x out, const vec384x inp); -static void flt_reciprocal_fp2(vec384x out, const vec384x inp); -static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, - const vec384x recip_ZZZ, const vec384x magic_ZZZ); -static bool_t sqrt_fp2(vec384x out, const vec384x inp); -static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, - const vec384x sqrt, const vec384x inp); - -typedef vec384x vec384fp2; -typedef vec384fp2 vec384fp6[3]; -typedef vec384fp6 vec384fp12[2]; - -static void sqr_fp12(vec384fp12 ret, const vec384fp12 a); -static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a); -static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b); -static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, - const vec384fp6 xy00z0); -static void conjugate_fp12(vec384fp12 a); -static void inverse_fp12(vec384fp12 ret, const vec384fp12 a); -/* caveat lector! |n| has to be non-zero and not more than 3! */ -static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n); - -#define neg_fp(r,a) cneg_fp((r),(a),1) -#define neg_fp2(r,a) cneg_fp2((r),(a),1) - -#endif /* __BLS12_381_ASM_FIELDS_H__ */ diff --git a/crypto/blst_src/fp12_tower.c b/crypto/blst_src/fp12_tower.c deleted file mode 100644 index d6c0b124eb6..00000000000 --- a/crypto/blst_src/fp12_tower.c +++ /dev/null @@ -1,789 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "fields.h" - -/* - * Fp2 = Fp[u] / (u^2 + 1) - * Fp6 = Fp2[v] / (v^3 - u - 1) - * Fp12 = Fp6[w] / (w^2 - v) - */ - -static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a) -{ mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P); } - -#if 1 && !defined(__BLST_NO_ASM__) -#define __FP2x2__ -/* - * Fp2x2 is a "widened" version of Fp2, which allows to consolidate - * reductions from several multiplications. In other words instead of - * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter - * addition is double-width... To be more specific this gives ~7-10% - * faster pairing depending on platform... - */ -typedef vec768 vec768x[2]; - -static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b) -{ - add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); - add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); -} - -static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b) -{ - sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); - sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); -} - -static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a) -{ - /* caveat lector! |ret| may not be same as |a| */ - sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P); - add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P); -} - -static inline void redc_fp2x2(vec384x ret, const vec768x a) -{ - redc_mont_384(ret[0], a[0], BLS12_381_P, p0); - redc_mont_384(ret[1], a[1], BLS12_381_P, p0); -} - -static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b) -{ -#if 1 - mul_382x(ret, a, b, BLS12_381_P); /* +~6% in Miller loop */ -#else - union { vec384 x[2]; vec768 x2; } t; - - add_mod_384(t.x[0], a[0], a[1], BLS12_381_P); - add_mod_384(t.x[1], b[0], b[1], BLS12_381_P); - mul_384(ret[1], t.x[0], t.x[1]); - - mul_384(ret[0], a[0], b[0]); - mul_384(t.x2, a[1], b[1]); - - sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P); - sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P); - - sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P); -#endif -} - -static void sqr_fp2x2(vec768x ret, const vec384x a) -{ -#if 1 - sqr_382x(ret, a, BLS12_381_P); /* +~5% in final exponentiation */ -#else - vec384 t0, t1; - - add_mod_384(t0, a[0], a[1], BLS12_381_P); - sub_mod_384(t1, a[0], a[1], BLS12_381_P); - - mul_384(ret[1], a[0], a[1]); - add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P); - - mul_384(ret[0], t0, t1); -#endif -} -#endif /* __FP2x2__ */ - -/* - * Fp6 extension - */ -#if defined(__FP2x2__) /* ~10-13% improvement for mul_fp12 and sqr_fp12 */ -typedef vec768x vec768fp6[3]; - -static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a, - const vec768fp6 b) -{ - sub_fp2x2(ret[0], a[0], b[0]); - sub_fp2x2(ret[1], a[1], b[1]); - sub_fp2x2(ret[2], a[2], b[2]); -} - -static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b) -{ - vec768x t0, t1, t2; - vec384x aa, bb; - - mul_fp2x2(t0, a[0], b[0]); - mul_fp2x2(t1, a[1], b[1]); - mul_fp2x2(t2, a[2], b[2]); - - /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 - = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ - add_fp2(aa, a[1], a[2]); - add_fp2(bb, b[1], b[2]); - mul_fp2x2(ret[0], aa, bb); - sub_fp2x2(ret[0], ret[0], t1); - sub_fp2x2(ret[0], ret[0], t2); - mul_by_u_plus_1_fp2x2(ret[1], ret[0]); /* borrow ret[1] for a moment */ - add_fp2x2(ret[0], ret[1], t0); - - /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) - = a0*b1 + a1*b0 + a2*b2*(u+1) */ - add_fp2(aa, a[0], a[1]); - add_fp2(bb, b[0], b[1]); - mul_fp2x2(ret[1], aa, bb); - sub_fp2x2(ret[1], ret[1], t0); - sub_fp2x2(ret[1], ret[1], t1); - mul_by_u_plus_1_fp2x2(ret[2], t2); /* borrow ret[2] for a moment */ - add_fp2x2(ret[1], ret[1], ret[2]); - - /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 - = a0*b2 + a2*b0 + a1*b1 */ - add_fp2(aa, a[0], a[2]); - add_fp2(bb, b[0], b[2]); - mul_fp2x2(ret[2], aa, bb); - sub_fp2x2(ret[2], ret[2], t0); - sub_fp2x2(ret[2], ret[2], t2); - add_fp2x2(ret[2], ret[2], t1); -} - -static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a) -{ - redc_fp2x2(ret[0], a[0]); - redc_fp2x2(ret[1], a[1]); - redc_fp2x2(ret[2], a[2]); -} - -static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) -{ - vec768fp6 r; - - mul_fp6x2(r, a, b); - redc_fp6x2(ret, r); /* narrow to normal width */ -} - -static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) -{ - vec768x s0, m01, m12, s2, rx; - - sqr_fp2x2(s0, a[0]); - - mul_fp2x2(m01, a[0], a[1]); - add_fp2x2(m01, m01, m01); - - mul_fp2x2(m12, a[1], a[2]); - add_fp2x2(m12, m12, m12); - - sqr_fp2x2(s2, a[2]); - - /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) - = a1^2 + 2*(a0*a2) */ - add_fp2(ret[2], a[2], a[1]); - add_fp2(ret[2], ret[2], a[0]); - sqr_fp2x2(rx, ret[2]); - sub_fp2x2(rx, rx, s0); - sub_fp2x2(rx, rx, s2); - sub_fp2x2(rx, rx, m01); - sub_fp2x2(rx, rx, m12); - redc_fp2x2(ret[2], rx); - - /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ - mul_by_u_plus_1_fp2x2(rx, m12); - add_fp2x2(rx, rx, s0); - redc_fp2x2(ret[0], rx); - - /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ - mul_by_u_plus_1_fp2x2(rx, s2); - add_fp2x2(rx, rx, m01); - redc_fp2x2(ret[1], rx); -} -#else -static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) -{ - vec384x t0, t1, t2, t3, t4, t5; - - mul_fp2(t0, a[0], b[0]); - mul_fp2(t1, a[1], b[1]); - mul_fp2(t2, a[2], b[2]); - - /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 - = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ - add_fp2(t4, a[1], a[2]); - add_fp2(t5, b[1], b[2]); - mul_fp2(t3, t4, t5); - sub_fp2(t3, t3, t1); - sub_fp2(t3, t3, t2); - mul_by_u_plus_1_fp2(t3, t3); - /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ - - /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) - = a0*b1 + a1*b0 + a2*b2*(u+1) */ - add_fp2(t4, a[0], a[1]); - add_fp2(t5, b[0], b[1]); - mul_fp2(ret[1], t4, t5); - sub_fp2(ret[1], ret[1], t0); - sub_fp2(ret[1], ret[1], t1); - mul_by_u_plus_1_fp2(t4, t2); - add_fp2(ret[1], ret[1], t4); - - /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 - = a0*b2 + a2*b0 + a1*b1 */ - add_fp2(t4, a[0], a[2]); - add_fp2(t5, b[0], b[2]); - mul_fp2(ret[2], t4, t5); - sub_fp2(ret[2], ret[2], t0); - sub_fp2(ret[2], ret[2], t2); - add_fp2(ret[2], ret[2], t1); - - add_fp2(ret[0], t3, t0); /* ... moved from above */ -} - -static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) -{ - vec384x s0, m01, m12, s2; - - sqr_fp2(s0, a[0]); - - mul_fp2(m01, a[0], a[1]); - add_fp2(m01, m01, m01); - - mul_fp2(m12, a[1], a[2]); - add_fp2(m12, m12, m12); - - sqr_fp2(s2, a[2]); - - /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) - = a1^2 + 2*(a0*a2) */ - add_fp2(ret[2], a[2], a[1]); - add_fp2(ret[2], ret[2], a[0]); - sqr_fp2(ret[2], ret[2]); - sub_fp2(ret[2], ret[2], s0); - sub_fp2(ret[2], ret[2], s2); - sub_fp2(ret[2], ret[2], m01); - sub_fp2(ret[2], ret[2], m12); - - /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ - mul_by_u_plus_1_fp2(ret[0], m12); - add_fp2(ret[0], ret[0], s0); - - /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ - mul_by_u_plus_1_fp2(ret[1], s2); - add_fp2(ret[1], ret[1], m01); -} -#endif - -static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) -{ - add_fp2(ret[0], a[0], b[0]); - add_fp2(ret[1], a[1], b[1]); - add_fp2(ret[2], a[2], b[2]); -} - -static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) -{ - sub_fp2(ret[0], a[0], b[0]); - sub_fp2(ret[1], a[1], b[1]); - sub_fp2(ret[2], a[2], b[2]); -} - -static void neg_fp6(vec384fp6 ret, const vec384fp6 a) -{ - neg_fp2(ret[0], a[0]); - neg_fp2(ret[1], a[1]); - neg_fp2(ret[2], a[2]); -} - -#if 0 -#define mul_by_v_fp6 mul_by_v_fp6 -static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a) -{ - vec384x t; - - mul_by_u_plus_1_fp2(t, a[2]); - vec_copy(ret[2], a[1], sizeof(a[1])); - vec_copy(ret[1], a[0], sizeof(a[0])); - vec_copy(ret[0], t, sizeof(t)); -} -#endif - -/* - * Fp12 extension - */ -#if defined(__FP2x2__) -static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) -{ - vec768fp6 t0, t1, rx; - vec384fp6 t2; - - mul_fp6x2(t0, a[0], b[0]); - mul_fp6x2(t1, a[1], b[1]); - - /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 - = a0*b1 + a1*b0 */ - add_fp6(t2, a[0], a[1]); - add_fp6(ret[1], b[0], b[1]); - mul_fp6x2(rx, ret[1], t2); - sub_fp6x2(rx, rx, t0); - sub_fp6x2(rx, rx, t1); - redc_fp6x2(ret[1], rx); - - /* ret[0] = a0*b0 + a1*b1*v */ - mul_by_u_plus_1_fp2x2(rx[0], t1[2]); - add_fp2x2(rx[0], t0[0], rx[0]); - add_fp2x2(rx[1], t0[1], t1[0]); - add_fp2x2(rx[2], t0[2], t1[1]); - redc_fp6x2(ret[0], rx); -} - -static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a, - const vec384fp2 b) -{ - mul_fp2x2(ret[1], a[2], b); /* borrow ret[1] for a moment */ - mul_by_u_plus_1_fp2x2(ret[0], ret[1]); - mul_fp2x2(ret[1], a[0], b); - mul_fp2x2(ret[2], a[1], b); -} - -static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a, - const vec384fp6 b) -{ - vec768x t0, t1; - vec384x aa, bb; - - mul_fp2x2(t0, a[0], b[0]); - mul_fp2x2(t1, a[1], b[1]); - - /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 - = (a1*0 + a2*b1)*(u+1) + a0*b0 */ - mul_fp2x2(ret[1], a[2], b[1]); /* borrow ret[1] for a moment */ - mul_by_u_plus_1_fp2x2(ret[0], ret[1]); - add_fp2x2(ret[0], ret[0], t0); - - /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) - = a0*b1 + a1*b0 + a2*0*(u+1) */ - add_fp2(aa, a[0], a[1]); - add_fp2(bb, b[0], b[1]); - mul_fp2x2(ret[1], aa, bb); - sub_fp2x2(ret[1], ret[1], t0); - sub_fp2x2(ret[1], ret[1], t1); - - /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 - = a0*0 + a2*b0 + a1*b1 */ - mul_fp2x2(ret[2], a[2], b[0]); - add_fp2x2(ret[2], ret[2], t1); -} - -static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, - const vec384fp6 xy00z0) -{ - vec768fp6 t0, t1, rr; - vec384fp6 t2; - - mul_by_xy0_fp6x2(t0, a[0], xy00z0); - mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]); - - /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 - = a0*b1 + a1*b0 */ - vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); - add_fp2(t2[1], xy00z0[1], xy00z0[2]); - add_fp6(ret[1], a[0], a[1]); - mul_by_xy0_fp6x2(rr, ret[1], t2); - sub_fp6x2(rr, rr, t0); - sub_fp6x2(rr, rr, t1); - redc_fp6x2(ret[1], rr); - - /* ret[0] = a0*b0 + a1*b1*v */ - mul_by_u_plus_1_fp2x2(rr[0], t1[2]); - add_fp2x2(rr[0], t0[0], rr[0]); - add_fp2x2(rr[1], t0[1], t1[0]); - add_fp2x2(rr[2], t0[2], t1[1]); - redc_fp6x2(ret[0], rr); -} -#else -static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) -{ - vec384fp6 t0, t1, t2; - - mul_fp6(t0, a[0], b[0]); - mul_fp6(t1, a[1], b[1]); - - /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 - = a0*b1 + a1*b0 */ - add_fp6(t2, a[0], a[1]); - add_fp6(ret[1], b[0], b[1]); - mul_fp6(ret[1], ret[1], t2); - sub_fp6(ret[1], ret[1], t0); - sub_fp6(ret[1], ret[1], t1); - - /* ret[0] = a0*b0 + a1*b1*v */ -#ifdef mul_by_v_fp6 - mul_by_v_fp6(t1, t1); - add_fp6(ret[0], t0, t1); -#else - mul_by_u_plus_1_fp2(t1[2], t1[2]); - add_fp2(ret[0][0], t0[0], t1[2]); - add_fp2(ret[0][1], t0[1], t1[0]); - add_fp2(ret[0][2], t0[2], t1[1]); -#endif -} - -static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a, - const vec384fp2 b) -{ - vec384x t; - - mul_fp2(t, a[2], b); - mul_fp2(ret[2], a[1], b); - mul_fp2(ret[1], a[0], b); - mul_by_u_plus_1_fp2(ret[0], t); -} - -static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) -{ - vec384x t0, t1, /*t2,*/ t3, t4, t5; - - mul_fp2(t0, a[0], b[0]); - mul_fp2(t1, a[1], b[1]); - - /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 - = (a1*0 + a2*b1)*(u+1) + a0*b0 */ - mul_fp2(t3, a[2], b[1]); - mul_by_u_plus_1_fp2(t3, t3); - /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ - - /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) - = a0*b1 + a1*b0 + a2*0*(u+1) */ - add_fp2(t4, a[0], a[1]); - add_fp2(t5, b[0], b[1]); - mul_fp2(ret[1], t4, t5); - sub_fp2(ret[1], ret[1], t0); - sub_fp2(ret[1], ret[1], t1); - - /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 - = a0*0 + a2*b0 + a1*b1 */ - mul_fp2(ret[2], a[2], b[0]); - add_fp2(ret[2], ret[2], t1); - - add_fp2(ret[0], t3, t0); /* ... moved from above */ -} - -static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, - const vec384fp6 xy00z0) -{ - vec384fp6 t0, t1, t2; - - mul_by_xy0_fp6(t0, a[0], xy00z0); - mul_by_0y0_fp6(t1, a[1], xy00z0[2]); - - /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 - = a0*b1 + a1*b0 */ - vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); - add_fp2(t2[1], xy00z0[1], xy00z0[2]); - add_fp6(ret[1], a[0], a[1]); - mul_by_xy0_fp6(ret[1], ret[1], t2); - sub_fp6(ret[1], ret[1], t0); - sub_fp6(ret[1], ret[1], t1); - - /* ret[0] = a0*b0 + a1*b1*v */ -#ifdef mul_by_v_fp6 - mul_by_v_fp6(t1, t1); - add_fp6(ret[0], t0, t1); -#else - mul_by_u_plus_1_fp2(t1[2], t1[2]); - add_fp2(ret[0][0], t0[0], t1[2]); - add_fp2(ret[0][1], t0[1], t1[0]); - add_fp2(ret[0][2], t0[2], t1[1]); -#endif -} -#endif - -static void sqr_fp12(vec384fp12 ret, const vec384fp12 a) -{ - vec384fp6 t0, t1; - - add_fp6(t0, a[0], a[1]); -#ifdef mul_by_v_fp6 - mul_by_v_fp6(t1, a[1]); - add_fp6(t1, a[0], t1); -#else - mul_by_u_plus_1_fp2(t1[2], a[1][2]); - add_fp2(t1[0], a[0][0], t1[2]); - add_fp2(t1[1], a[0][1], a[1][0]); - add_fp2(t1[2], a[0][2], a[1][1]); -#endif - mul_fp6(t0, t0, t1); - mul_fp6(t1, a[0], a[1]); - - /* ret[1] = 2*(a0*a1) */ - add_fp6(ret[1], t1, t1); - - /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v - = a0^2 + a1^2*v */ - sub_fp6(ret[0], t0, t1); -#ifdef mul_by_v_fp6 - mul_by_v_fp6(t1, t1); - sub_fp6(ret[0], ret[0], t1); -#else - mul_by_u_plus_1_fp2(t1[2], t1[2]); - sub_fp2(ret[0][0], ret[0][0], t1[2]); - sub_fp2(ret[0][1], ret[0][1], t1[0]); - sub_fp2(ret[0][2], ret[0][2], t1[1]); -#endif -} - -static void conjugate_fp12(vec384fp12 a) -{ neg_fp6(a[1], a[1]); } - -static void inverse_fp6(vec384fp6 ret, const vec384fp6 a) -{ - vec384x c0, c1, c2, t0, t1; - - /* c0 = a0^2 - (a1*a2)*(u+1) */ - sqr_fp2(c0, a[0]); - mul_fp2(t0, a[1], a[2]); - mul_by_u_plus_1_fp2(t0, t0); - sub_fp2(c0, c0, t0); - - /* c1 = a2^2*(u+1) - (a0*a1) */ - sqr_fp2(c1, a[2]); - mul_by_u_plus_1_fp2(c1, c1); - mul_fp2(t0, a[0], a[1]); - sub_fp2(c1, c1, t0); - - /* c2 = a1^2 - a0*a2 */ - sqr_fp2(c2, a[1]); - mul_fp2(t0, a[0], a[2]); - sub_fp2(c2, c2, t0); - - /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */ - mul_fp2(t0, c1, a[2]); - mul_fp2(t1, c2, a[1]); - add_fp2(t0, t0, t1); - mul_by_u_plus_1_fp2(t0, t0); - mul_fp2(t1, c0, a[0]); - add_fp2(t0, t0, t1); - - reciprocal_fp2(t1, t0); - - mul_fp2(ret[0], c0, t1); - mul_fp2(ret[1], c1, t1); - mul_fp2(ret[2], c2, t1); -} - -static void inverse_fp12(vec384fp12 ret, const vec384fp12 a) -{ - vec384fp6 t0, t1; - - sqr_fp6(t0, a[0]); - sqr_fp6(t1, a[1]); -#ifdef mul_by_v_fp6 - mul_by_v_fp6(t1, t1); - sub_fp6(t0, t0, t1); -#else - mul_by_u_plus_1_fp2(t1[2], t1[2]); - sub_fp2(t0[0], t0[0], t1[2]); - sub_fp2(t0[1], t0[1], t1[0]); - sub_fp2(t0[2], t0[2], t1[1]); -#endif - - inverse_fp6(t1, t0); - - mul_fp6(ret[0], a[0], t1); - mul_fp6(ret[1], a[1], t1); - neg_fp6(ret[1], ret[1]); -} - -typedef vec384x vec384fp4[2]; - -#if defined(__FP2x2__) -static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) -{ - vec768x t0, t1, t2; - - sqr_fp2x2(t0, a0); - sqr_fp2x2(t1, a1); - add_fp2(ret[1], a0, a1); - - mul_by_u_plus_1_fp2x2(t2, t1); - add_fp2x2(t2, t2, t0); - redc_fp2x2(ret[0], t2); - - sqr_fp2x2(t2, ret[1]); - sub_fp2x2(t2, t2, t0); - sub_fp2x2(t2, t2, t1); - redc_fp2x2(ret[1], t2); -} -#else -static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) -{ - vec384x t0, t1; - - sqr_fp2(t0, a0); - sqr_fp2(t1, a1); - add_fp2(ret[1], a0, a1); - - mul_by_u_plus_1_fp2(ret[0], t1); - add_fp2(ret[0], ret[0], t0); - - sqr_fp2(ret[1], ret[1]); - sub_fp2(ret[1], ret[1], t0); - sub_fp2(ret[1], ret[1], t1); -} -#endif - -static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a) -{ - vec384fp4 t0, t1, t2; - - sqr_fp4(t0, a[0][0], a[1][1]); - sqr_fp4(t1, a[1][0], a[0][2]); - sqr_fp4(t2, a[0][1], a[1][2]); - - sub_fp2(ret[0][0], t0[0], a[0][0]); - add_fp2(ret[0][0], ret[0][0], ret[0][0]); - add_fp2(ret[0][0], ret[0][0], t0[0]); - - sub_fp2(ret[0][1], t1[0], a[0][1]); - add_fp2(ret[0][1], ret[0][1], ret[0][1]); - add_fp2(ret[0][1], ret[0][1], t1[0]); - - sub_fp2(ret[0][2], t2[0], a[0][2]); - add_fp2(ret[0][2], ret[0][2], ret[0][2]); - add_fp2(ret[0][2], ret[0][2], t2[0]); - - mul_by_u_plus_1_fp2(t2[1], t2[1]); - add_fp2(ret[1][0], t2[1], a[1][0]); - add_fp2(ret[1][0], ret[1][0], ret[1][0]); - add_fp2(ret[1][0], ret[1][0], t2[1]); - - add_fp2(ret[1][1], t0[1], a[1][1]); - add_fp2(ret[1][1], ret[1][1], ret[1][1]); - add_fp2(ret[1][1], ret[1][1], t0[1]); - - add_fp2(ret[1][2], t1[1], a[1][2]); - add_fp2(ret[1][2], ret[1][2], ret[1][2]); - add_fp2(ret[1][2], ret[1][2], t1[1]); -} - -/* - * caveat lector! |n| has to be non-zero and not more than 3! - */ -static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n) -{ - vec_copy(ret[0], a[0], sizeof(ret[0])); - cneg_fp(ret[1], a[1], n & 1); -} - -static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n) -{ - static const vec384x coeffs1[] = { /* (u + 1)^((P^n - 1) / 3) */ - { { 0 }, - { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), - TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), - TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } }, - { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a), - TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b), - TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } }, - { { 0 }, { ONE_MONT_P } } - }; - static const vec384 coeffs2[] = { /* (u + 1)^((2P^n - 2) / 3) */ - { TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), - TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), - TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }, - { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), - TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), - TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) }, - { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), - TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), - TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } - }; - - frobenius_map_fp2(ret[0], a[0], n); - frobenius_map_fp2(ret[1], a[1], n); - frobenius_map_fp2(ret[2], a[2], n); - --n; /* implied ONE_MONT_P at index 0 */ - mul_fp2(ret[1], ret[1], coeffs1[n]); - mul_fp(ret[2][0], ret[2][0], coeffs2[n]); - mul_fp(ret[2][1], ret[2][1], coeffs2[n]); -} - -static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n) -{ - static const vec384x coeffs[] = { /* (u + 1)^((P^n - 1) / 6) */ - { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313), - TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee), - TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) }, - { TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec), - TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0), - TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } }, - { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c), - TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721), - TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } }, - { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), - TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), - TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, - { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), - TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), - TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }, - }; - - frobenius_map_fp6(ret[0], a[0], n); - frobenius_map_fp6(ret[1], a[1], n); - --n; /* implied ONE_MONT_P at index 0 */ - mul_fp2(ret[1][0], ret[1][0], coeffs[n]); - mul_fp2(ret[1][1], ret[1][1], coeffs[n]); - mul_fp2(ret[1][2], ret[1][2], coeffs[n]); -} - - -/* - * BLS12-381-specific Fp12 shortcuts. - */ -void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a) -{ sqr_fp12(ret, a); } - -void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a) -{ cyclotomic_sqr_fp12(ret, a); } - -void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) -{ mul_fp12(ret, a, b); } - -void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a, - const vec384fp6 xy00z0) -{ mul_by_xy00z0_fp12(ret, a, xy00z0); } - -void blst_fp12_conjugate(vec384fp12 a) -{ conjugate_fp12(a); } - -void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a) -{ inverse_fp12(ret, a); } - -/* caveat lector! |n| has to be non-zero and not more than 3! */ -void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n) -{ frobenius_map_fp12(ret, a, n); } - -int blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b) -{ return (int)vec_is_equal(a, b, sizeof(vec384fp12)); } - -int blst_fp12_is_one(const vec384fp12 a) -{ - return (int)(vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & - vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0]))); -} - -const vec384fp12 *blst_fp12_one(void) -{ return (const vec384fp12 *)BLS12_381_Rx.p12; } - -void blst_bendian_from_fp12(unsigned char ret[48*12], const vec384fp12 a) -{ - size_t i, j; - vec384 out; - - for (i = 0; i < 3; i++) { - for (j = 0; j < 2; j++) { - from_fp(out, a[j][i][0]); - be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; - from_fp(out, a[j][i][1]); - be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; - } - } -} - -size_t blst_fp12_sizeof(void) -{ return sizeof(vec384fp12); } diff --git a/crypto/blst_src/hash_to_field.c b/crypto/blst_src/hash_to_field.c deleted file mode 100644 index 6816ea8b922..00000000000 --- a/crypto/blst_src/hash_to_field.c +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "consts.h" -#include "sha256.h" - -static const vec384 BLS12_381_RRRR = { /* RR^2 */ - TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8), - TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761), - TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d) -}; - -#ifdef expand_message_xmd -void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, - const unsigned char *aug, size_t aug_len, - const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len); -#else -static void sha256_init_Zpad(SHA256_CTX *ctx) -{ - ctx->h[0] = 0xda5698beU; - ctx->h[1] = 0x17b9b469U; - ctx->h[2] = 0x62335799U; - ctx->h[3] = 0x779fbecaU; - ctx->h[4] = 0x8ce5d491U; - ctx->h[5] = 0xc0d26243U; - ctx->h[6] = 0xbafef9eaU; - ctx->h[7] = 0x1837a9d8U; - ctx->N = 64; - vec_zero(ctx->buf, sizeof(ctx->buf)); - ctx->off = 0; -} - -static void vec_xor(void *restrict ret, const void *restrict a, - const void *restrict b, size_t num) -{ - limb_t *rp = (limb_t *)ret; - const limb_t *ap = (const limb_t *)a; - const limb_t *bp = (const limb_t *)b; - size_t i; - - num /= sizeof(limb_t); - - for (i = 0; i < num; i++) - rp[i] = ap[i] ^ bp[i]; -} - -static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, - const unsigned char *aug, size_t aug_len, - const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len) -{ - union { limb_t align; unsigned char c[32]; } b_0; - union { limb_t align; unsigned char c[33+256+31]; } b_i; - unsigned char *p; - size_t i, b_i_bits, b_i_blocks; - SHA256_CTX ctx; - - /* - * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime' - */ - if (DST_len > 255) { - sha256_init(&ctx); - sha256_update(&ctx, "H2C-OVERSIZE-DST-", 17); - sha256_update(&ctx, DST, DST_len); - sha256_final(b_0.c, &ctx); - DST = b_0.c, DST_len = 32; - } - b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64; - vec_zero(b_i.c + b_i_blocks - 64, 64); - - p = b_i.c + 33; - for (i = 0; i < DST_len; i++) - p[i] = DST[i]; - p[i++] = (unsigned char)DST_len; - p[i++] = 0x80; - p[i+6] = p[i+5] = p[i+4] = p[i+3] = p[i+2] = p[i+1] = p[i+0] = 0; - b_i_bits = (33 + DST_len + 1) * 8; - p = b_i.c + b_i_blocks; - p[-2] = (unsigned char)(b_i_bits >> 8); - p[-1] = (unsigned char)(b_i_bits); - - sha256_init_Zpad(&ctx); /* Z_pad | */ - sha256_update(&ctx, aug, aug_len); /* | aug | */ - sha256_update(&ctx, msg, msg_len); /* | msg | */ - /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime */ - b_i.c[30] = (unsigned char)(len_in_bytes >> 8); - b_i.c[31] = (unsigned char)(len_in_bytes); - b_i.c[32] = 0; - sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1); - sha256_final(b_0.c, &ctx); - - sha256_init_h(ctx.h); - vec_copy(b_i.c, b_0.c, 32); - ++b_i.c[32]; - sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); - sha256_emit(bytes, ctx.h); - - len_in_bytes += 31; /* ell = ceil(len_in_bytes / b_in_bytes), with */ - len_in_bytes /= 32; /* caller being responsible for accordingly large - * buffer. hash_to_field passes one with length - * divisible by 64, remember? which works... */ - while (--len_in_bytes) { - sha256_init_h(ctx.h); - vec_xor(b_i.c, b_0.c, bytes, 32); - bytes += 32; - ++b_i.c[32]; - sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); - sha256_emit(bytes, ctx.h); - } -} -#endif - -/* - * |nelems| is 'count * m' from spec - */ -static void hash_to_field(vec384 elems[], size_t nelems, - const unsigned char *aug, size_t aug_len, - const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len) -{ - size_t L = sizeof(vec384) + 128/8; /* ceil((ceil(log2(p)) + k) / 8) */ - size_t len_in_bytes = L * nelems; /* divisible by 64, hurray! */ -#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ - || defined(__STDC_NO_VLA__) - limb_t *pseudo_random = alloca(len_in_bytes); -#else - limb_t pseudo_random[len_in_bytes/sizeof(limb_t)]; -#endif - unsigned char *bytes; - vec768 elem; - - aug_len = aug!=NULL ? aug_len : 0; - DST_len = DST!=NULL ? DST_len : 0; - - expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes, - aug, aug_len, msg, msg_len, DST, DST_len); - - vec_zero(elem, sizeof(elem)); - bytes = (unsigned char *)pseudo_random; - while (nelems--) { - limbs_from_be_bytes(elem, bytes, L); - bytes += L; - /* - * L-bytes block % P, output is in Montgomery domain... - */ - redc_mont_384(elems[0], elem, BLS12_381_P, p0); - mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0); - elems++; - } -} - -void blst_expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, - const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len) -{ - size_t buf_len = (len_in_bytes+31) & ((size_t)0-32); - unsigned char *buf_ptr = bytes; - - if (buf_len > 255*32) - return; - - if (buf_len != len_in_bytes) - buf_ptr = alloca(buf_len); - - expand_message_xmd(buf_ptr, len_in_bytes, NULL, 0, msg, msg_len, - DST, DST_len); - if (buf_ptr != bytes) { - unsigned char *ptr = buf_ptr; - while (len_in_bytes--) - *bytes++ = *ptr++; - vec_zero(buf_ptr, buf_len); - } -} diff --git a/crypto/blst_src/keygen.c b/crypto/blst_src/keygen.c deleted file mode 100644 index 9b62f16b534..00000000000 --- a/crypto/blst_src/keygen.c +++ /dev/null @@ -1,319 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "consts.h" -#include "bytes.h" -#include "sha256.h" - -typedef struct { - SHA256_CTX ctx; - unsigned int h_ipad[8]; - unsigned int h_opad[8]; - union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail; -} HMAC_SHA256_CTX; - -static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len) -{ - size_t i; - - if (K == NULL) { /* reuse h_ipad and h_opad */ - sha256_hcopy(ctx->ctx.h, ctx->h_ipad); - ctx->ctx.N = 64; - vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf)); - ctx->ctx.off = 0; - - return; - } - - vec_zero(ctx->tail.c, sizeof(ctx->tail)); - if (K_len > 64) { - sha256_init(&ctx->ctx); - sha256_update(&ctx->ctx, K, K_len); - sha256_final(ctx->tail.c, &ctx->ctx); - } else { - sha256_bcopy(ctx->tail.c, K, K_len); - } - - for (i = 0; i < 64/sizeof(limb_t); i++) - ctx->tail.l[i] ^= (limb_t)0x3636363636363636; - - sha256_init(&ctx->ctx); - sha256_update(&ctx->ctx, ctx->tail.c, 64); - sha256_hcopy(ctx->h_ipad, ctx->ctx.h); - - for (i = 0; i < 64/sizeof(limb_t); i++) - ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c); - - sha256_init_h(ctx->h_opad); - sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1); - - vec_zero(ctx->tail.c, sizeof(ctx->tail)); - ctx->tail.c[32] = 0x80; - ctx->tail.c[62] = 3; /* (64+32)*8 in big endian */ - ctx->tail.c[63] = 0; -} - -static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp, - size_t len) -{ sha256_update(&ctx->ctx, inp, len); } - -static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx) -{ - sha256_final(ctx->tail.c, &ctx->ctx); - sha256_hcopy(ctx->ctx.h, ctx->h_opad); - sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1); - sha256_emit(md, ctx->ctx.h); -} - -static void HKDF_Extract(unsigned char PRK[32], - const void *salt, size_t salt_len, - const void *IKM, size_t IKM_len, -#ifndef __BLST_HKDF_TESTMODE__ - int IKM_fixup, -#endif - HMAC_SHA256_CTX *ctx) -{ - unsigned char zero[1] = { 0 }; - - HMAC_init(ctx, salt != NULL ? salt : zero, salt_len); - HMAC_update(ctx, IKM, IKM_len); -#ifndef __BLST_HKDF_TESTMODE__ - if (IKM_fixup) { - /* Section 2.3 KeyGen in BLS-signature draft */ - HMAC_update(ctx, zero, 1); - } -#endif - HMAC_final(PRK, ctx); -} - -static void HKDF_Expand(unsigned char *OKM, size_t L, - const unsigned char PRK[32], - const void *info, size_t info_len, -#ifndef __BLST_HKDF_TESTMODE__ - int info_fixup, -#endif - HMAC_SHA256_CTX *ctx) -{ -#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ - || defined(__STDC_NO_VLA__) - unsigned char *info_prime = alloca(info_len + 2 + 1); -#else - unsigned char info_prime[info_len + 2 + 1]; -#endif - - HMAC_init(ctx, PRK, 32); - - if (info_len != 0) - sha256_bcopy(info_prime, info, info_len); -#ifndef __BLST_HKDF_TESTMODE__ - if (info_fixup) { - /* Section 2.3 KeyGen in BLS-signature draft */ - info_prime[info_len + 0] = (unsigned char)(L >> 8); - info_prime[info_len + 1] = (unsigned char)(L); - info_len += 2; - } -#endif - info_prime[info_len] = 1; /* counter */ - HMAC_update(ctx, info_prime, info_len + 1); - HMAC_final(ctx->tail.c, ctx); - while (L > 32) { - sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c); - OKM += 32; L -= 32; - ++info_prime[info_len]; /* counter */ - HMAC_init(ctx, NULL, 0); - HMAC_update(ctx, ctx->tail.c, 32); - HMAC_update(ctx, info_prime, info_len + 1); - HMAC_final(ctx->tail.c, ctx); - } - sha256_bcopy(OKM, ctx->tail.c, L); -} - -#ifndef __BLST_HKDF_TESTMODE__ -static void keygen(pow256 SK, const void *IKM, size_t IKM_len, - const void *salt, size_t salt_len, - const void *info, size_t info_len, - int version) -{ - struct { - HMAC_SHA256_CTX ctx; - unsigned char PRK[32], OKM[48]; - vec512 key; - } scratch; - unsigned char salt_prime[32] = "BLS-SIG-KEYGEN-SALT-"; - - if (IKM_len < 32 || (version > 4 && salt == NULL)) { - vec_zero(SK, sizeof(pow256)); - return; - } - - /* - * Vet |info| since some callers were caught to be sloppy, e.g. - * SWIG-4.0-generated Python wrapper... - */ - info_len = info==NULL ? 0 : info_len; - - if (salt == NULL) { - salt = salt_prime; - salt_len = 20; - } - - if (version == 4) { - /* salt = H(salt) */ - sha256_init(&scratch.ctx.ctx); - sha256_update(&scratch.ctx.ctx, salt, salt_len); - sha256_final(salt_prime, &scratch.ctx.ctx); - salt = salt_prime; - salt_len = sizeof(salt_prime); - } - - while (1) { - /* PRK = HKDF-Extract(salt, IKM || I2OSP(0, 1)) */ - HKDF_Extract(scratch.PRK, salt, salt_len, - IKM, IKM_len, 1, &scratch.ctx); - - /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */ - HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK, - info, info_len, 1, &scratch.ctx); - - /* SK = OS2IP(OKM) mod r */ - vec_zero(scratch.key, sizeof(scratch.key)); - limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM)); - redc_mont_256(scratch.key, scratch.key, BLS12_381_r, r0); - /* - * Given that mul_mont_sparse_256 has special boundary conditions - * it's appropriate to mention that redc_mont_256 output is fully - * reduced at this point. Because we started with 384-bit input, - * one with most significant half smaller than the modulus. - */ - mul_mont_sparse_256(scratch.key, scratch.key, BLS12_381_rRR, - BLS12_381_r, r0); - - if (version < 4 || !vec_is_zero(scratch.key, sizeof(vec256))) - break; - - /* salt = H(salt) */ - sha256_init(&scratch.ctx.ctx); - sha256_update(&scratch.ctx.ctx, salt, salt_len); - sha256_final(salt_prime, &scratch.ctx.ctx); - salt = salt_prime; - salt_len = sizeof(salt_prime); - } - - le_bytes_from_limbs(SK, scratch.key, sizeof(pow256)); - - /* - * scrub the stack just in case next callee inadvertently flashes - * a fragment across application boundary... - */ - vec_zero(&scratch, sizeof(scratch)); -} - -void blst_keygen(pow256 SK, const void *IKM, size_t IKM_len, - const void *info, size_t info_len) -{ keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 4); } - -void blst_keygen_v3(pow256 SK, const void *IKM, size_t IKM_len, - const void *info, size_t info_len) -{ keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 3); } - -void blst_keygen_v4_5(pow256 SK, const void *IKM, size_t IKM_len, - const void *salt, size_t salt_len, - const void *info, size_t info_len) -{ keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 4); } - -void blst_keygen_v5(pow256 SK, const void *IKM, size_t IKM_len, - const void *salt, size_t salt_len, - const void *info, size_t info_len) -{ keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 5); } - -/* - * https://eips.ethereum.org/EIPS/eip-2333 - */ -void blst_derive_master_eip2333(pow256 SK, const void *seed, size_t seed_len) -{ keygen(SK, seed, seed_len, NULL, 0, NULL, 0, 4); } - -static void parent_SK_to_lamport_PK(pow256 PK, const pow256 parent_SK, - unsigned int index) -{ - size_t i; - struct { - HMAC_SHA256_CTX ctx; - SHA256_CTX ret; - unsigned char PRK[32], IKM[32]; - unsigned char lamport[255][32]; - } scratch; - - /* salt = I2OSP(index, 4) */ - unsigned char salt[4] = { (unsigned char)(index>>24), - (unsigned char)(index>>16), - (unsigned char)(index>>8), - (unsigned char)(index) }; - - /* IKM = I2OSP(parent_SK, 32) */ - for (i = 0; i < 32; i++) - scratch.IKM[i] = parent_SK[31-i]; - - /* lamport_0 = IKM_to_lamport_SK(IKM, salt) */ - HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, - &scratch.ctx); - HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), - scratch.PRK, NULL, 0, 0, &scratch.ctx); - - vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); - scratch.ctx.ctx.buf[32] = 0x80; - scratch.ctx.ctx.buf[62] = 1; /* 32*8 in big endian */ - scratch.ctx.ctx.buf[63] = 0; - for (i = 0; i < 255; i++) { - /* lamport_PK = lamport_PK | SHA256(lamport_0[i]) */ - sha256_init_h(scratch.ctx.ctx.h); - sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); - sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); - sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); - } - - /* compressed_lamport_PK = SHA256(lamport_PK) */ - sha256_init(&scratch.ret); - sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); - - /* not_IKM = flip_bits(IKM) */ - for (i = 0; i< 32; i++) - scratch.IKM[i] = ~scratch.IKM[i]; - - /* lamport_1 = IKM_to_lamport_SK(not_IKM, salt) */ - HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, - &scratch.ctx); - HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), - scratch.PRK, NULL, 0, 0, &scratch.ctx); - - vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); - scratch.ctx.ctx.buf[32] = 0x80; - scratch.ctx.ctx.buf[62] = 1; - for (i = 0; i < 255; i++) { - /* lamport_PK = lamport_PK | SHA256(lamport_1[i]) */ - sha256_init_h(scratch.ctx.ctx.h); - sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); - sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); - sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); - } - - /* compressed_lamport_PK = SHA256(lamport_PK) */ - sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); - sha256_final(PK, &scratch.ret); - - /* - * scrub the stack just in case next callee inadvertently flashes - * a fragment across application boundary... - */ - vec_zero(&scratch, sizeof(scratch)); -} - -void blst_derive_child_eip2333(pow256 SK, const pow256 parent_SK, - unsigned int child_index) -{ - parent_SK_to_lamport_PK(SK, parent_SK, child_index); - keygen(SK, SK, sizeof(pow256), NULL, 0, NULL, 0, 4); -} -#endif diff --git a/crypto/blst_src/map_to_g1.c b/crypto/blst_src/map_to_g1.c deleted file mode 100644 index 6613d68bb29..00000000000 --- a/crypto/blst_src/map_to_g1.c +++ /dev/null @@ -1,559 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "point.h" -#include "fields.h" - -/* - * y^2 = x^3 + A'*x + B', isogenous one - */ -static const vec384 Aprime_E1 = { - /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8 - d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */ - TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3), - TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c), - TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85) -}; -static const vec384 Bprime_E1 = { - /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070 - a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */ - TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f), - TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571), - TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b) -}; - -static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[], - const vec384 Zz_powers[], size_t n) -{ - while (n--) - mul_fp(map[n], isogeny_map[n], Zz_powers[n]); -} - -static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n) -{ - while (n--) { - mul_fp(acc, acc, x); - add_fp(acc, acc, map[n]); - } -} - -static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p) -{ - /* - * x = x_num / x_den, where - * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 + - * ... + k_(1,0) - * ... - */ - static const vec384 isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ - { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c), - TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304), - TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4) }, - { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa), - TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad), - TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad) }, - { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6), - TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7), - TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524) }, - { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021), - TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60), - TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb) }, - { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185), - TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c), - TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6) }, - { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd), - TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db), - TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929) }, - { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9), - TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f), - TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe) }, - { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81), - TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65), - TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028) }, - { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475), - TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb), - TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac) }, - { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2), - TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f), - TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375) }, - { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375), - TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb), - TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c) }, - { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5), - TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb), - TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d) } - }; - /* ... - * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0) - */ - static const vec384 isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ - { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0), - TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e), - TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f) }, - { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a), - TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93), - TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce) }, - { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0), - TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd), - TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2) }, - { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb), - TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df), - TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c) }, - { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f), - TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42), - TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015) }, - { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4), - TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2), - TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb) }, - { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3), - TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4), - TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606) }, - { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0), - TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23), - TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122) }, - { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7), - TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c), - TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34) }, - { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0), - TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f), - TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8) } - }; - /* - * y = y' * y_num / y_den, where - * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 + - * ... + k_(3,0) - * ... - */ - static const vec384 isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ - { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767), - TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df), - TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310) }, - { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878), - TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8), - TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555) }, - { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d), - TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0), - TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905) }, - { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707), - TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53), - TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257) }, - { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd), - TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f), - TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d) }, - { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982), - TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7), - TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793) }, - { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19), - TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0), - TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f) }, - { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091), - TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f), - TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79) }, - { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233), - TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482), - TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393) }, - { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17), - TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46), - TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb) }, - { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f), - TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d), - TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5) }, - { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4), - TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d), - TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4) }, - { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5), - TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef), - TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2) }, - { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a), - TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f), - TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49) }, - { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f), - TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee), - TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f) }, - { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a), - TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838), - TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9) } - }; - /* ... - * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0) - */ - static const vec384 isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ - { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60), - TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323), - TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1) }, - { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911), - TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658), - TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e) }, - { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742), - TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d), - TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41) }, - { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f), - TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806), - TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd) }, - { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700), - TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4), - TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb) }, - { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55), - TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc), - TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4) }, - { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0), - TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3), - TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed) }, - { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930), - TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6), - TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406) }, - { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497), - TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb), - TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5) }, - { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30), - TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023), - TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff) }, - { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b), - TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8), - TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e) }, - { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03), - TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45), - TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff) }, - { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154), - TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153), - TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c) }, - { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55), - TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc), - TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6) }, - { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21), - TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467), - TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e) } - }; - vec384 Zz_powers[15], map[15], xn, xd, yn, yd; - - /* lay down Z^2 powers in descending order */ - sqr_fp(Zz_powers[14], p->Z); /* ZZ^1 */ -#ifdef __OPTIMIZE_SIZE__ - for (size_t i = 14; i > 0; i--) - mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]); -#else - sqr_fp(Zz_powers[13], Zz_powers[14]); /* ZZ^2 1+1 */ - mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3 2+1 */ - sqr_fp(Zz_powers[11], Zz_powers[13]); /* ZZ^4 2+2 */ - mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5 2+3 */ - sqr_fp(Zz_powers[9], Zz_powers[12]); /* ZZ^6 3+3 */ - mul_fp(Zz_powers[8], Zz_powers[12], Zz_powers[11]);/* ZZ^7 3+4 */ - sqr_fp(Zz_powers[7], Zz_powers[11]); /* ZZ^8 4+4 */ - mul_fp(Zz_powers[6], Zz_powers[11], Zz_powers[10]);/* ZZ^9 4+5 */ - sqr_fp(Zz_powers[5], Zz_powers[10]); /* ZZ^10 5+5 */ - mul_fp(Zz_powers[4], Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6 */ - sqr_fp(Zz_powers[3], Zz_powers[9]); /* ZZ^12 6+6 */ - mul_fp(Zz_powers[2], Zz_powers[9], Zz_powers[8]); /* ZZ^13 6+7 */ - sqr_fp(Zz_powers[1], Zz_powers[8]); /* ZZ^14 7+7 */ - mul_fp(Zz_powers[0], Zz_powers[8], Zz_powers[7]); /* ZZ^15 7+8 */ -#endif - - map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11); - mul_fp(xn, p->X, isogeny_map_x_num[11]); - add_fp(xn, xn, map[10]); - map_fp(xn, p->X, map, 10); - - map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10); - add_fp(xd, p->X, map[9]); - map_fp(xd, p->X, map, 9); - mul_fp(xd, xd, Zz_powers[14]); /* xd *= Z^2 */ - - map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15); - mul_fp(yn, p->X, isogeny_map_y_num[15]); - add_fp(yn, yn, map[14]); - map_fp(yn, p->X, map, 14); - mul_fp(yn, yn, p->Y); /* yn *= Y */ - - map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15); - add_fp(yd, p->X, map[14]); - map_fp(yd, p->X, map, 14); - mul_fp(Zz_powers[14], Zz_powers[14], p->Z); - mul_fp(yd, yd, Zz_powers[14]); /* yd *= Z^3 */ - - /* convert (xn, xd, yn, yd) to Jacobian coordinates */ - mul_fp(out->Z, xd, yd); /* Z = xd * yd */ - mul_fp(out->X, xn, yd); - mul_fp(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ - sqr_fp(out->Y, out->Z); - mul_fp(out->Y, out->Y, xd); - mul_fp(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ -} - -static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u) -{ - static const vec384 minus_A = { /* P - A */ - TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c), - TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442), - TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915) - }; - static const vec384 Z = { /* (11<<384) % P */ - TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d), - TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740), - TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8) - }; - static const vec384 sqrt_minus_ZZZ = { - TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2), - TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b), - TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff) - }; - static const vec384 ZxA = { - TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65), - TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1), - TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c) - }; - vec384 uu, tv2, x2n, gx1, gxd, y2; -#if 0 - vec384 xn, x1n, xd, y, y1, Zuu, tv4; -#else -# define xn p->X -# define y p->Y -# define xd p->Z -# define x1n xn -# define y1 y -# define Zuu x2n -# define tv4 y1 -#endif -#define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1) - bool_t e1, e2; - - /* - * as per map_to_curve() from poc/sswu_opt.sage at - * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve - */ - /* x numerator variants */ - sqr_fp(uu, u); /* uu = u^2 */ - mul_fp(Zuu, Z, uu); /* Zuu = Z * uu */ - sqr_fp(tv2, Zuu); /* tv2 = Zuu^2 */ - add_fp(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ - add_fp(x1n, tv2, BLS12_381_Rx.p); /* x1n = tv2 + 1 */ - mul_fp(x1n, x1n, Bprime_E1); /* x1n = x1n * B */ - mul_fp(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ - - /* x denumenator */ - mul_fp(xd, minus_A, tv2); /* xd = -A * tv2 */ - e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ - vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ - - /* y numerators variants */ - sqr_fp(tv2, xd); /* tv2 = xd^2 */ - mul_fp(gxd, xd, tv2); /* gxd = xd^3 */ - mul_fp(tv2, Aprime_E1, tv2); /* tv2 = A * tv2 */ - sqr_fp(gx1, x1n); /* gx1 = x1n^2 */ - add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ - mul_fp(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ - mul_fp(tv2, Bprime_E1, gxd); /* tv2 = B * gxd */ - add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ - sqr_fp(tv4, gxd); /* tv4 = gxd^2 */ - mul_fp(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ - mul_fp(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ - e2 = recip_sqrt_fp(y1, tv4); /* y1 = tv4^c1 # (gx1*gxd^3)^((p-3)/4) */ - mul_fp(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ - mul_fp(y2, y1, sqrt_minus_ZZZ); /* y2 = y1 * c2 # y2 = y1*sqrt(-Z^3) */ - mul_fp(y2, y2, uu); /* y2 = y2 * uu */ - mul_fp(y2, y2, u); /* y2 = y2 * u */ - - /* choose numerators */ - vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ - vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ - - e1 = sgn0_fp(u); - e2 = sgn0_fp(y); - cneg_fp(y, y, e1^e2); /* fix sign of y */ - /* return (xn, xd, y, 1) */ - - /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ - mul_fp(p->X, xn, xd); /* X = xn * xd */ - mul_fp(p->Y, y, gxd); /* Y = y * xd^3 */ -#ifndef xd - vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ -#else -# undef xn -# undef y -# undef xd -# undef x1n -# undef y1 -# undef Zuu -# undef tv4 -#endif -#undef sgn0_fp -} - -static void POINTonE1_add_n_dbl(POINTonE1 *out, const POINTonE1 *p, size_t n) -{ - POINTonE1_dadd(out, out, p, NULL); - while(n--) - POINTonE1_double(out, out); -} - -static void POINTonE1_times_minus_z(POINTonE1 *out, const POINTonE1 *in) -{ - POINTonE1_double(out, in); /* 1: 0x2 */ - POINTonE1_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ - POINTonE1_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ - POINTonE1_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ - POINTonE1_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ - POINTonE1_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ -} - -/* - * |u|, |v| are expected to be in Montgomery representation - */ -static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) -{ - POINTonE1 p; - - map_to_isogenous_E1(&p, u); - - if (v != NULL) { - map_to_isogenous_E1(out, v); /* borrow |out| */ - POINTonE1_dadd(&p, &p, out, Aprime_E1); - } - - isogeny_map_to_E1(&p, &p); /* sprinkle isogenous powder */ - - /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */ - POINTonE1_times_minus_z(out, &p); - POINTonE1_dadd(out, out, &p, NULL); -} - -void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) -{ map_to_g1(out, u, v); } - -static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len, - const unsigned char *aug, size_t aug_len) -{ - vec384 u[1]; - - hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len); - map_to_g1(p, u[0], NULL); -} - -void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len, - const unsigned char *aug, size_t aug_len) -{ Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } - -static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len, - const unsigned char *aug, size_t aug_len) -{ - vec384 u[2]; - - hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len); - map_to_g1(p, u[0], u[1]); -} - -void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len, - const unsigned char *aug, size_t aug_len) -{ Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } - -static void sigma(POINTonE1 *out, const POINTonE1 *in); - -#if 0 -#ifdef __OPTIMIZE_SIZE__ -static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, - const POINTonE1 *in) -{ - static const byte zz_minus_1_div_by_3[] = { - TO_BYTES(0x0000000055555555ULL), TO_BYTES(0x396c8c005555e156) - }; - size_t n = 126-1; - const POINTonE1 *dblin = in; - - while(n--) { - POINTonE1_double(out, dblin); dblin = out; - if (is_bit_set(zz_minus_1_div_by_3, n)) - POINTonE1_dadd(out, out, in, NULL); - } -} -#else -static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p) -{ - while(n--) - POINTonE1_double(out, out); - POINTonE1_dadd(out, out, p, NULL); -} - -static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, - const POINTonE1 *in) -{ - POINTonE1 t3, t5, t7, t11, t85; - - POINTonE1_double(&t7, in); /* 2P */ - POINTonE1_dadd(&t3, &t7, in, NULL); /* 3P */ - POINTonE1_dadd(&t5, &t3, &t7, NULL); /* 5P */ - POINTonE1_dadd(&t7, &t5, &t7, NULL); /* 7P */ - POINTonE1_double(&t85, &t5); /* 10P */ - POINTonE1_dadd(&t11, &t85, in, NULL); /* 11P */ - POINTonE1_dbl_n_add(&t85, 3, &t5); /* 0x55P */ - /* (-0xd201000000010000^2 - 1) / 3 */ - POINTonE1_double(out, &t7); /* 0xe */ - POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb */ - POINTonE1_dbl_n_add(out, 3, &t3); /* 0xe5b */ - POINTonE1_dbl_n_add(out, 3, in); /* 0x72d9 */ - POINTonE1_dbl_n_add(out, 5, &t3); /* 0xe5b23 */ - POINTonE1_dbl_n_add(out, 18, &t85); /* 0x396c8c0055 */ - POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555 */ - POINTonE1_dbl_n_add(out, 3, &t7); /* 0x1cb646002aaaf */ - POINTonE1_dbl_n_add(out, 7, &t5); /* 0xe5b23001555785 */ - POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb646002aaaf0ab */ - POINTonE1_dbl_n_add(out, 41, &t85); /* 0x396c8c005555e1560000000055 */ - POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e156000000005555 */ - POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e15600000000555555 */ - POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e1560000000055555555 */ -} -#endif - -static bool_t POINTonE1_in_G1(const POINTonE1 *P) -{ - POINTonE1 t0, t1, t2; - - /* Bowe, S., "Faster subgroup checks for BLS12-381" */ - sigma(&t0, P); /* σ(P) */ - sigma(&t1, &t0); /* σ²(P) */ - - POINTonE1_double(&t0, &t0); /* 2σ(P) */ - POINTonE1_dadd(&t2, &t1, P, NULL); /* P + σ²(P) */ - POINTonE1_cneg(&t2, 1); /* - P - σ²(P) */ - POINTonE1_dadd(&t2, &t2, &t0, NULL); /* 2σ(P) - P - σ²(P) */ - POINTonE1_times_zz_minus_1_div_by_3( &t0, &t2); - POINTonE1_cneg(&t1, 1); - POINTonE1_dadd(&t0, &t0, &t1, NULL); /* [(z²-1)/3](2σ(P) - P - σ²(P)) */ - /* - σ²(P) */ - return vec_is_zero(t0.Z, sizeof(t0.Z)); -} -#else -static bool_t POINTonE1_in_G1(const POINTonE1 *P) -{ - POINTonE1 t0, t1; - - /* Scott, M., https://eprint.iacr.org/2021/1130 */ - POINTonE1_times_minus_z(&t0, P); - POINTonE1_times_minus_z(&t1, &t0); - POINTonE1_cneg(&t1, 1); /* [-z²]P */ - - sigma(&t0, P); /* σ(P) */ - sigma(&t0, &t0); /* σ²(P) */ - - return POINTonE1_is_equal(&t0, &t1); -} -#endif - -int blst_p1_in_g1(const POINTonE1 *p) -{ return (int)POINTonE1_in_G1(p); } - -int blst_p1_affine_in_g1(const POINTonE1_affine *p) -{ - POINTonE1 P; - - vec_copy(P.X, p->X, 2*sizeof(P.X)); - vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), - vec_is_zero(p, sizeof(*p))); - - return (int)POINTonE1_in_G1(&P); -} diff --git a/crypto/blst_src/map_to_g2.c b/crypto/blst_src/map_to_g2.c deleted file mode 100644 index 90fd86e9d31..00000000000 --- a/crypto/blst_src/map_to_g2.c +++ /dev/null @@ -1,444 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "point.h" -#include "fields.h" - -/* - * y^2 = x^3 + A'*x + B', isogenous one - */ -static const vec384x Aprime_E2 = { /* 240*i */ - { 0 }, - { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), - TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), - TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) } -}; -static const vec384x Bprime_E2 = { /* 1012 + 1012*i */ - { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), - TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), - TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }, - { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), - TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), - TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) } -}; - -static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[], - const vec384x Zz_powers[], size_t n) -{ - while (n--) - mul_fp2(map[n], isogeny_map[n], Zz_powers[n]); -} - -static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n) -{ - while (n--) { - mul_fp2(acc, acc, x); - add_fp2(acc, acc, map[n]); - } -} - -static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p) -{ - /* - * x = x_num / x_den, where - * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0) - * ... - */ - static const vec384x isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ - {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), - TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), - TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }, - { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), - TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), - TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }}, - {{ 0 }, - { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3), - TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945), - TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }}, - {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae), - TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c), - TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) }, - { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551), - TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2), - TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }}, - {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e), - TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1), - TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) }, - { 0 }} - }; - /* ... - * x_den = x'^2 + k_(2,1) * x' + k_(2,0) - */ - static const vec384x isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ - {{ 0 }, - { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e), - TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18), - TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }}, - {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020), - TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6), - TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) }, - { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf), - TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8), - TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }} - }; - /* - * y = y' * y_num / y_den, where - * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0) - * ... - */ - static const vec384x isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ - {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), - TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), - TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }, - { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), - TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), - TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }}, - {{ 0 }, - { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd), - TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5), - TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }}, - {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8), - TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251), - TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) }, - { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556), - TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e), - TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }}, - {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425), - TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e), - TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) }, - { 0 }} - }; - /* ... - * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0) - */ - static const vec384x isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ - {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), - TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), - TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }, - { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), - TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), - TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }}, - {{ 0 }, - { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba), - TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a), - TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }}, - {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030), - TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9), - TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) }, - { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf), - TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915), - TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }} - }; - vec384x Zz_powers[3], map[3], xn, xd, yn, yd; - - /* lay down Z^2 powers in descending order */ - sqr_fp2(Zz_powers[2], p->Z); /* ZZ^1 */ - sqr_fp2(Zz_powers[1], Zz_powers[2]); /* ZZ^2 1+1 */ - mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3 2+1 */ - - map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3); - mul_fp2(xn, p->X, isogeny_map_x_num[3]); - add_fp2(xn, xn, map[2]); - map_fp2(xn, p->X, map, 2); - - map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2); - add_fp2(xd, p->X, map[1]); - map_fp2(xd, p->X, map, 1); - mul_fp2(xd, xd, Zz_powers[2]); /* xd *= Z^2 */ - - map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3); - mul_fp2(yn, p->X, isogeny_map_y_num[3]); - add_fp2(yn, yn, map[2]); - map_fp2(yn, p->X, map, 2); - mul_fp2(yn, yn, p->Y); /* yn *= Y */ - - map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3); - add_fp2(yd, p->X, map[2]); - map_fp2(yd, p->X, map, 2); - mul_fp2(Zz_powers[2], Zz_powers[2], p->Z); - mul_fp2(yd, yd, Zz_powers[2]); /* yd *= Z^3 */ - - /* convert (xn, xd, yn, yd) to Jacobian coordinates */ - mul_fp2(out->Z, xd, yd); /* Z = xd * yd */ - mul_fp2(out->X, xn, yd); - mul_fp2(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ - sqr_fp2(out->Y, out->Z); - mul_fp2(out->Y, out->Y, xd); - mul_fp2(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ -} - -static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u) -{ - static const vec384x minus_A = { - { 0 }, - { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79), - TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd), - TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) } - }; - static const vec384x Z = { /* -2 - i */ - { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa), - TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4), - TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) }, - { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), - TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), - TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } - }; - static const vec384x recip_ZZZ = { /* 1/(Z^3) */ - { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916), - TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f), - TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) }, - { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604), - TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7), - TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) } - }; - static const vec384x magic_ZZZ = { /* 1/Z^3 = a + b*i */ - /* a^2 + b^2 */ - { TO_LIMB_T(0xaa7eb851eb8508e0), TO_LIMB_T(0x1c54fdf360989374), - TO_LIMB_T(0xc87f2fc6e716c62e), TO_LIMB_T(0x0124aefb1f9efea7), - TO_LIMB_T(0xb2f8be63e844865c), TO_LIMB_T(0x08b47f775a7ef35a) }, - /* (a^2 + b^2)^((P-3)/4) */ - { TO_LIMB_T(0xe4132bbd838cf70a), TO_LIMB_T(0x01d769ac83772c19), - TO_LIMB_T(0xa83dd6e974c22e45), TO_LIMB_T(0xbc8ec3e777b08dff), - TO_LIMB_T(0xc035c2042ecf5da3), TO_LIMB_T(0x073929e97f0850bf) } - }; - static const vec384x ZxA = { /* 240 - 480*i */ - { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), - TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), - TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }, - { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3), - TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a), - TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) } - }; - vec384x uu, tv2, tv4, x2n, gx1, gxd, y2; -#if 0 - vec384x xn, x1n, xd, y, y1, Zuu; -#else -# define xn p->X -# define y p->Y -# define xd p->Z -# define x1n xn -# define y1 y -# define Zuu x2n -#endif -#define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1) - bool_t e1, e2; - - /* - * as per map_to_curve() from poc/sswu_opt.sage at - * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve - * with 9mod16 twists... - */ - /* x numerator variants */ - sqr_fp2(uu, u); /* uu = u^2 */ - mul_fp2(Zuu, Z, uu); /* Zuu = Z * uu */ - sqr_fp2(tv2, Zuu); /* tv2 = Zuu^2 */ - add_fp2(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ - add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1 */ - mul_fp2(x1n, x1n, Bprime_E2); /* x1n = x1n * B */ - mul_fp2(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ - - /* x denumenator */ - mul_fp2(xd, minus_A, tv2); /* xd = -A * tv2 */ - e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ - vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ - - /* y numerators variants */ - sqr_fp2(tv2, xd); /* tv2 = xd^2 */ - mul_fp2(gxd, xd, tv2); /* gxd = xd^3 */ - mul_fp2(tv2, Aprime_E2, tv2); /* tv2 = A * tv2 */ - sqr_fp2(gx1, x1n); /* gx1 = x1n^2 */ - add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ - mul_fp2(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ - mul_fp2(tv2, Bprime_E2, gxd); /* tv2 = B * gxd */ - add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ - sqr_fp2(tv4, gxd); /* tv4 = gxd^2 */ - mul_fp2(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ - mul_fp2(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ - e2 = recip_sqrt_fp2(y1, tv4, /* y1 = tv4^c1 # (gx1*gxd^3)^((p^2-9)/16) */ - recip_ZZZ, magic_ZZZ); - mul_fp2(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ - mul_fp2(y2, y1, uu); /* y2 = y1 * uu */ - mul_fp2(y2, y2, u); /* y2 = y2 * u */ - - /* choose numerators */ - vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ - vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ - - e1 = sgn0_fp2(u); - e2 = sgn0_fp2(y); - cneg_fp2(y, y, e1^e2); /* fix sign of y */ - /* return (xn, xd, y, 1) */ - - /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ - mul_fp2(p->X, xn, xd); /* X = xn * xd */ - mul_fp2(p->Y, y, gxd); /* Y = y * xd^3 */ -#ifndef xd - vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ -#else -# undef xn -# undef y -# undef xd -# undef x1n -# undef y1 -# undef Zuu -# undef tv4 -#endif -#undef sgn0_fp2 -} - -#if 0 -static const byte h_eff[] = { - TO_BYTES(0xe8020005aaa95551), TO_BYTES(0x59894c0adebbf6b4), - TO_BYTES(0xe954cbc06689f6a3), TO_BYTES(0x2ec0ec69d7477c1a), - TO_BYTES(0x6d82bf015d1212b0), TO_BYTES(0x329c2f178731db95), - TO_BYTES(0x9986ff031508ffe1), TO_BYTES(0x88e2a8e9145ad768), - TO_BYTES(0x584c6a0ea91b3528), TO_BYTES(0x0bc69f08f2ee75b3) -}; - -static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) -{ POINTonE2_mult_w5(out, p, h_eff, 636); } -#else -/* - * As per suggestions in "7. Clearing the cofactor" at - * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06 - */ -static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n) -{ - POINTonE2_dadd(out, out, p, NULL); - while(n--) - POINTonE2_double(out, out); -} - -static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in) -{ - POINTonE2_double(out, in); /* 1: 0x2 */ - POINTonE2_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ - POINTonE2_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ - POINTonE2_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ - POINTonE2_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ - POINTonE2_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ -} - -static void psi(POINTonE2 *out, const POINTonE2 *in); - -static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) -{ - POINTonE2 t0, t1; - - /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves" */ - POINTonE2_double(out, p); /* out = 2P */ - psi(out, out); /* out = Ψ(2P) */ - psi(out, out); /* out = Ψ²(2P) */ - - vec_copy(&t0, p, sizeof(t0)); - POINTonE2_cneg(&t0, 1); /* t0 = -P */ - psi(&t1, &t0); /* t1 = -Ψ(P) */ - POINTonE2_dadd(out, out, &t0, NULL);/* out = Ψ²(2P) - P */ - POINTonE2_dadd(out, out, &t1, NULL);/* out = Ψ²(2P) - P - Ψ(P) */ - - POINTonE2_times_minus_z(&t0, p); /* t0 = [-z]P */ - POINTonE2_dadd(&t0, &t0, p, NULL); /* t0 = [-z + 1]P */ - POINTonE2_dadd(&t0, &t0, &t1, NULL);/* t0 = [-z + 1]P - Ψ(P) */ - POINTonE2_times_minus_z(&t1, &t0); /* t1 = [z² - z]P + [z]Ψ(P) */ - POINTonE2_dadd(out, out, &t1, NULL);/* out = [z² - z - 1]P */ - /* + [z - 1]Ψ(P) */ - /* + Ψ²(2P) */ -} -#endif - -/* - * |u|, |v| are expected to be in Montgomery representation - */ -static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) -{ - POINTonE2 p; - - map_to_isogenous_E2(&p, u); - - if (v != NULL) { - map_to_isogenous_E2(out, v); /* borrow |out| */ - POINTonE2_dadd(&p, &p, out, Aprime_E2); - } - - isogeny_map_to_E2(&p, &p); /* sprinkle isogenous powder */ - clear_cofactor(out, &p); -} - -void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) -{ map_to_g2(out, u, v); } - -static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len, - const unsigned char *aug, size_t aug_len) -{ - vec384x u[1]; - - hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len); - map_to_g2(p, u[0], NULL); -} - -void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len, - const unsigned char *aug, size_t aug_len) -{ Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } - -static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len, - const unsigned char *aug, size_t aug_len) -{ - vec384x u[2]; - - hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len); - map_to_g2(p, u[0], u[1]); -} - -void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len, - const unsigned char *aug, size_t aug_len) -{ Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } - -static bool_t POINTonE2_in_G2(const POINTonE2 *P) -{ -#if 0 - POINTonE2 t0, t1, t2; - - /* Bowe, S., "Faster subgroup checks for BLS12-381" */ - psi(&t0, P); /* Ψ(P) */ - psi(&t0, &t0); /* Ψ²(P) */ - psi(&t1, &t0); /* Ψ³(P) */ - - POINTonE2_times_minus_z(&t2, &t1); - POINTonE2_dadd(&t0, &t0, &t2, NULL); - POINTonE2_cneg(&t0, 1); - POINTonE2_dadd(&t0, &t0, P, NULL); /* [z]Ψ³(P) - Ψ²(P) + P */ - - return vec_is_zero(t0.Z, sizeof(t0.Z)); -#else - POINTonE2 t0, t1; - - /* Scott, M., https://eprint.iacr.org/2021/1130 */ - psi(&t0, P); /* Ψ(P) */ - - POINTonE2_times_minus_z(&t1, P); - POINTonE2_cneg(&t1, 1); /* [z]P */ - - return POINTonE2_is_equal(&t0, &t1); -#endif -} - -int blst_p2_in_g2(const POINTonE2 *p) -{ return (int)POINTonE2_in_G2(p); } - -int blst_p2_affine_in_g2(const POINTonE2_affine *p) -{ - POINTonE2 P; - - vec_copy(P.X, p->X, 2*sizeof(P.X)); - vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), - vec_is_zero(p, sizeof(*p))); - - return (int)POINTonE2_in_G2(&P); -} diff --git a/crypto/blst_src/multi_scalar.c b/crypto/blst_src/multi_scalar.c deleted file mode 100644 index 55ab8227718..00000000000 --- a/crypto/blst_src/multi_scalar.c +++ /dev/null @@ -1,427 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "fields.h" -#include "point.h" - -/* - * Infinite point among inputs would be devastating. Shall we change it? - */ -#define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \ -static void ptype##s_to_affine(ptype##_affine dst[], \ - const ptype *const points[], size_t npoints) \ -{ \ - size_t i; \ - vec##bits *acc, ZZ, ZZZ; \ - const ptype *point = NULL; \ - const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \ -\ - while (npoints) { \ - const ptype *p, *const *walkback; \ - size_t delta = strideZ, sizeof(vec##bits)); \ - for (i = 1; i < delta; i++, acc++) \ - point = *points ? *points++ : point+1, \ - mul_##field(acc[0], acc[-1], point->Z); \ -\ - --acc; reciprocal_##field(acc[0], acc[0]); \ -\ - walkback = points-1, p = point, --delta, dst += delta; \ - for (i = 0; i < delta; i++, acc--, dst--) { \ - mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ - sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ - mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ - mul_##field(acc[-1], p->Z, acc[0]); \ - mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ - mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ - p = (p == *walkback) ? *--walkback : p-1; \ - } \ - sqr_##field(ZZ, acc[0]); /* 1/Z^2 */\ - mul_##field(ZZZ, ZZ, acc[0]); /* 1/Z^3 */\ - mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ - mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ - ++delta, dst += delta, npoints -= delta; \ - } \ -} \ -\ -void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \ - size_t npoints) \ -{ ptype##s_to_affine(dst, points, npoints); } - -POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp) -POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2) - -/* - * This is two-step multi-scalar multiplication procedure. First, given - * a set of points you pre-compute a table for chosen windowing factor - * [expressed in bits with value between 2 and 14], and then you pass - * this table to the actual multiplication procedure along with scalars. - * Idea is that the pre-computed table will be reused multiple times. In - * which case multiplication runs faster than below Pippenger algorithm - * implementation for up to ~16K points for wbits=8, naturally at the - * expense of multi-megabyte table. One can trade even more memory for - * performance, but each wbits increment doubles the memory requirement, - * so at some point it gets prohibively large... For reference, without - * reusing the table it's faster than Pippenger algorithm for up ~32 - * points [with wbits=5]... - */ - -#define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096) - -#define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \ -static void ptype##_precompute_row_wbits(ptype row[], size_t wbits, \ - const ptype##_affine *point) \ -{ \ - size_t i, j, n = (size_t)1 << (wbits-1); \ - /* row[-1] is implicit infinity */\ - vec_copy(&row[0], point, sizeof(*point)); /* row[0]=p*1 */\ - vec_copy(&row[0].Z, one, sizeof(row[0].Z)); \ - ptype##_double(&row[1], &row[0]); /* row[1]=p*(1+1) */\ - for (i = 2, j = 1; i < n; i += 2, j++) \ - ptype##_add_affine(&row[i], &row[i-1], point), /* row[2]=p*(2+1) */\ - ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ -} /* row[4] ... */\ -\ -static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \ - size_t wbits, size_t npoints) \ -{ \ - size_t total = npoints << (wbits-1); \ - size_t nwin = (size_t)1 << (wbits-1); \ - size_t i, j; \ - vec##bits *acc, ZZ, ZZZ; \ -\ - src += total; \ - acc = (vec##bits *)src; \ - vec_copy(acc++, one, sizeof(vec##bits)); \ - for (i = 0; i < npoints; i++) \ - for (j = nwin; --src, --j; acc++) \ - mul_##field(acc[0], acc[-1], src->Z); \ -\ - --acc; reciprocal_##field(acc[0], acc[0]); \ -\ - for (i = 0; i < npoints; i++) { \ - vec_copy(dst++, src++, sizeof(ptype##_affine)); \ - for (j = 1; j < nwin; j++, acc--, src++, dst++) { \ - mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ - sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ - mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ - mul_##field(acc[-1], src->Z, acc[0]); \ - mul_##field(dst->X, src->X, ZZ); /* X = X'/Z^2 */\ - mul_##field(dst->Y, src->Y, ZZZ); /* Y = Y'/Z^3 */\ - } \ - } \ -} \ -\ -/* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\ -static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \ - const ptype##_affine *const points[], \ - size_t npoints) \ -{ \ - size_t total = npoints << (wbits-1); \ - size_t nwin = (size_t)1 << (wbits-1); \ - size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \ - size_t i, top = 0; \ - ptype *rows, *row; \ - const ptype##_affine *point = NULL; \ - size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \ - if (stride == 0) stride = 1; \ -\ - while (npoints >= nmin) { \ - size_t limit = total - npoints; \ -\ - if (top + (stride << wbits) > limit) { \ - stride = (limit - top) >> wbits; \ - if (stride == 0) break; \ - } \ - rows = row = (ptype *)(&table[top]); \ - for (i = 0; i < stride; i++, row += nwin) \ - point = *points ? *points++ : point+1, \ - ptype##_precompute_row_wbits(row, wbits, point); \ - ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \ - top += stride << (wbits-1); \ - npoints -= stride; \ - } \ - rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \ - for (i = 0; i < npoints; i++, row += nwin) \ - point = *points ? *points++ : point+1, \ - ptype##_precompute_row_wbits(row, wbits, point); \ - ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \ -} \ -\ -size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \ -{ return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \ -void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \ - const ptype##_affine *const points[], \ - size_t npoints) \ -{ ptype##s_precompute_wbits(table, wbits, points, npoints); } - -#define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \ -static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \ - size_t wbits, limb_t booth_idx) \ -{ \ - bool_t booth_sign = (booth_idx >> wbits) & 1; \ - bool_t idx_is_zero; \ - static const ptype##_affine infinity = { 0 }; \ -\ - booth_idx &= ((limb_t)1 << wbits) - 1; \ - idx_is_zero = is_zero(booth_idx); \ - booth_idx -= 1 ^ idx_is_zero; \ - vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \ - ptype##_cneg(p, booth_sign); \ -} \ -\ -static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ - size_t wbits, size_t npoints, \ - const byte *const scalars[], size_t nbits, \ - ptype scratch[]) \ -{ \ - limb_t wmask, wval; \ - size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \ - const byte *scalar, *const *scalar_s = scalars; \ - const ptype##_affine *row = table; \ -\ - size_t scratch_sz = SCRATCH_SZ(ptype); \ - if (scratch == NULL) { \ - scratch_sz /= 4; /* limit to 288K */ \ - scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \ - scratch = alloca(sizeof(ptype) * scratch_sz); \ - } \ -\ - nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \ - scalar = *scalar_s++; \ -\ - /* top excess bits modulo target window size */ \ - window = nbits % wbits; /* yes, it may be zero */ \ - wmask = ((limb_t)1 << (window + 1)) - 1; \ -\ - nbits -= window; \ - z = is_zero(nbits); \ - wval = (get_wval_limb(scalar, nbits - (z^1), wbits + (z^1)) << z) & wmask; \ - wval = booth_encode(wval, wbits); \ - ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \ - row += nwin; \ -\ - i = 1; vec_zero(ret, sizeof(*ret)); \ - while (nbits > 0) { \ - for (j = i; i < npoints; i++, j++, row += nwin) { \ - if (j == scratch_sz) \ - ptype##s_accumulate(ret, scratch, j), j = 0; \ - scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ - wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \ - wval = booth_encode(wval, wbits); \ - ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ - } \ - ptype##s_accumulate(ret, scratch, j); \ -\ - for (j = 0; j < wbits; j++) \ - ptype##_double(ret, ret); \ -\ - window = wbits; \ - wmask = ((limb_t)1 << (window + 1)) - 1; \ - nbits -= window; \ - i = 0; row = table; scalar_s = scalars; \ - } \ -\ - for (j = i; i < npoints; i++, j++, row += nwin) { \ - if (j == scratch_sz) \ - ptype##s_accumulate(ret, scratch, j), j = 0; \ - scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ - wval = (get_wval_limb(scalar, 0, wbits) << 1) & wmask; \ - wval = booth_encode(wval, wbits); \ - ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ - } \ - ptype##s_accumulate(ret, scratch, j); \ -} \ -\ -size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \ -{ \ - const size_t scratch_sz = SCRATCH_SZ(ptype); \ - return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \ -} \ -void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ - size_t wbits, size_t npoints, \ - const byte *const scalars[], size_t nbits, \ - ptype scratch[]) \ -{ ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); } - -PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) -POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) - -PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) -POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) - -/* - * Pippenger algorithm implementation, fastest option for larger amount - * of points... - */ - -static size_t pippenger_window_size(size_t npoints) -{ - size_t wbits; - - for (wbits=0; npoints>>=1; wbits++) ; - - return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1)); -} - -#define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \ -typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz; - -#define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \ -static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \ - size_t wbits) \ -{ \ - ptype##xyzz ret[1], acc[1]; \ - size_t n = (size_t)1 << wbits; \ -\ - /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\ - vec_copy(acc, &buckets[--n], sizeof(acc)); \ - vec_copy(ret, &buckets[n], sizeof(ret)); \ - vec_zero(&buckets[n], sizeof(buckets[n])); \ - while (n--) { \ - ptype##xyzz_dadd(acc, acc, &buckets[n]); \ - ptype##xyzz_dadd(ret, ret, acc); \ - vec_zero(&buckets[n], sizeof(buckets[n])); \ - } \ - ptype##xyzz_to_Jacobian(out, ret); \ -} \ -\ -static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \ - size_t wbits, const ptype##_affine *p) \ -{ \ - bool_t booth_sign = (booth_idx >> wbits) & 1; \ -\ - booth_idx &= (1< nbits) wbits = nbits - bit0, cbits = wbits + 1; \ - else wbits = cbits = window; \ - ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \ - bit0, wbits, cbits); \ -} \ -void prefix##s_mult_pippenger(ptype *ret, \ - const ptype##_affine *const points[], \ - size_t npoints, \ - const byte *const scalars[], size_t nbits, \ - ptype##xyzz scratch[]) \ -{ \ - if (npoints == 1) { \ - prefix##_from_affine(ret, points[0]); \ - prefix##_mult(ret, ret, scalars[0], nbits); \ - return; \ - } \ - if ((npoints * sizeof(ptype##_affine) * 8 * 3) <= SCRATCH_LIMIT) { \ - ptype##_affine *table = alloca(npoints * sizeof(ptype##_affine) * 8); \ - ptype##s_precompute_wbits(table, 4, points, npoints); \ - ptype##s_mult_wbits(ret, table, 4, npoints, scalars, nbits, NULL); \ - return; \ - } \ - ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); \ -} - -DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384) -POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp) -POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp) -POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) -POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1) - -DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x) -POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2) -POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2) -POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) -POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2) diff --git a/crypto/blst_src/no_asm.h b/crypto/blst_src/no_asm.h deleted file mode 100644 index be7bf47e197..00000000000 --- a/crypto/blst_src/no_asm.h +++ /dev/null @@ -1,1345 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#if LIMB_T_BITS==32 -typedef unsigned long long llimb_t; -#endif - -#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 || defined(__STDC_NO_VLA__) -# error "unsupported compiler" -#endif - -#if defined(__clang__) -# pragma GCC diagnostic ignored "-Wstatic-in-inline" -#endif - -#if !defined(__clang__) && !defined(__builtin_assume) -# if defined(__GNUC__) && __GNUC__>=5 -# define __builtin_assume(condition) if (!(condition)) __builtin_unreachable() -# elif defined(_MSC_VER) -# define __builtin_assume(condition) __assume(condition) -# else -# define __builtin_assume(condition) (void)(condition) -# endif -#endif - -static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[], - const limb_t p[], limb_t n0, size_t n) -{ - __builtin_assume(n != 0 && n%2 == 0); - llimb_t limbx; - limb_t mask, borrow, mx, hi, tmp[n+1], carry; - size_t i, j; - - for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); - } - mx = n0*tmp[0]; - tmp[i] = hi; - - for (carry=0, j=0; ; ) { - limbx = (mx * (llimb_t)p[0]) + tmp[0]; - hi = (limb_t)(limbx >> LIMB_T_BITS); - for (i=1; i> LIMB_T_BITS); - } - limbx = tmp[i] + (hi + (llimb_t)carry); - tmp[i-1] = (limb_t)limbx; - carry = (limb_t)(limbx >> LIMB_T_BITS); - - if (++j==n) - break; - - for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); - } - mx = n0*tmp[0]; - limbx = hi + (llimb_t)carry; - tmp[i] = (limb_t)limbx; - carry = (limb_t)(limbx >> LIMB_T_BITS); - } - - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - - mask = carry - borrow; - launder(mask); - - for(i=0; i> LIMB_T_BITS); - } - - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - - mask = carry - borrow; - launder(mask); - - for(i=0; i> LIMB_T_BITS) & 1; - } - - mask = 0 - borrow; - launder(mask); - - for (carry=0, i=0; i> LIMB_T_BITS); - } -} - -#define SUB_MOD_IMPL(bits) \ -inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \ - const vec##bits b, const vec##bits p) \ -{ sub_mod_n(ret, a, b, p, NLIMBS(bits)); } - -SUB_MOD_IMPL(256) -SUB_MOD_IMPL(384) - -static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[], - size_t n) -{ - __builtin_assume(n != 0); - llimb_t limbx; - limb_t mask, carry, borrow, tmp[n], two_a[n]; - size_t i; - - for (carry=0, i=0; i>(LIMB_T_BITS-1); - } - - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - - mask = carry - borrow; - launder(mask); - - for(i=0; i> LIMB_T_BITS); - } - - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - - mask = carry - borrow; - launder(mask); - - for(i=0; i>(LIMB_T_BITS-1); - } - - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - - mask = carry - borrow; - launder(mask); - - for(i=0; i> LIMB_T_BITS) & 1; - } - - flag &= vec_is_zero(a, sizeof(tmp)) ^ 1; - mask = (limb_t)0 - flag; - - for(i=0; i> LIMB_T_BITS) & 1; - } - - return borrow & (is_zero(acc) ^ 1); -} - -#define CHECK_MOD_IMPL(bits) \ -inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \ -{ return check_mod_n(a, p, NLIMBS(bits)); } - -CHECK_MOD_IMPL(256) - -static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[], - const limb_t p[], size_t n) -{ - __builtin_assume(n != 0); - limb_t ret_[n], a_[n], b_[n], zero; - - limbs_from_le_bytes(a_, a, sizeof(a_)); - limbs_from_le_bytes(b_, b, sizeof(b_)); - - add_mod_n(ret_, a_, b_, p, n); - zero = vec_is_zero(ret_, sizeof(ret_)); - - le_bytes_from_limbs(ret, ret_, sizeof(ret_)); - - return zero^1; -} - -#define ADD_N_CHECK_MOD_IMPL(bits) \ -inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ - const pow##bits b, const vec##bits p) \ -{ return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } - -ADD_N_CHECK_MOD_IMPL(256) - -static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[], - const limb_t p[], size_t n) -{ - __builtin_assume(n != 0); - limb_t ret_[n], a_[n], b_[n], zero; - - limbs_from_le_bytes(a_, a, sizeof(a_)); - limbs_from_le_bytes(b_, b, sizeof(b_)); - - sub_mod_n(ret_, a_, b_, p, n); - zero = vec_is_zero(ret_, sizeof(ret_)); - - le_bytes_from_limbs(ret, ret_, sizeof(ret_)); - - return zero^1; -} - -#define SUB_N_CHECK_MOD_IMPL(bits) \ -inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ - const pow##bits b, const vec##bits p) \ -{ return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } - -SUB_N_CHECK_MOD_IMPL(256) - -static void from_mont_n(limb_t ret[], const limb_t a[], - const limb_t p[], limb_t n0, size_t n) -{ - __builtin_assume(n != 0 && n%2 == 0); - llimb_t limbx; - limb_t mask, borrow, mx, hi, tmp[n]; - size_t i, j; - - for (j=0; j> LIMB_T_BITS); - for (i=1; i> LIMB_T_BITS); - } - tmp[i-1] = hi; - a = tmp; - } - - /* this is needed only if input can be non-fully-reduced */ - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - - mask = 0 - borrow; - launder(mask); - - for(i=0; i> LIMB_T_BITS); - for (i=1; i> LIMB_T_BITS); - } - tmp[i-1] = hi; - b = tmp; - } - - for (carry=0, i=0; i> LIMB_T_BITS); - } - - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - - mask = carry - borrow; - launder(mask); - - for(i=0; i> LIMB_T_BITS); - } - - for (next=ret[0], i=0; i> 1; - next = ret[i+1]; - ret[i] = limb | next << (LIMB_T_BITS-1); - } - ret[i] = next >> 1 | carry << (LIMB_T_BITS-1); - - a = ret; - } -} - -#define RSHIFT_MOD_IMPL(bits) \ -inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \ - const vec##bits p) \ -{ rshift_mod_n(ret, a, count, p, NLIMBS(bits)); } - -RSHIFT_MOD_IMPL(256) -RSHIFT_MOD_IMPL(384) - -#define DIV_BY_2_MOD_IMPL(bits) \ -inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \ - const vec##bits p) \ -{ rshift_mod_n(ret, a, 1, p, NLIMBS(bits)); } - -DIV_BY_2_MOD_IMPL(384) - -static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n) -{ - __builtin_assume(n != 0); - llimb_t limbx; - limb_t carry, borrow, ret, tmp[n]; - size_t i; - - ret = a[0] & 1; /* parity */ - - for (carry=0, i=0; i>(LIMB_T_BITS-1); - } - - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - - ret |= ((carry - borrow) & 2) ^ 2; - - return ret; -} - -inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p) -{ return sgn0_pty_mod_n(a, p, NLIMBS(384)); } - -inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0) -{ - vec384 tmp; - - from_mont_n(tmp, a, p, n0, NLIMBS(384)); - - return sgn0_pty_mod_n(tmp, p, NLIMBS(384)); -} - -inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p) -{ - limb_t re, im, sign, prty; - - re = sgn0_pty_mod_n(a[0], p, NLIMBS(384)); - im = sgn0_pty_mod_n(a[1], p, NLIMBS(384)); - - /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */ - sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384)); - sign = (re & sign) | (im & ~sign); - - /* a->re==0 ? prty(a->im) : prty(a->re) */ - prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384)); - prty = (im & prty) | (re & ~prty); - - return (sign & 2) | (prty & 1); -} - -inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0) -{ - vec384x tmp; - - from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384)); - from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384)); - - return sgn0_pty_mod_384x(tmp, p); -} - -void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, - const vec384 p, limb_t n0) -{ - vec384 aa, bb, cc; - - add_mod_n(aa, a[0], a[1], p, NLIMBS(384)); - add_mod_n(bb, b[0], b[1], p, NLIMBS(384)); - mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384)); - mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384)); - mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384)); - sub_mod_n(ret[0], aa, cc, p, NLIMBS(384)); - sub_mod_n(ret[1], bb, aa, p, NLIMBS(384)); - sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384)); -} - -/* - * mul_mont_n without final conditional subtraction, which implies - * that modulus is one bit short, which in turn means that there are - * no carries to handle between iterations... - */ -static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[], - const limb_t p[], limb_t n0, size_t n) -{ - __builtin_assume(n != 0 && n%2 == 0); - llimb_t limbx; - limb_t mx, hi, tmp[n+1]; - size_t i, j; - - for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); - } - mx = n0*tmp[0]; - tmp[i] = hi; - - for (j=0; ; ) { - limbx = (mx * (llimb_t)p[0]) + tmp[0]; - hi = (limb_t)(limbx >> LIMB_T_BITS); - for (i=1; i> LIMB_T_BITS); - } - tmp[i-1] = tmp[i] + hi; - - if (++j==n) - break; - - for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); - } - mx = n0*tmp[0]; - tmp[i] = hi; - } - - vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t)); -} - -void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, - const vec384 p, limb_t n0, const vec384 b) -{ - __builtin_assume(count != 0); - while(count--) { - mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384)); - a = ret; - } - mul_mont_n(ret, ret, b, p, n0, NLIMBS(384)); -} - -void sqr_mont_382x(vec384x ret, const vec384x a, - const vec384 p, limb_t n0) -{ - llimb_t limbx; - limb_t mask, carry, borrow; - size_t i; - vec384 t0, t1; - - /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */ - for (carry=0, i=0; i> LIMB_T_BITS); - } - - /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */ - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - mask = 0 - borrow; - launder(mask); - - /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */ - mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384)); - - /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */ - for (carry=0, i=0; i>(LIMB_T_BITS-1); - } - - /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */ - mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384)); - - /* account for t1's sign... */ - for (borrow=0, i=0; i> LIMB_T_BITS) & 1; - } - mask = 0 - borrow; - launder(mask); - for (carry=0, i=0; i> LIMB_T_BITS); - } -} - -#if defined(__GNUC__) || defined(__clang__) -# define MSB(x) ({ limb_t ret = (x) >> (LIMB_T_BITS-1); launder(ret); ret; }) -#else -# define MSB(x) ((x) >> (LIMB_T_BITS-1)) -#endif - -static size_t num_bits(limb_t l) -{ - limb_t x, mask; - size_t bits = is_zero(l) ^ 1; - - if (sizeof(limb_t) == 8) { - x = l >> (32 & (8*sizeof(limb_t)-1)); - mask = 0 - MSB(0 - x); - bits += 32 & mask; - l ^= (x ^ l) & mask; - } - - x = l >> 16; - mask = 0 - MSB(0 - x); - bits += 16 & mask; - l ^= (x ^ l) & mask; - - x = l >> 8; - mask = 0 - MSB(0 - x); - bits += 8 & mask; - l ^= (x ^ l) & mask; - - x = l >> 4; - mask = 0 - MSB(0 - x); - bits += 4 & mask; - l ^= (x ^ l) & mask; - - x = l >> 2; - mask = 0 - MSB(0 - x); - bits += 2 & mask; - l ^= (x ^ l) & mask; - - bits += l >> 1; - - return bits; -} - -#if defined(__clang_major__) && __clang_major__>7 -__attribute__((optnone)) -#endif -static limb_t lshift_2(limb_t hi, limb_t lo, size_t l) -{ - size_t r = LIMB_T_BITS - l; - limb_t mask = 0 - (is_zero(l)^1); - return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1))); -} - -/* - * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1. - */ -static void ab_approximation_n(limb_t a_[2], const limb_t a[], - limb_t b_[2], const limb_t b[], size_t n) -{ - __builtin_assume(n != 0 && n%2 == 0); - limb_t a_hi, a_lo, b_hi, b_lo, mask; - size_t i; - - i = n-1; - a_hi = a[i], a_lo = a[i-1]; - b_hi = b[i], b_lo = b[i-1]; - for (i--; --i;) { - mask = 0 - is_zero(a_hi | b_hi); - a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi; - b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi; - a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo; - b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo; - } - i = LIMB_T_BITS - num_bits(a_hi | b_hi); - /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */ - - a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i); - b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i); -} - -typedef struct { limb_t f0, g0, f1, g1; } factors; - -static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2], - size_t n) -{ - __builtin_assume(n != 0); - llimb_t limbx; - limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1; - limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; - - a_lo = a_[0], a_hi = a_[1]; - b_lo = b_[0], b_hi = b_[1]; - - while(n--) { - odd = 0 - (a_lo&1); - - /* a_ -= b_ if a_ is odd */ - t_lo = a_lo, t_hi = a_hi; - limbx = a_lo - (llimb_t)(b_lo & odd); - a_lo = (limb_t)limbx; - borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; - limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); - a_hi = (limb_t)limbx; - borrow = (limb_t)(limbx >> LIMB_T_BITS); - - /* negate a_-b_ if it borrowed */ - a_lo ^= borrow; - a_hi ^= borrow; - limbx = a_lo + (llimb_t)(borrow & 1); - a_lo = (limb_t)limbx; - a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; - - /* b_=a_ if a_-b_ borrowed */ - b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; - b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; - - /* exchange f0 and f1 if a_-b_ borrowed */ - xorm = (f0 ^ f1) & borrow; - f0 ^= xorm; - f1 ^= xorm; - - /* exchange g0 and g1 if a_-b_ borrowed */ - xorm = (g0 ^ g1) & borrow; - g0 ^= xorm; - g1 ^= xorm; - - /* subtract if a_ was odd */ - f0 -= f1 & odd; - g0 -= g1 & odd; - - f1 <<= 1; - g1 <<= 1; - a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); - a_hi >>= 1; - } - - fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1; -} - -static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n) -{ - __builtin_assume(n != 0); - llimb_t limbx = 0; - limb_t carry; - size_t i; - - for (carry=neg&1, i=0; i> LIMB_T_BITS); - } - - return 0 - MSB((limb_t)limbx); -} - -static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n) -{ - __builtin_assume(n != 0); - llimb_t limbx; - limb_t carry; - size_t i; - - for (carry=0, i=0; i> LIMB_T_BITS); - } - - return carry; -} - -static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n) -{ - __builtin_assume(n != 0); - llimb_t limbx; - limb_t hi; - size_t i; - - for (hi=0, i=0; i> LIMB_T_BITS); - } - - return hi; -} - -static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_, - const limb_t b[], limb_t *g_, - size_t n) -{ - __builtin_assume(n != 0); - limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi; - size_t i; - - /* |a|*|f_| */ - f = *f_; - neg = 0 - MSB(f); - f = (f ^ neg) - neg; /* ensure |f| is positive */ - (void)cneg_n(a_, a, neg, n); - hi = umul_n(a_, a_, f, n); - a_[n] = hi - (f & neg); - - /* |b|*|g_| */ - g = *g_; - neg = 0 - MSB(g); - g = (g ^ neg) - neg; /* ensure |g| is positive */ - (void)cneg_n(b_, b, neg, n); - hi = umul_n(b_, b_, g, n); - b_[n] = hi - (g & neg); - - /* |a|*|f_| + |b|*|g_| */ - (void)add_n(a_, a_, b_, n+1); - - /* (|a|*|f_| + |b|*|g_|) >> k */ - for (carry=a_[0], i=0; i> (LIMB_T_BITS-2); - carry = a_[i+1]; - ret[i] = hi | (carry << 2); - } - - /* ensure result is non-negative, fix up |f_| and |g_| accordingly */ - neg = 0 - MSB(carry); - *f_ = (*f_ ^ neg) - neg; - *g_ = (*g_ ^ neg) - neg; - (void)cneg_n(ret, ret, neg, n); - - return neg; -} - -static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f, - const limb_t v[], limb_t g, size_t n) -{ - __builtin_assume(n != 0); - limb_t u_[n], v_[n], neg, hi; - - /* |u|*|f_| */ - neg = 0 - MSB(f); - f = (f ^ neg) - neg; /* ensure |f| is positive */ - neg = cneg_n(u_, u, neg, n); - hi = umul_n(u_, u_, f, n) - (f&neg); - - /* |v|*|g_| */ - neg = 0 - MSB(g); - g = (g ^ neg) - neg; /* ensure |g| is positive */ - neg = cneg_n(v_, v, neg, n); - hi += umul_n(v_, v_, g, n) - (g&neg); - - /* |u|*|f_| + |v|*|g_| */ - hi += add_n(ret, u_, v_, n); - - return hi; -} - -static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[], - const limb_t mod[], const limb_t modx[], size_t n) -{ - __builtin_assume(n != 0 && n%2 == 0); - llimb_t limbx; - limb_t a[n], b[n], u[2*n], v[2*n], t[2*n]; - limb_t a_[2], b_[2], sign, carry, top; - factors fg; - size_t i; - - vec_copy(a, inp, sizeof(a)); - vec_copy(b, mod, sizeof(b)); - vec_zero(u, sizeof(u)); u[0] = 1; - vec_zero(v, sizeof(v)); - - for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { - ab_approximation_n(a_, a, b_, b, n); - inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2); - (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); - (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); - vec_copy(a, t, sizeof(a)); - smul_2n(t, u, fg.f0, v, fg.g0, 2*n); - smul_2n(v, u, fg.f1, v, fg.g1, 2*n); - vec_copy(u, t, sizeof(u)); - } - - inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); - top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n); - - sign = 0 - MSB(top); /* top is 1, 0 or -1 */ - for (carry=0, i=0; i> LIMB_T_BITS); - } - top += carry; - sign = 0 - top; /* top is 1, 0 or -1 */ - top |= sign; - for (i=0; i> LIMB_T_BITS) & 1; - limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); - a_hi = (limb_t)limbx; - borrow = (limb_t)(limbx >> LIMB_T_BITS); - - L += ((t_lo & b_lo) >> 1) & borrow; - - /* negate a_-b_ if it borrowed */ - a_lo ^= borrow; - a_hi ^= borrow; - limbx = a_lo + (llimb_t)(borrow & 1); - a_lo = (limb_t)limbx; - a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; - - /* b_=a_ if a_-b_ borrowed */ - b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; - b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; - - /* exchange f0 and f1 if a_-b_ borrowed */ - xorm = (f0 ^ f1) & borrow; - f0 ^= xorm; - f1 ^= xorm; - - /* exchange g0 and g1 if a_-b_ borrowed */ - xorm = (g0 ^ g1) & borrow; - g0 ^= xorm; - g1 ^= xorm; - - /* subtract if a_ was odd */ - f0 -= f1 & odd; - g0 -= g1 & odd; - - f1 <<= 1; - g1 <<= 1; - a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); - a_hi >>= 1; - - L += (b_lo + 2) >> 2; - } - - fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1; - - return L; -} - -static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n) -{ - __builtin_assume(n != 0 && n%2 == 0); - limb_t a[n], b[n], t[n]; - limb_t a_[2], b_[2], neg, L = 0; - factors fg; - size_t i; - - vec_copy(a, inp, sizeof(a)); - vec_copy(b, mod, sizeof(b)); - - for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { - ab_approximation_n(a_, a, b_, b, n); - L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2); - neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); - (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); - vec_copy(a, t, sizeof(a)); - L += (b[0] >> 1) & neg; - } - - L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); - - return (L & 1) ^ 1; -} - -#define CT_IS_SQR_MOD_IMPL(bits) \ -inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \ - const vec##bits mod) \ -{ return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits)); } - -CT_IS_SQR_MOD_IMPL(384) - -/* - * |div_top| points at two most significant limbs of the dividend, |d_hi| - * and |d_lo| are two most significant limbs of the divisor. If divisor - * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. - * The divisor is required to be "bitwise left-aligned," and dividend's - * top limbs to be not larger than the divisor's. The latter limitation - * can be problematic in the first iteration of multi-precision division, - * where in most general case the condition would have to be "smaller." - * The subroutine considers four limbs, two of which are "overlapping," - * hence the name... Another way to look at it is to think of the pair - * of the dividend's limbs being suffixed with a zero: - * +-------+-------+-------+ - * R | | | 0 | - * +-------+-------+-------+ - * +-------+-------+ - * D | | | - * +-------+-------+ - */ -limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi) -{ - llimb_t Rx; - limb_t r_lo = div_top[0], r_hi = div_top[1]; - limb_t Q = 0, mask, borrow, rx; - size_t i; - - for (i = 0; i < LIMB_T_BITS; i++) { - /* "borrow, Rx = R - D" */ - Rx = (llimb_t)r_lo - d_lo; - rx = (limb_t)Rx; - borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; - Rx = r_hi - (d_hi + (llimb_t)borrow); - borrow = (limb_t)(Rx >> LIMB_T_BITS); - - /* "if (R >= D) R -= D" */ - r_lo = ((r_lo ^ rx) & borrow) ^ rx; - rx = (limb_t)Rx; - r_hi = ((r_hi ^ rx) & borrow) ^ rx; - - Q <<= 1; - Q |= ~borrow & 1; - - /* "D >>= 1" */ - d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1); - d_hi >>= 1; - } - - mask = 0 - MSB(Q); /* does it overflow? */ - - /* "borrow, Rx = R - D" */ - Rx = (llimb_t)r_lo - d_lo; - rx = (limb_t)Rx; - borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; - Rx = r_hi - (d_hi + (llimb_t)borrow); - borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; - - Q <<= 1; - Q |= borrow ^ 1; - - return (Q | mask); -} - -static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor, - limb_t quotient, size_t n) -{ - __builtin_assume(n != 0 && n%2 == 0); - llimb_t limbx; - limb_t tmp[n+1], carry, mask, borrow; - size_t i; - - /* divisor*quotient */ - for (carry=0, i=0; i> LIMB_T_BITS); - } - tmp[i] = carry; - - /* remainder = dividend - divisor*quotient */ - for (borrow=0, i=0; i<=n; i++) { - limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow); - tmp[i] = (limb_t)limbx; - borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; - } - - mask = 0 - borrow; - launder(mask); - - /* if quotient was off by one, add divisor to the remainder */ - for (carry=0, i=0; i> LIMB_T_BITS) & 1; - } - - return (div_rem[i] = quotient + mask); -} - -inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor, - limb_t quotient) -{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128)); } - -inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor, - limb_t quotient) -{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64)); } - -/* - * Unlock reference implementations in vect.c - */ -#define mul_by_8_mod_384 mul_by_8_mod_384 -#define mul_by_8_mod_384x mul_by_8_mod_384x -#define mul_by_3_mod_384x mul_by_3_mod_384x -#define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x -#define add_mod_384x add_mod_384x -#define sub_mod_384x sub_mod_384x -#define lshift_mod_384x lshift_mod_384x -#define sqr_mont_384x sqr_mont_384x - -inline void vec_prefetch(const void *ptr, size_t len) -{ (void)ptr; (void)len; } - -/* - * SHA-256 - */ -#define ROTR(x,n) ((x)>>n | (x)<<(32-n)) -#define Sigma0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22)) -#define Sigma1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25)) -#define sigma0(x) (ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3)) -#define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10)) -#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) -#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - -void blst_sha256_block_data_order(unsigned int *v, const void *inp, - size_t blocks) -{ - static const unsigned int K256[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 - }; - unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2; - const unsigned char *data = inp; - size_t round; - - a = v[0]; - b = v[1]; - c = v[2]; - d = v[3]; - e = v[4]; - f = v[5]; - g = v[6]; - h = v[7]; - - while (blocks--) { - for (round = 0; round < 16; round++) { - l = (unsigned int)data[0] << 24; - l |= (unsigned int)data[1] << 16; - l |= (unsigned int)data[2] << 8; - l |= (unsigned int)data[3]; - data += 4; - T1 = X[round] = l; - T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; - T2 = Sigma0(a) + Maj(a, b, c); - h = g; - g = f; - f = e; - e = d + T1; - d = c; - c = b; - b = a; - a = T1 + T2; - } - - for (; round < 64; round++) { - s0 = X[(round + 1) & 0x0f]; - s0 = sigma0(s0); - s1 = X[(round + 14) & 0x0f]; - s1 = sigma1(s1); - - T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf]; - T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; - T2 = Sigma0(a) + Maj(a, b, c); - h = g; - g = f; - f = e; - e = d + T1; - d = c; - c = b; - b = a; - a = T1 + T2; - } - - a += v[0]; v[0] = a; - b += v[1]; v[1] = b; - c += v[2]; v[2] = c; - d += v[3]; v[3] = d; - e += v[4]; v[4] = e; - f += v[5]; v[5] = f; - g += v[6]; v[6] = g; - h += v[7]; v[7] = h; - } -} -#undef ROTR -#undef Sigma0 -#undef Sigma1 -#undef sigma0 -#undef sigma1 -#undef Ch -#undef Maj - -void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8]) -{ - size_t i; - - for (i=0; i<8; i++) - dst[i] = src[i]; -} - -void blst_sha256_emit(unsigned char md[32], const unsigned int h[8]) -{ - size_t i; - - for (i=0; i<8; i++, md+=4) { - unsigned int h_i = h[i]; - md[0] = (unsigned char)(h_i >> 24); - md[1] = (unsigned char)(h_i >> 16); - md[2] = (unsigned char)(h_i >> 8); - md[3] = (unsigned char)h_i; - } -} - -void blst_sha256_bcopy(void *dst_, const void *src_, size_t len) -{ - unsigned char *dst = dst_; - const unsigned char *src = src_; - size_t i; - - for (i=0; iZ); /* Z1Z1 = Z1^2 */ - mul_fp2(U2, Q->X, Z1Z1); /* U2 = X2*Z1Z1 */ - - mul_fp2(S2, Q->Y, R->Z); - mul_fp2(S2, S2, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */ - - sub_fp2(H, U2, R->X); /* H = U2-X1 */ - - sqr_fp2(HH, H); /* HH = H^2 */ - add_fp2(I, HH, HH); - add_fp2(I, I, I); /* I = 4*HH */ - - mul_fp2(J, H, I); /* J = H*I */ - - sub_fp2(r, S2, R->Y); - add_fp2(r, r, r); /* r = 2*(S2-Y1) */ - - mul_fp2(V, R->X, I); /* V = X1*I */ - - sqr_fp2(T->X, r); - sub_fp2(T->X, T->X, J); - sub_fp2(T->X, T->X, V); - sub_fp2(T->X, T->X, V); /* X3 = r^2-J-2*V */ - - mul_fp2(J, J, R->Y); - sub_fp2(T->Y, V, T->X); - mul_fp2(T->Y, T->Y, r); - sub_fp2(T->Y, T->Y, J); - sub_fp2(T->Y, T->Y, J); /* Y3 = r*(V-X3)-2*Y1*J */ - - add_fp2(T->Z, R->Z, H); - sqr_fp2(T->Z, T->Z); - sub_fp2(T->Z, T->Z, Z1Z1); - sub_fp2(T->Z, T->Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */ - - /* - * line evaluation - */ - mul_fp2(I, r, Q->X); - mul_fp2(J, Q->Y, T->Z); - sub_fp2(I, I, J); - add_fp2(line[0], I, I); /* 2*(r*X2 - Y2*Z3) */ -#ifdef r -# undef r -#else - vec_copy(line[1], r, sizeof(r)); -#endif - vec_copy(line[2], T->Z, sizeof(T->Z)); -} - -static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q) -{ - vec384x ZZ, A, B, C, D, E, F; - - /* - * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr - */ - sqr_fp2(A, Q->X); /* A = X1^2 */ - sqr_fp2(B, Q->Y); /* B = Y1^2 */ - sqr_fp2(ZZ, Q->Z); /* ZZ = Z1^2 */ - sqr_fp2(C, B); /* C = B^2 */ - - add_fp2(D, Q->X, B); /* X1+B */ - sqr_fp2(D, D); /* (X1+B)^2 */ - sub_fp2(D, D, A); /* (X1+B)^2-A */ - sub_fp2(D, D, C); /* (X1+B)^2-A-C */ - add_fp2(D, D, D); /* D = 2*((X1+B)^2-A-C) */ - - mul_by_3_fp2(E, A); /* E = 3*A */ - sqr_fp2(F, E); /* F = E^2 */ - - add_fp2(line[0], E, Q->X); /* 3*A+X1 for line evaluation */ - - sub_fp2(T->X, F, D); - sub_fp2(T->X, T->X, D); /* X3 = F-2*D */ - - add_fp2(T->Z, Q->Y, Q->Z); - sqr_fp2(T->Z, T->Z); - sub_fp2(T->Z, T->Z, B); - sub_fp2(T->Z, T->Z, ZZ); /* Z3 = (Y1+Z1)^2-B-ZZ */ - - mul_by_8_fp2(C, C); /* 8*C */ - sub_fp2(T->Y, D, T->X); /* D-X3 */ - mul_fp2(T->Y, T->Y, E); /* E*(D-X3) */ - sub_fp2(T->Y, T->Y, C); /* Y3 = E*(D-X3)-8*C */ - - /* - * line evaluation - */ - sqr_fp2(line[0], line[0]); - sub_fp2(line[0], line[0], A); - sub_fp2(line[0], line[0], F); /* (3*A+X1)^2 - X1^2 - 9*A^2 */ - lshift_fp2(B, B, 2); - sub_fp2(line[0], line[0], B); /* 6*X1^3 - 4*Y1^2 */ - - mul_fp2(line[1], E, ZZ); /* 3*X1^2 * Z1^2 */ - - mul_fp2(line[2], T->Z, ZZ); /* Z3 * Z1^2 */ -} - -static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2) -{ - mul_fp(line[1][0], line[1][0], Px2->X); /* "b01" *= -2*P->X */ - mul_fp(line[1][1], line[1][1], Px2->X); - - mul_fp(line[2][0], line[2][0], Px2->Y); /* "b11" *= 2*P->Y */ - mul_fp(line[2][1], line[2][1], Px2->Y); -} - -#if 0 -static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q, - const POINTonE1_affine *Px2, vec384fp6 line, size_t n) -{ - line_add(line, T, T, Q); line_by_Px2(line, Px2); - mul_by_xy00z0_fp12(ret, ret, line); - while (n--) { - sqr_fp12(ret, ret); - line_dbl(line, T, T); line_by_Px2(line, Px2); - mul_by_xy00z0_fp12(ret, ret, line); - } -} - -static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P) -{ -#define Q ((const POINTonE2_affine *)Q) - POINTonE2 T[1]; - POINTonE1_affine Px2[1]; - vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ - - /* Move common expression from line evaluation to line_by_Px2. */ - add_fp(Px2->X, P->X, P->X); - neg_fp(Px2->X, Px2->X); - add_fp(Px2->Y, P->Y, P->Y); - - vec_copy(T->X, Q->X, 2*sizeof(T->X)); - vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); - - /* first step is ret = 1^2*line, which is replaced with ret = line */ - line_dbl(line, T, T); /* 0x2 */ - line_by_Px2(line, Px2); - vec_zero(ret, sizeof(vec384fp12)); - vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); - vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); - add_n_dbl(ret, T, Q, Px2, line, 2); /* ..0xc */ - add_n_dbl(ret, T, Q, Px2, line, 3); /* ..0x68 */ - add_n_dbl(ret, T, Q, Px2, line, 9); /* ..0xd200 */ - add_n_dbl(ret, T, Q, Px2, line, 32); /* ..0xd20100000000 */ - add_n_dbl(ret, T, Q, Px2, line, 16); /* ..0xd201000000010000 */ - conjugate_fp12(ret); /* account for z being negative */ -#undef Q -} -#endif - -static void start_dbl_n(vec384fp12 ret, POINTonE2 T[], - const POINTonE1_affine Px2[], size_t n) -{ - size_t i; - vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ - - /* first step is ret = 1^2*line, which is replaced with ret = line */ - line_dbl(line, T+0, T+0); line_by_Px2(line, Px2+0); - vec_zero(ret, sizeof(vec384fp12)); - vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); - vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); - - for (i = 1; i < n; i++) { - line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); - mul_by_xy00z0_fp12(ret, ret, line); - } -} - -static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[], - const POINTonE2_affine Q[], - const POINTonE1_affine Px2[], - size_t n, size_t k) -{ - size_t i; - vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ - - for (i = 0; i < n; i++) { - line_add(line, T+i, T+i, Q+i); line_by_Px2(line, Px2+i); - mul_by_xy00z0_fp12(ret, ret, line); - } - while (k--) { - sqr_fp12(ret, ret); - for (i = 0; i < n; i++) { - line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); - mul_by_xy00z0_fp12(ret, ret, line); - } - } -} - -static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[], - const POINTonE1_affine P[], size_t n) -{ -#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ - || defined(__STDC_NO_VLA__) - POINTonE2 *T = alloca(n*sizeof(POINTonE2)); - POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine)); -#else - POINTonE2 T[n]; - POINTonE1_affine Px2[n]; -#endif - size_t i; - - if ((n == 1) && (vec_is_zero(&Q[0], sizeof(Q[0])) | - vec_is_zero(&P[0], sizeof(P[0]))) ) { - /* - * Special case of infinite aggregated signature, pair the additive - * group's identity with the multiplicative group's identity. - */ - vec_copy(ret, BLS12_381_Rx.p12, sizeof(vec384fp12)); - return; - } - - for (i = 0; i < n; i++) { - /* Move common expression from line evaluation to line_by_Px2. */ - add_fp(Px2[i].X, P[i].X, P[i].X); - neg_fp(Px2[i].X, Px2[i].X); - add_fp(Px2[i].Y, P[i].Y, P[i].Y); - - vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X)); - vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); - } - - /* first step is ret = 1^2*line, which is replaced with ret = line */ - start_dbl_n(ret, T, Px2, n); /* 0x2 */ - add_n_dbl_n(ret, T, Q, Px2, n, 2); /* ..0xc */ - add_n_dbl_n(ret, T, Q, Px2, n, 3); /* ..0x68 */ - add_n_dbl_n(ret, T, Q, Px2, n, 9); /* ..0xd200 */ - add_n_dbl_n(ret, T, Q, Px2, n, 32); /* ..0xd20100000000 */ - add_n_dbl_n(ret, T, Q, Px2, n, 16); /* ..0xd201000000010000 */ - conjugate_fp12(ret); /* account for z being negative */ -} - -static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T, - const POINTonE2_affine *Q, - size_t n) -{ - line_add(lines++[0], T, T, Q); - while (n--) - line_dbl(lines++[0], T, T); -} - -static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) -{ - POINTonE2 T[1]; - - vec_copy(T->X, Q->X, 2*sizeof(T->X)); - vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); - - line_dbl(Qlines[0], T, T); /* 0x2 */ - pre_add_n_dbl(&Qlines[1], T, Q, 2); /* ..0xc */ - pre_add_n_dbl(&Qlines[4], T, Q, 3); /* ..0x68 */ - pre_add_n_dbl(&Qlines[8], T, Q, 9); /* ..0xd200 */ - pre_add_n_dbl(&Qlines[18], T, Q, 32); /* ..0xd20100000000 */ - pre_add_n_dbl(&Qlines[51], T, Q, 16); /* ..0xd201000000010000 */ -} - -static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in, - const POINTonE1_affine *Px2) -{ - vec_copy(out[0], in[0], sizeof(out[0])); - - mul_fp(out[1][0], in[1][0], Px2->X); /* "b01" *= -2*P->X */ - mul_fp(out[1][1], in[1][1], Px2->X); - - mul_fp(out[2][0], in[2][0], Px2->Y); /* "b11" *= 2*P->Y */ - mul_fp(out[2][1], in[2][1], Px2->Y); -} - -static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[], - const POINTonE1_affine *Px2, size_t n) -{ - vec384fp6 line; - - post_line_by_Px2(line, lines++[0], Px2); - mul_by_xy00z0_fp12(ret, ret, line); - while (n--) { - sqr_fp12(ret, ret); - post_line_by_Px2(line, lines++[0], Px2); - mul_by_xy00z0_fp12(ret, ret, line); - } -} - -static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], - const POINTonE1_affine *P) -{ - POINTonE1_affine Px2[1]; - vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ - - /* Move common expression from line evaluation to line_by_Px2. */ - add_fp(Px2->X, P->X, P->X); - neg_fp(Px2->X, Px2->X); - add_fp(Px2->Y, P->Y, P->Y); - - /* first step is ret = 1^2*line, which is replaced with ret = line */ - post_line_by_Px2(line, Qlines[0], Px2); /* 0x2 */ - vec_zero(ret, sizeof(vec384fp12)); - vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); - vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); - post_add_n_dbl(ret, &Qlines[1], Px2, 2); /* ..0xc */ - post_add_n_dbl(ret, &Qlines[4], Px2, 3); /* ..0x68 */ - post_add_n_dbl(ret, &Qlines[8], Px2, 9); /* ..0xd200 */ - post_add_n_dbl(ret, &Qlines[18], Px2, 32); /* ..0xd20100000000 */ - post_add_n_dbl(ret, &Qlines[51], Px2, 16); /* ..0xd201000000010000 */ - conjugate_fp12(ret); /* account for z being negative */ -} - -#ifdef INTERNAL_TESTMODE -static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q, - const POINTonE1_affine *P) -{ - vec384fp6 lines[68]; - - precompute_lines(lines, Q); - miller_loop_lines(ret, lines, P); -} -#endif - -static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n) -{ - mul_fp12(ret, ret, a); - while (n--) - cyclotomic_sqr_fp12(ret, ret); -} - -static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a) -{ - cyclotomic_sqr_fp12(ret, a); /* 0x2 */ - mul_n_sqr(ret, a, 2); /* ..0xc */ - mul_n_sqr(ret, a, 3); /* ..0x68 */ - mul_n_sqr(ret, a, 9); /* ..0xd200 */ - mul_n_sqr(ret, a, 32); /* ..0xd20100000000 */ - mul_n_sqr(ret, a, 16-1); /* ..0x6900800000008000 */ - conjugate_fp12(ret); /* account for z being negative */ -} - -#define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a)) - -/* - * Adaptation from /pairing/src/bls12_381/mod.rs - */ -static void final_exp(vec384fp12 ret, const vec384fp12 f) -{ - vec384fp12 y0, y1, y2, y3; - - vec_copy(y1, f, sizeof(y1)); - conjugate_fp12(y1); - inverse_fp12(y2, f); - mul_fp12(ret, y1, y2); - frobenius_map_fp12(y2, ret, 2); - mul_fp12(ret, ret, y2); - - cyclotomic_sqr_fp12(y0, ret); - raise_to_z(y1, y0); - raise_to_z_div_by_2(y2, y1); - vec_copy(y3, ret, sizeof(y3)); - conjugate_fp12(y3); - mul_fp12(y1, y1, y3); - conjugate_fp12(y1); - mul_fp12(y1, y1, y2); - raise_to_z(y2, y1); - raise_to_z(y3, y2); - conjugate_fp12(y1); - mul_fp12(y3, y3, y1); - conjugate_fp12(y1); - frobenius_map_fp12(y1, y1, 3); - frobenius_map_fp12(y2, y2, 2); - mul_fp12(y1, y1, y2); - raise_to_z(y2, y3); - mul_fp12(y2, y2, y0); - mul_fp12(y2, y2, ret); - mul_fp12(y1, y1, y2); - frobenius_map_fp12(y2, y3, 1); - mul_fp12(ret, y1, y2); -} - -void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q, - const POINTonE1_affine *P) -{ miller_loop_n(ret, Q ? Q : (const POINTonE2_affine *)&BLS12_381_G2, - P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1); -} - -#ifndef MILLER_LOOP_N_MAX -# define MILLER_LOOP_N_MAX 16 -#endif - -void blst_miller_loop_n(vec384fp12 out, const POINTonE2_affine *const Qs[], - const POINTonE1_affine *const Ps[], - size_t n) -{ /* ~10KB of stack storage */ - POINTonE2 T[MILLER_LOOP_N_MAX]; - POINTonE2_affine Q[MILLER_LOOP_N_MAX]; - POINTonE1_affine Px2[MILLER_LOOP_N_MAX]; - const POINTonE2_affine *Qptr = NULL; - const POINTonE1_affine *Pptr = NULL; - size_t i, j; - - for (i = 0, j = 0; j < n; j++) { - Qptr = *Qs ? *Qs++ : Qptr+1; - Pptr = *Ps ? *Ps++ : Pptr+1; - - /* Move common expression from line evaluation to line_by_Px2. */ - add_fp(Px2[i].X, Pptr->X, Pptr->X); - neg_fp(Px2[i].X, Px2[i].X); - add_fp(Px2[i].Y, Pptr->Y, Pptr->Y); - - vec_copy(Q[i].X, Qptr->X, 2*sizeof(Q[i].X)); - vec_copy(T[i].X, Qptr->X, 2*sizeof(T[i].X)); - vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); - - if (++i == MILLER_LOOP_N_MAX || j == n-1) { - vec384fp12 tmp; - vec384fp6 *ret = j < MILLER_LOOP_N_MAX ? out : tmp; - - /* first step is ret = 1^2*line, which is just ret = line */ - start_dbl_n(ret, T, Px2, i); /* 0x2 */ - add_n_dbl_n(ret, T, Q, Px2, i, 2); /* ..0xc */ - add_n_dbl_n(ret, T, Q, Px2, i, 3); /* ..0x68 */ - add_n_dbl_n(ret, T, Q, Px2, i, 9); /* ..0xd200 */ - add_n_dbl_n(ret, T, Q, Px2, i, 32); /* ..0xd20100000000 */ - add_n_dbl_n(ret, T, Q, Px2, i, 16); /* ..0xd201000000010000 */ - conjugate_fp12(ret); /* account for z being negative */ - - if (j >= MILLER_LOOP_N_MAX) - mul_fp12(out, out, ret); - - i = 0; - } - } -} - -void blst_final_exp(vec384fp12 ret, const vec384fp12 f) -{ final_exp(ret, f); } - -void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) -{ precompute_lines(Qlines, Q); } - -void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], - const POINTonE1_affine *P) -{ miller_loop_lines(ret, Qlines, P); } - -static bool_t is_cyclotomic(const vec384fp12 f) -{ - vec384fp12 a, b; - - frobenius_map_fp12(a, f, 2); - frobenius_map_fp12(b, a, 2); - mul_fp12(b, b, f); - - return vec_is_equal(a, b, sizeof(a)); -} - -int blst_fp12_in_group(const vec384fp12 f) -{ - vec384fp12 a, b; - - if (vec_is_zero(f, sizeof(vec384fp12)) || !is_cyclotomic(f)) - return 0; - - frobenius_map_fp12(a, f, 1); - raise_to_z(b, f); - - return (int)vec_is_equal(a, b, sizeof(a)); -} diff --git a/crypto/blst_src/pentaroot-addchain.h b/crypto/blst_src/pentaroot-addchain.h deleted file mode 100644 index 5bdd9ddf7f7..00000000000 --- a/crypto/blst_src/pentaroot-addchain.h +++ /dev/null @@ -1,333 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -/* - * The "magic" number is 1/5 modulo BLS12_381_r-1. Exponentiation to which - * yields 5th root of the base. - * - * Generated with 'addchain 20974350070050476191779096203274386335076221000211055129041463479975432473805' - * https://github.com/kwantam/addchain - * # Bos-Coster (win=4) : 307 (15) - * # Bos-Coster (win=10) : 307 (18) - * # Yacobi : 319 (16) - * # Bos-Coster (win=2) : 319 ( 5) - * # Bos-Coster (win=5) : 306 (19) <<< - * # Bos-Coster (win=7) : 311 (22) - * # Bos-Coster (win=9) : 313 (20) - * # Bos-Coster (win=3) : 314 ( 9) - * # Bos-Coster (win=6) : 309 (21) - * # Bos-Coster (win=8) : 309 (23) - * # Bergeron-Berstel-Brlek-Duboc : 334 ( 5) - */ - -#define PENTAROOT_MOD_BLS12_381_r(out, inp, ptype) do { \ -ptype t[19]; \ -vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ -sqr(t[7], t[1]); /* 1: 2 */\ -sqr(t[0], t[7]); /* 2: 4 */\ -sqr(t[2], t[0]); /* 3: 8 */\ -mul(t[10], t[2], t[1]); /* 4: 9 */\ -mul(t[3], t[10], t[7]); /* 5: b */\ -mul(t[1], t[10], t[0]); /* 6: d */\ -mul(t[5], t[3], t[0]); /* 7: f */\ -mul(t[9], t[10], t[2]); /* 8: 11 */\ -mul(t[4], t[3], t[2]); /* 9: 13 */\ -mul(t[15], t[5], t[2]); /* 10: 17 */\ -mul(t[8], t[15], t[2]); /* 11: 1f */\ -mul(t[13], t[8], t[7]); /* 12: 21 */\ -mul(t[14], t[8], t[0]); /* 13: 23 */\ -mul(t[12], t[13], t[0]); /* 14: 25 */\ -mul(t[6], t[8], t[2]); /* 15: 27 */\ -mul(t[11], t[14], t[2]); /* 16: 2b */\ -sqr(t[0], t[15]); /* 17: 2e */\ -mul(t[18], t[6], t[2]); /* 18: 2f */\ -mul(t[2], t[11], t[2]); /* 19: 33 */\ -mul(t[16], t[2], t[7]); /* 20: 35 */\ -mul(t[7], t[0], t[3]); /* 21: 39 */\ -mul(t[17], t[0], t[5]); /* 22: 3d */\ -/* sqr(t[0], t[0]); */ /* 23: 5c */\ -/* sqr(t[0], t[0]); */ /* 24: b8 */\ -/* sqr(t[0], t[0]); */ /* 25: 170 */\ -/* sqr(t[0], t[0]); */ /* 26: 2e0 */\ -/* sqr(t[0], t[0]); */ /* 27: 5c0 */\ -/* sqr(t[0], t[0]); */ /* 28: b80 */\ -/* sqr(t[0], t[0]); */ /* 29: 1700 */\ -sqr_n_mul(t[0], t[0], 7, t[18]); /* 30: 172f */\ -/* sqr(t[0], t[0]); */ /* 31: 2e5e */\ -/* sqr(t[0], t[0]); */ /* 32: 5cbc */\ -/* sqr(t[0], t[0]); */ /* 33: b978 */\ -/* sqr(t[0], t[0]); */ /* 34: 172f0 */\ -/* sqr(t[0], t[0]); */ /* 35: 2e5e0 */\ -/* sqr(t[0], t[0]); */ /* 36: 5cbc0 */\ -sqr_n_mul(t[0], t[0], 6, t[13]); /* 37: 5cbe1 */\ -/* sqr(t[0], t[0]); */ /* 38: b97c2 */\ -/* sqr(t[0], t[0]); */ /* 39: 172f84 */\ -/* sqr(t[0], t[0]); */ /* 40: 2e5f08 */\ -/* sqr(t[0], t[0]); */ /* 41: 5cbe10 */\ -/* sqr(t[0], t[0]); */ /* 42: b97c20 */\ -/* sqr(t[0], t[0]); */ /* 43: 172f840 */\ -sqr_n_mul(t[0], t[0], 6, t[17]); /* 44: 172f87d */\ -/* sqr(t[0], t[0]); */ /* 45: 2e5f0fa */\ -/* sqr(t[0], t[0]); */ /* 46: 5cbe1f4 */\ -/* sqr(t[0], t[0]); */ /* 47: b97c3e8 */\ -/* sqr(t[0], t[0]); */ /* 48: 172f87d0 */\ -/* sqr(t[0], t[0]); */ /* 49: 2e5f0fa0 */\ -/* sqr(t[0], t[0]); */ /* 50: 5cbe1f40 */\ -sqr_n_mul(t[0], t[0], 6, t[16]); /* 51: 5cbe1f75 */\ -/* sqr(t[0], t[0]); */ /* 52: b97c3eea */\ -/* sqr(t[0], t[0]); */ /* 53: 172f87dd4 */\ -/* sqr(t[0], t[0]); */ /* 54: 2e5f0fba8 */\ -/* sqr(t[0], t[0]); */ /* 55: 5cbe1f750 */\ -/* sqr(t[0], t[0]); */ /* 56: b97c3eea0 */\ -sqr_n_mul(t[0], t[0], 5, t[15]); /* 57: b97c3eeb7 */\ -/* sqr(t[0], t[0]); */ /* 58: 172f87dd6e */\ -/* sqr(t[0], t[0]); */ /* 59: 2e5f0fbadc */\ -/* sqr(t[0], t[0]); */ /* 60: 5cbe1f75b8 */\ -/* sqr(t[0], t[0]); */ /* 61: b97c3eeb70 */\ -/* sqr(t[0], t[0]); */ /* 62: 172f87dd6e0 */\ -/* sqr(t[0], t[0]); */ /* 63: 2e5f0fbadc0 */\ -sqr_n_mul(t[0], t[0], 6, t[15]); /* 64: 2e5f0fbadd7 */\ -/* sqr(t[0], t[0]); */ /* 65: 5cbe1f75bae */\ -/* sqr(t[0], t[0]); */ /* 66: b97c3eeb75c */\ -/* sqr(t[0], t[0]); */ /* 67: 172f87dd6eb8 */\ -/* sqr(t[0], t[0]); */ /* 68: 2e5f0fbadd70 */\ -/* sqr(t[0], t[0]); */ /* 69: 5cbe1f75bae0 */\ -/* sqr(t[0], t[0]); */ /* 70: b97c3eeb75c0 */\ -/* sqr(t[0], t[0]); */ /* 71: 172f87dd6eb80 */\ -/* sqr(t[0], t[0]); */ /* 72: 2e5f0fbadd700 */\ -sqr_n_mul(t[0], t[0], 8, t[14]); /* 73: 2e5f0fbadd723 */\ -/* sqr(t[0], t[0]); */ /* 74: 5cbe1f75bae46 */\ -/* sqr(t[0], t[0]); */ /* 75: b97c3eeb75c8c */\ -/* sqr(t[0], t[0]); */ /* 76: 172f87dd6eb918 */\ -/* sqr(t[0], t[0]); */ /* 77: 2e5f0fbadd7230 */\ -/* sqr(t[0], t[0]); */ /* 78: 5cbe1f75bae460 */\ -/* sqr(t[0], t[0]); */ /* 79: b97c3eeb75c8c0 */\ -/* sqr(t[0], t[0]); */ /* 80: 172f87dd6eb9180 */\ -/* sqr(t[0], t[0]); */ /* 81: 2e5f0fbadd72300 */\ -sqr_n_mul(t[0], t[0], 8, t[13]); /* 82: 2e5f0fbadd72321 */\ -/* sqr(t[0], t[0]); */ /* 83: 5cbe1f75bae4642 */\ -/* sqr(t[0], t[0]); */ /* 84: b97c3eeb75c8c84 */\ -/* sqr(t[0], t[0]); */ /* 85: 172f87dd6eb91908 */\ -/* sqr(t[0], t[0]); */ /* 86: 2e5f0fbadd723210 */\ -/* sqr(t[0], t[0]); */ /* 87: 5cbe1f75bae46420 */\ -/* sqr(t[0], t[0]); */ /* 88: b97c3eeb75c8c840 */\ -sqr_n_mul(t[0], t[0], 6, t[2]); /* 89: b97c3eeb75c8c873 */\ -/* sqr(t[0], t[0]); */ /* 90: 172f87dd6eb9190e6 */\ -/* sqr(t[0], t[0]); */ /* 91: 2e5f0fbadd72321cc */\ -/* sqr(t[0], t[0]); */ /* 92: 5cbe1f75bae464398 */\ -/* sqr(t[0], t[0]); */ /* 93: b97c3eeb75c8c8730 */\ -/* sqr(t[0], t[0]); */ /* 94: 172f87dd6eb9190e60 */\ -/* sqr(t[0], t[0]); */ /* 95: 2e5f0fbadd72321cc0 */\ -sqr_n_mul(t[0], t[0], 6, t[13]); /* 96: 2e5f0fbadd72321ce1 */\ -/* sqr(t[0], t[0]); */ /* 97: 5cbe1f75bae46439c2 */\ -/* sqr(t[0], t[0]); */ /* 98: b97c3eeb75c8c87384 */\ -/* sqr(t[0], t[0]); */ /* 99: 172f87dd6eb9190e708 */\ -/* sqr(t[0], t[0]); */ /* 100: 2e5f0fbadd72321ce10 */\ -/* sqr(t[0], t[0]); */ /* 101: 5cbe1f75bae46439c20 */\ -/* sqr(t[0], t[0]); */ /* 102: b97c3eeb75c8c873840 */\ -/* sqr(t[0], t[0]); */ /* 103: 172f87dd6eb9190e7080 */\ -sqr_n_mul(t[0], t[0], 7, t[12]); /* 104: 172f87dd6eb9190e70a5 */\ -/* sqr(t[0], t[0]); */ /* 105: 2e5f0fbadd72321ce14a */\ -/* sqr(t[0], t[0]); */ /* 106: 5cbe1f75bae46439c294 */\ -/* sqr(t[0], t[0]); */ /* 107: b97c3eeb75c8c8738528 */\ -/* sqr(t[0], t[0]); */ /* 108: 172f87dd6eb9190e70a50 */\ -/* sqr(t[0], t[0]); */ /* 109: 2e5f0fbadd72321ce14a0 */\ -/* sqr(t[0], t[0]); */ /* 110: 5cbe1f75bae46439c2940 */\ -/* sqr(t[0], t[0]); */ /* 111: b97c3eeb75c8c87385280 */\ -/* sqr(t[0], t[0]); */ /* 112: 172f87dd6eb9190e70a500 */\ -sqr_n_mul(t[0], t[0], 8, t[11]); /* 113: 172f87dd6eb9190e70a52b */\ -/* sqr(t[0], t[0]); */ /* 114: 2e5f0fbadd72321ce14a56 */\ -/* sqr(t[0], t[0]); */ /* 115: 5cbe1f75bae46439c294ac */\ -/* sqr(t[0], t[0]); */ /* 116: b97c3eeb75c8c873852958 */\ -/* sqr(t[0], t[0]); */ /* 117: 172f87dd6eb9190e70a52b0 */\ -/* sqr(t[0], t[0]); */ /* 118: 2e5f0fbadd72321ce14a560 */\ -/* sqr(t[0], t[0]); */ /* 119: 5cbe1f75bae46439c294ac0 */\ -sqr_n_mul(t[0], t[0], 6, t[1]); /* 120: 5cbe1f75bae46439c294acd */\ -/* sqr(t[0], t[0]); */ /* 121: b97c3eeb75c8c873852959a */\ -/* sqr(t[0], t[0]); */ /* 122: 172f87dd6eb9190e70a52b34 */\ -/* sqr(t[0], t[0]); */ /* 123: 2e5f0fbadd72321ce14a5668 */\ -/* sqr(t[0], t[0]); */ /* 124: 5cbe1f75bae46439c294acd0 */\ -/* sqr(t[0], t[0]); */ /* 125: b97c3eeb75c8c873852959a0 */\ -/* sqr(t[0], t[0]); */ /* 126: 172f87dd6eb9190e70a52b340 */\ -/* sqr(t[0], t[0]); */ /* 127: 2e5f0fbadd72321ce14a56680 */\ -/* sqr(t[0], t[0]); */ /* 128: 5cbe1f75bae46439c294acd00 */\ -sqr_n_mul(t[0], t[0], 8, t[2]); /* 129: 5cbe1f75bae46439c294acd33 */\ -/* sqr(t[0], t[0]); */ /* 130: b97c3eeb75c8c873852959a66 */\ -/* sqr(t[0], t[0]); */ /* 131: 172f87dd6eb9190e70a52b34cc */\ -/* sqr(t[0], t[0]); */ /* 132: 2e5f0fbadd72321ce14a566998 */\ -/* sqr(t[0], t[0]); */ /* 133: 5cbe1f75bae46439c294acd330 */\ -/* sqr(t[0], t[0]); */ /* 134: b97c3eeb75c8c873852959a660 */\ -/* sqr(t[0], t[0]); */ /* 135: 172f87dd6eb9190e70a52b34cc0 */\ -sqr_n_mul(t[0], t[0], 6, t[11]); /* 136: 172f87dd6eb9190e70a52b34ceb */\ -/* sqr(t[0], t[0]); */ /* 137: 2e5f0fbadd72321ce14a56699d6 */\ -/* sqr(t[0], t[0]); */ /* 138: 5cbe1f75bae46439c294acd33ac */\ -/* sqr(t[0], t[0]); */ /* 139: b97c3eeb75c8c873852959a6758 */\ -/* sqr(t[0], t[0]); */ /* 140: 172f87dd6eb9190e70a52b34ceb0 */\ -sqr_n_mul(t[0], t[0], 4, t[10]); /* 141: 172f87dd6eb9190e70a52b34ceb9 */\ -/* sqr(t[0], t[0]); */ /* 142: 2e5f0fbadd72321ce14a56699d72 */\ -/* sqr(t[0], t[0]); */ /* 143: 5cbe1f75bae46439c294acd33ae4 */\ -/* sqr(t[0], t[0]); */ /* 144: b97c3eeb75c8c873852959a675c8 */\ -/* sqr(t[0], t[0]); */ /* 145: 172f87dd6eb9190e70a52b34ceb90 */\ -/* sqr(t[0], t[0]); */ /* 146: 2e5f0fbadd72321ce14a56699d720 */\ -sqr_n_mul(t[0], t[0], 5, t[8]); /* 147: 2e5f0fbadd72321ce14a56699d73f */\ -/* sqr(t[0], t[0]); */ /* 148: 5cbe1f75bae46439c294acd33ae7e */\ -/* sqr(t[0], t[0]); */ /* 149: b97c3eeb75c8c873852959a675cfc */\ -/* sqr(t[0], t[0]); */ /* 150: 172f87dd6eb9190e70a52b34ceb9f8 */\ -/* sqr(t[0], t[0]); */ /* 151: 2e5f0fbadd72321ce14a56699d73f0 */\ -/* sqr(t[0], t[0]); */ /* 152: 5cbe1f75bae46439c294acd33ae7e0 */\ -/* sqr(t[0], t[0]); */ /* 153: b97c3eeb75c8c873852959a675cfc0 */\ -/* sqr(t[0], t[0]); */ /* 154: 172f87dd6eb9190e70a52b34ceb9f80 */\ -/* sqr(t[0], t[0]); */ /* 155: 2e5f0fbadd72321ce14a56699d73f00 */\ -/* sqr(t[0], t[0]); */ /* 156: 5cbe1f75bae46439c294acd33ae7e00 */\ -/* sqr(t[0], t[0]); */ /* 157: b97c3eeb75c8c873852959a675cfc00 */\ -/* sqr(t[0], t[0]); */ /* 158: 172f87dd6eb9190e70a52b34ceb9f800 */\ -/* sqr(t[0], t[0]); */ /* 159: 2e5f0fbadd72321ce14a56699d73f000 */\ -/* sqr(t[0], t[0]); */ /* 160: 5cbe1f75bae46439c294acd33ae7e000 */\ -/* sqr(t[0], t[0]); */ /* 161: b97c3eeb75c8c873852959a675cfc000 */\ -/* sqr(t[0], t[0]); */ /* 162: 172f87dd6eb9190e70a52b34ceb9f8000 */\ -sqr_n_mul(t[0], t[0], 15, t[9]); /* 163: 172f87dd6eb9190e70a52b34ceb9f8011 */\ -/* sqr(t[0], t[0]); */ /* 164: 2e5f0fbadd72321ce14a56699d73f0022 */\ -/* sqr(t[0], t[0]); */ /* 165: 5cbe1f75bae46439c294acd33ae7e0044 */\ -/* sqr(t[0], t[0]); */ /* 166: b97c3eeb75c8c873852959a675cfc0088 */\ -/* sqr(t[0], t[0]); */ /* 167: 172f87dd6eb9190e70a52b34ceb9f80110 */\ -/* sqr(t[0], t[0]); */ /* 168: 2e5f0fbadd72321ce14a56699d73f00220 */\ -/* sqr(t[0], t[0]); */ /* 169: 5cbe1f75bae46439c294acd33ae7e00440 */\ -/* sqr(t[0], t[0]); */ /* 170: b97c3eeb75c8c873852959a675cfc00880 */\ -/* sqr(t[0], t[0]); */ /* 171: 172f87dd6eb9190e70a52b34ceb9f801100 */\ -sqr_n_mul(t[0], t[0], 8, t[3]); /* 172: 172f87dd6eb9190e70a52b34ceb9f80110b */\ -/* sqr(t[0], t[0]); */ /* 173: 2e5f0fbadd72321ce14a56699d73f002216 */\ -/* sqr(t[0], t[0]); */ /* 174: 5cbe1f75bae46439c294acd33ae7e00442c */\ -/* sqr(t[0], t[0]); */ /* 175: b97c3eeb75c8c873852959a675cfc008858 */\ -/* sqr(t[0], t[0]); */ /* 176: 172f87dd6eb9190e70a52b34ceb9f80110b0 */\ -/* sqr(t[0], t[0]); */ /* 177: 2e5f0fbadd72321ce14a56699d73f0022160 */\ -sqr_n_mul(t[0], t[0], 5, t[8]); /* 178: 2e5f0fbadd72321ce14a56699d73f002217f */\ -/* sqr(t[0], t[0]); */ /* 179: 5cbe1f75bae46439c294acd33ae7e00442fe */\ -/* sqr(t[0], t[0]); */ /* 180: b97c3eeb75c8c873852959a675cfc00885fc */\ -/* sqr(t[0], t[0]); */ /* 181: 172f87dd6eb9190e70a52b34ceb9f80110bf8 */\ -/* sqr(t[0], t[0]); */ /* 182: 2e5f0fbadd72321ce14a56699d73f002217f0 */\ -/* sqr(t[0], t[0]); */ /* 183: 5cbe1f75bae46439c294acd33ae7e00442fe0 */\ -/* sqr(t[0], t[0]); */ /* 184: b97c3eeb75c8c873852959a675cfc00885fc0 */\ -/* sqr(t[0], t[0]); */ /* 185: 172f87dd6eb9190e70a52b34ceb9f80110bf80 */\ -/* sqr(t[0], t[0]); */ /* 186: 2e5f0fbadd72321ce14a56699d73f002217f00 */\ -/* sqr(t[0], t[0]); */ /* 187: 5cbe1f75bae46439c294acd33ae7e00442fe00 */\ -/* sqr(t[0], t[0]); */ /* 188: b97c3eeb75c8c873852959a675cfc00885fc00 */\ -sqr_n_mul(t[0], t[0], 10, t[7]); /* 189: b97c3eeb75c8c873852959a675cfc00885fc39 */\ -/* sqr(t[0], t[0]); */ /* 190: 172f87dd6eb9190e70a52b34ceb9f80110bf872 */\ -/* sqr(t[0], t[0]); */ /* 191: 2e5f0fbadd72321ce14a56699d73f002217f0e4 */\ -/* sqr(t[0], t[0]); */ /* 192: 5cbe1f75bae46439c294acd33ae7e00442fe1c8 */\ -/* sqr(t[0], t[0]); */ /* 193: b97c3eeb75c8c873852959a675cfc00885fc390 */\ -/* sqr(t[0], t[0]); */ /* 194: 172f87dd6eb9190e70a52b34ceb9f80110bf8720 */\ -/* sqr(t[0], t[0]); */ /* 195: 2e5f0fbadd72321ce14a56699d73f002217f0e40 */\ -sqr_n_mul(t[0], t[0], 6, t[6]); /* 196: 2e5f0fbadd72321ce14a56699d73f002217f0e67 */\ -/* sqr(t[0], t[0]); */ /* 197: 5cbe1f75bae46439c294acd33ae7e00442fe1cce */\ -/* sqr(t[0], t[0]); */ /* 198: b97c3eeb75c8c873852959a675cfc00885fc399c */\ -/* sqr(t[0], t[0]); */ /* 199: 172f87dd6eb9190e70a52b34ceb9f80110bf87338 */\ -/* sqr(t[0], t[0]); */ /* 200: 2e5f0fbadd72321ce14a56699d73f002217f0e670 */\ -/* sqr(t[0], t[0]); */ /* 201: 5cbe1f75bae46439c294acd33ae7e00442fe1cce0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 202: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3 */\ -/* sqr(t[0], t[0]); */ /* 203: b97c3eeb75c8c873852959a675cfc00885fc399e6 */\ -/* sqr(t[0], t[0]); */ /* 204: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc */\ -/* sqr(t[0], t[0]); */ /* 205: 2e5f0fbadd72321ce14a56699d73f002217f0e6798 */\ -/* sqr(t[0], t[0]); */ /* 206: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf30 */\ -/* sqr(t[0], t[0]); */ /* 207: b97c3eeb75c8c873852959a675cfc00885fc399e60 */\ -/* sqr(t[0], t[0]); */ /* 208: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc0 */\ -/* sqr(t[0], t[0]); */ /* 209: 2e5f0fbadd72321ce14a56699d73f002217f0e67980 */\ -/* sqr(t[0], t[0]); */ /* 210: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf300 */\ -sqr_n_mul(t[0], t[0], 8, t[2]); /* 211: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf333 */\ -/* sqr(t[0], t[0]); */ /* 212: b97c3eeb75c8c873852959a675cfc00885fc399e666 */\ -/* sqr(t[0], t[0]); */ /* 213: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc */\ -/* sqr(t[0], t[0]); */ /* 214: 2e5f0fbadd72321ce14a56699d73f002217f0e679998 */\ -/* sqr(t[0], t[0]); */ /* 215: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3330 */\ -/* sqr(t[0], t[0]); */ /* 216: b97c3eeb75c8c873852959a675cfc00885fc399e6660 */\ -/* sqr(t[0], t[0]); */ /* 217: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc0 */\ -/* sqr(t[0], t[0]); */ /* 218: 2e5f0fbadd72321ce14a56699d73f002217f0e6799980 */\ -sqr_n_mul(t[0], t[0], 7, t[5]); /* 219: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f */\ -/* sqr(t[0], t[0]); */ /* 220: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e */\ -/* sqr(t[0], t[0]); */ /* 221: b97c3eeb75c8c873852959a675cfc00885fc399e6663c */\ -/* sqr(t[0], t[0]); */ /* 222: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78 */\ -/* sqr(t[0], t[0]); */ /* 223: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f0 */\ -/* sqr(t[0], t[0]); */ /* 224: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e0 */\ -/* sqr(t[0], t[0]); */ /* 225: b97c3eeb75c8c873852959a675cfc00885fc399e6663c0 */\ -/* sqr(t[0], t[0]); */ /* 226: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc780 */\ -/* sqr(t[0], t[0]); */ /* 227: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f00 */\ -/* sqr(t[0], t[0]); */ /* 228: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e00 */\ -sqr_n_mul(t[0], t[0], 9, t[2]); /* 229: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33 */\ -/* sqr(t[0], t[0]); */ /* 230: b97c3eeb75c8c873852959a675cfc00885fc399e6663c66 */\ -/* sqr(t[0], t[0]); */ /* 231: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc */\ -/* sqr(t[0], t[0]); */ /* 232: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f198 */\ -/* sqr(t[0], t[0]); */ /* 233: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e330 */\ -/* sqr(t[0], t[0]); */ /* 234: b97c3eeb75c8c873852959a675cfc00885fc399e6663c660 */\ -/* sqr(t[0], t[0]); */ /* 235: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc0 */\ -/* sqr(t[0], t[0]); */ /* 236: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1980 */\ -sqr_n_mul(t[0], t[0], 7, t[4]); /* 237: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993 */\ -/* sqr(t[0], t[0]); */ /* 238: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326 */\ -/* sqr(t[0], t[0]); */ /* 239: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c */\ -/* sqr(t[0], t[0]); */ /* 240: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc98 */\ -/* sqr(t[0], t[0]); */ /* 241: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19930 */\ -/* sqr(t[0], t[0]); */ /* 242: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33260 */\ -/* sqr(t[0], t[0]); */ /* 243: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c0 */\ -/* sqr(t[0], t[0]); */ /* 244: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc980 */\ -/* sqr(t[0], t[0]); */ /* 245: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199300 */\ -sqr_n_mul(t[0], t[0], 8, t[2]); /* 246: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333 */\ -/* sqr(t[0], t[0]); */ /* 247: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666 */\ -/* sqr(t[0], t[0]); */ /* 248: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc */\ -/* sqr(t[0], t[0]); */ /* 249: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9998 */\ -/* sqr(t[0], t[0]); */ /* 250: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993330 */\ -/* sqr(t[0], t[0]); */ /* 251: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326660 */\ -/* sqr(t[0], t[0]); */ /* 252: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc0 */\ -/* sqr(t[0], t[0]); */ /* 253: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99980 */\ -/* sqr(t[0], t[0]); */ /* 254: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933300 */\ -sqr_n_mul(t[0], t[0], 8, t[2]); /* 255: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333 */\ -/* sqr(t[0], t[0]); */ /* 256: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666 */\ -/* sqr(t[0], t[0]); */ /* 257: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc */\ -/* sqr(t[0], t[0]); */ /* 258: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999998 */\ -/* sqr(t[0], t[0]); */ /* 259: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333330 */\ -/* sqr(t[0], t[0]); */ /* 260: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666660 */\ -/* sqr(t[0], t[0]); */ /* 261: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc0 */\ -/* sqr(t[0], t[0]); */ /* 262: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999980 */\ -/* sqr(t[0], t[0]); */ /* 263: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333300 */\ -sqr_n_mul(t[0], t[0], 8, t[2]); /* 264: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333333 */\ -/* sqr(t[0], t[0]); */ /* 265: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666 */\ -/* sqr(t[0], t[0]); */ /* 266: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc */\ -/* sqr(t[0], t[0]); */ /* 267: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999998 */\ -/* sqr(t[0], t[0]); */ /* 268: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333330 */\ -/* sqr(t[0], t[0]); */ /* 269: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666660 */\ -/* sqr(t[0], t[0]); */ /* 270: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc0 */\ -sqr_n_mul(t[0], t[0], 6, t[3]); /* 271: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb */\ -/* sqr(t[0], t[0]); */ /* 272: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996 */\ -/* sqr(t[0], t[0]); */ /* 273: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c */\ -/* sqr(t[0], t[0]); */ /* 274: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666658 */\ -/* sqr(t[0], t[0]); */ /* 275: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb0 */\ -/* sqr(t[0], t[0]); */ /* 276: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999960 */\ -/* sqr(t[0], t[0]); */ /* 277: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c0 */\ -/* sqr(t[0], t[0]); */ /* 278: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666580 */\ -/* sqr(t[0], t[0]); */ /* 279: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb00 */\ -sqr_n_mul(t[0], t[0], 8, t[2]); /* 280: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33 */\ -/* sqr(t[0], t[0]); */ /* 281: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666 */\ -/* sqr(t[0], t[0]); */ /* 282: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc */\ -/* sqr(t[0], t[0]); */ /* 283: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665998 */\ -/* sqr(t[0], t[0]); */ /* 284: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb330 */\ -/* sqr(t[0], t[0]); */ /* 285: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996660 */\ -/* sqr(t[0], t[0]); */ /* 286: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc0 */\ -/* sqr(t[0], t[0]); */ /* 287: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659980 */\ -/* sqr(t[0], t[0]); */ /* 288: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3300 */\ -sqr_n_mul(t[0], t[0], 8, t[2]); /* 289: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333 */\ -/* sqr(t[0], t[0]); */ /* 290: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666 */\ -/* sqr(t[0], t[0]); */ /* 291: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc */\ -/* sqr(t[0], t[0]); */ /* 292: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666599998 */\ -/* sqr(t[0], t[0]); */ /* 293: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33330 */\ -/* sqr(t[0], t[0]); */ /* 294: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666660 */\ -/* sqr(t[0], t[0]); */ /* 295: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc0 */\ -/* sqr(t[0], t[0]); */ /* 296: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665999980 */\ -/* sqr(t[0], t[0]); */ /* 297: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333300 */\ -sqr_n_mul(t[0], t[0], 8, t[2]); /* 298: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333333 */\ -/* sqr(t[0], t[0]); */ /* 299: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996666666 */\ -/* sqr(t[0], t[0]); */ /* 300: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc */\ -/* sqr(t[0], t[0]); */ /* 301: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659999998 */\ -/* sqr(t[0], t[0]); */ /* 302: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333330 */\ -/* sqr(t[0], t[0]); */ /* 303: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666660 */\ -/* sqr(t[0], t[0]); */ /* 304: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc0 */\ -sqr_n_mul(out, t[0], 6, t[1]); /* 305: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332cccccccd */\ -} while(0) diff --git a/crypto/blst_src/pentaroot.c b/crypto/blst_src/pentaroot.c deleted file mode 100644 index 71f334df50a..00000000000 --- a/crypto/blst_src/pentaroot.c +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "fields.h" - -static inline void mul_fr(vec256 ret, const vec256 a, const vec256 b) -{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } - -static inline void sqr_fr(vec256 ret, const vec256 a) -{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } - -#ifdef __OPTIMIZE_SIZE__ -void blst_fr_pentaroot(vec256 out, const vec256 inp) -{ - static const byte pow[] = { - TO_BYTES(0x33333332cccccccd), TO_BYTES(0x217f0e679998f199), - TO_BYTES(0xe14a56699d73f002), TO_BYTES(0x2e5f0fbadd72321c) - }; - size_t pow_bits = 254; - vec256 ret; - - vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ - --pow_bits; /* most significant bit is set, skip over */ - while (pow_bits--) { - sqr_fr(ret, ret); - if (is_bit_set(pow, pow_bits)) - mul_fr(ret, ret, inp); - } - vec_copy(out, ret, sizeof(ret)); /* out = ret */ -} -#else -# if 0 -/* - * "255"-bit variant omits full reductions at the ends of squarings, - * not implemented yet[?]. - */ -static inline void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, - const vec256 b) -{ sqr_n_mul_mont_255(out, a, count, BLS12_381_r, r0, b); } -# else -static void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, - const vec256 b) -{ - do { - sqr_fr(out, a); - a = out; - } while (--count); - mul_fr(out, out, b); -} -# endif - -# define sqr(ret,a) sqr_fr(ret,a) -# define mul(ret,a,b) mul_fr(ret,a,b) -# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fr(ret,a,n,b) - -# include "pentaroot-addchain.h" -void blst_fr_pentaroot(vec256 out, const vec256 inp) -{ PENTAROOT_MOD_BLS12_381_r(out, inp, vec256); } -# undef PENTAROOT_MOD_BLS12_381_r - -# undef sqr_n_mul -# undef sqr -# undef mul -#endif - -void blst_fr_pentapow(vec256 out, const vec256 inp) -{ - vec256 tmp; - - sqr_fr(tmp, inp); - sqr_fr(tmp, tmp); - mul_fr(out, tmp, inp); -} diff --git a/crypto/blst_src/point.h b/crypto/blst_src/point.h deleted file mode 100644 index 0aa7379671f..00000000000 --- a/crypto/blst_src/point.h +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLS12_381_ASM_POINT_H__ -#define __BLS12_381_ASM_POINT_H__ - -#include "vect.h" -#include "bytes.h" - -#define DECLARE_POINT(ptype, bits) \ -typedef struct { vec##bits X,Y,Z; } ptype; \ -typedef struct { vec##bits X,Y; } ptype##_affine; \ -\ -static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ - const vec##bits a4); \ -static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ - const ptype##_affine *p2); \ -static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2); \ -static void ptype##_add_affine(ptype *out, const ptype *p1, \ - const ptype##_affine *p2); \ -static void ptype##_double(ptype *out, const ptype *p1); \ -static void ptype##_mult_w5(ptype *out, const ptype *point, \ - const byte *scalar, size_t nbits); \ -static void ptype##_cneg(ptype *p, limb_t cbit); \ -static void ptype##_to_affine(ptype##_affine *out, const ptype *in); \ -static void ptype##_from_Jacobian(ptype *out, const ptype *in); \ -\ -static inline void ptype##_cswap(ptype *restrict a, \ - ptype *restrict b, bool_t cbit) { \ - vec_cswap(a, b, sizeof(ptype), cbit); \ -} \ -static inline void ptype##_ccopy(ptype *restrict a, \ - const ptype *restrict b, bool_t cbit) {\ - vec_select(a, b, a, sizeof(ptype), cbit); \ -} - -#define DECLARE_PRIVATE_POINTXZ(ptype, bits) \ -typedef struct { vec##bits X,Z; } ptype##xz; \ -\ -static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in); \ -static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ - const ptype##xz *p); \ -static void ptype##xz_ladder_post(ptype *ret, \ - const ptype##xz *r, const ptype##xz *s, \ - const ptype##xz *p, const vec##bits Y1);\ -\ -static inline void ptype##xz_cswap(ptype##xz *restrict a, \ - ptype##xz *restrict b, bool_t cbit) {\ - vec_cswap(a, b, sizeof(ptype##xz), cbit); \ -} - -DECLARE_POINT(POINTonE1, 384) - -DECLARE_POINT(POINTonE2, 384x) - -#ifdef __GNUC__ -# pragma GCC diagnostic ignored "-Wunused-function" -#endif - -#endif diff --git a/crypto/blst_src/rb_tree.c b/crypto/blst_src/rb_tree.c deleted file mode 100644 index 207becdad18..00000000000 --- a/crypto/blst_src/rb_tree.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include - -/* - * Red-black tree tailored for uniqueness test. Amount of messages to be - * checked is known prior context initialization, implementation is - * insert-only, failure is returned if message is already in the tree. - */ - -struct node { - struct node *leafs[2]; - const void *data; - size_t len_n_colour; /* len<<1 | colour */ -}; - -struct rb_tree { - struct node *root; - size_t n_nodes; - struct node nodes[1]; -}; - -static long bytes_compare(const unsigned char *ptr0, size_t len0, - const unsigned char *ptr1, size_t len1) -{ - size_t i, len = len0len_n_colour &= ~(size_t)1) -#define PAINT_RED(p) ((p)->len_n_colour |= 1) -#define IS_RED(p) ((p)->len_n_colour & 1) - -static int rb_tree_insert(struct rb_tree *tree, const void *data, size_t len) -{ - struct node *nodes[8*sizeof(void *)]; /* visited nodes */ - unsigned char dirs[8*sizeof(void *)]; /* taken directions */ - size_t k = 0; /* walked distance */ - struct node *p, *y, *z; - - for (p = tree->root; p != NULL; k++) { - long cmp = bytes_compare(data, len, p->data, p->len_n_colour>>1); - - if (cmp == 0) - return 0; /* already in tree, no insertion */ - - /* record the step */ - nodes[k] = p; - p = p->leafs[(dirs[k] = cmp>0)]; - } - - /* allocate new node */ - z = &tree->nodes[tree->n_nodes++]; - z->leafs[0] = z->leafs[1] = NULL; - z->data = data; - z->len_n_colour = len<<1; - PAINT_RED(z); - - /* graft |z| */ - if (k > 0) - nodes[k-1]->leafs[dirs[k-1]] = z; - else - tree->root = z; - - /* re-balance |tree| */ - while (k >= 2 && IS_RED(y = nodes[k-1])) { - size_t ydir = dirs[k-2]; - struct node *x = nodes[k-2], /* |z|'s grandparent */ - *s = x->leafs[ydir^1]; /* |z|'s uncle */ - - if (s != NULL && IS_RED(s)) { - PAINT_RED(x); - PAINT_BLACK(y); - PAINT_BLACK(s); - k -= 2; - } else { - if (dirs[k-1] != ydir) { - /* | | - * x x - * / \ \ - * y s -> z s - * \ / - * z y - * / \ - * ? ? - */ - struct node *t = y; - y = y->leafs[ydir^1]; - t->leafs[ydir^1] = y->leafs[ydir]; - y->leafs[ydir] = t; - } - - /* | | - * x y - * \ / \ - * y s -> z x - * / \ / \ - * z ? ? s - */ - x->leafs[ydir] = y->leafs[ydir^1]; - y->leafs[ydir^1] = x; - - PAINT_RED(x); - PAINT_BLACK(y); - - if (k > 2) - nodes[k-3]->leafs[dirs[k-3]] = y; - else - tree->root = y; - - break; - } - } - - PAINT_BLACK(tree->root); - - return 1; -} - -#undef IS_RED -#undef PAINT_RED -#undef PAINT_BLACK - -size_t blst_uniq_sizeof(size_t n_nodes) -{ return sizeof(struct rb_tree) + sizeof(struct node)*(n_nodes-1); } - -void blst_uniq_init(struct rb_tree *tree) -{ - tree->root = NULL; - tree->n_nodes = 0; -} - -int blst_uniq_test(struct rb_tree *tree, const void *data, size_t len) -{ return (int)rb_tree_insert(tree, data, len); } diff --git a/crypto/blst_src/recip-addchain.h b/crypto/blst_src/recip-addchain.h deleted file mode 100644 index e4e436a3f09..00000000000 --- a/crypto/blst_src/recip-addchain.h +++ /dev/null @@ -1,489 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -/* - * The "magic" number is BLS12_381_P-2. Exponentiation to which yields - * reciprocal to input base. - * - * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785' - * https://github.com/kwantam/addchain - * - * # Bos-Coster (win=4) : 461 (16) <<< - * # Bos-Coster (win=3) : 464 ( 9) - * # Bos-Coster (win=8) : 469 (35) - * # Bos-Coster (win=5) : 463 (28) - * # Bos-Coster (win=9) : 467 (32) - * # Bos-Coster (win=7) : 462 (27) - * # Yacobi : 481 (31) - * # Bos-Coster (win=10) : 475 (30) - * # Bos-Coster (win=6) : 463 (32) - * # Bos-Coster (win=2) : 489 ( 5) - * # Bergeron-Berstel-Brlek-Duboc : 498 ( 5) - */ - -#define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \ -ptype t[16]; \ -vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ -sqr(t[0], t[1]); /* 1: 2 */\ -mul(t[9], t[0], t[1]); /* 2: 3 */\ -sqr(t[5], t[0]); /* 3: 4 */\ -mul(t[2], t[9], t[0]); /* 4: 5 */\ -mul(t[7], t[5], t[9]); /* 5: 7 */\ -mul(t[10], t[2], t[5]); /* 6: 9 */\ -mul(t[13], t[7], t[5]); /* 7: b */\ -mul(t[4], t[10], t[5]); /* 8: d */\ -mul(t[8], t[13], t[5]); /* 9: f */\ -mul(t[15], t[4], t[5]); /* 10: 11 */\ -mul(t[11], t[8], t[5]); /* 11: 13 */\ -mul(t[3], t[15], t[5]); /* 12: 15 */\ -mul(t[12], t[11], t[5]); /* 13: 17 */\ -sqr(t[0], t[4]); /* 14: 1a */\ -mul(t[14], t[12], t[5]); /* 15: 1b */\ -mul(t[6], t[0], t[9]); /* 16: 1d */\ -mul(t[5], t[0], t[2]); /* 17: 1f */\ -/* sqr(t[0], t[0]); */ /* 18: 34 */\ -/* sqr(t[0], t[0]); */ /* 19: 68 */\ -/* sqr(t[0], t[0]); */ /* 20: d0 */\ -/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ -/* sqr(t[0], t[0]); */ /* 22: 340 */\ -/* sqr(t[0], t[0]); */ /* 23: 680 */\ -/* sqr(t[0], t[0]); */ /* 24: d00 */\ -/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ -/* sqr(t[0], t[0]); */ /* 26: 3400 */\ -/* sqr(t[0], t[0]); */ /* 27: 6800 */\ -/* sqr(t[0], t[0]); */ /* 28: d000 */\ -/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ -sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ -/* sqr(t[0], t[0]); */ /* 31: 34022 */\ -/* sqr(t[0], t[0]); */ /* 32: 68044 */\ -/* sqr(t[0], t[0]); */ /* 33: d0088 */\ -/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ -/* sqr(t[0], t[0]); */ /* 35: 340220 */\ -/* sqr(t[0], t[0]); */ /* 36: 680440 */\ -/* sqr(t[0], t[0]); */ /* 37: d00880 */\ -sqr_n_mul(t[0], t[0], 7, t[8]); /* 38: d0088f */\ -/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ -/* sqr(t[0], t[0]); */ /* 40: 340223c */\ -/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ -/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ -sqr_n_mul(t[0], t[0], 4, t[2]); /* 43: d0088f5 */\ -/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ -/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ -/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ -/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ -/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ -/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ -sqr_n_mul(t[0], t[0], 6, t[7]); /* 50: 340223d47 */\ -/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ -/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ -/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ -/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ -/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ -/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ -/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ -sqr_n_mul(t[0], t[0], 7, t[12]); /* 58: 1a0111ea397 */\ -/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ -/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ -/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ -/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ -/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 64: 340223d472ff */\ -/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ -/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ -sqr_n_mul(t[0], t[0], 2, t[9]); /* 67: d0088f51cbff */\ -/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ -/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ -/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ -/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ -/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ -/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ -sqr_n_mul(t[0], t[0], 6, t[4]); /* 74: 340223d472ffcd */\ -/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ -/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ -/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ -/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ -/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ -/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ -sqr_n_mul(t[0], t[0], 6, t[4]); /* 81: d0088f51cbff34d */\ -/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ -/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ -/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ -/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ -/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ -/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ -sqr_n_mul(t[0], t[0], 6, t[10]); /* 88: 340223d472ffcd349 */\ -/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ -/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ -/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ -sqr_n_mul(t[0], t[0], 3, t[9]); /* 92: 1a0111ea397fe69a4b */\ -/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ -/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ -/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ -/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ -/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ -/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ -/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ -sqr_n_mul(t[0], t[0], 7, t[4]); /* 100: d0088f51cbff34d258d */\ -/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ -/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ -/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ -/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ -sqr_n_mul(t[0], t[0], 4, t[4]); /* 105: d0088f51cbff34d258dd */\ -/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ -/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ -/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ -/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ -/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ -/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ -sqr_n_mul(t[0], t[0], 6, t[8]); /* 112: 340223d472ffcd3496374f */\ -/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ -/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ -/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ -/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ -/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ -/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ -sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ -/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ -/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ -/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ -sqr_n_mul(t[0], t[0], 3, t[1]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ -/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ -/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ -/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ -/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ -/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ -/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ -/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ -/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ -sqr_n_mul(t[0], t[0], 8, t[4]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ -/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ -/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ -/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ -/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ -/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ -/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ -/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ -sqr_n_mul(t[0], t[0], 7, t[12]); /* 140: 340223d472ffcd3496374f6c8697 */\ -/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ -/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ -/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ -/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ -/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ -sqr_n_mul(t[0], t[0], 5, t[13]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ -/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ -/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ -/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ -/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ -/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ -/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ -sqr_n_mul(t[0], t[0], 6, t[4]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ -/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ -/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ -/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ -/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ -/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ -/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ -sqr_n_mul(t[0], t[0], 6, t[6]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ -/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ -/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ -/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ -/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ -sqr_n_mul(t[0], t[0], 4, t[10]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ -/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ -/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ -/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ -/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ -/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ -/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ -/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ -/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ -sqr_n_mul(t[0], t[0], 8, t[6]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ -/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ -/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ -/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ -/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ -sqr_n_mul(t[0], t[0], 4, t[4]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ -/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ -/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ -/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ -/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ -/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ -/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ -/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ -sqr_n_mul(t[0], t[0], 7, t[12]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ -/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ -/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ -/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ -/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ -/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ -/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ -/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ -/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ -/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ -sqr_n_mul(t[0], t[0], 9, t[11]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ -/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ -/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ -sqr_n_mul(t[0], t[0], 2, t[9]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ -/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ -/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ -/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ -/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ -/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ -sqr_n_mul(t[0], t[0], 5, t[7]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ -/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ -/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ -/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ -/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ -/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ -/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ -/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ -sqr_n_mul(t[0], t[0], 7, t[2]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ -/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ -/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ -/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ -/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ -/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ -/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ -/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ -sqr_n_mul(t[0], t[0], 7, t[10]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ -/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ -/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ -/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ -/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ -/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ -/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ -sqr_n_mul(t[0], t[0], 6, t[12]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ -/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ -/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ -/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ -/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ -/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ -sqr_n_mul(t[0], t[0], 5, t[6]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ -/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ -/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ -/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ -/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ -/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ -sqr_n_mul(t[0], t[0], 5, t[11]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ -/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ -/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ -/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ -/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ -/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ -sqr_n_mul(t[0], t[0], 5, t[11]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ -/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ -/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ -/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ -/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ -/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ -/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ -/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ -/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ -sqr_n_mul(t[0], t[0], 8, t[4]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ -/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ -/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ -/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ -/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ -/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ -/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ -/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ -sqr_n_mul(t[0], t[0], 7, t[3]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ -/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ -/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ -/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ -/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ -/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ -/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ -/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ -/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ -/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ -sqr_n_mul(t[0], t[0], 9, t[8]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ -/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ -/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ -/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ -/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ -/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ -/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ -/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ -/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ -sqr_n_mul(t[0], t[0], 3, t[9]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ -/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ -/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ -/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ -/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ -/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ -/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ -/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ -/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ -sqr_n_mul(t[0], t[0], 8, t[8]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ -/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ -/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ -/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ -sqr_n_mul(t[0], t[0], 3, t[9]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ -/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ -/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ -/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ -/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ -/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ -/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ -/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ -sqr_n_mul(t[0], t[0], 7, t[10]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ -/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ -/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ -/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ -/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ -/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ -/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ -/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ -/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ -/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ -sqr_n_mul(t[0], t[0], 9, t[8]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ -/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ -/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ -/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ -/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ -/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ -/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ -sqr_n_mul(t[0], t[0], 6, t[3]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ -/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ -/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ -/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ -/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ -/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ -/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ -sqr_n_mul(t[0], t[0], 6, t[5]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ -/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ -/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ -/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ -/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ -/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ -/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ -/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ -/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ -/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ -/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ -/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ -/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ -/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ -/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ -sqr_n_mul(t[0], t[0], 4, t[4]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ -/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ -/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ -/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ -sqr_n_mul(t[0], t[0], 3, t[9]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ -/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ -/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ -/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ -/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ -/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ -/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ -/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ -/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ -sqr_n_mul(t[0], t[0], 8, t[3]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ -/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ -/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ -/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ -/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ -/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ -/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ -/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ -sqr_n_mul(t[0], t[0], 7, t[5]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ -/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ -/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ -/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ -/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ -/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ -/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ -/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ -/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ -/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ -/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ -/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ -/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ -/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ -/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ -sqr_n_mul(t[0], t[0], 4, t[8]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ -/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ -/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ -/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ -/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ -sqr_n_mul(t[0], t[0], 4, t[7]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ -/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ -/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ -/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ -/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ -/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ -/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ -/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ -sqr_n_mul(t[0], t[0], 7, t[5]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ -/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ -/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ -/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ -/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ -/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ -sqr_n_mul(t[0], t[0], 5, t[6]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ -/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ -/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ -/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ -/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ -/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ -/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ -/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ -/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ -/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ -/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ -/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ -/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ -/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ -/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ -/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ -/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ -/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ -/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ -/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ -/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ -/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ -/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ -/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ -/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ -/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ -/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ -/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ -/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ -/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ -/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ -/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ -/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ -/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ -/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ -sqr_n_mul(t[0], t[0], 4, t[4]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ -/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ -/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ -/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ -/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ -/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ -/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ -sqr_n_mul(t[0], t[0], 6, t[3]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ -/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ -/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ -/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ -/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ -sqr_n_mul(t[0], t[0], 4, t[2]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ -/* sqr(t[0], t[0]); */ /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ -/* sqr(t[0], t[0]); */ /* 458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\ -/* sqr(t[0], t[0]); */ /* 459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\ -sqr_n_mul(out, t[0], 3, t[1]); /* 460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\ -} while(0) diff --git a/crypto/blst_src/recip.c b/crypto/blst_src/recip.c deleted file mode 100644 index e0c700635ed..00000000000 --- a/crypto/blst_src/recip.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "fields.h" - -#ifdef __OPTIMIZE_SIZE__ -/* - * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32% - * more than corresponding optimal addition-chain, plus mispredicted - * branch penalties on top of that... The addition chain below was - * measured to be >50% faster. - */ -static void flt_reciprocal_fp(vec384 out, const vec384 inp) -{ - static const byte BLS12_381_P_minus_2[] = { - TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff), - TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf), - TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a) - }; - - exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0); -} -#else -# define sqr(ret,a) sqr_fp(ret,a) -# define mul(ret,a,b) mul_fp(ret,a,b) -# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) - -# include "recip-addchain.h" -static void flt_reciprocal_fp(vec384 out, const vec384 inp) -{ - RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384); -} -# undef RECIPROCAL_MOD_BLS12_381_P -# undef sqr_n_mul -# undef mul -# undef sqr -#endif - -static void flt_reciprocal_fp2(vec384x out, const vec384x inp) -{ - vec384 t0, t1; - - /* - * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i - */ - sqr_fp(t0, inp[0]); - sqr_fp(t1, inp[1]); - add_fp(t0, t0, t1); - flt_reciprocal_fp(t1, t0); - mul_fp(out[0], inp[0], t1); - mul_fp(out[1], inp[1], t1); - neg_fp(out[1], out[1]); -} - -static void reciprocal_fp(vec384 out, const vec384 inp) -{ - static const vec384 Px8 = { /* left-aligned value of the modulus */ - TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd), - TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb), - TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2) - }; -#ifdef __BLST_NO_ASM__ -# define RRx4 BLS12_381_RR -#else - static const vec384 RRx4 = { /* (4<<768)%P */ - TO_LIMB_T(0x5f7e7cd070d107c2), TO_LIMB_T(0xec839a9ac49c13c8), - TO_LIMB_T(0x6933786f44f4ef0b), TO_LIMB_T(0xd6bf8b9c676be983), - TO_LIMB_T(0xd3adaaaa4dcefb06), TO_LIMB_T(0x12601bc1d82bc175) - }; -#endif - union { vec768 x; vec384 r[2]; } temp; - - ct_inverse_mod_383(temp.x, inp, BLS12_381_P, Px8); - redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0); - mul_mont_384(temp.r[0], temp.r[0], RRx4, BLS12_381_P, p0); - -#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION - /* sign goes straight to flt_reciprocal */ - mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0); - if (vec_is_equal(temp.r[1], BLS12_381_Rx.p, sizeof(vec384)) | - vec_is_zero(temp.r[1], sizeof(vec384))) - vec_copy(out, temp.r[0], sizeof(vec384)); - else - flt_reciprocal_fp(out, inp); -#else - vec_copy(out, temp.r[0], sizeof(vec384)); -#endif -#undef RRx4 -} - -void blst_fp_inverse(vec384 out, const vec384 inp) -{ reciprocal_fp(out, inp); } - -void blst_fp_eucl_inverse(vec384 ret, const vec384 a) -{ reciprocal_fp(ret, a); } - -static void reciprocal_fp2(vec384x out, const vec384x inp) -{ - vec384 t0, t1; - - /* - * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i - */ - sqr_fp(t0, inp[0]); - sqr_fp(t1, inp[1]); - add_fp(t0, t0, t1); - reciprocal_fp(t1, t0); - mul_fp(out[0], inp[0], t1); - mul_fp(out[1], inp[1], t1); - neg_fp(out[1], out[1]); -} - -void blst_fp2_inverse(vec384x out, const vec384x inp) -{ reciprocal_fp2(out, inp); } - -void blst_fp2_eucl_inverse(vec384x out, const vec384x inp) -{ reciprocal_fp2(out, inp); } - -static void reciprocal_fr(vec256 out, const vec256 inp) -{ - static const vec256 rx2 = { /* left-aligned value of the modulus */ - TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd), - TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90), - }; - vec512 temp; - - ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2); - redc_mont_256(out, temp, BLS12_381_r, r0); - mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); -} - -void blst_fr_inverse(vec256 out, const vec256 inp) -{ reciprocal_fr(out, inp); } - -void blst_fr_eucl_inverse(vec256 out, const vec256 inp) -{ reciprocal_fr(out, inp); } diff --git a/crypto/blst_src/sha256.h b/crypto/blst_src/sha256.h deleted file mode 100644 index 77ddb6dc848..00000000000 --- a/crypto/blst_src/sha256.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLS12_381_ASM_SHA256_H__ -#define __BLS12_381_ASM_SHA256_H__ - -#include "vect.h" - -#if (defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)) && \ - defined(__SHA__) /* -msha */ && !defined(__BLST_PORTABLE__) -# define sha256_block_data_order blst_sha256_block_data_order_shaext -#elif defined(__aarch64__) && \ - defined(__ARM_FEATURE_CRYPTO) && !defined(__BLST_PORTABLE__) -# define sha256_block_data_order blst_sha256_block_armv8 -#else -# define sha256_block_data_order blst_sha256_block_data_order -#endif -#define sha256_hcopy blst_sha256_hcopy -#define sha256_bcopy blst_sha256_bcopy -#define sha256_emit blst_sha256_emit - -void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks); -void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]); -void sha256_bcopy(void *dst, const void *src, size_t len); - -/* - * If SHA256_CTX conflicts with something, just redefine it to alternative - * custom name prior including this header. - */ -typedef struct { - unsigned int h[8]; - unsigned long long N; - unsigned char buf[64]; - size_t off; -} SHA256_CTX; - - -static void sha256_init_h(unsigned int h[8]) -{ - h[0] = 0x6a09e667U; - h[1] = 0xbb67ae85U; - h[2] = 0x3c6ef372U; - h[3] = 0xa54ff53aU; - h[4] = 0x510e527fU; - h[5] = 0x9b05688cU; - h[6] = 0x1f83d9abU; - h[7] = 0x5be0cd19U; -} - -static void sha256_init(SHA256_CTX *ctx) -{ - sha256_init_h(ctx->h); - ctx->N = 0; - vec_zero(ctx->buf, sizeof(ctx->buf)); - ctx->off = 0; -} - -static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len) -{ - size_t n; - const unsigned char *inp = _inp; - - ctx->N += len; - - if ((len != 0) & ((n = ctx->off) != 0)) { - size_t rem = sizeof(ctx->buf) - n; - - if (rem > len) { - sha256_bcopy(ctx->buf + n, inp, len); - ctx->off += len; - return; - } else { - sha256_bcopy(ctx->buf + n, inp, rem); - inp += rem; - len -= rem; - sha256_block_data_order(ctx->h, ctx->buf, 1); - vec_zero(ctx->buf, sizeof(ctx->buf)); - ctx->off = 0; - } - } - - n = len / sizeof(ctx->buf); - if (n > 0) { - sha256_block_data_order(ctx->h, inp, n); - n *= sizeof(ctx->buf); - inp += n; - len -= n; - } - - if (len) - sha256_bcopy(ctx->buf, inp, ctx->off = len); -} - -#define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \ - (ptr)[1] = (unsigned char)((val)>>16), \ - (ptr)[2] = (unsigned char)((val)>>8), \ - (ptr)[3] = (unsigned char)(val)) - -#if 1 -void sha256_emit(unsigned char md[32], const unsigned int h[8]); -#else -static void sha256_emit(unsigned char md[32], const unsigned int h[8]) -{ - unsigned int h_i; - - h_i = h[0]; __TOBE32(md + 0, h_i); - h_i = h[1]; __TOBE32(md + 4, h_i); - h_i = h[2]; __TOBE32(md + 8, h_i); - h_i = h[3]; __TOBE32(md + 12, h_i); - h_i = h[4]; __TOBE32(md + 16, h_i); - h_i = h[5]; __TOBE32(md + 20, h_i); - h_i = h[6]; __TOBE32(md + 24, h_i); - h_i = h[7]; __TOBE32(md + 28, h_i); -} -#endif - -static void sha256_final(unsigned char md[32], SHA256_CTX *ctx) -{ - unsigned long long bits = ctx->N * 8; - size_t n = ctx->off; - unsigned char *tail; - - ctx->buf[n++] = 0x80; - - if (n > (sizeof(ctx->buf) - 8)) { - sha256_block_data_order(ctx->h, ctx->buf, 1); - vec_zero(ctx->buf, sizeof(ctx->buf)); - } - - tail = ctx->buf + sizeof(ctx->buf) - 8; - __TOBE32(tail, (unsigned int)(bits >> 32)); - __TOBE32(tail + 4, (unsigned int)bits); - sha256_block_data_order(ctx->h, ctx->buf, 1); - sha256_emit(md, ctx->h); -} - -#undef __TOBE32 -#endif diff --git a/crypto/blst_src/sqrt-addchain.h b/crypto/blst_src/sqrt-addchain.h deleted file mode 100644 index 4e7f0beb6b1..00000000000 --- a/crypto/blst_src/sqrt-addchain.h +++ /dev/null @@ -1,489 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -/* - * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which - * yields reciprocal of sqrt(x), which is used in simplified Shallue- - * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt - * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x) - * as 'x*ret^2==1'). - * - * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946' - * https://github.com/kwantam/addchain - * - * # Bos-Coster (win=4) : 458 (16) <<< - * # Bos-Coster (win=5) : 460 (28) - * # Bos-Coster (win=6) : 461 (33) - * # Bos-Coster (win=7) : 460 (28) - * # Bos-Coster (win=3) : 462 ( 9) - * # Bos-Coster (win=8) : 466 (34) - * # Bos-Coster (win=9) : 464 (31) - * # Yacobi : 478 (31) - * # Bos-Coster (win=10) : 473 (30) - * # Bos-Coster (win=2) : 486 ( 5) - * # Bergeron-Berstel-Brlek-Duboc : 489 ( 5) - */ - -#define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \ -ptype t[16]; \ -vec_copy(t[13], inp, sizeof(ptype));/* 0: 1 */\ -sqr(t[0], t[13]); /* 1: 2 */\ -mul(t[8], t[0], t[13]); /* 2: 3 */\ -sqr(t[4], t[0]); /* 3: 4 */\ -mul(t[1], t[8], t[0]); /* 4: 5 */\ -mul(t[6], t[4], t[8]); /* 5: 7 */\ -mul(t[9], t[1], t[4]); /* 6: 9 */\ -mul(t[12], t[6], t[4]); /* 7: b */\ -mul(t[3], t[9], t[4]); /* 8: d */\ -mul(t[7], t[12], t[4]); /* 9: f */\ -mul(t[15], t[3], t[4]); /* 10: 11 */\ -mul(t[10], t[7], t[4]); /* 11: 13 */\ -mul(t[2], t[15], t[4]); /* 12: 15 */\ -mul(t[11], t[10], t[4]); /* 13: 17 */\ -sqr(t[0], t[3]); /* 14: 1a */\ -mul(t[14], t[11], t[4]); /* 15: 1b */\ -mul(t[5], t[0], t[8]); /* 16: 1d */\ -mul(t[4], t[0], t[1]); /* 17: 1f */\ -/* sqr(t[0], t[0]); */ /* 18: 34 */\ -/* sqr(t[0], t[0]); */ /* 19: 68 */\ -/* sqr(t[0], t[0]); */ /* 20: d0 */\ -/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ -/* sqr(t[0], t[0]); */ /* 22: 340 */\ -/* sqr(t[0], t[0]); */ /* 23: 680 */\ -/* sqr(t[0], t[0]); */ /* 24: d00 */\ -/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ -/* sqr(t[0], t[0]); */ /* 26: 3400 */\ -/* sqr(t[0], t[0]); */ /* 27: 6800 */\ -/* sqr(t[0], t[0]); */ /* 28: d000 */\ -/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ -sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ -/* sqr(t[0], t[0]); */ /* 31: 34022 */\ -/* sqr(t[0], t[0]); */ /* 32: 68044 */\ -/* sqr(t[0], t[0]); */ /* 33: d0088 */\ -/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ -/* sqr(t[0], t[0]); */ /* 35: 340220 */\ -/* sqr(t[0], t[0]); */ /* 36: 680440 */\ -/* sqr(t[0], t[0]); */ /* 37: d00880 */\ -sqr_n_mul(t[0], t[0], 7, t[7]); /* 38: d0088f */\ -/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ -/* sqr(t[0], t[0]); */ /* 40: 340223c */\ -/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ -/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ -sqr_n_mul(t[0], t[0], 4, t[1]); /* 43: d0088f5 */\ -/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ -/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ -/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ -/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ -/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ -/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ -sqr_n_mul(t[0], t[0], 6, t[6]); /* 50: 340223d47 */\ -/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ -/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ -/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ -/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ -/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ -/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ -/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ -sqr_n_mul(t[0], t[0], 7, t[11]); /* 58: 1a0111ea397 */\ -/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ -/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ -/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ -/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ -/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 64: 340223d472ff */\ -/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ -/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ -sqr_n_mul(t[0], t[0], 2, t[8]); /* 67: d0088f51cbff */\ -/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ -/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ -/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ -/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ -/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ -/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ -sqr_n_mul(t[0], t[0], 6, t[3]); /* 74: 340223d472ffcd */\ -/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ -/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ -/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ -/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ -/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ -/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ -sqr_n_mul(t[0], t[0], 6, t[3]); /* 81: d0088f51cbff34d */\ -/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ -/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ -/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ -/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ -/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ -/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ -sqr_n_mul(t[0], t[0], 6, t[9]); /* 88: 340223d472ffcd349 */\ -/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ -/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ -/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ -sqr_n_mul(t[0], t[0], 3, t[8]); /* 92: 1a0111ea397fe69a4b */\ -/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ -/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ -/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ -/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ -/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ -/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ -/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ -sqr_n_mul(t[0], t[0], 7, t[3]); /* 100: d0088f51cbff34d258d */\ -/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ -/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ -/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ -/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ -sqr_n_mul(t[0], t[0], 4, t[3]); /* 105: d0088f51cbff34d258dd */\ -/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ -/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ -/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ -/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ -/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ -/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ -sqr_n_mul(t[0], t[0], 6, t[7]); /* 112: 340223d472ffcd3496374f */\ -/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ -/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ -/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ -/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ -/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ -/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ -sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ -/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ -/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ -/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ -sqr_n_mul(t[0], t[0], 3, t[13]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ -/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ -/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ -/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ -/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ -/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ -/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ -/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ -/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ -sqr_n_mul(t[0], t[0], 8, t[3]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ -/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ -/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ -/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ -/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ -/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ -/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ -/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ -sqr_n_mul(t[0], t[0], 7, t[11]); /* 140: 340223d472ffcd3496374f6c8697 */\ -/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ -/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ -/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ -/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ -/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ -sqr_n_mul(t[0], t[0], 5, t[12]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ -/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ -/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ -/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ -/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ -/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ -/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ -sqr_n_mul(t[0], t[0], 6, t[3]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ -/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ -/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ -/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ -/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ -/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ -/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ -sqr_n_mul(t[0], t[0], 6, t[5]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ -/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ -/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ -/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ -/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ -sqr_n_mul(t[0], t[0], 4, t[9]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ -/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ -/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ -/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ -/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ -/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ -/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ -/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ -/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ -sqr_n_mul(t[0], t[0], 8, t[5]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ -/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ -/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ -/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ -/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ -sqr_n_mul(t[0], t[0], 4, t[3]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ -/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ -/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ -/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ -/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ -/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ -/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ -/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ -sqr_n_mul(t[0], t[0], 7, t[11]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ -/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ -/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ -/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ -/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ -/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ -/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ -/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ -/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ -/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ -sqr_n_mul(t[0], t[0], 9, t[10]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ -/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ -/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ -sqr_n_mul(t[0], t[0], 2, t[8]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ -/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ -/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ -/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ -/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ -/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ -sqr_n_mul(t[0], t[0], 5, t[6]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ -/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ -/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ -/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ -/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ -/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ -/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ -/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ -sqr_n_mul(t[0], t[0], 7, t[1]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ -/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ -/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ -/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ -/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ -/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ -/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ -/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ -sqr_n_mul(t[0], t[0], 7, t[9]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ -/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ -/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ -/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ -/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ -/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ -/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ -sqr_n_mul(t[0], t[0], 6, t[11]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ -/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ -/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ -/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ -/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ -/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ -/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ -/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ -/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ -/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ -/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ -sqr_n_mul(t[0], t[0], 5, t[10]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ -/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ -/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ -/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ -/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ -/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ -sqr_n_mul(t[0], t[0], 5, t[10]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ -/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ -/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ -/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ -/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ -/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ -/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ -/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ -/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ -sqr_n_mul(t[0], t[0], 8, t[3]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ -/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ -/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ -/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ -/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ -/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ -/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ -/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ -sqr_n_mul(t[0], t[0], 7, t[2]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ -/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ -/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ -/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ -/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ -/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ -/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ -/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ -/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ -/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ -sqr_n_mul(t[0], t[0], 9, t[7]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ -/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ -/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ -/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ -/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ -/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ -sqr_n_mul(t[0], t[0], 5, t[3]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ -/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ -/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ -/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ -sqr_n_mul(t[0], t[0], 3, t[8]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ -/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ -/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ -/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ -/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ -/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ -/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ -/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ -/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ -sqr_n_mul(t[0], t[0], 8, t[7]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ -/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ -/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ -/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ -sqr_n_mul(t[0], t[0], 3, t[8]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ -/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ -/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ -/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ -/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ -/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ -/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ -/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ -sqr_n_mul(t[0], t[0], 7, t[9]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ -/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ -/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ -/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ -/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ -/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ -/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ -/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ -/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ -/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ -sqr_n_mul(t[0], t[0], 9, t[7]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ -/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ -/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ -/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ -/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ -/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ -/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ -sqr_n_mul(t[0], t[0], 6, t[2]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ -/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ -/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ -/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ -/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ -/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ -/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ -sqr_n_mul(t[0], t[0], 6, t[4]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ -/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ -/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ -/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ -/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ -/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ -/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ -/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ -/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ -/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ -/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ -/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ -/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ -/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ -/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ -sqr_n_mul(t[0], t[0], 4, t[3]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ -/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ -/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ -/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ -sqr_n_mul(t[0], t[0], 3, t[8]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ -/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ -/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ -/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ -/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ -/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ -/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ -/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ -/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ -sqr_n_mul(t[0], t[0], 8, t[2]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ -/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ -/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ -/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ -/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ -/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ -/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ -/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ -sqr_n_mul(t[0], t[0], 7, t[4]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ -/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ -/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ -/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ -/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ -/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ -/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ -/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ -/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ -/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ -/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ -/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ -/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ -/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ -/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ -sqr_n_mul(t[0], t[0], 4, t[7]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ -/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ -/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ -/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ -/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ -sqr_n_mul(t[0], t[0], 4, t[6]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ -/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ -/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ -/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ -/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ -/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ -/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ -/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ -sqr_n_mul(t[0], t[0], 7, t[4]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ -/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ -/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ -/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ -/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ -/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ -sqr_n_mul(t[0], t[0], 5, t[5]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ -/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ -/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ -/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ -/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ -/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ -/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ -/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ -/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ -/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ -/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ -/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ -/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ -/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ -/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ -/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ -/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ -/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ -/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ -/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ -/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ -/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ -/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ -/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ -/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ -/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ -/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ -/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ -/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ -/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ -/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ -sqr_n_mul(t[0], t[0], 5, t[4]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ -/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ -/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ -/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ -/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ -sqr_n_mul(t[0], t[0], 4, t[3]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ -/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ -/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ -/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ -/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ -/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ -/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ -sqr_n_mul(t[0], t[0], 6, t[2]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ -/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ -/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ -/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ -/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ -sqr_n_mul(t[0], t[0], 4, t[1]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ -sqr(out, t[0]); /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ -} while(0) diff --git a/crypto/blst_src/sqrt.c b/crypto/blst_src/sqrt.c deleted file mode 100644 index cf149fd1124..00000000000 --- a/crypto/blst_src/sqrt.c +++ /dev/null @@ -1,261 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "fields.h" - -#ifdef __OPTIMIZE_SIZE__ -static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) -{ - static const byte BLS_12_381_P_minus_3_div_4[] = { - TO_BYTES(0xee7fbfffffffeaaa), TO_BYTES(0x07aaffffac54ffff), - TO_BYTES(0xd9cc34a83dac3d89), TO_BYTES(0xd91dd2e13ce144af), - TO_BYTES(0x92c6e9ed90d2eb35), TO_BYTES(0x0680447a8e5ff9a6) - }; - - exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0); -} -#else -# if 1 -/* - * "383"-bit variant omits full reductions at the ends of squarings, - * which results in up to ~15% improvement. [One can improve further - * by omitting full reductions even after multiplications and - * performing final reduction at the very end of the chain.] - */ -static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, - const vec384 b) -{ sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b); } -# else -static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, - const vec384 b) -{ - while(count--) { - sqr_fp(out, a); - a = out; - } - mul_fp(out, out, b); -} -# endif - -# define sqr(ret,a) sqr_fp(ret,a) -# define mul(ret,a,b) mul_fp(ret,a,b) -# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) - -# include "sqrt-addchain.h" -static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) -{ - RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384); -} -# undef RECIP_SQRT_MOD_BLS12_381_P - -# undef sqr_n_mul -# undef sqr -# undef mul -#endif - -static bool_t recip_sqrt_fp(vec384 out, const vec384 inp) -{ - vec384 t0, t1; - bool_t ret; - - recip_sqrt_fp_3mod4(t0, inp); - - mul_fp(t1, t0, inp); - sqr_fp(t1, t1); - ret = vec_is_equal(t1, inp, sizeof(t1)); - vec_copy(out, t0, sizeof(t0)); - - return ret; -} - -static bool_t sqrt_fp(vec384 out, const vec384 inp) -{ - vec384 t0, t1; - bool_t ret; - - recip_sqrt_fp_3mod4(t0, inp); - - mul_fp(t0, t0, inp); - sqr_fp(t1, t0); - ret = vec_is_equal(t1, inp, sizeof(t1)); - vec_copy(out, t0, sizeof(t0)); - - return ret; -} - -int blst_fp_sqrt(vec384 out, const vec384 inp) -{ return (int)sqrt_fp(out, inp); } - -int blst_fp_is_square(const vec384 inp) -{ - return (int)ct_is_square_mod_384(inp, BLS12_381_P); -} - -static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, - const vec384x sqrt, const vec384x inp) -{ - static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } }; - static const vec384x sqrt_sqrt_minus_1 = { - /* - * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)", - * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1, - * but it pivots into "complex" plane nevertheless... - */ - { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), - TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), - TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, - { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), - TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), - TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } - }; - static const vec384x sqrt_minus_sqrt_minus_1 = { - { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), - TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), - TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, - { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), - TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), - TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } - }; - vec384x coeff, t0, t1; - bool_t is_sqrt, flag; - - /* - * Instead of multiple trial squarings we can perform just one - * and see if the result is "rotated by multiple of 90°" in - * relation to |inp|, and "rotate" |ret| accordingly. - */ - sqr_fp2(t0, sqrt); - /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */ - - /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */ - sub_fp2(t1, t0, inp); - is_sqrt = vec_is_zero(t1, sizeof(t1)); - vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff)); - - /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */ - add_fp2(t1, t0, inp); - vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff), - flag = vec_is_zero(t1, sizeof(t1))); - is_sqrt |= flag; - - /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */ - sub_fp(t1[0], t0[0], inp[1]); - add_fp(t1[1], t0[1], inp[0]); - vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff), - flag = vec_is_zero(t1, sizeof(t1))); - is_sqrt |= flag; - - /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */ - add_fp(t1[0], t0[0], inp[1]); - sub_fp(t1[1], t0[1], inp[0]); - vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff), - flag = vec_is_zero(t1, sizeof(t1))); - is_sqrt |= flag; - - /* actual "rotation" */ - mul_fp2(out, ret, coeff); - - return is_sqrt; -} - -/* - * |inp| = a + b*i - */ -static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, - const vec384x recip_ZZZ, - const vec384x magic_ZZZ) -{ - vec384 aa, bb, cc; - vec384x inp_; - bool_t is_sqrt; - - sqr_fp(aa, inp[0]); - sqr_fp(bb, inp[1]); - add_fp(aa, aa, bb); - - is_sqrt = recip_sqrt_fp(cc, aa); /* 1/sqrt(a²+b²) */ - - /* if |inp| doesn't have quadratic residue, multiply by "1/Z³" ... */ - mul_fp2(inp_, inp, recip_ZZZ); - /* ... and adjust |aa| and |cc| accordingly */ - { - vec384 za, zc; - - mul_fp(za, aa, magic_ZZZ[0]); /* aa*(za² + zb²) */ - mul_fp(zc, cc, magic_ZZZ[1]); /* cc*(za² + zb²)^((p-3)/4) */ - vec_select(aa, aa, za, sizeof(aa), is_sqrt); - vec_select(cc, cc, zc, sizeof(cc), is_sqrt); - } - vec_select(inp_, inp, inp_, sizeof(inp_), is_sqrt); - - mul_fp(aa, aa, cc); /* sqrt(a²+b²) */ - - sub_fp(bb, inp_[0], aa); - add_fp(aa, inp_[0], aa); - vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); - div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ - - /* if it says "no sqrt," final "align" will find right one... */ - (void)recip_sqrt_fp(out[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ - - div_by_2_fp(out[1], inp_[1]); - mul_fp(out[1], out[1], out[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ - mul_fp(out[0], out[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ - - /* bound to succeed */ - (void)sqrt_align_fp2(out, out, out, inp_); - - mul_fp(out[0], out[0], cc); /* inverse the result */ - mul_fp(out[1], out[1], cc); - neg_fp(out[1], out[1]); - - return is_sqrt; -} - -static bool_t sqrt_fp2(vec384x out, const vec384x inp) -{ - vec384x ret; - vec384 aa, bb; - - sqr_fp(aa, inp[0]); - sqr_fp(bb, inp[1]); - add_fp(aa, aa, bb); - - /* don't pay attention to return value, final "align" will tell... */ - (void)sqrt_fp(aa, aa); /* sqrt(a²+b²) */ - - sub_fp(bb, inp[0], aa); - add_fp(aa, inp[0], aa); - vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); - div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ - - /* if it says "no sqrt," final "align" will find right one... */ - (void)recip_sqrt_fp(ret[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ - - div_by_2_fp(ret[1], inp[1]); - mul_fp(ret[1], ret[1], ret[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ - mul_fp(ret[0], ret[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ - - /* - * Now see if |ret| is or can be made sqrt(|inp|)... - */ - - return sqrt_align_fp2(out, ret, ret, inp); -} - -int blst_fp2_sqrt(vec384x out, const vec384x inp) -{ return (int)sqrt_fp2(out, inp); } - -int blst_fp2_is_square(const vec384x inp) -{ - vec384 aa, bb; - - sqr_fp(aa, inp[0]); - sqr_fp(bb, inp[1]); - add_fp(aa, aa, bb); - - return (int)ct_is_square_mod_384(aa, BLS12_381_P); -} diff --git a/crypto/blst_src/vect.c b/crypto/blst_src/vect.c deleted file mode 100644 index 1834a48fadd..00000000000 --- a/crypto/blst_src/vect.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "vect.h" - -#ifdef __BLST_NO_ASM__ -# include "no_asm.h" -#endif - -/* - * Following are some reference C implementations to assist new - * assembly modules development, as starting-point stand-ins and for - * cross-checking. In order to "polyfil" specific subroutine redefine - * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x. - */ - -#ifdef lshift_mod_384 -inline void lshift_mod_384(vec384 ret, const vec384 a, size_t n, - const vec384 mod) -{ - while(n--) - add_mod_384(ret, a, a, mod), a = ret; -} -#endif - -#ifdef mul_by_8_mod_384 -inline void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod) -{ lshift_mod_384(ret, a, 3, mod); } -#endif - -#ifdef mul_by_3_mod_384 -inline void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod) -{ - vec384 t; - - add_mod_384(t, a, a, mod); - add_mod_384(ret, t, a, mod); -} -#endif - -#ifdef mul_by_3_mod_384x -inline void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod) -{ - mul_by_3_mod_384(ret[0], a[0], mod); - mul_by_3_mod_384(ret[1], a[1], mod); -} -#endif - -#ifdef mul_by_8_mod_384x -inline void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 mod) -{ - mul_by_8_mod_384(ret[0], a[0], mod); - mul_by_8_mod_384(ret[1], a[1], mod); -} -#endif - -#ifdef mul_by_1_plus_i_mod_384x -inline void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, - const vec384 mod) -{ - vec384 t; - - add_mod_384(t, a[0], a[1], mod); - sub_mod_384(ret[0], a[0], a[1], mod); - vec_copy(ret[1], t, sizeof(t)); -} -#endif - -#ifdef add_mod_384x -inline void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, - const vec384 mod) -{ - add_mod_384(ret[0], a[0], b[0], mod); - add_mod_384(ret[1], a[1], b[1], mod); -} -#endif - -#ifdef sub_mod_384x -inline void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, - const vec384 mod) -{ - sub_mod_384(ret[0], a[0], b[0], mod); - sub_mod_384(ret[1], a[1], b[1], mod); -} -#endif - -#ifdef lshift_mod_384x -inline void lshift_mod_384x(vec384x ret, const vec384x a, size_t n, - const vec384 mod) -{ - lshift_mod_384(ret[0], a[0], n, mod); - lshift_mod_384(ret[1], a[1], n, mod); -} -#endif - -#if defined(mul_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) -void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, - const vec384 mod, limb_t n0) -{ - vec768 t0, t1, t2; - vec384 aa, bb; - - mul_384(t0, a[0], b[0]); - mul_384(t1, a[1], b[1]); - - add_mod_384(aa, a[0], a[1], mod); - add_mod_384(bb, b[0], b[1], mod); - mul_384(t2, aa, bb); - sub_mod_384x384(t2, t2, t0, mod); - sub_mod_384x384(t2, t2, t1, mod); - - sub_mod_384x384(t0, t0, t1, mod); - - redc_mont_384(ret[0], t0, mod, n0); - redc_mont_384(ret[1], t2, mod, n0); -} -#endif - -#if defined(sqr_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) -void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0) -{ - vec384 t0, t1; - - add_mod_384(t0, a[0], a[1], mod); - sub_mod_384(t1, a[0], a[1], mod); - - mul_mont_384(ret[1], a[0], a[1], mod, n0); - add_mod_384(ret[1], ret[1], ret[1], mod); - - mul_mont_384(ret[0], t0, t1, mod, n0); -} -#endif - -limb_t div_3_limbs(const limb_t dividend_top[2], limb_t d_lo, limb_t d_hi); -limb_t quot_rem_128(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); -limb_t quot_rem_64(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); - -/* - * Divide 255-bit |val| by z^2 yielding 128-bit quotient and remainder in place. - */ -static void div_by_zz(limb_t val[]) -{ - static const limb_t zz[] = { TO_LIMB_T(0x0000000100000000), - TO_LIMB_T(0xac45a4010001a402) }; - size_t loop, zz_len = sizeof(zz)/sizeof(zz[0]); - limb_t d_lo, d_hi; - - d_lo = zz[zz_len - 2]; - d_hi = zz[zz_len - 1]; - for (loop = zz_len, zz_len--; loop--;) { - limb_t q = div_3_limbs(val + loop + zz_len, d_lo, d_hi); - (void)quot_rem_128(val + loop, zz, q); - } - /* remainder is in low half of val[], quotient is in high */ -} - -/* - * Divide 128-bit |val| by z yielding 64-bit quotient and remainder in place. - */ -static void div_by_z(limb_t val[]) -{ - static const limb_t z[] = { TO_LIMB_T(0xd201000000010000) }; - size_t loop, z_len = sizeof(z)/sizeof(z[0]); - limb_t d_lo, d_hi; - - d_lo = (sizeof(z) == sizeof(limb_t)) ? 0 : z[z_len - 2]; - d_hi = z[z_len - 1]; - for (loop = z_len, z_len--; loop--;) { - limb_t q = div_3_limbs(val + loop + z_len, d_lo, d_hi); - (void)quot_rem_64(val + loop, z, q); - } - /* remainder is in low half of val[], quotient is in high */ -} diff --git a/crypto/blst_src/vect.h b/crypto/blst_src/vect.h deleted file mode 100644 index 554dd5daefc..00000000000 --- a/crypto/blst_src/vect.h +++ /dev/null @@ -1,418 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLS12_381_ASM_VECT_H__ -#define __BLS12_381_ASM_VECT_H__ - -#include - -#if defined(__x86_64__) || defined(__aarch64__) -/* These are available even in ILP32 flavours, but even then they are - * capable of performing 64-bit operations as efficiently as in *P64. */ -typedef unsigned long long limb_t; -# define LIMB_T_BITS 64 - -#elif defined(_WIN64) /* Win64 is P64 */ -typedef unsigned __int64 limb_t; -# define LIMB_T_BITS 64 - -#elif defined(__BLST_NO_ASM__) || defined(__wasm64__) -typedef unsigned int limb_t; -# define LIMB_T_BITS 32 -# ifndef __BLST_NO_ASM__ -# define __BLST_NO_ASM__ -# endif - -#else /* 32 bits on 32-bit platforms, 64 - on 64-bit */ -typedef unsigned long limb_t; -# ifdef _LP64 -# define LIMB_T_BITS 64 -# else -# define LIMB_T_BITS 32 -# define __BLST_NO_ASM__ -# endif -#endif - -/* - * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor - * knows nothing about sizeof(anything)... - */ -#if LIMB_T_BITS == 64 -# define TO_LIMB_T(limb64) limb64 -#else -# define TO_LIMB_T(limb64) (limb_t)limb64,(limb_t)(limb64>>32) -#endif - -#define NLIMBS(bits) (bits/LIMB_T_BITS) - -typedef limb_t vec256[NLIMBS(256)]; -typedef limb_t vec512[NLIMBS(512)]; -typedef limb_t vec384[NLIMBS(384)]; -typedef limb_t vec768[NLIMBS(768)]; -typedef vec384 vec384x[2]; /* 0 is "real" part, 1 is "imaginary" */ - -typedef unsigned char byte; -#define TO_BYTES(limb64) (byte)limb64,(byte)(limb64>>8),\ - (byte)(limb64>>16),(byte)(limb64>>24),\ - (byte)(limb64>>32),(byte)(limb64>>40),\ - (byte)(limb64>>48),(byte)(limb64>>56) -typedef byte pow256[256/8]; - -/* - * Internal Boolean type, Boolean by value, hence safe to cast to or - * reinterpret as 'bool'. - */ -typedef limb_t bool_t; - -/* - * Assembly subroutines... - */ -#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)\ - && !defined(__BLST_NO_ASM__) -# define mul_mont_sparse_256 mulx_mont_sparse_256 -# define sqr_mont_sparse_256 sqrx_mont_sparse_256 -# define from_mont_256 fromx_mont_256 -# define redc_mont_256 redcx_mont_256 -# define mul_mont_384 mulx_mont_384 -# define sqr_mont_384 sqrx_mont_384 -# define sqr_n_mul_mont_384 sqrx_n_mul_mont_384 -# define sqr_n_mul_mont_383 sqrx_n_mul_mont_383 -# define mul_384 mulx_384 -# define sqr_384 sqrx_384 -# define redc_mont_384 redcx_mont_384 -# define from_mont_384 fromx_mont_384 -# define sgn0_pty_mont_384 sgn0x_pty_mont_384 -# define sgn0_pty_mont_384x sgn0x_pty_mont_384x -# define ct_inverse_mod_383 ctx_inverse_mod_383 -#elif defined(__BLST_NO_ASM__) -# define ct_inverse_mod_383 ct_inverse_mod_384 -#endif - -void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b, - const vec256 p, limb_t n0); -void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); -void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); -void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); - -void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); -void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); -void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p); -void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p); -void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); -void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); -bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p, - const vec256 one); -limb_t check_mod_256(const pow256 a, const vec256 p); -limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, - const vec256 p); -limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, - const vec256 p); - -void vec_prefetch(const void *ptr, size_t len); - -void mul_mont_384(vec384 ret, const vec384 a, const vec384 b, - const vec384 p, limb_t n0); -void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); -void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count, - const vec384 p, limb_t n0, const vec384 b); -void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, - const vec384 p, limb_t n0, const vec384 b); - -void mul_384(vec768 ret, const vec384 a, const vec384 b); -void sqr_384(vec768 ret, const vec384 a); -void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0); -void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); -limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0); -limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0); -limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p); -limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p); - -void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); -void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); -void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p); -void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p); -void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p); -void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); -void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); -void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p); -void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod, - const vec384 modx); -void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, - const vec256 modx); -bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod); - -#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) -# define mul_mont_384x mulx_mont_384x -# define sqr_mont_384x sqrx_mont_384x -# define sqr_mont_382x sqrx_mont_382x -# define mul_382x mulx_382x -# define sqr_382x sqrx_382x -#endif - -void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, - const vec384 p, limb_t n0); -void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); -void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); -void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); -void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); - -void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, - const vec384 p); -void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, - const vec384 p); -void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p); -void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p); -void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p); -void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b, - const vec384 p); -void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b, - const vec384 p); - -/* - * C subroutines - */ -static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, - size_t pow_bits, const vec384 p, limb_t n0); -static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, - size_t pow_bits, const vec384 p, limb_t n0); -static void div_by_zz(limb_t val[]); -static void div_by_z(limb_t val[]); - -#ifdef __UINTPTR_TYPE__ -typedef __UINTPTR_TYPE__ uptr_t; -#else -typedef const void *uptr_t; -#endif - -#if !defined(restrict) -# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 -# if defined(__GNUC__) && __GNUC__>=2 -# define restrict __restrict__ -# elif defined(_MSC_VER) -# define restrict __restrict -# else -# define restrict -# endif -# endif -#endif - -#if !defined(inline) && !defined(__cplusplus) -# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 -# if defined(__GNUC__) && __GNUC__>=2 -# define inline __inline__ -# elif defined(_MSC_VER) -# define inline __inline -# else -# define inline -# endif -# endif -#endif - -#if defined(__GNUC__) || defined(__clang__) -# define launder(var) __asm__ __volatile__("" : "+r"(var)) -#else -# define launder(var) -#endif - -static inline bool_t is_bit_set(const byte *v, size_t i) -{ - bool_t ret = (v[i/8] >> (i%8)) & 1; - launder(ret); - return ret; -} - -static inline bool_t byte_is_zero(unsigned char c) -{ - limb_t ret = ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1); - launder(ret); - return ret; -} - -static inline bool_t bytes_are_zero(const unsigned char *a, size_t num) -{ - unsigned char acc; - size_t i; - - for (acc = 0, i = 0; i < num; i++) - acc |= a[i]; - - return byte_is_zero(acc); -} - -static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, - bool_t cbit) -{ - limb_t ai, *ap = (limb_t *)a; - limb_t bi, *bp = (limb_t *)b; - limb_t xorm, mask; - size_t i; - - launder(cbit); - mask = (limb_t)0 - cbit; - - num /= sizeof(limb_t); - - for (i = 0; i < num; i++) { - xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask; - ap[i] = ai ^ xorm; - bp[i] = bi ^ xorm; - } -} - -/* ret = bit ? a : b */ -void vec_select_32(void *ret, const void *a, const void *b, bool_t sel_a); -void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a); -void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a); -void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a); -void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a); -void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a); -static inline void vec_select(void *ret, const void *a, const void *b, - size_t num, bool_t sel_a) -{ - launder(sel_a); -#ifndef __BLST_NO_ASM__ - if (num == 32) vec_select_32(ret, a, b, sel_a); - else if (num == 48) vec_select_48(ret, a, b, sel_a); - else if (num == 96) vec_select_96(ret, a, b, sel_a); - else if (num == 144) vec_select_144(ret, a, b, sel_a); - else if (num == 192) vec_select_192(ret, a, b, sel_a); - else if (num == 288) vec_select_288(ret, a, b, sel_a); -#else - if (0) ; -#endif - else { - limb_t bi; - volatile limb_t *rp = (limb_t *)ret; - const limb_t *ap = (const limb_t *)a; - const limb_t *bp = (const limb_t *)b; - limb_t xorm, mask = (limb_t)0 - sel_a; - size_t i; - - num /= sizeof(limb_t); - - for (i = 0; i < num; i++) { - xorm = (ap[i] ^ (bi = bp[i])) & mask; - rp[i] = bi ^ xorm; - } - } -} - -static inline bool_t is_zero(limb_t l) -{ - limb_t ret = (~l & (l - 1)) >> (LIMB_T_BITS - 1); - launder(ret); - return ret; -} - -static inline bool_t vec_is_zero(const void *a, size_t num) -{ - const limb_t *ap = (const limb_t *)a; - limb_t acc; - size_t i; - -#ifndef __BLST_NO_ASM__ - bool_t vec_is_zero_16x(const void *a, size_t num); - if ((num & 15) == 0) - return vec_is_zero_16x(a, num); -#endif - - num /= sizeof(limb_t); - - for (acc = 0, i = 0; i < num; i++) - acc |= ap[i]; - - return is_zero(acc); -} - -static inline bool_t vec_is_equal(const void *a, const void *b, size_t num) -{ - const limb_t *ap = (const limb_t *)a; - const limb_t *bp = (const limb_t *)b; - limb_t acc; - size_t i; - -#ifndef __BLST_NO_ASM__ - bool_t vec_is_equal_16x(const void *a, const void *b, size_t num); - if ((num & 15) == 0) - return vec_is_equal_16x(a, b, num); -#endif - - num /= sizeof(limb_t); - - for (acc = 0, i = 0; i < num; i++) - acc |= ap[i] ^ bp[i]; - - return is_zero(acc); -} - -static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag, - const vec384 p) -{ - cneg_mod_384(ret[0], a[0], flag, p); - cneg_mod_384(ret[1], a[1], flag, p); -} - -static inline void vec_copy(void *restrict ret, const void *a, size_t num) -{ - limb_t *rp = (limb_t *)ret; - const limb_t *ap = (const limb_t *)a; - size_t i; - - num /= sizeof(limb_t); - - for (i = 0; i < num; i++) - rp[i] = ap[i]; -} - -static inline void vec_zero(void *ret, size_t num) -{ - volatile limb_t *rp = (volatile limb_t *)ret; - size_t i; - - num /= sizeof(limb_t); - - for (i = 0; i < num; i++) - rp[i] = 0; - -#if defined(__GNUC__) || defined(__clang__) - __asm__ __volatile__("" : : "r"(ret) : "memory"); -#endif -} - -/* - * Some compilers get arguably overzealous(*) when passing pointer to - * multi-dimensional array [such as vec384x] as 'const' argument. - * General direction seems to be to legitimize such constification, - * so it's argued that suppressing the warning is appropriate. - * - * (*) http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm - */ -#if defined(__INTEL_COMPILER) -# pragma warning(disable:167) -# pragma warning(disable:556) -#elif defined(__GNUC__) && !defined(__clang__) -# pragma GCC diagnostic ignored "-Wpedantic" -#elif defined(_MSC_VER) -# pragma warning(disable: 4127 4189) -#endif - -#if !defined(__wasm__) && __STDC_HOSTED__-0 != 0 -# include -#endif - -#if defined(__GNUC__) -# ifndef alloca -# define alloca(s) __builtin_alloca(s) -# endif -#elif defined(__sun) -# include -#elif defined(_WIN32) -# include -# ifndef alloca -# define alloca(s) _alloca(s) -# endif -#endif - -#endif /* __BLS12_381_ASM_VECT_H__ */ diff --git a/crypto/common.go b/crypto/common.go deleted file mode 100644 index b9e072c9930..00000000000 --- a/crypto/common.go +++ /dev/null @@ -1,94 +0,0 @@ -package crypto - -import ( - "crypto/rand" - "errors" - "fmt" -) - -//revive:disable:var-naming - -const ( - // Minimum targeted bits of security. - // This is used as a reference but it doesn't mean all implemented primitives provide this minimum. - securityBits = 128 - - // keygen seed length conditions - // enforce seed to be at least double the security bits and have enough entropy. - // it is still recommened that seed is generated using a secure RNG. - KeyGenSeedMinLen = 2 * (securityBits / 8) - KeyGenSeedMaxLen = 256 -) - -// TODO: update this code to make sure -// the function isn't removed by the compiler -// https://github.com/golang/go/issues/21865 -func overwrite(data []byte) { - _, err := rand.Read(data) // checking err is enough - if err != nil { - // zero the buffer if randomizing failed - for i := 0; i < len(data); i++ { - data[i] = 0 - } - } -} - -// invalidInputsError is an error returned when a crypto API receives invalid inputs. -// It allows a function caller differentiate unexpected program errors from errors caused by -// invalid inputs. -type invalidInputsError struct { - error -} - -func (e invalidInputsError) Unwrap() error { - return e.error -} - -// invalidInputsErrorf constructs a new invalidInputsError -func invalidInputsErrorf(msg string, args ...interface{}) error { - return &invalidInputsError{ - error: fmt.Errorf(msg, args...), - } -} - -// IsInvalidInputsError checks if the input error is of an invalidInputsError type -// invalidInputsError is returned when the API is provided invalid inputs. -// Some specific errors are assigned specific sentinel errors for a simpler error check -// while the remaining input errors trigger an invalidInputsError. -func IsInvalidInputsError(err error) bool { - var target *invalidInputsError - return errors.As(err, &target) -} - -var nilHasherError = errors.New("hasher cannot be nil") - -// IsNilHasherError checks if the input error wraps a nilHasherError. -// nilHasherError is returned when a nil hasher is used. -func IsNilHasherError(err error) bool { - return errors.Is(err, nilHasherError) -} - -// invalidHasherSizeError is an error returned when a crypto API is called with a hasher -// with an output size not suited with the cryptographic operation. -type invalidHasherSizeError struct { - error -} - -func (e invalidHasherSizeError) Unwrap() error { - return e.error -} - -// invalidHasherSizeErrorf constructs a new invalidHasherSizeError -func invalidHasherSizeErrorf(msg string, args ...interface{}) error { - return &invalidHasherSizeError{ - error: fmt.Errorf(msg, args...), - } -} - -// IsInvalidHasherSizeError checks if the input error is of an invalidHasherSizeError type. -// invalidHasherSizeError is an error returned when a crypto API is called with a hasher -// with an output size not suited with the cryptographic operation. -func IsInvalidHasherSizeError(err error) bool { - var target *invalidHasherSizeError - return errors.As(err, &target) -} diff --git a/crypto/dkg.go b/crypto/dkg.go deleted file mode 100644 index 03305d016c7..00000000000 --- a/crypto/dkg.go +++ /dev/null @@ -1,234 +0,0 @@ -package crypto - -import ( - "errors" - "fmt" -) - -// DKG stands for distributed key generation. In this library, DKG -// refers to discrete-log based protocols. -// The protocols implemented in the package for now generate keys for a BLS-based -// threshold signature scheme. -// BLS is used with the BLS12-381 curve. -// -// These protocols mainly generate a BLS key pair and share the secret key -// among (n) participants in a way that any (t+1) key shares allow reconstructing -// the initial key (and also reconstructing a BLS threshold signature under the initial key). -// Up to (t) shares don't reveal any information about the initial key (or a signature generated -// by that key). -// -// We refer to the initial key pair by group private and group public key. -// (t) is the threshold parameter. -// Flow uses DKG with the value t = floor((n-1)/2) to optimize for unforgeability and robustness -// of the threshold signature scheme using the output keys. -// -// Private keys are scalar in Fr, where r is the group order of G1/G2. -// Public keys are in G2. - -const ( - // DKG and Threshold Signatures - - // MinimumThreshold is the minimum value of the threshold parameter in all threshold-based protocols. - MinimumThreshold = 1 - // DKGMinSize is the minimum size of a group participating in a DKG protocol - DKGMinSize int = MinimumThreshold + 1 - // DKGMaxSize is the maximum size of a group participating in a DKG protocol - DKGMaxSize int = 254 -) - -type DKGState interface { - // Size returns the size of the DKG group n - Size() int - // Threshold returns the threshold value t - Threshold() int - // Start starts running a DKG in the current participant - Start(seed []byte) error - // HandleBroadcastMsg processes a new broadcasted message received by the current participant. - // orig is the message origin index - HandleBroadcastMsg(orig int, msg []byte) error - // HandlePrivateMsg processes a new private message received by the current participant. - // orig is the message origin index - HandlePrivateMsg(orig int, msg []byte) error - // End ends a DKG protocol in the current participant. - // It returns the finalized public data and participant private key share. - // - the group public key corresponding to the group secret key - // - all the public key shares corresponding to the participants private - // key shares - // - the finalized private key which is the current participant's own private key share - End() (PrivateKey, PublicKey, []PublicKey, error) - // NextTimeout set the next timeout of the protocol if any timeout applies. - // Some protocols could require more than one timeout - NextTimeout() error - // Running returns the running state of the DKG protocol - Running() bool - // ForceDisqualify forces a participant to get disqualified - // for a reason outside of the DKG protocol. - // The caller should make sure all honest participants call this function, - // otherwise, the protocol can be broken. - ForceDisqualify(participant int) error -} - -// dkgFailureError is an error returned when a participant -// detects a failure in the protocol and is not able to compute output keys. -// Such a failure can be local and only depends on the participant's view of what -// happened in the protocol. The error can only be returned using the End() function. -type dkgFailureError struct { - error -} - -// dkgFailureErrorf constructs a new dkgFailureError -func dkgFailureErrorf(msg string, args ...interface{}) error { - return &dkgFailureError{ - error: fmt.Errorf(msg, args...), - } -} - -// IsDKGFailureError checks if the input error is of a dkgFailureError type. -// dkgFailureError is an error returned when a participant -// detects a failure in the protocol and is not able to compute output keys. -func IsDKGFailureError(err error) bool { - var target *dkgFailureError - return errors.As(err, &target) -} - -type dkgInvalidStateTransitionError struct { - error -} - -func (e dkgInvalidStateTransitionError) Unwrap() error { - return e.error -} - -// dkgInvalidStateTransitionErrorf constructs a new dkgInvalidStateTransitionError -func dkgInvalidStateTransitionErrorf(msg string, args ...interface{}) error { - return &dkgInvalidStateTransitionError{ - error: fmt.Errorf(msg, args...), - } -} - -// IsDkgInvalidStateTransitionError checks if the input error is of a dkgInvalidStateTransition type. -// invalidStateTransition is returned when a caller -// triggers an invalid state transition in the local DKG instance. -// Such a failure can only happen if the API is misued by not respecting -// the state machine conditions. -func IsDKGInvalidStateTransitionError(err error) bool { - var target *dkgInvalidStateTransitionError - return errors.As(err, &target) -} - -// index is the node index type used as participants ID -type index byte - -// newDKGCommon initializes the common structure of DKG protocols -func newDKGCommon(size int, threshold int, myIndex int, - processor DKGProcessor, dealerIndex int) (*dkgCommon, error) { - if size < DKGMinSize || size > DKGMaxSize { - return nil, invalidInputsErrorf( - "size should be between %d and %d", - DKGMinSize, - DKGMaxSize) - } - - if myIndex >= size || dealerIndex >= size || myIndex < 0 || dealerIndex < 0 { - return nil, invalidInputsErrorf( - "indices of current and dealer nodes must be between 0 and %d, got %d", - size-1, - myIndex) - } - - if threshold >= size || threshold < MinimumThreshold { - return nil, invalidInputsErrorf( - "The threshold must be between %d and %d, got %d", - MinimumThreshold, - size-1, - threshold) - } - - return &dkgCommon{ - size: size, - threshold: threshold, - myIndex: index(myIndex), - processor: processor, - }, nil -} - -// dkgCommon holds the common data of all DKG protocols -type dkgCommon struct { - size int - threshold int - myIndex index - // running is true when the DKG protocol is running, is false otherwise - running bool - // processes the action of the DKG interface outputs - processor DKGProcessor -} - -// Running returns the running state of the DKG protocol. -// The state is equal to true when the DKG protocol is running, and is equal to false otherwise. -func (s *dkgCommon) Running() bool { - return s.running -} - -// Size returns the size of the DKG group n -func (s *dkgCommon) Size() int { - return s.size -} - -// Threshold returns the threshold value t -func (s *dkgCommon) Threshold() int { - return s.threshold -} - -// NextTimeout sets the next protocol timeout if there is any. -// This function should be overwritten by any protocol that uses timeouts. -func (s *dkgCommon) NextTimeout() error { - return nil -} - -// dkgMsgTag is the type used to encode message tags -type dkgMsgTag byte - -const ( - feldmanVSSShare dkgMsgTag = iota - feldmanVSSVerifVec - feldmanVSSComplaint - feldmanVSSComplaintAnswer -) - -// DKGProcessor is an interface that implements the DKG output actions. -// -// An instance of a DKGProcessor is needed for each participant in order to -// particpate in a DKG protocol -type DKGProcessor interface { - // PrivateSend sends a message to a destination over - // a private channel. The channel must preserve the - // confidentiality of the message and should authenticate - // the sender. - // It is recommended that the private channel is unique per - // protocol instance. This can be achieved by prepending all - // messages by a unique instance ID. - PrivateSend(dest int, data []byte) - // Broadcast broadcasts a message to all participants. - // This function assumes all participants have received the same message, - // failing to do so, the protocol can be broken. - // The broadcasted message is public and not confidential. - // The broadcasting channel should authenticate the sender. - // It is recommended that the broadcasting channel is unique per - // protocol instance. This can be achieved by prepending all - // messages by a unique instance ID. - Broadcast(data []byte) - // Disqualify flags that a participant is misbehaving and that it got - // disqualified from the protocol. Such behavior deserves - // disqualifying as it is flagged to all honest participants in - // the protocol. - // log describes the disqualification reason. - Disqualify(participant int, log string) - // FlagMisbehavior warns that a participant is misbehaving. - // Such behavior is not necessarily flagged to all participants and therefore - // the participant is not disqualified from the protocol. Other mechanisms - // outside DKG could be implemented to synchronize slashing the misbehaving - // participant by all participating participants, using the api `ForceDisqualify`. Failing to - // do so, the protocol can be broken. - // log describes the misbehavior. - FlagMisbehavior(participant int, log string) -} diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c deleted file mode 100644 index c8fee6917f6..00000000000 --- a/crypto/dkg_core.c +++ /dev/null @@ -1,109 +0,0 @@ -#include "dkg_include.h" - -// computes P(x) = a_0 + a_1*x + .. + a_n x^n in F_r -// where `x` is a small integer (byte) and `degree` is P's degree n. -// P(x) is written in `out` and P(x).g2 is written in `y` if `y` is non NULL. -void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int degree, - const byte x) { - Fr image; - Fr_polynomial_image(&image, y, a, degree, x); - // exports the result - Fr_write_bytes(out, &image); -} - -// computes P(x) = a_0 + a_1 * x + .. + a_n * x^n where P is in Fr[X]. -// a_i are all in Fr, `degree` is P's degree, x is a small integer less than -// `MAX_IND` (currently 255). -// The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL. -void Fr_polynomial_image(Fr *image, E2 *y, const Fr *a, const int degree, - const byte x) { - Fr_set_zero(image); - // convert `x` to Montgomery form - Fr xR; - Fr_set_limb(&xR, (limb_t)x); - Fr_to_montg(&xR, &xR); - - for (int i = degree; i >= 0; i--) { - Fr_mul_montg(image, image, &xR); - Fr_add(image, image, &a[i]); // image is in normal form - } - // compute y = P(x).g2 - if (y) { - G2_mult_gen(y, image); - } -} - -// computes Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 -// and stores the point in y. -// - A_i being G2 points -// - x being a small scalar (less than `MAX_IND`) -static void E2_polynomial_image(E2 *y, const E2 *A, const int degree, - const byte x) { - E2_set_infty(y); - for (int i = degree; i >= 0; i--) { - E2_mult_small_expo(y, y, x); - E2_add(y, y, &A[i]); - } -} - -// computes y[i] = Q(i+1) for all participants i ( 0 <= i < len_y) -// where Q(x) = A_0 + A_1*x + ... + A_n*x^n -// - A_i being G2 points -// - x being a small scalar (less than `MAX_IND`) -void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, - const int degree) { - for (byte i = 0; i < len_y; i++) { - // y[i] = Q(i+1) - E2_polynomial_image(y + i, A, degree, i + 1); - } -} - -// export an array of E2 into an array of bytes by concatenating -// all serializations of E2 points in order. -// the array must be of length (A_len * G2_SER_BYTES). -void E2_vector_write_bytes(byte *out, const E2 *A, const int A_len) { - byte *p = out; - for (int i = 0; i < A_len; i++) { - E2_write_bytes(p, &A[i]); - p += G2_SER_BYTES; - } -} - -// The function imports an array of `A_len` E2 points from a concatenated array -// of bytes. The bytes array is supposed to be of size (A_len * G2_SER_BYTES). -// -// If return is `VALID`, output vector is guaranteed to be in G2. -// It returns other errors if at least one input isn't a serialization of a E2 -// point, or an input E2 point isn't in G2. -// returns: -// - BAD_ENCODING if the serialization header bits of at least one input are -// invalid. -// - BAD_VALUE if Fp^2 coordinates of at least one input couldn't -// deserialize. -// - POINT_NOT_ON_CURVE if at least one input deserialized point isn't on -// E2. -// - POINT_NOT_IN_GROUP if at least one E2 point isn't in G2. -// - VALID if deserialization of all points to G2 is valid. -ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int A_len) { - byte *p = (byte *)src; - for (int i = 0; i < A_len; i++) { - int read_ret = E2_read_bytes(&A[i], p, G2_SER_BYTES); - if (read_ret != VALID) { - return read_ret; - } - if (!E2_in_G2(&A[i])) { - return POINT_NOT_IN_GROUP; - } - p += G2_SER_BYTES; - } - return VALID; -} - -// checks the discrete log relationship in G2. -// - returns 1 if g2^x = y, where g2 is the generator of G2 -// - returns 0 otherwise. -bool G2_check_log(const Fr *x, const E2 *y) { - E2 tmp; - G2_mult_gen(&tmp, x); - return E2_is_equal(&tmp, y); -} diff --git a/crypto/dkg_feldmanvss.go b/crypto/dkg_feldmanvss.go deleted file mode 100644 index dbe7771b6c4..00000000000 --- a/crypto/dkg_feldmanvss.go +++ /dev/null @@ -1,489 +0,0 @@ -package crypto - -// #include "dkg_include.h" -import "C" - -import ( - "fmt" - - "github.com/onflow/flow-go/crypto/hash" - "github.com/onflow/flow-go/crypto/random" -) - -// Implements Feldman Verifiable Secret Sharing using -// the BLS set up on the BLS12-381 curve. - -// The secret is a BLS private key generated by a single dealer. -// (and hence this is a centralized generation). -// The generates key shares for a BLS-based -// threshold signature scheme and distributes the shares over the (n) -// partcipants including itself. The particpants validate their shares -// using a public verifiaction vector shared by the . - -// Private keys are scalar in Fr, where r is the group order of G1/G2 -// Public keys are in G2. - -// feldman VSS protocol, implements DKGState -type feldmanVSSstate struct { - // common DKG state - *dkgCommon - // participant index - dealerIndex index - // Polynomial P = a_0 + a_1*x + .. + a_t*x^t in Fr[X], the vector size is (t+1) - // a_0 is the group private key - a []scalar - // Public vector of the group, the vector size is (t+1) - // A_0 is the group public key - vA []pointE2 - vAReceived bool - // Private share of the current participant - x scalar - xReceived bool - // Public keys of the group participants, the vector size is (n) - y []pointE2 - // true if the private share is valid - validKey bool -} - -// NewFeldmanVSS creates a new instance of Feldman VSS protocol. -// -// An instance is run by a single participant and is usable for only one protocol. -// In order to run the protocol again, a new instance needs to be created -// -// The function returns: -// - (nil, InvalidInputsError) if: -// - size if not in [DKGMinSize, DKGMaxSize] -// - threshold is not in [MinimumThreshold, size-1] -// - myIndex is not in [0, size-1] -// - dealerIndex is not in [0, size-1] -// -// - (dkgInstance, nil) otherwise -func NewFeldmanVSS(size int, threshold int, myIndex int, - processor DKGProcessor, dealerIndex int) (DKGState, error) { - - common, err := newDKGCommon(size, threshold, myIndex, processor, dealerIndex) - if err != nil { - return nil, err - } - - fvss := &feldmanVSSstate{ - dkgCommon: common, - dealerIndex: index(dealerIndex), - } - fvss.init() - return fvss, nil -} - -func (s *feldmanVSSstate) init() { - // set the bls context - - s.running = false - s.y = nil - s.xReceived = false - s.vAReceived = false -} - -// Start triggers the protocol start for the current participant. -// If the current participant is the dealer, then the seed is used -// to generate the secret polynomial (including the group private key). -// If the current participant is not the dealer, the seed is ignored. -// -// The function returns: -// - invalidInputError if seed is too short -// - dkgInvalidStateTransitionError if the DKG instance is already running. -// - error if an unexpected exception occurs -// - nil otherwise -func (s *feldmanVSSstate) Start(seed []byte) error { - if s.running { - return dkgInvalidStateTransitionErrorf("dkg is already running") - } - - s.running = true - // Generate shares if necessary - if s.dealerIndex == s.myIndex { - return s.generateShares(seed) - } - return nil -} - -// End finalizes the protocol in the current node. -// It returns the finalized public data and participants private key share. -// - the group public key corresponding to the group secret key -// - all the public key shares corresponding to the participants private -// key shares. -// - the finalized private key which is the current participant's own private key share -// -// The returned erorr is : -// - dkgInvalidStateTransitionError if the DKG instance was not running. -// - dkgFailureError if the private key and vector are inconsistent. -// - dkgFailureError if the public key share or group public key is identity. -// - nil otherwise. -func (s *feldmanVSSstate) End() (PrivateKey, PublicKey, []PublicKey, error) { - if !s.running { - return nil, nil, nil, dkgInvalidStateTransitionErrorf("dkg is not running") - } - s.running = false - if !s.validKey { - return nil, nil, nil, dkgFailureErrorf("received private key is invalid") - } - // private key of the current participant - x := newPrKeyBLSBLS12381(&s.x) - - // Group public key - Y := newPubKeyBLSBLS12381(&s.vA[0]) - - // The participants public keys - y := make([]PublicKey, s.size) - for i, p := range s.y { - y[i] = newPubKeyBLSBLS12381(&p) - } - - // check if current public key share or group public key is identity. - // In that case all signatures generated by the key are invalid (as stated by the BLS IETF draft) - // to avoid equivocation issues. - if (&s.x).isZero() { - return nil, nil, nil, dkgFailureErrorf("received private key is identity and is therefore invalid") - } - if Y.isIdentity { - return nil, nil, nil, dkgFailureErrorf("group private key is identity and is therefore invalid") - } - return x, Y, y, nil -} - -var ( - shareSize = frBytesLen - // the actual verifVectorSize depends on the state and is: - // g2BytesLen*(t+1) - verifVectorSize = g2BytesLen -) - -// HandleBroadcastMsg processes a new broadcasted message received by the current participant. -// `orig` is the message origin index. -// -// The function returns: -// - dkgInvalidStateTransitionError if the instance is not running -// - invalidInputsError if `orig` is not valid (in [0, size-1]) -// - nil otherwise -func (s *feldmanVSSstate) HandleBroadcastMsg(orig int, msg []byte) error { - if !s.running { - return dkgInvalidStateTransitionErrorf("dkg is not running") - } - if orig >= s.Size() || orig < 0 { - return invalidInputsErrorf( - "wrong origin input, should be less than %d, got %d", - s.Size(), - orig) - } - - // In case a message is received by the origin participant, - // the message is just ignored - if s.myIndex == index(orig) { - return nil - } - - if len(msg) == 0 { - s.processor.Disqualify(orig, "the received broadcast is empty") - return nil - } - - // msg = |tag| Data | - if dkgMsgTag(msg[0]) == feldmanVSSVerifVec { - s.receiveVerifVector(index(orig), msg[1:]) - } else { - s.processor.Disqualify(orig, - fmt.Sprintf("the broadcast header is invalid, got %d", - dkgMsgTag(msg[0]))) - } - return nil -} - -// HandlePrivateMsg processes a new private message received by the current participant. -// `orig` is the message origin index. -// -// The function returns: -// - dkgInvalidStateTransitionError if the instance is not running -// - invalidInputsError if `orig` is not valid (in [0, size-1]) -// - nil otherwise -func (s *feldmanVSSstate) HandlePrivateMsg(orig int, msg []byte) error { - if !s.running { - return dkgInvalidStateTransitionErrorf("dkg is not running") - } - - if orig >= s.Size() || orig < 0 { - return invalidInputsErrorf( - "wrong origin, should be positive less than %d, got %d", - s.Size(), - orig) - } - - // In case a private message is received by the origin participant, - // the message is just ignored - if s.myIndex == index(orig) { - return nil - } - - // forward received message to receiveShare because private messages - // can only be private shares - // msg = |tag| Data | - s.receiveShare(index(orig), msg) - - return nil -} - -// ForceDisqualify forces a participant to get disqualified -// for a reason outside of the DKG protocol -// The caller should make sure all honest participants call this function, -// otherwise, the protocol can be broken. -// -// The function returns: -// - dkgInvalidStateTransitionError if the instance is not running -// - invalidInputsError if `orig` is not valid (in [0, size-1]) -// - nil otherwise -func (s *feldmanVSSstate) ForceDisqualify(participant int) error { - if !s.running { - return dkgInvalidStateTransitionErrorf("dkg is not running") - } - if participant >= s.Size() || participant < 0 { - return invalidInputsErrorf( - "wrong origin input, should be less than %d, got %d", - s.Size(), - participant) - } - if index(participant) == s.dealerIndex { - s.validKey = false - } - return nil -} - -// generate a pseudo-random polynomial P = a_0 + a_1*x + .. + a_n x^n in Fr[X] -// where `n` is the input `degree` (higher degree monomial in non-zero). -// `a_0` is also non-zero (for single dealer BLS-DKGs, this insures -// protocol public key output is not identity). -// `seed` is used as the entropy source and must be at least `KeyGenSeedMinLen` -// random bytes with at least 128 bits entropy. -func generateFrPolynomial(seed []byte, degree int) ([]scalar, error) { - if len(seed) < KeyGenSeedMinLen { - return nil, invalidInputsErrorf( - "seed should be at least %d bytes, got %d", KeyGenSeedMinLen, len(seed)) - } - - // build a PRG out of the seed - // In this case, SHA3 is used to smoothen the seed and Chacha20 is used as a PRG - var prgSeed [random.Chacha20SeedLen]byte - hash.ComputeSHA3_256(&prgSeed, seed) - prg, err := random.NewChacha20PRG(prgSeed[:], []byte("gen_poly")) - if err != nil { - return nil, fmt.Errorf("instanciating the PRG failed: %w", err) - } - - // P's coefficients - a := make([]scalar, degree+1) - - // generate a_0 in F_r* - randFrStar(&a[0], prg) - if degree > 0 { - // genarate a_i on F_r, for 0= s.Size() || orig < 0 { - return invalidInputsErrorf( - "wrong origin input, should be less than %d, got %d", - s.Size(), - orig) - } - - // In case a message is received by the origin participant, - // the message is just ignored - if s.myIndex == index(orig) { - return nil - } - - // if dealer is already disqualified, ignore the message - if s.disqualified { - return nil - } - - if len(msg) == 0 { - if index(orig) == s.dealerIndex { - s.disqualified = true - } - s.processor.Disqualify(orig, "received broadcast is empty") - return nil - } - - switch dkgMsgTag(msg[0]) { - case feldmanVSSVerifVec: - s.receiveVerifVector(index(orig), msg[1:]) - case feldmanVSSComplaint: - s.receiveComplaint(index(orig), msg[1:]) - case feldmanVSSComplaintAnswer: - s.receiveComplaintAnswer(index(orig), msg[1:]) - default: - if index(orig) == s.dealerIndex { - s.disqualified = true - } - s.processor.Disqualify(orig, - fmt.Sprintf("invalid broadcast header, got %d", - dkgMsgTag(msg[0]))) - } - return nil -} - -// HandlePrivateMsg processes a new private message received by the current participant. -// orig is the message origin index. -// -// The function returns: -// - dkgInvalidStateTransitionError if the instance is not running -// - invalidInputsError if `orig` is not valid (in [0, size-1]) -// - nil otherwise -func (s *feldmanVSSQualState) HandlePrivateMsg(orig int, msg []byte) error { - if !s.running { - return dkgInvalidStateTransitionErrorf("dkg is not running") - } - if orig >= s.Size() || orig < 0 { - return invalidInputsErrorf( - "invalid origin, should be positive less than %d, got %d", - s.Size(), - orig) - } - - // In case a private message is received by the origin participant, - // the message is just ignored - if s.myIndex == index(orig) { - return nil - } - - // if dealer is already disqualified, ignore the message - if s.disqualified { - return nil - } - - // forward all the message to receiveShare because any private message - // has to be a private share - s.receiveShare(index(orig), msg) - - return nil -} - -// ForceDisqualify forces a participant to get disqualified -// for a reason outside of the DKG protocol -// The caller should make sure all honest participants call this function, -// otherwise, the protocol can be broken -// -// The function returns: -// - dkgInvalidStateTransitionError if the instance is not running -// - invalidInputsError if `orig` is not valid (in [0, size-1]) -// - nil otherwise -func (s *feldmanVSSQualState) ForceDisqualify(participant int) error { - if !s.running { - return dkgInvalidStateTransitionErrorf("dkg is not running") - } - if participant >= s.Size() || participant < 0 { - return invalidInputsErrorf( - "invalid origin input, should be less than %d, got %d", - s.Size(), participant) - } - if index(participant) == s.dealerIndex { - s.disqualified = true - } - return nil -} - -// The function does not check the call respects the machine -// state transition of feldmanVSSQual. The calling function must make sure this call -// is valid. -func (s *feldmanVSSQualState) setSharesTimeout() { - s.sharesTimeout = true - // if verif vector is not received, disqualify the dealer - if !s.vAReceived { - s.disqualified = true - s.processor.Disqualify(int(s.dealerIndex), - "verification vector was not received") - return - } - // if share is not received, make a complaint - if !s.xReceived { - s.buildAndBroadcastComplaint() - } -} - -// The function does not check the call respects the machine -// state transition of feldmanVSSQual. The calling function must make sure this call -// is valid. -func (s *feldmanVSSQualState) setComplaintsTimeout() { - s.complaintsTimeout = true - // if more than t complaints are received, the dealer is disqualified - // regardless of the answers. - // (at this point, all answered complaints should have been already received) - // (i.e there is no complaint with (!c.received && c.answerReceived) - if len(s.complaints) > s.threshold { - s.disqualified = true - s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("there are %d complaints, they exceeded the threshold %d", - len(s.complaints), s.threshold)) - } -} - -func (s *feldmanVSSQualState) receiveShare(origin index, data []byte) { - // only accept private shares from the dealer. - if origin != s.dealerIndex { - return - } - - // check the share timeout - if s.sharesTimeout { - s.processor.FlagMisbehavior(int(origin), - "private share is received after the shares timeout") - return - } - - if s.xReceived { - s.processor.FlagMisbehavior(int(origin), - "private share was already received") - return - } - - // at this point, tag private share is received - s.xReceived = true - - // private message general check - if len(data) == 0 || dkgMsgTag(data[0]) != feldmanVSSShare { - s.buildAndBroadcastComplaint() - s.processor.FlagMisbehavior(int(origin), - fmt.Sprintf("private share should be non-empty and first byte should be %d, received %#x", - feldmanVSSShare, data)) - return - } - - // consider the remaining data from message - data = data[1:] - - if (len(data)) != shareSize { - s.buildAndBroadcastComplaint() - s.processor.FlagMisbehavior(int(origin), - fmt.Sprintf("invalid share size, expects %d, got %d", - shareSize, len(data))) - return - } - // read the participant private share - err := readScalarFrStar(&s.x, data) - if err != nil { - s.buildAndBroadcastComplaint() - s.processor.FlagMisbehavior(int(origin), - fmt.Sprintf("invalid share value %x: %s", data, err)) - return - } - - if s.vAReceived { - if !s.verifyShare() { - // build a complaint - s.buildAndBroadcastComplaint() - } - } -} - -func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) { - // only accept the verification vector from the dealer. - if origin != s.dealerIndex { - return - } - - // check the share timeout - if s.sharesTimeout { - s.processor.FlagMisbehavior(int(origin), - "verification vector received after the shares timeout") - return - } - - if s.vAReceived { - s.processor.FlagMisbehavior(int(origin), - "verification received was already received") - return - } - s.vAReceived = true - - if len(data) != verifVectorSize*(s.threshold+1) { - s.disqualified = true - s.processor.Disqualify(int(origin), - fmt.Sprintf("invalid verification vector size, expects %d, got %d", - verifVectorSize*(s.threshold+1), len(data))) - return - } - // read the verification vector - s.vA = make([]pointE2, s.threshold+1) - err := readVerifVector(s.vA, data) - if err != nil { - s.disqualified = true - s.processor.Disqualify(int(origin), - fmt.Sprintf("reading the verification vector failed:%s", err)) - return - } - - s.y = make([]pointE2, s.size) - // compute all public keys - s.computePublicKeys() - - // check the (already) registered complaints - for complainer, c := range s.complaints { - if c.received && c.answerReceived { - if s.checkComplaint(complainer, c) { - s.disqualified = true - s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("verification vector received: a complaint answer to (%d) is invalid, answer is %s, computed key is %s", - complainer, &c.answer, &s.y[complainer])) - return - } - } - } - // check the private share - if s.xReceived { - if !s.verifyShare() { - s.buildAndBroadcastComplaint() - } - } -} - -// build a complaint against the dealer, add it to the local -// complaint map and broadcast it -func (s *feldmanVSSQualState) buildAndBroadcastComplaint() { - var logMsg string - if s.vAReceived && s.xReceived { - logMsg = fmt.Sprintf("building a complaint, share is %s, computed public key is %s", - &s.x, &s.y[s.myIndex]) - } else { - logMsg = "building a complaint" - } - s.processor.FlagMisbehavior(int(s.dealerIndex), logMsg) - s.complaints[s.myIndex] = &complaint{ - received: true, - answerReceived: false, - } - data := []byte{byte(feldmanVSSComplaint), byte(s.dealerIndex)} - s.processor.Broadcast(data) -} - -// build a complaint answer, add it to the local -// complaint map and broadcast it -func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index) { - data := make([]byte, complaintAnswerSize+1) - data[0] = byte(feldmanVSSComplaintAnswer) - data[1] = byte(complainee) - frPolynomialImage(data[2:], s.a, complainee+1, nil) - s.complaints[complainee].answerReceived = true - s.processor.Broadcast(data) -} - -// assuming a complaint and its answer were both received, this function returns: -// - false if the complaint answer is correct -// - true if the complaint answer is not correct -func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool { - // check y[complainer] == share.G2 - isLog := C.G2_check_log( - (*C.Fr)(&c.answer), - (*C.E2)(&s.y[complainer])) - return !bool(isLog) -} - -// data = |complainee| -func (s *feldmanVSSQualState) receiveComplaint(origin index, data []byte) { - // check the complaint timeout - if s.complaintsTimeout { - s.processor.FlagMisbehavior(int(origin), - "complaint received after the complaint timeout") - return - } - - if len(data) != complaintSize { - // only the dealer of the instance gets disqualified - if origin == s.dealerIndex { - s.disqualified = true - s.processor.Disqualify(int(origin), - fmt.Sprintf("invalid complaint size, expects %d, got %d", - complaintSize, len(data))) - } - return - } - - // the byte encodes the complainee - complainee := index(data[0]) - - // validate the complainee value - if int(complainee) >= s.size { - // only the dealer of the instance gets disqualified - if origin == s.dealerIndex { - s.disqualified = true - s.processor.Disqualify(int(origin), - fmt.Sprintf("invalid complainee, should be less than %d, got %d", - s.size, complainee)) - } - return - } - - // if the complaint is coming from the dealer, ignore it - if origin == s.dealerIndex { - return - } - - // if the complainee is not the dealer, ignore the complaint - if complainee != s.dealerIndex { - return - } - - c, ok := s.complaints[origin] - // if the complaint is new, add it - if !ok { - s.complaints[origin] = &complaint{ - received: true, - answerReceived: false, - } - // if the complainee is the current participant, prepare an answer - if s.myIndex == s.dealerIndex { - s.buildAndBroadcastComplaintAnswer(origin) - } - return - } - // complaint is not new in the map - // check if the complaint has been already received - if c.received { - s.processor.FlagMisbehavior(int(origin), - "complaint was already received") - return - } - c.received = true - // answerReceived flag check is a sanity check - if s.vAReceived && c.answerReceived && s.myIndex != s.dealerIndex { - s.disqualified = s.checkComplaint(origin, c) - if s.disqualified { - s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("complaint received: answer to (%d) is invalid, answer is %s, computed public key is %s", - origin, &c.answer, &s.y[origin])) - } - return - } -} - -// answer = |complainer| private share | -func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) { - // check for invalid answers - if origin != s.dealerIndex { - return - } - - // check the answer format - if len(data) != complaintAnswerSize { - s.disqualified = true - s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("the complaint answer has an invalid length, expects %d, got %d", - complaintAnswerSize, len(data))) - return - } - - // first byte encodes the complainee - complainer := index(data[0]) - if int(complainer) >= s.size { - s.disqualified = true - s.processor.Disqualify(int(origin), - fmt.Sprintf("complainer value is invalid, should be less that %d, got %d", - s.size, int(complainer))) - return - } - - c, ok := s.complaints[complainer] - // if the complaint is new, add it - if !ok { - s.complaints[complainer] = &complaint{ - received: false, - answerReceived: true, - } - - // read the complainer private share - err := readScalarFrStar(&s.complaints[complainer].answer, data[1:]) - if err != nil { - s.disqualified = true - s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("invalid complaint answer value %x: %s", data, err)) - return - } - return - } - // complaint is not new in the map - // check if the answer has been already received - if c.answerReceived { - s.processor.FlagMisbehavior(int(origin), - "complaint answer was already received") - return - } - c.answerReceived = true - - // flag check is a sanity check - if c.received { - // read the complainer private share - err := readScalarFrStar(&c.answer, data[1:]) - if err != nil { - s.disqualified = true - s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("invalid complaint answer value %x: %s", data, err)) - return - } - if s.vAReceived { - s.disqualified = s.checkComplaint(complainer, c) - if s.disqualified { - s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("complaint answer received: answer to (%d) is invalid, answer is %s, computed key is %s", - complainer, &c.answer, &s.y[complainer])) - } - } - - // fix the share of the current participant if the complaint is invalid - if !s.disqualified && complainer == s.myIndex { - s.x = c.answer - } - } -} diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h deleted file mode 100644 index 02fb9a363f4..00000000000 --- a/crypto/dkg_include.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _DKG_INCLUDE_H -#define _DKG_INCLUDE_H - -#include "bls12381_utils.h" - -void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int deg, - const byte x); -void Fr_polynomial_image(Fr *out, E2 *y, const Fr *a, const int deg, - const byte x); -void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, const int deg); -void E2_vector_write_bytes(byte *out, const E2 *A, const int len); -ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int len); -bool G2_check_log(const Fr *x, const E2 *y); - -#endif diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go deleted file mode 100644 index 115730e33d9..00000000000 --- a/crypto/dkg_jointfeldman.go +++ /dev/null @@ -1,335 +0,0 @@ -package crypto - -// #include "dkg_include.h" -import "C" - -import ( - "fmt" -) - -// Implements Joint Feldman (Pedersen) protocol using -// the BLS set up on the BLS12-381 curve. -// The protocol runs (n) parallel instances of Feldman vss with -// the complaints mechanism, each participant being a dealer -// once. - -// This is a fully distributed generation. The secret is a BLS -// private key generated jointly by all the participants. - -// (t) is the threshold parameter. Although the API allows using arbitrary values of (t), -// the DKG protocol is secure in the presence of up to (t) malicious participants -// when (t < n/2). -// Joint-Feldman is the protocol implemented in Flow, (t) being set to the maximum value -// t = floor((n-1)/2) to optimize for unforgeability and robustness of the threshold -// signature scheme using the output keys. - -// In each feldman VSS instance, the dealer generates a chunk of the -// the private key of a BLS threshold signature scheme. -// Using the complaints mechanism, each dealer is qualified or disqualified -// from the protocol, and the overall key is taking into account -// all chunks from qualified dealers. - -// Private keys are scalar in Fr, where r is the group order of G1/G2 -// Public keys are in G2. - -// Joint Feldman protocol, with complaint mechanism, implements DKGState -type JointFeldmanState struct { - *dkgCommon - // jointRunning is true if and only if all parallel Feldman vss protocols are running - jointRunning bool - // feldmanVSSQualState parallel states - fvss []feldmanVSSQualState - // is the group public key - jointPublicKey pointE2 - // Private share of the current participant - jointx scalar - // Public keys of the group participants, the vector size is (n) - jointy []pointE2 -} - -// NewJointFeldman creates a new instance of a Joint Feldman protocol. -// -// size if the total number of participants (n). -// threshold is the threshold parameter (t). the DKG protocol is secure in the -// presence of up to (t) malicious participants when (t < n/2). -// myIndex is the index of the participant creating the new DKG instance. -// processor is the DKGProcessor instance required to connect the participant to the -// communication channels. -// -// An instance is run by a single participant and is usable for only one protocol. -// In order to run the protocol again, a new instance needs to be created. -// -// The function returns: -// - (nil, InvalidInputsError) if: -// - size if not in [DKGMinSize, DKGMaxSize] -// - threshold is not in [MinimumThreshold, size-1] -// - myIndex is not in [0, size-1] -// - dealerIndex is not in [0, size-1] -// -// - (dkgInstance, nil) otherwise -func NewJointFeldman(size int, threshold int, myIndex int, - processor DKGProcessor) (DKGState, error) { - - common, err := newDKGCommon(size, threshold, myIndex, processor, 0) - if err != nil { - return nil, err - } - - jf := &JointFeldmanState{ - dkgCommon: common, - } - jf.init() - return jf, nil -} - -func (s *JointFeldmanState) init() { - s.fvss = make([]feldmanVSSQualState, s.size) - for i := 0; i < s.size; i++ { - fvss := &feldmanVSSstate{ - dkgCommon: s.dkgCommon, - dealerIndex: index(i), - } - s.fvss[i] = feldmanVSSQualState{ - feldmanVSSstate: fvss, - disqualified: false, - } - s.fvss[i].init() - } -} - -// Start triggers Joint Feldman protocol start for the current participant. -// The seed is used to generate the FVSS secret polynomial -// (including the instance group private key) when the current -// participant is the dealer. -// -// The returned error is : -// - dkgInvalidStateTransitionError if the DKG instance is already running. -// - error if an unexpected exception occurs -// - nil otherwise. -func (s *JointFeldmanState) Start(seed []byte) error { - if s.jointRunning { - return dkgInvalidStateTransitionErrorf("dkg is already running") - } - - for i := index(0); int(i) < s.size; i++ { - s.fvss[i].running = false - err := s.fvss[i].Start(seed) - if err != nil { - return fmt.Errorf("error when starting dkg: %w", err) - } - } - s.jointRunning = true - return nil -} - -// NextTimeout sets the next timeout of the protocol if any timeout applies. -// -// The returned error is : -// - dkgInvalidStateTransitionError if the DKG instance was not running. -// - dkgInvalidStateTransitionError if the DKG instance already called the 2 required timeouts. -// - nil otherwise. -func (s *JointFeldmanState) NextTimeout() error { - if !s.jointRunning { - return dkgInvalidStateTransitionErrorf("dkg protocol %d is not running", s.myIndex) - } - - for i := index(0); int(i) < s.size; i++ { - err := s.fvss[i].NextTimeout() - if err != nil { - return fmt.Errorf("next timeout failed: %w", err) - } - } - return nil -} - -// End ends the protocol in the current participant -// It returns the finalized public data and participant private key share. -// - the group public key corresponding to the group secret key -// - all the public key shares corresponding to the participants private -// key shares. -// - the finalized private key which is the current participant's own private key share -// -// The returned error is: -// - dkgFailureError if the disqualified dealers exceeded the threshold -// - dkgFailureError if the public key share or group public key is identity. -// - dkgInvalidStateTransitionError Start() was not called, or NextTimeout() was not called twice -// - nil otherwise. -func (s *JointFeldmanState) End() (PrivateKey, PublicKey, []PublicKey, error) { - if !s.jointRunning { - return nil, nil, nil, dkgInvalidStateTransitionErrorf("dkg protocol %d is not running", s.myIndex) - } - - disqualifiedTotal := 0 - for i := 0; i < s.size; i++ { - // check previous timeouts were called - if !s.fvss[i].sharesTimeout || !s.fvss[i].complaintsTimeout { - return nil, nil, nil, - dkgInvalidStateTransitionErrorf("%d: two timeouts should be set before ending dkg", s.myIndex) - } - - // check if a complaint has remained without an answer - // a dealer is disqualified if a complaint was never answered - if !s.fvss[i].disqualified { - for complainer, c := range s.fvss[i].complaints { - if c.received && !c.answerReceived { - s.fvss[i].disqualified = true - s.processor.Disqualify(i, - fmt.Sprintf("complaint from %d was not answered", complainer)) - disqualifiedTotal++ - break - } - } - } else { - disqualifiedTotal++ - } - } - s.jointRunning = false - - // check failing dkg - if disqualifiedTotal > s.threshold || s.size-disqualifiedTotal <= s.threshold { - return nil, nil, nil, - dkgFailureErrorf( - "Joint-Feldman failed because the disqualified participants number is high: %d disqualified, threshold is %d, size is %d", - disqualifiedTotal, s.threshold, s.size) - } - - // wrap up the keys from qualified dealers - jointx, jointPublicKey, jointy := s.sumUpQualifiedKeys(s.size - disqualifiedTotal) - - // private key of the current participant - x := newPrKeyBLSBLS12381(jointx) - - // Group public key - Y := newPubKeyBLSBLS12381(jointPublicKey) - - // The participants public keys - y := make([]PublicKey, s.size) - for i, p := range jointy { - y[i] = newPubKeyBLSBLS12381(&p) - } - - // check if current public key share or group public key is identity. - // In that case all signatures generated by the current private key share or - // the group private key are invalid (as stated by the BLS IETF draft) - // to avoid equivocation issues. - // - // Assuming both private keys have entropy from at least one honest dealer, each private - // key is initially uniformly distributed over the 2^255 possible values. We can argue that - // the known uniformity-bias caused by malicious dealers in Joint-Feldman does not weaken - // the likelihood of generating an identity key to practical probabilities. - if (jointx).isZero() { - return nil, nil, nil, dkgFailureErrorf("private key share is identity and is therefore invalid") - } - if Y.isIdentity { - return nil, nil, nil, dkgFailureErrorf("group private key is identity and is therefore invalid") - } - return x, Y, y, nil -} - -// HandleBroadcastMsg processes a new broadcasted message received by the current participant -// orig is the message origin index -// -// The function returns: -// - dkgInvalidStateTransitionError if the instance is not running -// - invalidInputsError if `orig` is not valid (in [0, size-1]) -// - nil otherwise -func (s *JointFeldmanState) HandleBroadcastMsg(orig int, msg []byte) error { - if !s.jointRunning { - return dkgInvalidStateTransitionErrorf("dkg protocol %d is not running", s.myIndex) - } - for i := index(0); int(i) < s.size; i++ { - err := s.fvss[i].HandleBroadcastMsg(orig, msg) - if err != nil { - return fmt.Errorf("handle broadcast message failed: %w", err) - } - } - return nil -} - -// HandlePrivateMsg processes a new private message received by the current participant -// orig is the message origin index -// -// The function returns: -// - dkgInvalidStateTransitionError if the instance is not running -// - invalidInputsError if `orig` is not valid (in [0, size-1]) -// - nil otherwise -func (s *JointFeldmanState) HandlePrivateMsg(orig int, msg []byte) error { - if !s.jointRunning { - return dkgInvalidStateTransitionErrorf("dkg protocol %d is not running", s.myIndex) - } - for i := index(0); int(i) < s.size; i++ { - err := s.fvss[i].HandlePrivateMsg(orig, msg) - if err != nil { - return fmt.Errorf("handle private message failed: %w", err) - } - } - return nil -} - -// Running returns the running state of Joint Feldman protocol -func (s *JointFeldmanState) Running() bool { - return s.jointRunning -} - -// ForceDisqualify forces a participant to get disqualified -// for a reason outside of the DKG protocol -// The caller should make sure all honest participants call this function, -// otherwise, the protocol can be broken -// -// The function returns: -// - dkgInvalidStateTransitionError if the instance is not running -// - invalidInputsError if `orig` is not valid (in [0, size-1]) -// - nil otherwise -func (s *JointFeldmanState) ForceDisqualify(participant int) error { - if !s.jointRunning { - return dkgInvalidStateTransitionErrorf("dkg is not running") - } - // disqualify the participant in the fvss instance where they are a dealer - err := s.fvss[participant].ForceDisqualify(participant) - if err != nil { - return fmt.Errorf("force disqualify failed: %w", err) - } - return nil -} - -// sum up the 3 type of keys from all qualified dealers to end the protocol -func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointE2, []pointE2) { - qualifiedx, qualifiedPubKey, qualifiedy := s.getQualifiedKeys(qualified) - - // sum up x - var jointx scalar - C.Fr_sum_vector((*C.Fr)(&jointx), (*C.Fr)(&qualifiedx[0]), - (C.int)(qualified)) - // sum up Y - var jointPublicKey pointE2 - C.E2_sum_vector_to_affine((*C.E2)(&jointPublicKey), - (*C.E2)(&qualifiedPubKey[0]), (C.int)(qualified)) - // sum up []y - jointy := make([]pointE2, s.size) - for i := 0; i < s.size; i++ { - C.E2_sum_vector_to_affine((*C.E2)(&jointy[i]), - (*C.E2)(&qualifiedy[i][0]), (C.int)(qualified)) - } - return &jointx, &jointPublicKey, jointy -} - -// get the 3 type of keys from all qualified dealers -func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointE2, [][]pointE2) { - qualifiedx := make([]scalar, 0, qualified) - qualifiedPubKey := make([]pointE2, 0, qualified) - qualifiedy := make([][]pointE2, s.size) - for i := 0; i < s.size; i++ { - qualifiedy[i] = make([]pointE2, 0, qualified) - } - - for i := 0; i < s.size; i++ { - if !s.fvss[i].disqualified { - qualifiedx = append(qualifiedx, s.fvss[i].x) - qualifiedPubKey = append(qualifiedPubKey, s.fvss[i].vA[0]) - for j := 0; j < s.size; j++ { - qualifiedy[j] = append(qualifiedy[j], s.fvss[i].y[j]) - } - } - } - return qualifiedx, qualifiedPubKey, qualifiedy -} diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go deleted file mode 100644 index 2bd4dc51fa0..00000000000 --- a/crypto/dkg_test.go +++ /dev/null @@ -1,829 +0,0 @@ -package crypto - -import ( - crand "crypto/rand" - "fmt" - mrand "math/rand" - "sync" - "testing" - "time" - - log "github.com/sirupsen/logrus" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var gt *testing.T - -func TestDKG(t *testing.T) { - t.Run("FeldmanVSSSimple", testFeldmanVSSSimple) - t.Run("FeldmanVSSQual", testFeldmanVSSQual) - t.Run("JointFeldman", testJointFeldman) -} - -// optimal threshold (t) to allow the largest number of malicious participants (m) -// assuming the protocol requires: -// -// m<=t for unforgeability -// n-m>=t+1 for robustness -func optimalThreshold(size int) int { - return (size - 1) / 2 -} - -// Testing the happy path of Feldman VSS by simulating a network of n participants -func testFeldmanVSSSimple(t *testing.T) { - log.SetLevel(log.ErrorLevel) - - n := 4 - for threshold := MinimumThreshold; threshold < n; threshold++ { - t.Run(fmt.Sprintf("FeldmanVSS (n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, feldmanVSS, n, threshold, happyPath) - }) - } -} - -type testCase int - -const ( - happyPath testCase = iota - invalidShares - invalidVector - invalidComplaint - invalidComplaintAnswer - duplicatedMessages -) - -type behavior int - -const ( - honest behavior = iota - manyInvalidShares - fewInvalidShares - invalidVectorBroadcast - invalidComplaintBroadcast - timeoutedComplaintBroadcast - invalidSharesComplainTrigger - invalidComplaintAnswerBroadcast - duplicatedSendAndBroadcast -) - -// Testing Feldman VSS with the qualification system by simulating a network of n participants -func testFeldmanVSSQual(t *testing.T) { - log.SetLevel(log.ErrorLevel) - - n := 4 - // happy path, test multiple values of thresold - for threshold := MinimumThreshold; threshold < n; threshold++ { - t.Run(fmt.Sprintf("FeldmanVSSQual_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, feldmanVSSQual, n, threshold, happyPath) - }) - } - - // unhappy path, with focus on the optimal threshold value - n = 5 - threshold := optimalThreshold(n) - // unhappy path, with invalid shares - t.Run(fmt.Sprintf("FeldmanVSSQual_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, feldmanVSSQual, n, threshold, invalidShares) - }) - // unhappy path, with invalid vector - t.Run(fmt.Sprintf("FeldmanVSSQual_InvalidVector_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, feldmanVSSQual, n, threshold, invalidVector) - }) - // unhappy paths with invalid complaints and complaint answers - // are only tested within joint feldman. -} - -// Testing JointFeldman by simulating a network of n participants -func testJointFeldman(t *testing.T) { - log.SetLevel(log.ErrorLevel) - - n := 4 - var threshold int - // happy path, test multiple values of thresold - for threshold = MinimumThreshold; threshold < n; threshold++ { - t.Run(fmt.Sprintf("JointFeldman_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, jointFeldman, n, threshold, happyPath) - }) - } - - // unhappy path, with focus on the optimal threshold value - n = 5 - threshold = optimalThreshold(n) - // unhappy path, with invalid shares - t.Run(fmt.Sprintf("JointFeldman_InvalidShares_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, jointFeldman, n, threshold, invalidShares) - }) - // unhappy path, with invalid vector - t.Run(fmt.Sprintf("JointFeldman_InvalidVector_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, jointFeldman, n, threshold, invalidVector) - }) - // unhappy path, with invalid complaints - t.Run(fmt.Sprintf("JointFeldman_InvalidComplaints_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, jointFeldman, n, threshold, invalidComplaint) - }) - // unhappy path, with invalid complaint answers - t.Run(fmt.Sprintf("JointFeldman_InvalidComplaintAnswers_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, jointFeldman, n, threshold, invalidComplaintAnswer) - }) - // unhappy path, with duplicated messages (all types) - t.Run(fmt.Sprintf("JointFeldman_DuplicatedMessages_(n,t)=(%d,%d)", n, threshold), func(t *testing.T) { - dkgCommonTest(t, jointFeldman, n, threshold, duplicatedMessages) - }) -} - -// Supported Key Generation protocols -const ( - feldmanVSS = iota - feldmanVSSQual - jointFeldman -) - -func newDKG(dkg int, size int, threshold int, myIndex int, - processor DKGProcessor, dealerIndex int) (DKGState, error) { - switch dkg { - case feldmanVSS: - return NewFeldmanVSS(size, threshold, myIndex, processor, dealerIndex) - case feldmanVSSQual: - return NewFeldmanVSSQual(size, threshold, myIndex, processor, dealerIndex) - case jointFeldman: - return NewJointFeldman(size, threshold, myIndex, processor) - default: - return nil, fmt.Errorf("non supported protocol") - } -} - -func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) { - gt = t - log.Info("DKG protocol set up") - - // create the participant channels - chans := make([]chan *message, n) - lateChansTimeout1 := make([]chan *message, n) - lateChansTimeout2 := make([]chan *message, n) - for i := 0; i < n; i++ { - chans[i] = make(chan *message, 5*n) - lateChansTimeout1[i] = make(chan *message, 5*n) - lateChansTimeout2[i] = make(chan *message, 5*n) - } - - // number of dealers in the protocol - var dealers int - if dkg == jointFeldman { - dealers = n - } else { - dealers = 1 - } - - // create n processors for all participants - processors := make([]testDKGProcessor, 0, n) - for current := 0; current < n; current++ { - list := make([]bool, dealers) - processors = append(processors, testDKGProcessor{ - current: current, - chans: chans, - lateChansTimeout1: lateChansTimeout1, - lateChansTimeout2: lateChansTimeout2, - protocol: dkgType, - malicious: honest, - disqualified: list, - }) - } - - // Update processors depending on the test - // - // r1 and r2 is the number of malicious participants, each group with a slight diffrent behavior. - // - r1 participants of indices 0 to r1-1 behave maliciously and will get disqualified by honest participants. - // - r2 participants of indices r1 to r1+r2-1 will behave maliciously at first but will recover and won't be - // disqualified by honest participants. The r2 participants may or may not obtain correct protocol results. - var r1, r2 int - // h is the index of the first honest participant. All participant with indices greater than or equal to h are honest. - // Checking the final protocol results is done for honest participants only. - // Whether the r2 participants belong to the honest participants or not depend on the malicious behavior (detailed below). - var h int - - switch test { - case happyPath: - // r1 = r2 = 0 - - case invalidShares: - r1 = mrand.Intn(dealers + 1) // dealers with invalid shares and will get disqualified - r2 = mrand.Intn(dealers - r1 + 1) // dealers with invalid shares but will recover - h = r1 - - var i int - for i = 0; i < r1; i++ { - processors[i].malicious = manyInvalidShares - } - for ; i < r1+r2; i++ { - processors[i].malicious = fewInvalidShares - } - t.Logf("%d participants will be disqualified, %d other participants will recover\n", r1, r2) - - case invalidVector: - r1 = 1 + mrand.Intn(dealers) // dealers with invalid vector and will get disqualified - h = r1 - - // in this case r2 = 0 - for i := 0; i < r1; i++ { - processors[i].malicious = invalidVectorBroadcast - } - t.Logf("%d participants will be disqualified\n", r1) - - case invalidComplaint: - r1 = 1 + mrand.Intn(dealers-1) // participants with invalid complaints and will get disqualified. - // r1>= 1 to have at least one malicious dealer, and r1= 1 to have at least one malicious dealer, and r1 threshold || (n-r1) <= threshold)) || - (dkg == feldmanVSSQual && r1 == 1) { // case of a single dealer - t.Logf("dkg failed, there are %d disqualified participants\n", r1) - // DKG failed, check for final errors - for i := r1; i < n; i++ { - err := processors[i].finalError - assert.Error(t, err) - assert.True(t, IsDKGFailureError(err)) - } - } else { - t.Logf("dkg succeeded, there are %d disqualified participants\n", r1) - // DKG has succeeded, check for final errors - for i := h; i < n; i++ { - assert.NoError(t, processors[i].finalError) - } - // DKG has succeeded, check the final keys - for i := h; i < n; i++ { - assert.True(t, processors[h].pk.Equals(processors[i].pk), - "2 group public keys are mismatching") - } - } -} - -// time after which a silent channel causes switching to the next dkg phase -const phaseSwitchTimeout = 200 * time.Millisecond - -// This is a testing function -// It simulates processing incoming messages by a participant -// it assumes proc.dkg is already running -func dkgRunChan(proc *testDKGProcessor, - sync *sync.WaitGroup, t *testing.T, phase int) { - for { - select { - // if a message is received, handle it - case newMsg := <-proc.chans[proc.current]: - proc.startSync.Wait() // avoids reading a message when the receiving dkg instance - // hasn't started yet. - if newMsg.channel == private { - err := proc.dkg.HandlePrivateMsg(newMsg.orig, newMsg.data) - require.Nil(t, err) - } else { - err := proc.dkg.HandleBroadcastMsg(newMsg.orig, newMsg.data) - require.Nil(t, err) - } - // if no message is received by the channel, call the DKG timeout - case <-time.After(phaseSwitchTimeout): - proc.startSync.Wait() // avoids racing when starting isn't over yet - switch phase { - case 0: - log.Infof("%d shares phase ended\n", proc.current) - err := proc.dkg.NextTimeout() - require.Nil(t, err) - case 1: - log.Infof("%d complaints phase ended \n", proc.current) - err := proc.dkg.NextTimeout() - require.Nil(t, err) - case 2: - log.Infof("%d dkg ended \n", proc.current) - _, pk, _, err := proc.dkg.End() - proc.finalError = err - proc.pk = pk - } - sync.Done() - return - } - } -} - -// post processing required for some edge case tests -func timeoutPostProcess(processors []testDKGProcessor, t *testing.T, phase int) { - switch phase { - case 1: - for i := 0; i < len(processors); i++ { - go func(i int) { - for len(processors[0].lateChansTimeout1[i]) != 0 { - // to test timeouted messages, late messages are copied to the main channels - msg := <-processors[0].lateChansTimeout1[i] - processors[0].chans[i] <- msg - } - }(i) - } - case 2: - for i := 0; i < len(processors); i++ { - go func(i int) { - for len(processors[0].lateChansTimeout2[i]) != 0 { - // to test timeouted messages, late messages are copied to the main channels - msg := <-processors[0].lateChansTimeout2[i] - processors[0].chans[i] <- msg - } - }(i) - } - } -} - -// implements DKGProcessor interface -type testDKGProcessor struct { - // instnce of DKG - dkg DKGState - // index of the current participant in the protocol - current int - // group public key, output of DKG - pk PublicKey - // final disqualified list - disqualified []bool - // final output error of the DKG - finalError error - // type of malicious behavior - malicious behavior - // start DKG syncer - startSync sync.WaitGroup - - // main message channels - chans []chan *message - // extra channels for late messges with regards to the first timeout, and second timeout - lateChansTimeout1 []chan *message - lateChansTimeout2 []chan *message - // type of the protocol - protocol int - - // only used when testing the threshold signature stateful api - ts *blsThresholdSignatureParticipant - keys *statelessKeys -} - -const ( - dkgType int = iota - tsType -) - -const ( - broadcast int = iota - private -) - -type message struct { - orig int - protocol int - channel int - data []byte -} - -func (proc *testDKGProcessor) Disqualify(participant int, logInfo string) { - gt.Logf("%d disqualifies %d, %s\n", proc.current, participant, logInfo) - proc.disqualified[participant] = true -} - -func (proc *testDKGProcessor) FlagMisbehavior(participant int, logInfo string) { - gt.Logf("%d flags a misbehavior from %d: %s", proc.current, participant, logInfo) -} - -// This is a testing function -// it simulates sending a message from one participant to another -func (proc *testDKGProcessor) PrivateSend(dest int, data []byte) { - go func() { - log.Infof("%d sending to %d", proc.current, dest) - if proc.malicious == fewInvalidShares || proc.malicious == manyInvalidShares || - proc.malicious == invalidSharesComplainTrigger || proc.malicious == invalidComplaintAnswerBroadcast || - proc.malicious == duplicatedSendAndBroadcast { - proc.invalidShareSend(dest, data) - return - } - proc.honestSend(dest, data) - }() -} - -// This is a testing function -// it simulates sending a honest message from one participant to another -func (proc *testDKGProcessor) honestSend(dest int, data []byte) { - gt.Logf("%d honestly sending to %d:\n%x\n", proc.current, dest, data) - newMsg := &message{proc.current, proc.protocol, private, data} - proc.chans[dest] <- newMsg -} - -// This is a testing function -// it simulates sending a malicious message from one participant to another -// This function simulates the behavior of a malicious participant. -func (proc *testDKGProcessor) invalidShareSend(dest int, data []byte) { - - // check the behavior - var recipients int // number of recipients to send invalid shares to - switch proc.malicious { - case manyInvalidShares: - recipients = proc.dkg.Threshold() + 1 // t < recipients <= n - case fewInvalidShares: - recipients = proc.dkg.Threshold() // 0 <= recipients <= t - case invalidSharesComplainTrigger: - recipients = proc.current // equal to r1+r2, which causes all r1+r2 to complain - case invalidComplaintAnswerBroadcast: - recipients = 0 // treat this case separately as the complaint trigger is the participant n-1 - case duplicatedSendAndBroadcast: - proc.honestSend(dest, data) - proc.honestSend(dest, data) - return - default: - panic("invalid share send not supported") - } - - // copy of data - newData := make([]byte, len(data)) - copy(newData, data) - - newMsg := &message{proc.current, proc.protocol, private, newData} - originalMsg := &message{proc.current, proc.protocol, private, data} - - // check destination - if (dest < recipients) || (proc.current < recipients && dest < recipients+1) || - (proc.malicious == invalidComplaintAnswerBroadcast && dest == proc.dkg.Size()-1) { - // choose a random reason for an invalid share - coin := mrand.Intn(7) - gt.Logf("%d maliciously sending to %d, coin is %d\n", proc.current, dest, coin) - switch coin { - case 0: - // value doesn't match the verification vector - newMsg.data[8]++ - proc.chans[dest] <- newMsg - case 1: - // empty message - newMsg.data = newMsg.data[:0] - proc.chans[dest] <- newMsg - case 2: - // valid message length but invalid share length - newMsg.data = newMsg.data[:1] - proc.chans[dest] <- newMsg - case 3: - // invalid value - for i := 0; i < len(newMsg.data); i++ { - newMsg.data[i] = 0xFF - } - proc.chans[dest] <- newMsg - case 4: - // do not send the share at all - return - case 5: - // wrong header: will cause a complaint - newMsg.data[0] = byte(feldmanVSSVerifVec) - proc.chans[dest] <- newMsg - case 6: - // message will be sent after the shares timeout and will be considered late - // by the receiver. All late messages go into a separate channel and will be sent to - // the main channel after the shares timeout. - proc.lateChansTimeout1[dest] <- newMsg - return - } - - } else { - gt.Logf("%d to %d: turns out to be a honest send\n%x\n", data, proc.current, dest) - } - // honest send case: this is the only message sent - // malicious send case: this is a second correct send, to test the second message gets ignored - // by the receiver (sender has been tagged malicious after the first send) - proc.chans[dest] <- originalMsg -} - -// This is a testing function -// it simulates broadcasting a message from one participant to all participants -func (proc *testDKGProcessor) Broadcast(data []byte) { - go func() { - log.Infof("%d Broadcasting:", proc.current) - - if data[0] == byte(feldmanVSSVerifVec) && proc.malicious == invalidVectorBroadcast { - proc.invalidVectorBroadcast(data) - } else if data[0] == byte(feldmanVSSComplaint) && - (proc.malicious == invalidComplaintBroadcast || proc.malicious == timeoutedComplaintBroadcast) { - proc.invalidComplaintBroadcast(data) - } else if data[0] == byte(feldmanVSSComplaintAnswer) && proc.malicious == invalidComplaintAnswerBroadcast { - proc.invalidComplaintAnswerBroadcast(data) - } else if proc.malicious == duplicatedSendAndBroadcast || - (data[0] == byte(feldmanVSSComplaintAnswer) && proc.malicious == invalidSharesComplainTrigger) { - // the complaint trigger also sends duplicated complaint answers - proc.honestBroadcast(data) - proc.honestBroadcast(data) - } else { - proc.honestBroadcast(data) - } - }() -} - -func (proc *testDKGProcessor) honestBroadcast(data []byte) { - gt.Logf("%d honestly broadcasting:\n%x\n", proc.current, data) - newMsg := &message{proc.current, proc.protocol, broadcast, data} - for i := 0; i < len(proc.chans); i++ { - if i != proc.current { - proc.chans[i] <- newMsg - } - } -} - -func (proc *testDKGProcessor) invalidVectorBroadcast(data []byte) { - newMsg := &message{proc.current, proc.protocol, broadcast, data} - - // choose a random reason of an invalid vector - coin := mrand.Intn(5) - gt.Logf("%d malicious vector broadcast, coin is %d\n", proc.current, coin) - switch coin { - case 0: - // invalid point serialization - newMsg.data[1] = 0xFF - case 1: - // invalid length - newMsg.data = newMsg.data[:5] - case 2: - // do not send the vector at all - return - case 3: - // wrong header - newMsg.data[0] = byte(feldmanVSSShare) - case 4: - // send the vector after the first timeout, equivalent to not sending at all - // as the vector should be ignored. - for i := 0; i < proc.dkg.Size(); i++ { - if i != proc.current { - proc.lateChansTimeout1[i] <- newMsg - } - } - return - } - gt.Logf("%x\n", newMsg.data) - for i := 0; i < proc.dkg.Size(); i++ { - if i != proc.current { - proc.chans[i] <- newMsg - } - } -} - -func (proc *testDKGProcessor) invalidComplaintBroadcast(data []byte) { - newMsg := &message{proc.current, proc.protocol, broadcast, data} - - if proc.malicious == invalidComplaintBroadcast { - - // choose a random reason for an invalid complaint - coin := mrand.Intn(2) - gt.Logf("%d malicious complaint broadcast, coin is %d\n", proc.current, coin) - switch coin { - case 0: - // invalid complainee - newMsg.data[1] = byte(proc.dkg.Size() + 1) - case 1: - // invalid length - newMsg.data = make([]byte, complaintSize+5) - copy(newMsg.data, data) - } - gt.Logf("%x\n", newMsg.data) - for i := 0; i < len(proc.chans); i++ { - if i != proc.current { - proc.chans[i] <- newMsg - } - } - } else if proc.malicious == timeoutedComplaintBroadcast { - gt.Logf("%d timeouted complaint broadcast\n", proc.current) - // send the complaint after the second timeout, equivalent to not sending at all - // as the complaint should be ignored. - for i := 0; i < len(proc.chans); i++ { - if i != proc.current { - proc.lateChansTimeout2[i] <- newMsg - } - } - return - } -} - -func (proc *testDKGProcessor) invalidComplaintAnswerBroadcast(data []byte) { - newMsg := &message{proc.current, proc.protocol, broadcast, data} - - // choose a random reason for an invalid complaint - coin := mrand.Intn(3) - gt.Logf("%d malicious complaint answer broadcast, coin is %d\n", proc.current, coin) - switch coin { - case 0: - // invalid complainee - newMsg.data[1] = byte(proc.dkg.Size() + 1) - case 1: - // invalid length - newMsg.data = make([]byte, complaintAnswerSize+5) - copy(newMsg.data, data) - case 2: - // no answer at all - return - } - //gt.Logf("%x\n", newMsg.data) - for i := 0; i < len(proc.chans); i++ { - if i != proc.current { - proc.chans[i] <- newMsg - } - } -} - -// implements a dummy DKGProcessor -type dummyTestDKGProcessor struct { -} - -func (proc dummyTestDKGProcessor) PrivateSend(int, []byte) {} -func (proc dummyTestDKGProcessor) Broadcast([]byte) {} -func (proc dummyTestDKGProcessor) Disqualify(int, string) {} -func (proc dummyTestDKGProcessor) FlagMisbehavior(int, string) {} - -func TestDKGErrorTypes(t *testing.T) { - t.Run("dkgFailureError sanity", func(t *testing.T) { - failureError := dkgFailureErrorf("some error") - invInpError := invalidInputsErrorf("") - otherError := fmt.Errorf("some error") - assert.True(t, IsDKGFailureError(failureError)) - assert.False(t, IsDKGFailureError(otherError)) - assert.False(t, IsDKGFailureError(invInpError)) - assert.False(t, IsDKGFailureError(nil)) - assert.False(t, IsInvalidInputsError(failureError)) - }) - - t.Run("dkgInvalidStateTransitionError sanity", func(t *testing.T) { - failureError := dkgInvalidStateTransitionErrorf("some error") - invInpError := invalidInputsErrorf("") - otherError := fmt.Errorf("some error") - assert.True(t, IsDKGInvalidStateTransitionError(failureError)) - assert.False(t, IsInvalidInputsError(failureError)) - assert.False(t, IsDKGInvalidStateTransitionError(invInpError)) - assert.False(t, IsDKGInvalidStateTransitionError(otherError)) - assert.False(t, IsDKGInvalidStateTransitionError(nil)) - }) -} - -func TestDKGTransitionErrors(t *testing.T) { - n := 5 - threshold := 3 - myIndex := 0 - dealer := 1 - seed := make([]byte, KeyGenSeedMinLen) - - t.Run("feldman VSS", func(t *testing.T) { - state, err := NewFeldmanVSS(n, threshold, myIndex, dummyTestDKGProcessor{}, dealer) - require.NoError(t, err) - // calls before start - err = state.ForceDisqualify(1) - assert.True(t, IsDKGInvalidStateTransitionError(err)) - err = state.HandlePrivateMsg(1, []byte{}) - assert.True(t, IsDKGInvalidStateTransitionError(err)) - err = state.HandleBroadcastMsg(1, []byte{}) - assert.True(t, IsDKGInvalidStateTransitionError(err)) - _, _, _, err = state.End() - assert.True(t, IsDKGInvalidStateTransitionError(err)) - }) - - t.Run("Feldman VSS Qualif and joint-Feldman ", func(t *testing.T) { - stateFVSSQ, err := NewFeldmanVSSQual(n, threshold, myIndex, dummyTestDKGProcessor{}, dealer) - require.NoError(t, err) - stateJF, err := NewJointFeldman(n, threshold, myIndex, dummyTestDKGProcessor{}) - require.NoError(t, err) - - for _, state := range []DKGState{stateFVSSQ, stateJF} { - // calls before start - err = state.ForceDisqualify(1) - assert.True(t, IsDKGInvalidStateTransitionError(err)) - err = state.HandlePrivateMsg(1, []byte{}) - assert.True(t, IsDKGInvalidStateTransitionError(err)) - err = state.HandleBroadcastMsg(1, []byte{}) - assert.True(t, IsDKGInvalidStateTransitionError(err)) - _, _, _, err = state.End() - assert.True(t, IsDKGInvalidStateTransitionError(err)) - err = state.NextTimeout() - assert.True(t, IsDKGInvalidStateTransitionError(err)) - // after start - err = state.Start(seed) - require.NoError(t, err) - _, _, _, err = state.End() - assert.True(t, IsDKGInvalidStateTransitionError(err)) - // after first timeout - err = state.NextTimeout() - require.NoError(t, err) - err = state.Start(seed) - assert.True(t, IsDKGInvalidStateTransitionError(err)) - _, _, _, err = state.End() - assert.True(t, IsDKGInvalidStateTransitionError(err)) - // after second timeout - err = state.NextTimeout() - require.NoError(t, err) - err = state.Start(seed) - assert.True(t, IsDKGInvalidStateTransitionError(err)) - err = state.NextTimeout() - assert.True(t, IsDKGInvalidStateTransitionError(err)) - // after end - _, _, _, err = state.End() - require.True(t, IsDKGFailureError(err)) - err = state.NextTimeout() - assert.True(t, IsDKGInvalidStateTransitionError(err)) - } - }) -} diff --git a/crypto/ecdsa.go b/crypto/ecdsa.go deleted file mode 100644 index b09d3d5922f..00000000000 --- a/crypto/ecdsa.go +++ /dev/null @@ -1,461 +0,0 @@ -package crypto - -// Elliptic Curve Digital Signature Algorithm is implemented as -// defined in FIPS 186-4 (although the hash functions implemented in this package are SHA2 and SHA3). - -// Most of the implementation is Go based and is not optimized for performance. - -// This implementation does not include any security against side-channel attacks. - -import ( - "crypto/ecdsa" - "crypto/elliptic" - "crypto/rand" - "crypto/sha256" - "fmt" - "math/big" - - "github.com/btcsuite/btcd/btcec/v2" - "golang.org/x/crypto/hkdf" - - "github.com/onflow/flow-go/crypto/hash" -) - -const ( - // NIST P256 - SignatureLenECDSAP256 = 64 - PrKeyLenECDSAP256 = 32 - // PubKeyLenECDSAP256 is the size of uncompressed points on P256 - PubKeyLenECDSAP256 = 64 - - // SECG secp256k1 - SignatureLenECDSASecp256k1 = 64 - PrKeyLenECDSASecp256k1 = 32 - // PubKeyLenECDSASecp256k1 is the size of uncompressed points on secp256k1 - PubKeyLenECDSASecp256k1 = 64 -) - -// ecdsaAlgo embeds SignAlgo -type ecdsaAlgo struct { - // elliptic curve - curve elliptic.Curve - // the signing algo and parameters - algo SigningAlgorithm -} - -// ECDSA contexts for each supported curve -// -// NIST P-256 curve -var p256Instance *ecdsaAlgo - -// SECG secp256k1 curve https://www.secg.org/sec2-v2.pdf -var secp256k1Instance *ecdsaAlgo - -func bitsToBytes(bits int) int { - return (bits + 7) >> 3 -} - -// signHash returns the signature of the hash using the private key -// the signature is the concatenation bytes(r)||bytes(s) -// where r and s are padded to the curve order size -func (sk *prKeyECDSA) signHash(h hash.Hash) (Signature, error) { - r, s, err := ecdsa.Sign(rand.Reader, sk.goPrKey, h) - if err != nil { - return nil, fmt.Errorf("ECDSA Sign failed: %w", err) - } - rBytes := r.Bytes() - sBytes := s.Bytes() - Nlen := bitsToBytes((sk.alg.curve.Params().N).BitLen()) - signature := make([]byte, 2*Nlen) - // pad the signature with zeroes - copy(signature[Nlen-len(rBytes):], rBytes) - copy(signature[2*Nlen-len(sBytes):], sBytes) - return signature, nil -} - -// Sign signs an array of bytes -// -// The resulting signature is the concatenation bytes(r)||bytes(s), -// where r and s are padded to the curve order size. -// The private key is read only while sha2 and sha3 hashers are -// modified temporarily. -// -// The function returns: -// - (false, nilHasherError) if a hasher is nil -// - (false, invalidHasherSizeError) when the hasher's output size is less than the curve order (currently 32 bytes). -// - (nil, error) if an unexpected error occurs -// - (signature, nil) otherwise -func (sk *prKeyECDSA) Sign(data []byte, alg hash.Hasher) (Signature, error) { - if alg == nil { - return nil, nilHasherError - } - // check hasher's size is at least the curve order in bytes - Nlen := bitsToBytes((sk.alg.curve.Params().N).BitLen()) - if alg.Size() < Nlen { - return nil, invalidHasherSizeErrorf( - "hasher's size should be at least %d, got %d", Nlen, alg.Size()) - } - - h := alg.ComputeHash(data) - return sk.signHash(h) -} - -// verifyHash implements ECDSA signature verification -func (pk *pubKeyECDSA) verifyHash(sig Signature, h hash.Hash) (bool, error) { - Nlen := bitsToBytes((pk.alg.curve.Params().N).BitLen()) - - if len(sig) != 2*Nlen { - return false, nil - } - - var r big.Int - var s big.Int - r.SetBytes(sig[:Nlen]) - s.SetBytes(sig[Nlen:]) - return ecdsa.Verify(pk.goPubKey, h, &r, &s), nil -} - -// Verify verifies a signature of an input data under the public key. -// -// If the input signature slice has an invalid length or fails to deserialize into valid -// scalars, the function returns false without an error. -// -// Public keys are read only, sha2 and sha3 hashers are -// modified temporarily. -// -// The function returns: -// - (false, nilHasherError) if a hasher is nil -// - (false, invalidHasherSizeError) when the hasher's output size is less than the curve order (currently 32 bytes). -// - (false, error) if an unexpected error occurs -// - (validity, nil) otherwise -func (pk *pubKeyECDSA) Verify(sig Signature, data []byte, alg hash.Hasher) (bool, error) { - if alg == nil { - return false, nilHasherError - } - - // check hasher's size is at least the curve order in bytes - Nlen := bitsToBytes((pk.alg.curve.Params().N).BitLen()) - if alg.Size() < Nlen { - return false, invalidHasherSizeErrorf( - "hasher's size should be at least %d, got %d", Nlen, alg.Size()) - } - - h := alg.ComputeHash(data) - return pk.verifyHash(sig, h) -} - -// signatureFormatCheck verifies the format of a serialized signature, -// regardless of messages or public keys. -// If FormatCheck returns false then the input is not a valid ECDSA -// signature and will fail a verification against any message and public key. -func (a *ecdsaAlgo) signatureFormatCheck(sig Signature) bool { - N := a.curve.Params().N - Nlen := bitsToBytes(N.BitLen()) - - if len(sig) != 2*Nlen { - return false - } - - var r big.Int - var s big.Int - r.SetBytes(sig[:Nlen]) - s.SetBytes(sig[Nlen:]) - - if r.Sign() == 0 || s.Sign() == 0 { - return false - } - - if r.Cmp(N) >= 0 || s.Cmp(N) >= 0 { - return false - } - - // We could also check whether r and r+N are quadratic residues modulo (p) - // using Euler's criterion. - return true -} - -var one = new(big.Int).SetInt64(1) - -// goecdsaGenerateKey generates a public and private key pair -// for the crypto/ecdsa library using the input seed -func goecdsaGenerateKey(c elliptic.Curve, seed []byte) *ecdsa.PrivateKey { - k := new(big.Int).SetBytes(seed) - n := new(big.Int).Sub(c.Params().N, one) - k.Mod(k, n) - k.Add(k, one) - - priv := new(ecdsa.PrivateKey) - priv.PublicKey.Curve = c - priv.D = k - // public key is not computed - return priv -} - -// generatePrivateKey generates a private key for ECDSA -// deterministically using the input seed. -// -// It is recommended to use a secure crypto RNG to generate the seed. -// The seed must have enough entropy. -func (a *ecdsaAlgo) generatePrivateKey(seed []byte) (PrivateKey, error) { - if len(seed) < KeyGenSeedMinLen || len(seed) > KeyGenSeedMaxLen { - return nil, invalidInputsErrorf("seed byte length should be between %d and %d", - KeyGenSeedMinLen, KeyGenSeedMaxLen) - } - - // use HKDF to extract the seed entropy and expand it into key bytes - - // use SHA2-256 as the building block H in HKDF - hashFunction := sha256.New - salt := []byte("") // HKDF salt - info := []byte("") // HKDF info - // use extra 128 bits to reduce the modular reduction bias - Nlen := bitsToBytes((a.curve.Params().N).BitLen()) - okmLength := Nlen + (securityBits / 8) - - // instantiate HKDF and extract okm - reader := hkdf.New(hashFunction, seed, salt, info) - okm := make([]byte, okmLength) - n, err := reader.Read(okm) - if err != nil || n != okmLength { - return nil, fmt.Errorf("key generation failed because of the HKDF reader, %d bytes were read: %w", - n, err) - } - defer overwrite(okm) // overwrite okm - - sk := goecdsaGenerateKey(a.curve, okm) - return &prKeyECDSA{ - alg: a, - goPrKey: sk, - pubKey: nil, // public key is not computed - }, nil -} - -func (a *ecdsaAlgo) rawDecodePrivateKey(der []byte) (PrivateKey, error) { - n := a.curve.Params().N - nlen := bitsToBytes(n.BitLen()) - if len(der) != nlen { - return nil, invalidInputsErrorf("input has incorrect %s key size", a.algo) - } - var d big.Int - d.SetBytes(der) - - if d.Cmp(n) >= 0 { - return nil, invalidInputsErrorf("input is not a valid %s key", a.algo) - } - - priv := ecdsa.PrivateKey{ - D: &d, - } - priv.PublicKey.Curve = a.curve - - return &prKeyECDSA{ - alg: a, - goPrKey: &priv, - pubKey: nil, // public key is not computed - }, nil -} - -func (a *ecdsaAlgo) decodePrivateKey(der []byte) (PrivateKey, error) { - return a.rawDecodePrivateKey(der) -} - -func (a *ecdsaAlgo) rawDecodePublicKey(der []byte) (PublicKey, error) { - p := (a.curve.Params().P) - plen := bitsToBytes(p.BitLen()) - if len(der) != 2*plen { - return nil, invalidInputsErrorf("input has incorrect %s key size, got %d, expects %d", - a.algo, len(der), 2*plen) - } - var x, y big.Int - x.SetBytes(der[:plen]) - y.SetBytes(der[plen:]) - - // all the curves supported for now have a cofactor equal to 1, - // so that IsOnCurve guarantees the point is on the right subgroup. - if x.Cmp(p) >= 0 || y.Cmp(p) >= 0 || !a.curve.IsOnCurve(&x, &y) { - return nil, invalidInputsErrorf("input %x is not a valid %s key", der, a.algo) - } - - pk := ecdsa.PublicKey{ - Curve: a.curve, - X: &x, - Y: &y, - } - - return &pubKeyECDSA{a, &pk}, nil -} - -func (a *ecdsaAlgo) decodePublicKey(der []byte) (PublicKey, error) { - return a.rawDecodePublicKey(der) -} - -// decodePublicKeyCompressed returns a public key given the bytes of a compressed public key according to X9.62 section 4.3.6. -// this compressed representation uses an extra byte to disambiguate sign -func (a *ecdsaAlgo) decodePublicKeyCompressed(pkBytes []byte) (PublicKey, error) { - expectedLen := bitsToBytes(a.curve.Params().BitSize) + 1 - if len(pkBytes) != expectedLen { - return nil, invalidInputsErrorf(fmt.Sprintf("input length incompatible, expected %d, got %d", expectedLen, len(pkBytes))) - } - var goPubKey *ecdsa.PublicKey - - if a.curve == elliptic.P256() { - x, y := elliptic.UnmarshalCompressed(a.curve, pkBytes) - if x == nil { - return nil, invalidInputsErrorf("Key %x can't be interpreted as %v", pkBytes, a.algo.String()) - } - goPubKey = new(ecdsa.PublicKey) - goPubKey.Curve = a.curve - goPubKey.X = x - goPubKey.Y = y - - } else if a.curve == btcec.S256() { - pk, err := btcec.ParsePubKey(pkBytes) - if err != nil { - return nil, invalidInputsErrorf("Key %x can't be interpreted as %v", pkBytes, a.algo.String()) - } - // convert to a crypto/ecdsa key - goPubKey = pk.ToECDSA() - } else { - return nil, invalidInputsErrorf("the input curve is not supported") - } - return &pubKeyECDSA{a, goPubKey}, nil -} - -// prKeyECDSA is the private key of ECDSA, it implements the interface PrivateKey -type prKeyECDSA struct { - // the signature algo - alg *ecdsaAlgo - // ecdsa private key - goPrKey *ecdsa.PrivateKey - // public key - pubKey *pubKeyECDSA -} - -var _ PrivateKey = (*prKeyECDSA)(nil) - -// Algorithm returns the algo related to the private key -func (sk *prKeyECDSA) Algorithm() SigningAlgorithm { - return sk.alg.algo -} - -// Size returns the length of the private key in bytes -func (sk *prKeyECDSA) Size() int { - return bitsToBytes((sk.alg.curve.Params().N).BitLen()) -} - -// PublicKey returns the public key associated to the private key -func (sk *prKeyECDSA) PublicKey() PublicKey { - // compute the public key once - if sk.pubKey == nil { - priv := sk.goPrKey - priv.PublicKey.X, priv.PublicKey.Y = priv.Curve.ScalarBaseMult(priv.D.Bytes()) - } - sk.pubKey = &pubKeyECDSA{ - alg: sk.alg, - goPubKey: &sk.goPrKey.PublicKey, - } - return sk.pubKey -} - -// given a private key (d), returns a raw encoding bytes(d) in big endian -// padded to the private key length -func (sk *prKeyECDSA) rawEncode() []byte { - skBytes := sk.goPrKey.D.Bytes() - Nlen := bitsToBytes((sk.alg.curve.Params().N).BitLen()) - skEncoded := make([]byte, Nlen) - // pad sk with zeroes - copy(skEncoded[Nlen-len(skBytes):], skBytes) - return skEncoded -} - -// Encode returns a byte representation of a private key. -// a simple raw byte encoding in big endian is used for all curves -func (sk *prKeyECDSA) Encode() []byte { - return sk.rawEncode() -} - -// Equals test the equality of two private keys -func (sk *prKeyECDSA) Equals(other PrivateKey) bool { - // check the key type - otherECDSA, ok := other.(*prKeyECDSA) - if !ok { - return false - } - // check the curve - if sk.alg.curve != otherECDSA.alg.curve { - return false - } - return sk.goPrKey.D.Cmp(otherECDSA.goPrKey.D) == 0 -} - -// String returns the hex string representation of the key. -func (sk *prKeyECDSA) String() string { - return fmt.Sprintf("%#x", sk.Encode()) -} - -// pubKeyECDSA is the public key of ECDSA, it implements PublicKey -type pubKeyECDSA struct { - // the signature algo - alg *ecdsaAlgo - // public key data - goPubKey *ecdsa.PublicKey -} - -var _ PublicKey = (*pubKeyECDSA)(nil) - -// Algorithm returns the the algo related to the private key -func (pk *pubKeyECDSA) Algorithm() SigningAlgorithm { - return pk.alg.algo -} - -// Size returns the length of the public key in bytes -func (pk *pubKeyECDSA) Size() int { - return 2 * bitsToBytes((pk.goPubKey.Params().P).BitLen()) -} - -// EncodeCompressed returns a compressed encoding according to X9.62 section 4.3.6. -// This compressed representation uses an extra byte to disambiguate parity. -// The expected input is a public key (x,y). -func (pk *pubKeyECDSA) EncodeCompressed() []byte { - return elliptic.MarshalCompressed(pk.goPubKey.Curve, pk.goPubKey.X, pk.goPubKey.Y) -} - -// given a public key (x,y), returns a raw uncompressed encoding bytes(x)||bytes(y) -// x and y are padded to the field size -func (pk *pubKeyECDSA) rawEncode() []byte { - xBytes := pk.goPubKey.X.Bytes() - yBytes := pk.goPubKey.Y.Bytes() - Plen := bitsToBytes((pk.alg.curve.Params().P).BitLen()) - pkEncoded := make([]byte, 2*Plen) - // pad the public key coordinates with zeroes - copy(pkEncoded[Plen-len(xBytes):], xBytes) - copy(pkEncoded[2*Plen-len(yBytes):], yBytes) - return pkEncoded -} - -// Encode returns a byte representation of a public key. -// a simple uncompressed raw encoding X||Y is used for all curves -// X and Y are the big endian byte encoding of the x and y coordinates of the public key -func (pk *pubKeyECDSA) Encode() []byte { - return pk.rawEncode() -} - -// Equals test the equality of two private keys -func (pk *pubKeyECDSA) Equals(other PublicKey) bool { - // check the key type - otherECDSA, ok := other.(*pubKeyECDSA) - if !ok { - return false - } - // check the curve - if pk.alg.curve != otherECDSA.alg.curve { - return false - } - return (pk.goPubKey.X.Cmp(otherECDSA.goPubKey.X) == 0) && - (pk.goPubKey.Y.Cmp(otherECDSA.goPubKey.Y) == 0) -} - -// String returns the hex string representation of the key. -func (pk *pubKeyECDSA) String() string { - return fmt.Sprintf("%#x", pk.Encode()) -} diff --git a/crypto/ecdsa_test.go b/crypto/ecdsa_test.go deleted file mode 100644 index ed005a11e07..00000000000 --- a/crypto/ecdsa_test.go +++ /dev/null @@ -1,378 +0,0 @@ -package crypto - -import ( - "encoding/hex" - "testing" - - "crypto/elliptic" - crand "crypto/rand" - "math/big" - - "github.com/btcsuite/btcd/btcec/v2" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/onflow/flow-go/crypto/hash" -) - -var ecdsaCurves = []SigningAlgorithm{ - ECDSAP256, - ECDSASecp256k1, -} -var ecdsaPrKeyLen = map[SigningAlgorithm]int{ - ECDSAP256: PrKeyLenECDSAP256, - ECDSASecp256k1: PrKeyLenECDSASecp256k1, -} -var ecdsaPubKeyLen = map[SigningAlgorithm]int{ - ECDSAP256: PubKeyLenECDSAP256, - ECDSASecp256k1: PubKeyLenECDSASecp256k1, -} -var ecdsaSigLen = map[SigningAlgorithm]int{ - ECDSAP256: SignatureLenECDSAP256, - ECDSASecp256k1: SignatureLenECDSASecp256k1, -} - -// ECDSA tests -func TestECDSA(t *testing.T) { - - for _, curve := range ecdsaCurves { - t.Logf("Testing ECDSA for curve %s", curve) - // test key generation seed limits - testKeyGenSeed(t, curve, KeyGenSeedMinLen, KeyGenSeedMaxLen) - // test consistency - halg := hash.NewSHA3_256() - testGenSignVerify(t, curve, halg) - } -} - -type dummyHasher struct{ size int } - -func newDummyHasher(size int) hash.Hasher { return &dummyHasher{size} } -func (d *dummyHasher) Algorithm() hash.HashingAlgorithm { return hash.UnknownHashingAlgorithm } -func (d *dummyHasher) Size() int { return d.size } -func (d *dummyHasher) ComputeHash([]byte) hash.Hash { return make([]byte, d.size) } -func (d *dummyHasher) Write([]byte) (int, error) { return 0, nil } -func (d *dummyHasher) SumHash() hash.Hash { return make([]byte, d.size) } -func (d *dummyHasher) Reset() {} - -func TestECDSAHasher(t *testing.T) { - - for _, curve := range ecdsaCurves { - - // generate a key pair - seed := make([]byte, KeyGenSeedMinLen) - n, err := crand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - sk, err := GeneratePrivateKey(curve, seed) - require.NoError(t, err) - sig := make([]byte, ecdsaSigLen[curve]) - - // empty hasher - t.Run("Empty hasher", func(t *testing.T) { - _, err := sk.Sign(seed, nil) - assert.Error(t, err) - assert.True(t, IsNilHasherError(err)) - _, err = sk.PublicKey().Verify(sig, seed, nil) - assert.Error(t, err) - assert.True(t, IsNilHasherError(err)) - }) - - // hasher with large output size - t.Run("large size hasher is accepted", func(t *testing.T) { - dummy := newDummyHasher(500) - _, err := sk.Sign(seed, dummy) - assert.NoError(t, err) - _, err = sk.PublicKey().Verify(sig, seed, dummy) - assert.NoError(t, err) - }) - - // hasher with small output size - t.Run("small size hasher is rejected", func(t *testing.T) { - dummy := newDummyHasher(31) // 31 is one byte less than the supported curves' order - _, err := sk.Sign(seed, dummy) - assert.Error(t, err) - assert.True(t, IsInvalidHasherSizeError(err)) - _, err = sk.PublicKey().Verify(sig, seed, dummy) - assert.Error(t, err) - assert.True(t, IsInvalidHasherSizeError(err)) - }) - } -} - -// Signing bench -func BenchmarkECDSAP256Sign(b *testing.B) { - halg := hash.NewSHA3_256() - benchSign(b, ECDSAP256, halg) -} - -// Verifying bench -func BenchmarkECDSAP256Verify(b *testing.B) { - halg := hash.NewSHA3_256() - benchVerify(b, ECDSAP256, halg) -} - -// Signing bench -func BenchmarkECDSASecp256k1Sign(b *testing.B) { - halg := hash.NewSHA3_256() - benchSign(b, ECDSASecp256k1, halg) -} - -// Verifying bench -func BenchmarkECDSASecp256k1Verify(b *testing.B) { - halg := hash.NewSHA3_256() - benchVerify(b, ECDSASecp256k1, halg) -} - -// TestECDSAEncodeDecode tests encoding and decoding of ECDSA keys -func TestECDSAEncodeDecode(t *testing.T) { - for _, curve := range ecdsaCurves { - testEncodeDecode(t, curve) - } -} - -// TestECDSAEquals tests equal for ECDSA keys -func TestECDSAEquals(t *testing.T) { - for i, curve := range ecdsaCurves { - testEquals(t, curve, ecdsaCurves[i]^1) - } -} - -// TestECDSAUtils tests some utility functions -func TestECDSAUtils(t *testing.T) { - - for _, curve := range ecdsaCurves { - // generate a key pair - seed := make([]byte, KeyGenSeedMinLen) - n, err := crand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - sk, err := GeneratePrivateKey(curve, seed) - require.NoError(t, err) - testKeysAlgorithm(t, sk, curve) - testKeySize(t, sk, ecdsaPrKeyLen[curve], ecdsaPubKeyLen[curve]) - } -} - -// TestScalarMult is a unit test of the scalar multiplication -// This is only a sanity check meant to make sure the curve implemented -// is checked against an independent test vector -func TestScalarMultP256_secp256k1(t *testing.T) { - secp256k1 := secp256k1Instance.curve - p256 := p256Instance.curve - genericMultTests := []struct { - curve elliptic.Curve - Px string - Py string - k string - Qx string - Qy string - }{ - { - secp256k1, - "858a2ea2498449acf531128892f8ee5eb6d10cfb2f7ebfa851def0e0d8428742", - "015c59492d794a4f6a3ab3046eecfc85e223d1ce8571aa99b98af6838018286e", - "6e37a39c31a05181bf77919ace790efd0bdbcaf42b5a52871fc112fceb918c95", - "fea24b9a6acdd97521f850e782ef4a24f3ef672b5cd51f824499d708bb0c744d", - "5f0b6db1a2c851cb2959fab5ed36ad377e8b53f1f43b7923f1be21b316df1ea1", - }, - { - p256, - "fa1a85f1ae436e9aa05baabe60eb83b2d7ff52e5766504fda4e18d2d25887481", - "f7cc347e1ac53f6720ffc511bfb23c2f04c764620be0baf8c44313e92d5404de", - "6e37a39c31a05181bf77919ace790efd0bdbcaf42b5a52871fc112fceb918c95", - "28a27fc352f315d5cc562cb0d97e5882b6393fd6571f7d394cc583e65b5c7ffe", - "4086d17a2d0d9dc365388c91ba2176de7acc5c152c1a8d04e14edc6edaebd772", - }, - } - - baseMultTests := []struct { - curve elliptic.Curve - k string - Qx string - Qy string - }{ - { - secp256k1, - "6e37a39c31a05181bf77919ace790efd0bdbcaf42b5a52871fc112fceb918c95", - "36f292f6c287b6e72ca8128465647c7f88730f84ab27a1e934dbd2da753930fa", - "39a09ddcf3d28fb30cc683de3fc725e095ec865c3d41aef6065044cb12b1ff61", - }, - { - p256, - "6e37a39c31a05181bf77919ace790efd0bdbcaf42b5a52871fc112fceb918c95", - "78a80dfe190a6068be8ddf05644c32d2540402ffc682442f6a9eeb96125d8681", - "3789f92cf4afabf719aaba79ecec54b27e33a188f83158f6dd15ecb231b49808", - }, - } - - t.Run("scalar mult check", func(t *testing.T) { - for _, test := range genericMultTests { - Px, _ := new(big.Int).SetString(test.Px, 16) - Py, _ := new(big.Int).SetString(test.Py, 16) - k, _ := new(big.Int).SetString(test.k, 16) - Qx, _ := new(big.Int).SetString(test.Qx, 16) - Qy, _ := new(big.Int).SetString(test.Qy, 16) - Rx, Ry := test.curve.ScalarMult(Px, Py, k.Bytes()) - assert.Equal(t, Rx.Cmp(Qx), 0) - assert.Equal(t, Ry.Cmp(Qy), 0) - } - }) - - t.Run("base scalar mult check", func(t *testing.T) { - for _, test := range baseMultTests { - k, _ := new(big.Int).SetString(test.k, 16) - Qx, _ := new(big.Int).SetString(test.Qx, 16) - Qy, _ := new(big.Int).SetString(test.Qy, 16) - // base mult - Rx, Ry := test.curve.ScalarBaseMult(k.Bytes()) - assert.Equal(t, Rx.Cmp(Qx), 0) - assert.Equal(t, Ry.Cmp(Qy), 0) - // generic mult with base point - Px := new(big.Int).Set(test.curve.Params().Gx) - Py := new(big.Int).Set(test.curve.Params().Gy) - Rx, Ry = test.curve.ScalarMult(Px, Py, k.Bytes()) - assert.Equal(t, Rx.Cmp(Qx), 0) - assert.Equal(t, Ry.Cmp(Qy), 0) - } - }) -} - -func TestSignatureFormatCheck(t *testing.T) { - - for _, curve := range ecdsaCurves { - t.Run("valid signature", func(t *testing.T) { - len := ecdsaSigLen[curve] - sig := Signature(make([]byte, len)) - _, err := crand.Read(sig) - require.NoError(t, err) - sig[len/2] = 0 // force s to be less than the curve order - sig[len-1] |= 1 // force s to be non zero - sig[0] = 0 // force r to be less than the curve order - sig[len/2-1] |= 1 // force r to be non zero - valid, err := SignatureFormatCheck(curve, sig) - assert.Nil(t, err) - assert.True(t, valid) - }) - - t.Run("invalid length", func(t *testing.T) { - len := ecdsaSigLen[curve] - shortSig := Signature(make([]byte, len/2)) - valid, err := SignatureFormatCheck(curve, shortSig) - assert.Nil(t, err) - assert.False(t, valid) - - longSig := Signature(make([]byte, len*2)) - valid, err = SignatureFormatCheck(curve, longSig) - assert.Nil(t, err) - assert.False(t, valid) - }) - - t.Run("zero values", func(t *testing.T) { - // signature with a zero s - len := ecdsaSigLen[curve] - sig0s := Signature(make([]byte, len)) - _, err := crand.Read(sig0s[:len/2]) - require.NoError(t, err) - - valid, err := SignatureFormatCheck(curve, sig0s) - assert.Nil(t, err) - assert.False(t, valid) - - // signature with a zero r - sig0r := Signature(make([]byte, len)) - _, err = crand.Read(sig0r[len/2:]) - require.NoError(t, err) - - valid, err = SignatureFormatCheck(curve, sig0r) - assert.Nil(t, err) - assert.False(t, valid) - }) - - t.Run("large values", func(t *testing.T) { - len := ecdsaSigLen[curve] - sigLargeS := Signature(make([]byte, len)) - _, err := crand.Read(sigLargeS[:len/2]) - require.NoError(t, err) - // make sure s is larger than the curve order - for i := len / 2; i < len; i++ { - sigLargeS[i] = 0xFF - } - - valid, err := SignatureFormatCheck(curve, sigLargeS) - assert.Nil(t, err) - assert.False(t, valid) - - sigLargeR := Signature(make([]byte, len)) - _, err = crand.Read(sigLargeR[len/2:]) - require.NoError(t, err) - // make sure s is larger than the curve order - for i := 0; i < len/2; i++ { - sigLargeR[i] = 0xFF - } - - valid, err = SignatureFormatCheck(curve, sigLargeR) - assert.Nil(t, err) - assert.False(t, valid) - }) - } -} - -func TestEllipticUnmarshalSecp256k1(t *testing.T) { - - testVectors := []string{ - "028b10bf56476bf7da39a3286e29df389177a2fa0fca2d73348ff78887515d8da1", // IsOnCurve for elliptic returns false - "03d39427f07f680d202fe8504306eb29041aceaf4b628c2c69b0ec248155443166", // odd, IsOnCurve for elliptic returns false - "0267d1942a6cbe4daec242ea7e01c6cdb82dadb6e7077092deb55c845bf851433e", // arith of sqrt in elliptic doesn't match secp256k1 - "0345d45eda6d087918b041453a96303b78c478dce89a4ae9b3c933a018888c5e06", // odd, arith of sqrt in elliptic doesn't match secp256k1 - } - - for _, testVector := range testVectors { - - // get the compressed bytes - publicBytes, err := hex.DecodeString(testVector) - require.NoError(t, err) - - // decompress, check that those are perfectly valid Secp256k1 public keys - retrieved, err := DecodePublicKeyCompressed(ECDSASecp256k1, publicBytes) - require.NoError(t, err) - - // check the compression is canonical by re-compressing to the same bytes - require.Equal(t, retrieved.EncodeCompressed(), publicBytes) - - // check that elliptic fails at decompressing them - x, y := elliptic.UnmarshalCompressed(btcec.S256(), publicBytes) - require.Nil(t, x) - require.Nil(t, y) - } -} - -func BenchmarkECDSADecode(b *testing.B) { - // random message - seed := make([]byte, 50) - _, _ = crand.Read(seed) - - for _, curve := range []SigningAlgorithm{ECDSASecp256k1, ECDSAP256} { - sk, _ := GeneratePrivateKey(curve, seed) - comp := sk.PublicKey().EncodeCompressed() - uncomp := sk.PublicKey().Encode() - - b.Run("compressed point on "+curve.String(), func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := DecodePublicKeyCompressed(curve, comp) - require.NoError(b, err) - } - b.StopTimer() - }) - - b.Run("uncompressed point on "+curve.String(), func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - _, err := DecodePublicKey(curve, uncomp) - require.NoError(b, err) - } - b.StopTimer() - }) - } -} diff --git a/crypto/empty.go b/crypto/empty.go new file mode 100644 index 00000000000..5871506ee7e --- /dev/null +++ b/crypto/empty.go @@ -0,0 +1 @@ +package crypto diff --git a/crypto/go.mod b/crypto/go.mod index d31f36cf023..6b81958273c 100644 --- a/crypto/go.mod +++ b/crypto/go.mod @@ -1,23 +1,4 @@ +// Deprecated: The latest supported version is v0.25.0. The module then migrated to github.com/onflow/crypto. Use the new module github.com/onflow/crypto instead. module github.com/onflow/flow-go/crypto go 1.20 - -require ( - github.com/btcsuite/btcd/btcec/v2 v2.2.1 - github.com/sirupsen/logrus v1.4.2 - github.com/stretchr/testify v1.8.0 - golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d - gonum.org/v1/gonum v0.6.1 - pgregory.net/rapid v0.4.7 -) - -require ( - github.com/davecgh/go-spew v1.1.1 // indirect - github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1 // indirect - github.com/konsorten/go-windows-terminal-sequences v1.0.1 // indirect - github.com/kr/pretty v0.1.0 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect - golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1 // indirect - gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect -) diff --git a/crypto/go.sum b/crypto/go.sum index 820bb87a41c..e69de29bb2d 100644 --- a/crypto/go.sum +++ b/crypto/go.sum @@ -1,57 +0,0 @@ -github.com/ajstarks/svgo v0.0.0-20180226025133-644b8db467af/go.mod h1:K08gAheRH3/J6wwsYMMT4xOr94bZjxIelGM0+d/wbFw= -github.com/btcsuite/btcd/btcec/v2 v2.2.1 h1:xP60mv8fvp+0khmrN0zTdPC3cNm24rfeE6lh2R/Yv3E= -github.com/btcsuite/btcd/btcec/v2 v2.2.1/go.mod h1:9/CSmJxmuvqzX9Wh2fXMWToLOHhPd11lSPuIupwTkI8= -github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc= -github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1 h1:YLtO71vCjJRCBcrPMtQ9nqBsqpA1m5sE92cU+pd5Mcc= -github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1/go.mod h1:hyedUtir6IdtD/7lIxGeCxkaw7y45JueMRL4DIyJDKs= -github.com/fogleman/gg v1.2.1-0.20190220221249-0403632d5b90/go.mod h1:R/bRT+9gY/C5z7JzPU0zXsXHKM4/ayA+zqcVNZzPa1k= -github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGwJL78qG/PmXZO1EjYhfJinVAhrmmHX6Z8B9k= -github.com/jung-kurt/gofpdf v1.0.3-0.20190309125859-24315acbbda5/go.mod h1:7Id9E/uU8ce6rXgefFLlgrJj/GYY22cpxn+r32jIOes= -github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk= -github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= -github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4= -github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= -github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= -github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= -github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= -github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= -github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d h1:sK3txAijHtOK88l68nt020reeT1ZdKLIYetKl95FzVY= -golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20180807140117-3d87b88a115f/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2 h1:y102fOLFqhV41b+4GPiJoa0k/x+pJcEi2/HB1Y5T6fU= -golang.org/x/exp v0.0.0-20190125153040-c74c464bbbf2/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= -golang.org/x/image v0.0.0-20180708004352-c73c2afc3b81/go.mod h1:ux5Hcp/YLpHSI86hEcLt0YII63i6oz57MZXIpbrjZUs= -golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1 h1:SrN+KX8Art/Sf4HNj6Zcz06G7VEz+7w9tdXTPOZ7+l4= -golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/tools v0.0.0-20180525024113-a5b4c53f6e8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -golang.org/x/tools v0.0.0-20190206041539-40960b6deb8e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= -gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJDxZx5Dpnq+c3wtXuadVZAcxbbBo= -gonum.org/v1/gonum v0.6.1 h1:/LSrTrgZtpbXyAR6+0e152SROCkJJSh7goYWVmdPFGc= -gonum.org/v1/gonum v0.6.1/go.mod h1:9mxDZsDKxgMAuccQkewq682L+0eCu4dCN2yonUJTCLU= -gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0 h1:OE9mWmgKkjJyEmDAAtGMPjXu+YNeGvK9VTSHY6+Qihc= -gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= -gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15 h1:YR8cESwS4TdDjEe65xsg0ogRM/Nc3DYOhEAlW+xobZo= -gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= -pgregory.net/rapid v0.4.7 h1:MTNRktPuv5FNqOO151TM9mDTa+XHcX6ypYeISDVD14g= -pgregory.net/rapid v0.4.7/go.mod h1:UYpPVyjFHzYBGHIxLFoupi8vwk6rXNzRY9OMvVxFIOU= -rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/crypto/hash/hash.go b/crypto/hash/hash.go deleted file mode 100644 index 31f9fd08c7a..00000000000 --- a/crypto/hash/hash.go +++ /dev/null @@ -1,45 +0,0 @@ -package hash - -import ( - "bytes" - "fmt" - "io" -) - -// Hash is the hash algorithms output types -type Hash []byte - -// Equal checks if a hash is equal to a given hash -func (h Hash) Equal(input Hash) bool { - return bytes.Equal(h, input) -} - -// Hex returns the hex string representation of the hash. -func (h Hash) Hex() string { - return fmt.Sprintf("%#x", []byte(h)) -} - -// String returns the hex string representation of the hash. -func (h Hash) String() string { - return h.Hex() -} - -// Hasher interface -type Hasher interface { - // Algorithm returns the hashing algorithm of the hasher. - Algorithm() HashingAlgorithm - // Size returns the hash output length in bytes. - Size() int - // ComputeHash returns the hash output regardless of the existing hash state. - // It may update the state or not depending on the implementation. Thread safety - // also depends on the implementation. - ComputeHash([]byte) Hash - // Write([]bytes) (using the io.Writer interface) adds more bytes to the - // current hash state. - io.Writer - // SumHash returns the hash output. - // It may update the state or not depending on the implementation. - SumHash() Hash - // Reset resets the hash state. - Reset() -} diff --git a/crypto/hash/hash_test.go b/crypto/hash/hash_test.go deleted file mode 100644 index 21c14134fde..00000000000 --- a/crypto/hash/hash_test.go +++ /dev/null @@ -1,335 +0,0 @@ -package hash - -import ( - "crypto/rand" - "crypto/sha256" - "crypto/sha512" - "encoding/hex" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "golang.org/x/crypto/sha3" -) - -// Sanity check of SHA3_256 -func TestSanitySHA3_256(t *testing.T) { - input := []byte("test") - expected, _ := hex.DecodeString("36f028580bb02cc8272a9a020f4200e346e276ae664e45ee80745574e2f5ab80") - - alg := NewSHA3_256() - hash := alg.ComputeHash(input) - assert.Equal(t, Hash(expected), hash) -} - -// Sanity check of SHA3_384 -func TestSanitySHA3_384(t *testing.T) { - input := []byte("test") - expected, _ := hex.DecodeString("e516dabb23b6e30026863543282780a3ae0dccf05551cf0295178d7ff0f1b41eecb9db3ff219007c4e097260d58621bd") - - alg := NewSHA3_384() - hash := alg.ComputeHash(input) - assert.Equal(t, Hash(expected), hash) -} - -// Sanity check of SHA2_256 -func TestSanitySHA2_256(t *testing.T) { - input := []byte("test") - expected, _ := hex.DecodeString("9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08") - - alg := NewSHA2_256() - hash := alg.ComputeHash(input) - assert.Equal(t, Hash(expected), hash) -} - -// Sanity check of SHA2_384 -func TestSanitySHA2_384(t *testing.T) { - input := []byte("test") - expected, _ := hex.DecodeString("768412320f7b0aa5812fce428dc4706b3cae50e02a64caa16a782249bfe8efc4b7ef1ccb126255d196047dfedf17a0a9") - - alg := NewSHA2_384() - hash := alg.ComputeHash(input) - assert.Equal(t, Hash(expected), hash) -} - -// Sanity check of Keccak_256 -func TestSanityKeccak_256(t *testing.T) { - input := []byte("test") - expected, _ := hex.DecodeString("9c22ff5f21f0b81b113e63f7db6da94fedef11b2119b4088b89664fb9a3cb658") - - alg := NewKeccak_256() - hash := alg.ComputeHash(input) - assert.Equal(t, Hash(expected), hash) -} - -// Sanity checks of KMAC128 -// the test vector is taken from the NIST document -// https://csrc.nist.gov/CSRC/media/Projects/Cryptographic-Standards-and-Guidelines/documents/examples/Kmac_samples.pdf -func TestSanityKmac128(t *testing.T) { - - input := []byte{0x00, 0x01, 0x02, 0x03} - expected := []Hash{ - {0xE5, 0x78, 0x0B, 0x0D, 0x3E, 0xA6, 0xF7, 0xD3, 0xA4, 0x29, 0xC5, 0x70, 0x6A, 0xA4, 0x3A, 0x00, - 0xFA, 0xDB, 0xD7, 0xD4, 0x96, 0x28, 0x83, 0x9E, 0x31, 0x87, 0x24, 0x3F, 0x45, 0x6E, 0xE1, 0x4E}, - {0x3B, 0x1F, 0xBA, 0x96, 0x3C, 0xD8, 0xB0, 0xB5, 0x9E, 0x8C, 0x1A, 0x6D, 0x71, 0x88, 0x8B, 0x71, - 0x43, 0x65, 0x1A, 0xF8, 0xBA, 0x0A, 0x70, 0x70, 0xC0, 0x97, 0x9E, 0x28, 0x11, 0x32, 0x4A, 0xA5}, - } - key := []byte{0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, - 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F} - customizers := [][]byte{ - []byte(""), - []byte("My Tagged Application"), - } - outputSize := 32 - - alg, err := NewKMAC_128(key, customizers[0], outputSize) - require.Nil(t, err) - _, _ = alg.Write(input[0:2]) - _, _ = alg.Write(input[2:]) - hash := alg.SumHash() - assert.Equal(t, expected[0], hash) - - for i := 0; i < len(customizers); i++ { - alg, err = NewKMAC_128(key, customizers[i], outputSize) - require.Nil(t, err) - hash = alg.ComputeHash(input) - assert.Equal(t, expected[i], hash) - } - - // test short key length - _, err = NewKMAC_128(key[:15], customizers[0], outputSize) - assert.Error(t, err) -} - -// TestHashersAPI tests the expected definition of the hashers APIs -func TestHashersAPI(t *testing.T) { - - newKmac128 := func() Hasher { - kmac, err := NewKMAC_128([]byte("test_key________"), []byte("test_custommizer"), 32) - if err != nil { - panic("new kmac hasher failed") - } - return kmac - } - - newHasherFunctions := [](func() Hasher){ - NewSHA2_256, - NewSHA2_384, - NewSHA3_256, - NewSHA3_384, - newKmac128, - NewKeccak_256, - } - - data := make([]byte, 1801) - _, err := rand.Read(data) - require.NoError(t, err) - - for _, newFunction := range newHasherFunctions { - // Reset should empty the state - h := newFunction() - expectedEmptyHash := h.SumHash() - _, _ = h.Write(data) - h.Reset() - emptyHash := h.SumHash() - assert.Equal(t, expectedEmptyHash, emptyHash) - - // SumHash on an empty state is equal to compute hash with empty data - emptyHash = h.ComputeHash(nil) - assert.Equal(t, expectedEmptyHash, emptyHash) - - // successive writes of data are equivalent to compute hash - // of the concatenated data - h = newFunction() - hash1 := h.ComputeHash(data) - - h.Reset() - _, _ = h.Write(data[:355]) - _, _ = h.Write(data[355:902]) - _, _ = h.Write(data[902:]) - hash2 := h.SumHash() - assert.Equal(t, hash1, hash2) - - // ComputeHash output does not depend on the hasher state - h = newFunction() - - _, _ = h.Write([]byte("dummy data")) - hash1 = h.ComputeHash(data) - assert.Equal(t, hash1, hash2) - } -} - -// TestSHA2 is a specific test of SHA2-256 and SHA2-384. -// It compares the hashes of random data of different lengths to -// the output of standard Go sha2. -func TestSHA2(t *testing.T) { - - t.Run("SHA2_256", func(t *testing.T) { - for i := 0; i < 5000; i++ { - value := make([]byte, i) - _, err := rand.Read(value) - require.NoError(t, err) - expected := sha256.Sum256(value) - - // test hash computation using the hasher - hasher := NewSHA2_256() - h := hasher.ComputeHash(value) - assert.Equal(t, expected[:], []byte(h)) - - // test hash computation using the light api - var res [HashLenSHA2_256]byte - ComputeSHA2_256(&res, value) - assert.Equal(t, expected[:], res[:]) - } - }) - - t.Run("SHA2_384", func(t *testing.T) { - for i := 0; i < 5000; i++ { - value := make([]byte, i) - _, err := rand.Read(value) - require.NoError(t, err) - expected := sha512.Sum384(value) - - hasher := NewSHA2_384() - h := hasher.ComputeHash(value) - assert.Equal(t, expected[:], []byte(h)) - } - }) -} - -// TestSHA3 is a specific test of SHA3-256 and SHA3-384. -// It compares the hashes of random data of different lengths to -// the output of standard Go sha3. -func TestSHA3(t *testing.T) { - t.Run("SHA3_256", func(t *testing.T) { - for i := 0; i < 5000; i++ { - value := make([]byte, i) - _, err := rand.Read(value) - require.NoError(t, err) - expected := sha3.Sum256(value) - - // test hash computation using the hasher - hasher := NewSHA3_256() - h := hasher.ComputeHash(value) - assert.Equal(t, expected[:], []byte(h)) - - // test hash computation using the light api - var res [HashLenSHA3_256]byte - ComputeSHA3_256(&res, value) - assert.Equal(t, expected[:], res[:]) - } - }) - - t.Run("SHA3_384", func(t *testing.T) { - for i := 0; i < 5000; i++ { - value := make([]byte, i) - _, err := rand.Read(value) - require.NoError(t, err) - expected := sha3.Sum384(value) - - hasher := NewSHA3_384() - h := hasher.ComputeHash(value) - assert.Equal(t, expected[:], []byte(h)) - } - }) -} - -// TestKeccak is a specific test of Keccak-256. -// It compares the hashes of random data of different lengths to -// the output of Go LegacyKeccak. -func TestKeccak(t *testing.T) { - for i := 0; i < 5000; i++ { - value := make([]byte, i) - _, err := rand.Read(value) - require.NoError(t, err) - k := sha3.NewLegacyKeccak256() - k.Write(value) - expected := k.Sum(nil) - - // test hash computation using the hasher - hasher := NewKeccak_256() - h := hasher.ComputeHash(value) - assert.Equal(t, expected[:], []byte(h)) - } -} - -// Benchmark of all hashers' ComputeHash function -func BenchmarkComputeHash(b *testing.B) { - - m := make([]byte, 32) - _, err := rand.Read(m) - require.NoError(b, err) - - b.Run("SHA2_256", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - alg := NewSHA2_256() - _ = alg.ComputeHash(m) - } - b.StopTimer() - }) - - b.Run("SHA2_256_light", func(b *testing.B) { - var h [HashLenSHA2_256]byte - b.ResetTimer() - for i := 0; i < b.N; i++ { - ComputeSHA2_256(&h, m) - } - b.StopTimer() - }) - - b.Run("SHA2_384", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - alg := NewSHA2_384() - _ = alg.ComputeHash(m) - } - b.StopTimer() - }) - - b.Run("SHA3_256", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - alg := NewSHA3_256() - alg.ComputeHash(m) - } - b.StopTimer() - }) - - b.Run("SHA3_256_light", func(b *testing.B) { - var h [HashLenSHA3_256]byte - b.ResetTimer() - for i := 0; i < b.N; i++ { - ComputeSHA3_256(&h, m) - } - b.StopTimer() - }) - - b.Run("SHA3_384", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - alg := NewSHA3_384() - _ = alg.ComputeHash(m) - } - b.StopTimer() - }) - - b.Run("Keccak_256", func(b *testing.B) { - b.ResetTimer() - for i := 0; i < b.N; i++ { - alg := NewKeccak_256() - alg.ComputeHash(m) - } - b.StopTimer() - }) - - // KMAC128 with 128 bytes output - b.Run("KMAC128_128", func(b *testing.B) { - alg, _ := NewKMAC_128([]byte("bench_key________"), []byte("bench_custommizer"), 128) - b.ResetTimer() - for i := 0; i < b.N; i++ { - _ = alg.ComputeHash(m) - } - b.StopTimer() - }) -} diff --git a/crypto/hash/keccak.go b/crypto/hash/keccak.go deleted file mode 100644 index 6c515c02d83..00000000000 --- a/crypto/hash/keccak.go +++ /dev/null @@ -1,204 +0,0 @@ -package hash - -// Size returns the output size of the hash function in bytes. -func (d *spongeState) Size() int { - return d.outputLen -} - -// Algorithm returns the hashing algorithm of the instance. -func (s *spongeState) Algorithm() HashingAlgorithm { - return s.algo -} - -// ComputeHash calculates and returns the digest of the input. -// It updates the state (and therefore not thread-safe) and doesn't allow -// further writing without calling Reset(). -func (s *spongeState) ComputeHash(data []byte) Hash { - s.Reset() - s.write(data) - return s.sum() -} - -// SumHash returns the digest of the data written to the state. -// It updates the state and doesn't allow further writing without -// calling Reset(). -func (s *spongeState) SumHash() Hash { - return s.sum() -} - -// Write absorbs more data into the hash's state. -// It returns the number of bytes written and never errors. -func (d *spongeState) Write(p []byte) (int, error) { - d.write(p) - return len(p), nil -} - -// The functions below were copied and modified from golang.org/x/crypto/sha3. -// -// Copyright (c) 2009 The Go Authors. All rights reserved. - -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: - -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. - -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -type spongeState struct { - // the hashing algorithm name - algo HashingAlgorithm - - a [25]uint64 // main state of the hash - storage storageBuf // constant size array - // `buf` is a sub-slice that points into `storage` using `bufIndex` and `bufSize`: - // - `bufIndex` is the index of the first element of buf - // - `bufSize` is the size of buf - bufIndex int - bufSize int - rate int // the number of bytes of state to use - // dsbyte contains the domain separation bits (if any are defined) - // and the first bit of the 10*1 padding. - // Using a little-endian bit-ordering convention, it is 0b01 for SHA-3 - // and not defined for legacy Keccak. - // The padding 10*1 is applied to pad the message to a multiple - // of the rate, which involves adding a "1" bit, zero or more "0" bits, and - // a final "1" bit. We merge the first "1" bit from the padding into dsbyte, - // ( giving 0b00000110 for SHA-3 and 0b00000001 for legacy Keccak) - // [1] https://keccak.team/sponge_duplex.html - // "The sponge and duplex constructions" - dsByte byte // the domain separation byte with one bit padding - outputLen int // the default output size in bytes -} - -const ( - // maxRate is the maximum size of the internal buffer. SHA3-256 - // currently needs the largest buffer among supported sponge-based - // algorithms. - maxRate = rateSHA3_256 - - // initialization value of the buffer index - bufNilValue = -1 -) - -// returns the current buf -func (d *spongeState) buf() []byte { - return d.storage.asBytes()[d.bufIndex : d.bufIndex+d.bufSize] -} - -// setBuf assigns `buf` (sub-slice of `storage`) to a sub-slice of `storage` -// defined by a starting index and size. -func (d *spongeState) setBuf(start, size int) { - d.bufIndex = start - d.bufSize = size -} - -// checks if `buf` is nil (not yet set) -func (d *spongeState) bufIsNil() bool { - return d.bufSize == bufNilValue -} - -// appendBuf appends a slice to `buf` (sub-slice of `storage`) -// The function assumes the appended buffer still fits into `storage`. -func (d *spongeState) appendBuf(slice []byte) { - copy(d.storage.asBytes()[d.bufIndex+d.bufSize:], slice) - d.bufSize += len(slice) -} - -// Reset clears the internal state. -func (d *spongeState) Reset() { - // Zero the permutation's state. - for i := range d.a { - d.a[i] = 0 - } - d.setBuf(0, 0) -} - -// permute applies the KeccakF-1600 permutation. -func (d *spongeState) permute() { - // xor the input into the state before applying the permutation. - xorIn(d, d.buf()) - d.setBuf(0, 0) - keccakF1600(&d.a) -} - -func (d *spongeState) write(p []byte) { - if d.bufIsNil() { - d.setBuf(0, 0) - } - - for len(p) > 0 { - if d.bufSize == 0 && len(p) >= d.rate { - // The fast path; absorb a full "rate" bytes of input and apply the permutation. - xorIn(d, p[:d.rate]) - p = p[d.rate:] - keccakF1600(&d.a) - } else { - // The slow path; buffer the input until we can fill the sponge, and then xor it in. - todo := d.rate - d.bufSize - if todo > len(p) { - todo = len(p) - } - d.appendBuf(p[:todo]) - p = p[todo:] - - // If the sponge is full, apply the permutation. - if d.bufSize == d.rate { - d.permute() - } - } - } -} - -// pads appends the domain separation bits in dsbyte, applies -// the multi-bitrate 10..1 padding rule, and permutes the state. -func (d *spongeState) padAndPermute() { - if d.bufIsNil() { - d.setBuf(0, 0) - } - // Pad with this instance with dsbyte. We know that there's - // at least one byte of space in d.buf because, if it were full, - // permute would have been called to empty it. dsbyte also contains the - // first one bit for the padding. See the comment in the state struct. - d.appendBuf([]byte{d.dsByte}) - zerosStart := d.bufSize - d.setBuf(0, d.rate) - buf := d.buf() - for i := zerosStart; i < d.rate; i++ { - buf[i] = 0 - } - // This adds the final one bit for the padding. Because of the way that - // bits are numbered from the LSB upwards, the final bit is the MSB of - // the last byte. - buf[d.rate-1] ^= 0x80 - // Apply the permutation - d.permute() - d.setBuf(0, d.rate) -} - -// Sum applies padding to the hash state and then squeezes out the desired -// number of output bytes. -func (d *spongeState) sum() []byte { - hash := make([]byte, d.outputLen) - d.padAndPermute() - copyOut(hash, d) - return hash -} diff --git a/crypto/hash/keccak.s b/crypto/hash/keccak.s deleted file mode 100644 index 01e35bb9a5c..00000000000 --- a/crypto/hash/keccak.s +++ /dev/null @@ -1,419 +0,0 @@ -// The functions below were copied from golang.org/x/crypto/sha3. -// -// Copyright (c) 2009 The Go Authors. All rights reserved. - -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: - -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. - -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -// +build amd64,!purego,gc - -// This code was translated into a form compatible with 6a from the public -// domain sources at https://github.com/gvanas/KeccakCodePackage - -// Offsets in state - -#define _ba (0*8) -#define _be (1*8) -#define _bi (2*8) -#define _bo (3*8) -#define _bu (4*8) -#define _ga (5*8) -#define _ge (6*8) -#define _gi (7*8) -#define _go (8*8) -#define _gu (9*8) -#define _ka (10*8) -#define _ke (11*8) -#define _ki (12*8) -#define _ko (13*8) -#define _ku (14*8) -#define _ma (15*8) -#define _me (16*8) -#define _mi (17*8) -#define _mo (18*8) -#define _mu (19*8) -#define _sa (20*8) -#define _se (21*8) -#define _si (22*8) -#define _so (23*8) -#define _su (24*8) - -// Temporary registers -#define rT1 AX - -// Round vars -#define rpState DI -#define rpStack SP - -#define rDa BX -#define rDe CX -#define rDi DX -#define rDo R8 -#define rDu R9 - -#define rBa R10 -#define rBe R11 -#define rBi R12 -#define rBo R13 -#define rBu R14 - -#define rCa SI -#define rCe BP -#define rCi rBi -#define rCo rBo -#define rCu R15 - -#define MOVQ_RBI_RCE MOVQ rBi, rCe -#define XORQ_RT1_RCA XORQ rT1, rCa -#define XORQ_RT1_RCE XORQ rT1, rCe -#define XORQ_RBA_RCU XORQ rBa, rCu -#define XORQ_RBE_RCU XORQ rBe, rCu -#define XORQ_RDU_RCU XORQ rDu, rCu -#define XORQ_RDA_RCA XORQ rDa, rCa -#define XORQ_RDE_RCE XORQ rDe, rCe - -#define mKeccakRound(iState, oState, rc, B_RBI_RCE, G_RT1_RCA, G_RT1_RCE, G_RBA_RCU, K_RT1_RCA, K_RT1_RCE, K_RBA_RCU, M_RT1_RCA, M_RT1_RCE, M_RBE_RCU, S_RDU_RCU, S_RDA_RCA, S_RDE_RCE) \ - /* Prepare round */ \ - MOVQ rCe, rDa; \ - ROLQ $1, rDa; \ - \ - MOVQ _bi(iState), rCi; \ - XORQ _gi(iState), rDi; \ - XORQ rCu, rDa; \ - XORQ _ki(iState), rCi; \ - XORQ _mi(iState), rDi; \ - XORQ rDi, rCi; \ - \ - MOVQ rCi, rDe; \ - ROLQ $1, rDe; \ - \ - MOVQ _bo(iState), rCo; \ - XORQ _go(iState), rDo; \ - XORQ rCa, rDe; \ - XORQ _ko(iState), rCo; \ - XORQ _mo(iState), rDo; \ - XORQ rDo, rCo; \ - \ - MOVQ rCo, rDi; \ - ROLQ $1, rDi; \ - \ - MOVQ rCu, rDo; \ - XORQ rCe, rDi; \ - ROLQ $1, rDo; \ - \ - MOVQ rCa, rDu; \ - XORQ rCi, rDo; \ - ROLQ $1, rDu; \ - \ - /* Result b */ \ - MOVQ _ba(iState), rBa; \ - MOVQ _ge(iState), rBe; \ - XORQ rCo, rDu; \ - MOVQ _ki(iState), rBi; \ - MOVQ _mo(iState), rBo; \ - MOVQ _su(iState), rBu; \ - XORQ rDe, rBe; \ - ROLQ $44, rBe; \ - XORQ rDi, rBi; \ - XORQ rDa, rBa; \ - ROLQ $43, rBi; \ - \ - MOVQ rBe, rCa; \ - MOVQ rc, rT1; \ - ORQ rBi, rCa; \ - XORQ rBa, rT1; \ - XORQ rT1, rCa; \ - MOVQ rCa, _ba(oState); \ - \ - XORQ rDu, rBu; \ - ROLQ $14, rBu; \ - MOVQ rBa, rCu; \ - ANDQ rBe, rCu; \ - XORQ rBu, rCu; \ - MOVQ rCu, _bu(oState); \ - \ - XORQ rDo, rBo; \ - ROLQ $21, rBo; \ - MOVQ rBo, rT1; \ - ANDQ rBu, rT1; \ - XORQ rBi, rT1; \ - MOVQ rT1, _bi(oState); \ - \ - NOTQ rBi; \ - ORQ rBa, rBu; \ - ORQ rBo, rBi; \ - XORQ rBo, rBu; \ - XORQ rBe, rBi; \ - MOVQ rBu, _bo(oState); \ - MOVQ rBi, _be(oState); \ - B_RBI_RCE; \ - \ - /* Result g */ \ - MOVQ _gu(iState), rBe; \ - XORQ rDu, rBe; \ - MOVQ _ka(iState), rBi; \ - ROLQ $20, rBe; \ - XORQ rDa, rBi; \ - ROLQ $3, rBi; \ - MOVQ _bo(iState), rBa; \ - MOVQ rBe, rT1; \ - ORQ rBi, rT1; \ - XORQ rDo, rBa; \ - MOVQ _me(iState), rBo; \ - MOVQ _si(iState), rBu; \ - ROLQ $28, rBa; \ - XORQ rBa, rT1; \ - MOVQ rT1, _ga(oState); \ - G_RT1_RCA; \ - \ - XORQ rDe, rBo; \ - ROLQ $45, rBo; \ - MOVQ rBi, rT1; \ - ANDQ rBo, rT1; \ - XORQ rBe, rT1; \ - MOVQ rT1, _ge(oState); \ - G_RT1_RCE; \ - \ - XORQ rDi, rBu; \ - ROLQ $61, rBu; \ - MOVQ rBu, rT1; \ - ORQ rBa, rT1; \ - XORQ rBo, rT1; \ - MOVQ rT1, _go(oState); \ - \ - ANDQ rBe, rBa; \ - XORQ rBu, rBa; \ - MOVQ rBa, _gu(oState); \ - NOTQ rBu; \ - G_RBA_RCU; \ - \ - ORQ rBu, rBo; \ - XORQ rBi, rBo; \ - MOVQ rBo, _gi(oState); \ - \ - /* Result k */ \ - MOVQ _be(iState), rBa; \ - MOVQ _gi(iState), rBe; \ - MOVQ _ko(iState), rBi; \ - MOVQ _mu(iState), rBo; \ - MOVQ _sa(iState), rBu; \ - XORQ rDi, rBe; \ - ROLQ $6, rBe; \ - XORQ rDo, rBi; \ - ROLQ $25, rBi; \ - MOVQ rBe, rT1; \ - ORQ rBi, rT1; \ - XORQ rDe, rBa; \ - ROLQ $1, rBa; \ - XORQ rBa, rT1; \ - MOVQ rT1, _ka(oState); \ - K_RT1_RCA; \ - \ - XORQ rDu, rBo; \ - ROLQ $8, rBo; \ - MOVQ rBi, rT1; \ - ANDQ rBo, rT1; \ - XORQ rBe, rT1; \ - MOVQ rT1, _ke(oState); \ - K_RT1_RCE; \ - \ - XORQ rDa, rBu; \ - ROLQ $18, rBu; \ - NOTQ rBo; \ - MOVQ rBo, rT1; \ - ANDQ rBu, rT1; \ - XORQ rBi, rT1; \ - MOVQ rT1, _ki(oState); \ - \ - MOVQ rBu, rT1; \ - ORQ rBa, rT1; \ - XORQ rBo, rT1; \ - MOVQ rT1, _ko(oState); \ - \ - ANDQ rBe, rBa; \ - XORQ rBu, rBa; \ - MOVQ rBa, _ku(oState); \ - K_RBA_RCU; \ - \ - /* Result m */ \ - MOVQ _ga(iState), rBe; \ - XORQ rDa, rBe; \ - MOVQ _ke(iState), rBi; \ - ROLQ $36, rBe; \ - XORQ rDe, rBi; \ - MOVQ _bu(iState), rBa; \ - ROLQ $10, rBi; \ - MOVQ rBe, rT1; \ - MOVQ _mi(iState), rBo; \ - ANDQ rBi, rT1; \ - XORQ rDu, rBa; \ - MOVQ _so(iState), rBu; \ - ROLQ $27, rBa; \ - XORQ rBa, rT1; \ - MOVQ rT1, _ma(oState); \ - M_RT1_RCA; \ - \ - XORQ rDi, rBo; \ - ROLQ $15, rBo; \ - MOVQ rBi, rT1; \ - ORQ rBo, rT1; \ - XORQ rBe, rT1; \ - MOVQ rT1, _me(oState); \ - M_RT1_RCE; \ - \ - XORQ rDo, rBu; \ - ROLQ $56, rBu; \ - NOTQ rBo; \ - MOVQ rBo, rT1; \ - ORQ rBu, rT1; \ - XORQ rBi, rT1; \ - MOVQ rT1, _mi(oState); \ - \ - ORQ rBa, rBe; \ - XORQ rBu, rBe; \ - MOVQ rBe, _mu(oState); \ - \ - ANDQ rBa, rBu; \ - XORQ rBo, rBu; \ - MOVQ rBu, _mo(oState); \ - M_RBE_RCU; \ - \ - /* Result s */ \ - MOVQ _bi(iState), rBa; \ - MOVQ _go(iState), rBe; \ - MOVQ _ku(iState), rBi; \ - XORQ rDi, rBa; \ - MOVQ _ma(iState), rBo; \ - ROLQ $62, rBa; \ - XORQ rDo, rBe; \ - MOVQ _se(iState), rBu; \ - ROLQ $55, rBe; \ - \ - XORQ rDu, rBi; \ - MOVQ rBa, rDu; \ - XORQ rDe, rBu; \ - ROLQ $2, rBu; \ - ANDQ rBe, rDu; \ - XORQ rBu, rDu; \ - MOVQ rDu, _su(oState); \ - \ - ROLQ $39, rBi; \ - S_RDU_RCU; \ - NOTQ rBe; \ - XORQ rDa, rBo; \ - MOVQ rBe, rDa; \ - ANDQ rBi, rDa; \ - XORQ rBa, rDa; \ - MOVQ rDa, _sa(oState); \ - S_RDA_RCA; \ - \ - ROLQ $41, rBo; \ - MOVQ rBi, rDe; \ - ORQ rBo, rDe; \ - XORQ rBe, rDe; \ - MOVQ rDe, _se(oState); \ - S_RDE_RCE; \ - \ - MOVQ rBo, rDi; \ - MOVQ rBu, rDo; \ - ANDQ rBu, rDi; \ - ORQ rBa, rDo; \ - XORQ rBi, rDi; \ - XORQ rBo, rDo; \ - MOVQ rDi, _si(oState); \ - MOVQ rDo, _so(oState) \ - -// func keccakF1600(state *[25]uint64) -TEXT ·keccakF1600(SB), 0, $200-8 - MOVQ state+0(FP), rpState - - // Convert the user state into an internal state - NOTQ _be(rpState) - NOTQ _bi(rpState) - NOTQ _go(rpState) - NOTQ _ki(rpState) - NOTQ _mi(rpState) - NOTQ _sa(rpState) - - // Execute the KeccakF permutation - MOVQ _ba(rpState), rCa - MOVQ _be(rpState), rCe - MOVQ _bu(rpState), rCu - - XORQ _ga(rpState), rCa - XORQ _ge(rpState), rCe - XORQ _gu(rpState), rCu - - XORQ _ka(rpState), rCa - XORQ _ke(rpState), rCe - XORQ _ku(rpState), rCu - - XORQ _ma(rpState), rCa - XORQ _me(rpState), rCe - XORQ _mu(rpState), rCu - - XORQ _sa(rpState), rCa - XORQ _se(rpState), rCe - MOVQ _si(rpState), rDi - MOVQ _so(rpState), rDo - XORQ _su(rpState), rCu - - mKeccakRound(rpState, rpStack, $0x0000000000000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x0000000000008082, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x800000000000808a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000080008000, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000000000808b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x0000000080000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000080008081, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000008009, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000000000008a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x0000000000000088, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x0000000080008009, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x000000008000000a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000008000808b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x800000000000008b, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000000008089, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000008003, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000000008002, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000000080, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x000000000000800a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x800000008000000a, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x8000000080008081, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000000008080, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpState, rpStack, $0x0000000080000001, MOVQ_RBI_RCE, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBA_RCU, XORQ_RT1_RCA, XORQ_RT1_RCE, XORQ_RBE_RCU, XORQ_RDU_RCU, XORQ_RDA_RCA, XORQ_RDE_RCE) - mKeccakRound(rpStack, rpState, $0x8000000080008008, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP, NOP) - - // Revert the internal state to the user state - NOTQ _be(rpState) - NOTQ _bi(rpState) - NOTQ _go(rpState) - NOTQ _ki(rpState) - NOTQ _mi(rpState) - NOTQ _sa(rpState) - - RET - \ No newline at end of file diff --git a/crypto/hash/keccakf.go b/crypto/hash/keccakf.go deleted file mode 100644 index 76d0b9a1a5d..00000000000 --- a/crypto/hash/keccakf.go +++ /dev/null @@ -1,439 +0,0 @@ -// The functions below were copied from golang.org/x/crypto/sha3. -// -// Copyright (c) 2009 The Go Authors. All rights reserved. - -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: - -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. - -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -//go:build !amd64 || purego || !gc -// +build !amd64 purego !gc - -package hash - -// rc stores the round constants for use in the ι step. -var rc = [24]uint64{ - 0x0000000000000001, - 0x0000000000008082, - 0x800000000000808A, - 0x8000000080008000, - 0x000000000000808B, - 0x0000000080000001, - 0x8000000080008081, - 0x8000000000008009, - 0x000000000000008A, - 0x0000000000000088, - 0x0000000080008009, - 0x000000008000000A, - 0x000000008000808B, - 0x800000000000008B, - 0x8000000000008089, - 0x8000000000008003, - 0x8000000000008002, - 0x8000000000000080, - 0x000000000000800A, - 0x800000008000000A, - 0x8000000080008081, - 0x8000000000008080, - 0x0000000080000001, - 0x8000000080008008, -} - -// keccakF1600 applies the Keccak permutation to a 1600b-wide -// state represented as a slice of 25 uint64s. -func keccakF1600(a *[25]uint64) { - // Implementation translated from Keccak-inplace.c - // in the keccak reference code. - var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64 - - for i := 0; i < 24; i += 4 { - // Combines the 5 steps in each round into 2 steps. - // Unrolls 4 rounds per loop and spreads some steps across rounds. - - // Round 1 - bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20] - bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21] - bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22] - bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23] - bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24] - d0 = bc4 ^ (bc1<<1 | bc1>>63) - d1 = bc0 ^ (bc2<<1 | bc2>>63) - d2 = bc1 ^ (bc3<<1 | bc3>>63) - d3 = bc2 ^ (bc4<<1 | bc4>>63) - d4 = bc3 ^ (bc0<<1 | bc0>>63) - - bc0 = a[0] ^ d0 - t = a[6] ^ d1 - bc1 = t<<44 | t>>(64-44) - t = a[12] ^ d2 - bc2 = t<<43 | t>>(64-43) - t = a[18] ^ d3 - bc3 = t<<21 | t>>(64-21) - t = a[24] ^ d4 - bc4 = t<<14 | t>>(64-14) - a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i] - a[6] = bc1 ^ (bc3 &^ bc2) - a[12] = bc2 ^ (bc4 &^ bc3) - a[18] = bc3 ^ (bc0 &^ bc4) - a[24] = bc4 ^ (bc1 &^ bc0) - - t = a[10] ^ d0 - bc2 = t<<3 | t>>(64-3) - t = a[16] ^ d1 - bc3 = t<<45 | t>>(64-45) - t = a[22] ^ d2 - bc4 = t<<61 | t>>(64-61) - t = a[3] ^ d3 - bc0 = t<<28 | t>>(64-28) - t = a[9] ^ d4 - bc1 = t<<20 | t>>(64-20) - a[10] = bc0 ^ (bc2 &^ bc1) - a[16] = bc1 ^ (bc3 &^ bc2) - a[22] = bc2 ^ (bc4 &^ bc3) - a[3] = bc3 ^ (bc0 &^ bc4) - a[9] = bc4 ^ (bc1 &^ bc0) - - t = a[20] ^ d0 - bc4 = t<<18 | t>>(64-18) - t = a[1] ^ d1 - bc0 = t<<1 | t>>(64-1) - t = a[7] ^ d2 - bc1 = t<<6 | t>>(64-6) - t = a[13] ^ d3 - bc2 = t<<25 | t>>(64-25) - t = a[19] ^ d4 - bc3 = t<<8 | t>>(64-8) - a[20] = bc0 ^ (bc2 &^ bc1) - a[1] = bc1 ^ (bc3 &^ bc2) - a[7] = bc2 ^ (bc4 &^ bc3) - a[13] = bc3 ^ (bc0 &^ bc4) - a[19] = bc4 ^ (bc1 &^ bc0) - - t = a[5] ^ d0 - bc1 = t<<36 | t>>(64-36) - t = a[11] ^ d1 - bc2 = t<<10 | t>>(64-10) - t = a[17] ^ d2 - bc3 = t<<15 | t>>(64-15) - t = a[23] ^ d3 - bc4 = t<<56 | t>>(64-56) - t = a[4] ^ d4 - bc0 = t<<27 | t>>(64-27) - a[5] = bc0 ^ (bc2 &^ bc1) - a[11] = bc1 ^ (bc3 &^ bc2) - a[17] = bc2 ^ (bc4 &^ bc3) - a[23] = bc3 ^ (bc0 &^ bc4) - a[4] = bc4 ^ (bc1 &^ bc0) - - t = a[15] ^ d0 - bc3 = t<<41 | t>>(64-41) - t = a[21] ^ d1 - bc4 = t<<2 | t>>(64-2) - t = a[2] ^ d2 - bc0 = t<<62 | t>>(64-62) - t = a[8] ^ d3 - bc1 = t<<55 | t>>(64-55) - t = a[14] ^ d4 - bc2 = t<<39 | t>>(64-39) - a[15] = bc0 ^ (bc2 &^ bc1) - a[21] = bc1 ^ (bc3 &^ bc2) - a[2] = bc2 ^ (bc4 &^ bc3) - a[8] = bc3 ^ (bc0 &^ bc4) - a[14] = bc4 ^ (bc1 &^ bc0) - - // Round 2 - bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20] - bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21] - bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22] - bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23] - bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24] - d0 = bc4 ^ (bc1<<1 | bc1>>63) - d1 = bc0 ^ (bc2<<1 | bc2>>63) - d2 = bc1 ^ (bc3<<1 | bc3>>63) - d3 = bc2 ^ (bc4<<1 | bc4>>63) - d4 = bc3 ^ (bc0<<1 | bc0>>63) - - bc0 = a[0] ^ d0 - t = a[16] ^ d1 - bc1 = t<<44 | t>>(64-44) - t = a[7] ^ d2 - bc2 = t<<43 | t>>(64-43) - t = a[23] ^ d3 - bc3 = t<<21 | t>>(64-21) - t = a[14] ^ d4 - bc4 = t<<14 | t>>(64-14) - a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+1] - a[16] = bc1 ^ (bc3 &^ bc2) - a[7] = bc2 ^ (bc4 &^ bc3) - a[23] = bc3 ^ (bc0 &^ bc4) - a[14] = bc4 ^ (bc1 &^ bc0) - - t = a[20] ^ d0 - bc2 = t<<3 | t>>(64-3) - t = a[11] ^ d1 - bc3 = t<<45 | t>>(64-45) - t = a[2] ^ d2 - bc4 = t<<61 | t>>(64-61) - t = a[18] ^ d3 - bc0 = t<<28 | t>>(64-28) - t = a[9] ^ d4 - bc1 = t<<20 | t>>(64-20) - a[20] = bc0 ^ (bc2 &^ bc1) - a[11] = bc1 ^ (bc3 &^ bc2) - a[2] = bc2 ^ (bc4 &^ bc3) - a[18] = bc3 ^ (bc0 &^ bc4) - a[9] = bc4 ^ (bc1 &^ bc0) - - t = a[15] ^ d0 - bc4 = t<<18 | t>>(64-18) - t = a[6] ^ d1 - bc0 = t<<1 | t>>(64-1) - t = a[22] ^ d2 - bc1 = t<<6 | t>>(64-6) - t = a[13] ^ d3 - bc2 = t<<25 | t>>(64-25) - t = a[4] ^ d4 - bc3 = t<<8 | t>>(64-8) - a[15] = bc0 ^ (bc2 &^ bc1) - a[6] = bc1 ^ (bc3 &^ bc2) - a[22] = bc2 ^ (bc4 &^ bc3) - a[13] = bc3 ^ (bc0 &^ bc4) - a[4] = bc4 ^ (bc1 &^ bc0) - - t = a[10] ^ d0 - bc1 = t<<36 | t>>(64-36) - t = a[1] ^ d1 - bc2 = t<<10 | t>>(64-10) - t = a[17] ^ d2 - bc3 = t<<15 | t>>(64-15) - t = a[8] ^ d3 - bc4 = t<<56 | t>>(64-56) - t = a[24] ^ d4 - bc0 = t<<27 | t>>(64-27) - a[10] = bc0 ^ (bc2 &^ bc1) - a[1] = bc1 ^ (bc3 &^ bc2) - a[17] = bc2 ^ (bc4 &^ bc3) - a[8] = bc3 ^ (bc0 &^ bc4) - a[24] = bc4 ^ (bc1 &^ bc0) - - t = a[5] ^ d0 - bc3 = t<<41 | t>>(64-41) - t = a[21] ^ d1 - bc4 = t<<2 | t>>(64-2) - t = a[12] ^ d2 - bc0 = t<<62 | t>>(64-62) - t = a[3] ^ d3 - bc1 = t<<55 | t>>(64-55) - t = a[19] ^ d4 - bc2 = t<<39 | t>>(64-39) - a[5] = bc0 ^ (bc2 &^ bc1) - a[21] = bc1 ^ (bc3 &^ bc2) - a[12] = bc2 ^ (bc4 &^ bc3) - a[3] = bc3 ^ (bc0 &^ bc4) - a[19] = bc4 ^ (bc1 &^ bc0) - - // Round 3 - bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20] - bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21] - bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22] - bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23] - bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24] - d0 = bc4 ^ (bc1<<1 | bc1>>63) - d1 = bc0 ^ (bc2<<1 | bc2>>63) - d2 = bc1 ^ (bc3<<1 | bc3>>63) - d3 = bc2 ^ (bc4<<1 | bc4>>63) - d4 = bc3 ^ (bc0<<1 | bc0>>63) - - bc0 = a[0] ^ d0 - t = a[11] ^ d1 - bc1 = t<<44 | t>>(64-44) - t = a[22] ^ d2 - bc2 = t<<43 | t>>(64-43) - t = a[8] ^ d3 - bc3 = t<<21 | t>>(64-21) - t = a[19] ^ d4 - bc4 = t<<14 | t>>(64-14) - a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+2] - a[11] = bc1 ^ (bc3 &^ bc2) - a[22] = bc2 ^ (bc4 &^ bc3) - a[8] = bc3 ^ (bc0 &^ bc4) - a[19] = bc4 ^ (bc1 &^ bc0) - - t = a[15] ^ d0 - bc2 = t<<3 | t>>(64-3) - t = a[1] ^ d1 - bc3 = t<<45 | t>>(64-45) - t = a[12] ^ d2 - bc4 = t<<61 | t>>(64-61) - t = a[23] ^ d3 - bc0 = t<<28 | t>>(64-28) - t = a[9] ^ d4 - bc1 = t<<20 | t>>(64-20) - a[15] = bc0 ^ (bc2 &^ bc1) - a[1] = bc1 ^ (bc3 &^ bc2) - a[12] = bc2 ^ (bc4 &^ bc3) - a[23] = bc3 ^ (bc0 &^ bc4) - a[9] = bc4 ^ (bc1 &^ bc0) - - t = a[5] ^ d0 - bc4 = t<<18 | t>>(64-18) - t = a[16] ^ d1 - bc0 = t<<1 | t>>(64-1) - t = a[2] ^ d2 - bc1 = t<<6 | t>>(64-6) - t = a[13] ^ d3 - bc2 = t<<25 | t>>(64-25) - t = a[24] ^ d4 - bc3 = t<<8 | t>>(64-8) - a[5] = bc0 ^ (bc2 &^ bc1) - a[16] = bc1 ^ (bc3 &^ bc2) - a[2] = bc2 ^ (bc4 &^ bc3) - a[13] = bc3 ^ (bc0 &^ bc4) - a[24] = bc4 ^ (bc1 &^ bc0) - - t = a[20] ^ d0 - bc1 = t<<36 | t>>(64-36) - t = a[6] ^ d1 - bc2 = t<<10 | t>>(64-10) - t = a[17] ^ d2 - bc3 = t<<15 | t>>(64-15) - t = a[3] ^ d3 - bc4 = t<<56 | t>>(64-56) - t = a[14] ^ d4 - bc0 = t<<27 | t>>(64-27) - a[20] = bc0 ^ (bc2 &^ bc1) - a[6] = bc1 ^ (bc3 &^ bc2) - a[17] = bc2 ^ (bc4 &^ bc3) - a[3] = bc3 ^ (bc0 &^ bc4) - a[14] = bc4 ^ (bc1 &^ bc0) - - t = a[10] ^ d0 - bc3 = t<<41 | t>>(64-41) - t = a[21] ^ d1 - bc4 = t<<2 | t>>(64-2) - t = a[7] ^ d2 - bc0 = t<<62 | t>>(64-62) - t = a[18] ^ d3 - bc1 = t<<55 | t>>(64-55) - t = a[4] ^ d4 - bc2 = t<<39 | t>>(64-39) - a[10] = bc0 ^ (bc2 &^ bc1) - a[21] = bc1 ^ (bc3 &^ bc2) - a[7] = bc2 ^ (bc4 &^ bc3) - a[18] = bc3 ^ (bc0 &^ bc4) - a[4] = bc4 ^ (bc1 &^ bc0) - - // Round 4 - bc0 = a[0] ^ a[5] ^ a[10] ^ a[15] ^ a[20] - bc1 = a[1] ^ a[6] ^ a[11] ^ a[16] ^ a[21] - bc2 = a[2] ^ a[7] ^ a[12] ^ a[17] ^ a[22] - bc3 = a[3] ^ a[8] ^ a[13] ^ a[18] ^ a[23] - bc4 = a[4] ^ a[9] ^ a[14] ^ a[19] ^ a[24] - d0 = bc4 ^ (bc1<<1 | bc1>>63) - d1 = bc0 ^ (bc2<<1 | bc2>>63) - d2 = bc1 ^ (bc3<<1 | bc3>>63) - d3 = bc2 ^ (bc4<<1 | bc4>>63) - d4 = bc3 ^ (bc0<<1 | bc0>>63) - - bc0 = a[0] ^ d0 - t = a[1] ^ d1 - bc1 = t<<44 | t>>(64-44) - t = a[2] ^ d2 - bc2 = t<<43 | t>>(64-43) - t = a[3] ^ d3 - bc3 = t<<21 | t>>(64-21) - t = a[4] ^ d4 - bc4 = t<<14 | t>>(64-14) - a[0] = bc0 ^ (bc2 &^ bc1) ^ rc[i+3] - a[1] = bc1 ^ (bc3 &^ bc2) - a[2] = bc2 ^ (bc4 &^ bc3) - a[3] = bc3 ^ (bc0 &^ bc4) - a[4] = bc4 ^ (bc1 &^ bc0) - - t = a[5] ^ d0 - bc2 = t<<3 | t>>(64-3) - t = a[6] ^ d1 - bc3 = t<<45 | t>>(64-45) - t = a[7] ^ d2 - bc4 = t<<61 | t>>(64-61) - t = a[8] ^ d3 - bc0 = t<<28 | t>>(64-28) - t = a[9] ^ d4 - bc1 = t<<20 | t>>(64-20) - a[5] = bc0 ^ (bc2 &^ bc1) - a[6] = bc1 ^ (bc3 &^ bc2) - a[7] = bc2 ^ (bc4 &^ bc3) - a[8] = bc3 ^ (bc0 &^ bc4) - a[9] = bc4 ^ (bc1 &^ bc0) - - t = a[10] ^ d0 - bc4 = t<<18 | t>>(64-18) - t = a[11] ^ d1 - bc0 = t<<1 | t>>(64-1) - t = a[12] ^ d2 - bc1 = t<<6 | t>>(64-6) - t = a[13] ^ d3 - bc2 = t<<25 | t>>(64-25) - t = a[14] ^ d4 - bc3 = t<<8 | t>>(64-8) - a[10] = bc0 ^ (bc2 &^ bc1) - a[11] = bc1 ^ (bc3 &^ bc2) - a[12] = bc2 ^ (bc4 &^ bc3) - a[13] = bc3 ^ (bc0 &^ bc4) - a[14] = bc4 ^ (bc1 &^ bc0) - - t = a[15] ^ d0 - bc1 = t<<36 | t>>(64-36) - t = a[16] ^ d1 - bc2 = t<<10 | t>>(64-10) - t = a[17] ^ d2 - bc3 = t<<15 | t>>(64-15) - t = a[18] ^ d3 - bc4 = t<<56 | t>>(64-56) - t = a[19] ^ d4 - bc0 = t<<27 | t>>(64-27) - a[15] = bc0 ^ (bc2 &^ bc1) - a[16] = bc1 ^ (bc3 &^ bc2) - a[17] = bc2 ^ (bc4 &^ bc3) - a[18] = bc3 ^ (bc0 &^ bc4) - a[19] = bc4 ^ (bc1 &^ bc0) - - t = a[20] ^ d0 - bc3 = t<<41 | t>>(64-41) - t = a[21] ^ d1 - bc4 = t<<2 | t>>(64-2) - t = a[22] ^ d2 - bc0 = t<<62 | t>>(64-62) - t = a[23] ^ d3 - bc1 = t<<55 | t>>(64-55) - t = a[24] ^ d4 - bc2 = t<<39 | t>>(64-39) - a[20] = bc0 ^ (bc2 &^ bc1) - a[21] = bc1 ^ (bc3 &^ bc2) - a[22] = bc2 ^ (bc4 &^ bc3) - a[23] = bc3 ^ (bc0 &^ bc4) - a[24] = bc4 ^ (bc1 &^ bc0) - } -} diff --git a/crypto/hash/keccakf_asm.go b/crypto/hash/keccakf_asm.go deleted file mode 100644 index 978d2b6c658..00000000000 --- a/crypto/hash/keccakf_asm.go +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright 2015 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -//go:build amd64 && !purego && gc -// +build amd64,!purego,gc - -package hash - -// keccakF1600 is Keccak permutation function with -// a width of 1600 bits and 24 rounds. -// This function is implemented in keccakf_amd64.s. - -//go:noescape - -func keccakF1600(a *[25]uint64) diff --git a/crypto/hash/kmac.go b/crypto/hash/kmac.go deleted file mode 100644 index 8814fda4e99..00000000000 --- a/crypto/hash/kmac.go +++ /dev/null @@ -1,175 +0,0 @@ -package hash - -import ( - "encoding/binary" - "fmt" - - "golang.org/x/crypto/sha3" -) - -// implements the interface sha3.ShakeHash -type kmac128 struct { - // the output size of KMAC - outputSize int - // embeds ShakeHash - // stores the encoding of the function name and customization string - // Using the io.Writer interface changes the internal state - // of the KMAC - sha3.ShakeHash - // the block initialized by NewKMAC_128 - // stores the encoding of the key - initBlock []byte -} - -// the cSHAKE128 rate as defined in NIST SP 800-185 -const cSHAKE128BlockSize = 168 - -// NewKMAC_128 returns a new KMAC instance -// - key is the KMAC key (the key size is compared to the security level, although -// the parameter is used as a domain tag in Flow and not as a security key). -// - customizer is the customization string. It can be left empty if no customizer -// is required. -func NewKMAC_128(key []byte, customizer []byte, outputSize int) (Hasher, error) { - var k kmac128 - if outputSize < 0 { - return nil, - fmt.Errorf("kmac output cannot be negative, got %d", outputSize) - } - - // check the key size (required if the key is used as a security key) - if len(key) < KmacMinKeyLen { - return nil, - fmt.Errorf("kmac key size must be at least %d", KmacMinKeyLen) - } - - k.outputSize = outputSize - // initialize the cSHAKE128 instance - k.ShakeHash = sha3.NewCShake128([]byte("KMAC"), customizer) - - // store the encoding of the key - k.initBlock = bytepad(encodeString(key), cSHAKE128BlockSize) - _, _ = k.Write(k.initBlock) - return &k, nil -} - -func (k *kmac128) Algorithm() HashingAlgorithm { - return KMAC128 -} - -const maxEncodeLen = 9 - -// encode_string function as defined in NIST SP 800-185 (for value < 2^64) -func encodeString(s []byte) []byte { - // leftEncode returns max 9 bytes - out := make([]byte, 0, maxEncodeLen+len(s)) - out = append(out, leftEncode(uint64(len(s)*8))...) - out = append(out, s...) - return out -} - -// "left_encode" function as defined in NIST SP 800-185 (for value < 2^64) -// copied from golang.org/x/crypto/sha3 -// -// Copyright (c) 2009 The Go Authors. All rights reserved. - -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: - -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. - -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -func leftEncode(value uint64) []byte { - var b [maxEncodeLen]byte - binary.BigEndian.PutUint64(b[1:], value) - // Trim all but last leading zero bytes - i := byte(1) - for i < 8 && b[i] == 0 { - i++ - } - // Prepend number of encoded bytes - b[i-1] = maxEncodeLen - i - return b[i-1:] -} - -// bytepad function as defined in NIST SP 800-185 -// copied from golang.org/x/crypto/sha3 -// The caller must make sure parameter (w) is strictly positive. -// -// Copyright (c) 2009 The Go Authors. All rights reserved. -func bytepad(input []byte, w int) []byte { - // leftEncode always returns max 9 bytes - buf := make([]byte, 0, maxEncodeLen+len(input)+w) - buf = append(buf, leftEncode(uint64(w))...) - buf = append(buf, input...) - padlen := w - (len(buf) % w) - return append(buf, make([]byte, padlen)...) -} - -// "right_encode" function as defined in NIST SP 800-185 (for value < 2^64) -func rightEncode(value uint64) []byte { - var b [maxEncodeLen]byte - binary.BigEndian.PutUint64(b[:8], value) - // Trim all but last leading zero bytes - i := byte(0) - for i < 7 && b[i] == 0 { - i++ - } - // Append number of encoded bytes - b[8] = maxEncodeLen - 1 - i - return b[i:] -} - -// Reset resets the hash to initial state. -func (k *kmac128) Reset() { - k.ShakeHash.Reset() - _, _ = k.Write(k.initBlock) -} - -// ComputeHash computes the mac of the input data. -// It does not update the underlying hash state (the function is thread safe). -func (k *kmac128) ComputeHash(data []byte) Hash { - cshake := k.ShakeHash.Clone() - cshake.Reset() - _, _ = cshake.Write(k.initBlock) - _, _ = cshake.Write(data) - _, _ = cshake.Write(rightEncode(uint64(k.outputSize * 8))) - // read the cshake output - h := make([]byte, k.outputSize) - _, _ = cshake.Read(h) - return h -} - -// SumHash finalizes the mac computations and returns the output. -// It does not reset the state to allow further writing. -func (k *kmac128) SumHash() Hash { - cshake := k.ShakeHash.Clone() - _, _ = cshake.Write(rightEncode(uint64(k.outputSize * 8))) - // read the cshake output - h := make([]byte, k.outputSize) - _, _ = cshake.Read(h) - return h -} - -// Size returns the output length of the KMAC instance -func (k *kmac128) Size() int { - return k.outputSize -} diff --git a/crypto/hash/legacy_keccak.go b/crypto/hash/legacy_keccak.go deleted file mode 100644 index a333dcce00a..00000000000 --- a/crypto/hash/legacy_keccak.go +++ /dev/null @@ -1,19 +0,0 @@ -package hash - -const ( - rateKeccak_256 = 136 - - dsByteKeccak = byte(0x1) -) - -// NewKeccak_256 returns a new instance of legacy Keccak-256 hasher. -func NewKeccak_256() Hasher { - return &spongeState{ - algo: Keccak_256, - rate: rateKeccak_256, - dsByte: dsByteKeccak, - outputLen: HashLenKeccak_256, - bufIndex: bufNilValue, - bufSize: bufNilValue, - } -} diff --git a/crypto/hash/sha2.go b/crypto/hash/sha2.go deleted file mode 100644 index 3362face47a..00000000000 --- a/crypto/hash/sha2.go +++ /dev/null @@ -1,78 +0,0 @@ -package hash - -import ( - "crypto/sha256" - "crypto/sha512" - "hash" -) - -// sha2_256Algo -type sha2_256Algo struct { - hash.Hash -} - -// NewSHA2_256 returns a new instance of SHA2-256 hasher -func NewSHA2_256() Hasher { - return &sha2_256Algo{ - Hash: sha256.New()} -} - -func (s *sha2_256Algo) Algorithm() HashingAlgorithm { - return SHA2_256 -} - -// ComputeHash calculates and returns the SHA2-256 digest of the input. -// The function updates the state (and therefore not thread-safe) -// but does not reset the state to allow further writing. -func (s *sha2_256Algo) ComputeHash(data []byte) Hash { - s.Reset() - // `Write` delegates this call to sha256.digest's `Write` which does not return an error. - _, _ = s.Write(data) - return s.Sum(nil) -} - -// SumHash returns the SHA2-256 output. -// It does not reset the state to allow further writing. -func (s *sha2_256Algo) SumHash() Hash { - return s.Sum(nil) -} - -// sha2_384Algo -type sha2_384Algo struct { - hash.Hash -} - -// NewSHA2_384 returns a new instance of SHA2-384 hasher -func NewSHA2_384() Hasher { - return &sha2_384Algo{ - Hash: sha512.New384()} -} - -func (s *sha2_384Algo) Algorithm() HashingAlgorithm { - return SHA2_384 -} - -// ComputeHash calculates and returns the SHA2-384 digest of the input. -// It does not reset the state to allow further writing. -func (s *sha2_384Algo) ComputeHash(data []byte) Hash { - s.Reset() - // `Write` delegates this call to sha512.digest's `Write` which does not return an error. - _, _ = s.Write(data) - return s.Sum(nil) -} - -// SumHash returns the SHA2-384 output. -// It does not reset the state to allow further writing. -func (s *sha2_384Algo) SumHash() Hash { - return s.Sum(nil) -} - -// ComputeSHA2_256 computes the SHA2-256 (commonly known as SHA256) -// digest of data and copies the result to the result buffer. -// -// The function is not part of the Hasher API. It is a pure function -// for simple computation of a hash with minimal heap allocations. -func ComputeSHA2_256(result *[HashLenSHA2_256]byte, data []byte) { - hash := sha256.Sum256(data) - copy(result[:], hash[:]) -} diff --git a/crypto/hash/sha3.go b/crypto/hash/sha3.go deleted file mode 100644 index f5b6cd9fce4..00000000000 --- a/crypto/hash/sha3.go +++ /dev/null @@ -1,50 +0,0 @@ -package hash - -const ( - rateSHA3_256 = 136 - rateSHA3_384 = 104 - - dsByteSHA3 = byte(0x6) -) - -// NewSHA3_256 returns a new instance of SHA3-256 hasher. -func NewSHA3_256() Hasher { - return &spongeState{ - algo: SHA3_256, - rate: rateSHA3_256, - dsByte: dsByteSHA3, - outputLen: HashLenSHA3_256, - bufIndex: bufNilValue, - bufSize: bufNilValue, - } -} - -// NewSHA3_384 returns a new instance of SHA3-384 hasher. -func NewSHA3_384() Hasher { - return &spongeState{ - algo: SHA3_384, - rate: rateSHA3_384, - dsByte: dsByteSHA3, - outputLen: HashLenSHA3_384, - bufIndex: bufNilValue, - bufSize: bufNilValue, - } -} - -// ComputeSHA3_256 computes the SHA3-256 digest of data -// and copies the result to the result buffer. -// -// The function is not part of the Hasher API. It is a pure function -// for simple computation of a hash with minimal heap allocations. -func ComputeSHA3_256(result *[HashLenSHA3_256]byte, data []byte) { - state := &spongeState{ - rate: rateSHA3_256, - dsByte: dsByteSHA3, - outputLen: HashLenSHA3_256, - bufIndex: bufNilValue, - bufSize: bufNilValue, - } - state.write(data) - state.padAndPermute() - copyOut(result[:], state) -} diff --git a/crypto/hash/types.go b/crypto/hash/types.go deleted file mode 100644 index 709f8bdf364..00000000000 --- a/crypto/hash/types.go +++ /dev/null @@ -1,49 +0,0 @@ -package hash - -//revive:disable:var-naming - -// HashingAlgorithm is an identifier for a hashing algorithm. -type HashingAlgorithm int - -const ( - // Supported hashing algorithms - UnknownHashingAlgorithm HashingAlgorithm = iota - // SHA-2 - SHA2_256 - SHA2_384 - // SHA-3 - SHA3_256 - SHA3_384 - // KMAC (Keccak based MAC algorithm) - KMAC128 - // legacy Keccak - Keccak_256 -) - -// String returns the string representation of this hashing algorithm. -func (h HashingAlgorithm) String() string { - return [...]string{ - "UNKNOWN", - "SHA2_256", - "SHA2_384", - "SHA3_256", - "SHA3_384", - "KMAC128", - "Keccak_256"}[h] -} - -const ( - // minimum targeted bits of security - securityBits = 128 - - // Lengths of hash outputs in bytes - HashLenSHA2_256 = 32 - HashLenSHA2_384 = 48 - HashLenSHA3_256 = 32 - HashLenSHA3_384 = 48 - HashLenKeccak_256 = 32 - - // KMAC - // the minimum key length in bytes - KmacMinKeyLen = securityBits / 8 -) diff --git a/crypto/hash/xor_generic.go b/crypto/hash/xor_generic.go deleted file mode 100644 index 38f9d1863db..00000000000 --- a/crypto/hash/xor_generic.go +++ /dev/null @@ -1,64 +0,0 @@ -// The functions below were copied and modified from golang.org/x/crypto/sha3. -// -// Copyright (c) 2009 The Go Authors. All rights reserved. - -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: - -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. - -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -//go:build (!amd64 && !386 && !ppc64le) || purego -// +build !amd64,!386,!ppc64le purego - -package hash - -import "encoding/binary" - -// A storageBuf is an aligned array of maxRate bytes. -type storageBuf [maxRate]byte - -func (b *storageBuf) asBytes() *[maxRate]byte { - return (*[maxRate]byte)(b) -} - -// xorIn xors the bytes in buf into the state; it -// makes no non-portable assumptions about memory layout -// or alignment. -func xorIn(d *spongeState, buf []byte) { - n := len(buf) / 8 - - for i := 0; i < n; i++ { - a := binary.LittleEndian.Uint64(buf) - d.a[i] ^= a - buf = buf[8:] - } -} - -// copyOut copies ulint64s to a byte buffer. -func copyOut(b []byte, d *spongeState) { - for i := 0; len(b) >= 8; i++ { - binary.LittleEndian.PutUint64(b, d.a[i]) - b = b[8:] - } -} diff --git a/crypto/hash/xor_unaligned.go b/crypto/hash/xor_unaligned.go deleted file mode 100644 index 3b9446c3037..00000000000 --- a/crypto/hash/xor_unaligned.go +++ /dev/null @@ -1,82 +0,0 @@ -/// The functions below were copied and modified from golang.org/x/crypto/sha3. -// -// Copyright (c) 2009 The Go Authors. All rights reserved. - -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: - -// * Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// * Redistributions in binary form must reproduce the above -// copyright notice, this list of conditions and the following disclaimer -// in the documentation and/or other materials provided with the -// distribution. -// * Neither the name of Google Inc. nor the names of its -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. - -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -//go:build (amd64 || 386 || ppc64le) && !purego -// +build amd64 386 ppc64le -// +build !purego - -package hash - -import "unsafe" - -// A storageBuf is an aligned array of maxRate bytes. -type storageBuf [maxRate / 8]uint64 - -//go:nocheckptr ignore "pointer arithmetic result points to invalid allocation" -func (b *storageBuf) asBytes() *[maxRate]byte { - // re-using a trick from https://github.com/golang/go/blob/master/src/runtime/stubs.go#L178: - // to hide the input pointer from escape analysis and avoid - // an escape to to the heap. The 0 xor tricks the escape analysis tool - // to think "ptr" and "b" are not related. - ptr := uintptr(unsafe.Pointer(b)) ^ 0 // nolint:staticcheck - return (*[maxRate]byte)(unsafe.Pointer(ptr)) -} - -// xorIn uses unaligned reads and writes to update d.a to contain d.a -// XOR buf. -func xorIn(d *spongeState, buf []byte) { - n := len(buf) - bw := (*[maxRate / 8]uint64)(unsafe.Pointer(&buf[0]))[: n/8 : n/8] - - d.a[0] ^= bw[0] - d.a[1] ^= bw[1] - d.a[2] ^= bw[2] - d.a[3] ^= bw[3] - d.a[4] ^= bw[4] - d.a[5] ^= bw[5] - d.a[6] ^= bw[6] - d.a[7] ^= bw[7] - d.a[8] ^= bw[8] - d.a[9] ^= bw[9] - d.a[10] ^= bw[10] - d.a[11] ^= bw[11] - d.a[12] ^= bw[12] - if n >= 136 { - d.a[13] ^= bw[13] - d.a[14] ^= bw[14] - d.a[15] ^= bw[15] - d.a[16] ^= bw[16] - } -} - -func copyOut(buf []byte, d *spongeState) { - ab := (*[maxRate]uint8)(unsafe.Pointer(&d.a[0])) - copy(buf, ab[:]) -} diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go deleted file mode 100644 index c890f55e367..00000000000 --- a/crypto/internal/blst/blst.go +++ /dev/null @@ -1,3434 +0,0 @@ -//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -// DO NOT EDIT THIS FILE!! -// The file is generated from *.tgo by generate.py -//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ - -package blst - -// #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset -// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx -// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ -// #include "blst.h" -// -// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) -// # include -// # include -// static void handler(int signum) -// { ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, " -// "consult /bindings/go/README.md.\n", 70); -// _exit(128+SIGILL); -// (void)n; -// } -// __attribute__((constructor)) static void blst_cgo_init() -// { blst_fp temp = { 0 }; -// struct sigaction act = { handler }, oact; -// sigaction(SIGILL, &act, &oact); -// blst_fp_sqr(&temp, &temp); -// sigaction(SIGILL, &oact, NULL); -// } -// #endif -// -// static size_t go_pairing_sizeof(size_t DST_len) -// { return (blst_pairing_sizeof() + DST_len + sizeof(blst_pairing) - 1) / -// sizeof(blst_pairing); -// } -// static void go_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, -// const byte *DST, size_t DST_len) -// { if (DST != NULL) { -// byte *dst = (byte*)new_ctx + blst_pairing_sizeof(); -// for(size_t i = 0; i < DST_len; i++) dst[i] = DST[i]; -// DST = dst; -// } -// blst_pairing_init(new_ctx, hash_or_encode, DST, DST_len); -// } -// static void go_pairing_as_fp12(blst_fp12 *pt, blst_pairing *ctx) -// { *pt = *blst_pairing_as_fp12(ctx); } -// -// static void go_p1slice_to_affine(blst_p1_affine dst[], -// const blst_p1 points[], size_t npoints) -// { const blst_p1 *ppoints[2] = { points, NULL }; -// blst_p1s_to_affine(dst, ppoints, npoints); -// } -// static void go_p1slice_add(blst_p1 *dst, const blst_p1_affine points[], -// size_t npoints) -// { const blst_p1_affine *ppoints[2] = { points, NULL }; -// blst_p1s_add(dst, ppoints, npoints); -// } -// static void go_p2slice_to_affine(blst_p2_affine dst[], -// const blst_p2 points[], size_t npoints) -// { const blst_p2 *ppoints[2] = { points, NULL }; -// blst_p2s_to_affine(dst, ppoints, npoints); -// } -// static void go_p2slice_add(blst_p2 *dst, const blst_p2_affine points[], -// size_t npoints) -// { const blst_p2_affine *ppoints[2] = { points, NULL }; -// blst_p2s_add(dst, ppoints, npoints); -// } -// -// static void go_p1_mult_n_acc(blst_p1 *acc, const blst_fp *x, bool affine, -// const byte *scalar, size_t nbits) -// { blst_p1 m[1]; -// const void *p = x; -// if (p == NULL) -// p = blst_p1_generator(); -// else if (affine) -// blst_p1_from_affine(m, p), p = m; -// blst_p1_mult(m, p, scalar, nbits); -// blst_p1_add_or_double(acc, acc, m); -// } -// static void go_p2_mult_n_acc(blst_p2 *acc, const blst_fp2 *x, bool affine, -// const byte *scalar, size_t nbits) -// { blst_p2 m[1]; -// const void *p = x; -// if (p == NULL) -// p = blst_p2_generator(); -// else if (affine) -// blst_p2_from_affine(m, p), p = m; -// blst_p2_mult(m, p, scalar, nbits); -// blst_p2_add_or_double(acc, acc, m); -// } -// -// static void go_p1_sub_assign(blst_p1 *a, const blst_fp *x, bool affine) -// { blst_p1 minus_b; -// if (affine) -// blst_p1_from_affine(&minus_b, (const blst_p1_affine*)x); -// else -// minus_b = *(const blst_p1*)x; -// blst_p1_cneg(&minus_b, 1); -// blst_p1_add_or_double(a, a, &minus_b); -// } -// -// static void go_p2_sub_assign(blst_p2 *a, const blst_fp2 *x, bool affine) -// { blst_p2 minus_b; -// if (affine) -// blst_p2_from_affine(&minus_b, (const blst_p2_affine*)x); -// else -// minus_b = *(const blst_p2*)x; -// blst_p2_cneg(&minus_b, 1); -// blst_p2_add_or_double(a, a, &minus_b); -// } -// -// static bool go_scalar_from_bendian(blst_scalar *ret, const byte *in) -// { blst_scalar_from_bendian(ret, in); -// return blst_sk_check(ret); -// } -// static bool go_hash_to_scalar(blst_scalar *ret, -// const byte *msg, size_t msg_len, -// const byte *DST, size_t DST_len) -// { byte elem[48]; -// blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len); -// return blst_scalar_from_be_bytes(ret, elem, sizeof(elem)); -// } -// static void go_miller_loop_n(blst_fp12 *dst, const blst_p2_affine Q[], -// const blst_p1_affine P[], -// size_t npoints, bool acc) -// { const blst_p2_affine *Qs[2] = { Q, NULL }; -// const blst_p1_affine *Ps[2] = { P, NULL }; -// if (acc) { -// blst_fp12 tmp; -// blst_miller_loop_n(&tmp, Qs, Ps, npoints); -// blst_fp12_mul(dst, dst, &tmp); -// } else { -// blst_miller_loop_n(dst, Qs, Ps, npoints); -// } -// } -// static void go_fp12slice_mul(blst_fp12 *dst, const blst_fp12 in[], size_t n) -// { size_t i; -// blst_fp12_mul(dst, &in[0], &in[1]); -// for (i = 2; i < n; i++) -// blst_fp12_mul(dst, dst, &in[i]); -// } -import "C" -import ( - "fmt" - "math/bits" - "runtime" - "sync" - "sync/atomic" -) - -const BLST_SCALAR_BYTES = 256 / 8 -const BLST_FP_BYTES = 384 / 8 -const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES -const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2 -const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2 -const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4 - -type Scalar = C.blst_scalar -type Fp = C.blst_fp -type Fp2 = C.blst_fp2 -type Fp6 = C.blst_fp6 -type Fp12 = C.blst_fp12 -type P1 = C.blst_p1 -type P2 = C.blst_p2 -type P1Affine = C.blst_p1_affine -type P2Affine = C.blst_p2_affine -type Message = []byte -type Pairing = []C.blst_pairing -type SecretKey = Scalar -type P1s []P1 -type P2s []P2 -type P1Affines []P1Affine -type P2Affines []P2Affine - -// -// Configuration -// - -var maxProcs = initMaxProcs() - -func initMaxProcs() int { - maxProcs := runtime.GOMAXPROCS(0) - var version float32 - _, err := fmt.Sscanf(runtime.Version(), "go%f", &version) - if err != nil || version < 1.14 { - // be cooperative and leave one processor for the application - maxProcs -= 1 - } - if maxProcs <= 0 { - maxProcs = 1 - } - return maxProcs -} - -func SetMaxProcs(max int) { - if max <= 0 { - max = 1 - } - maxProcs = max -} - -// Secret key -func (sk *SecretKey) Zeroize() { - var zero SecretKey - *sk = zero -} - -func KeyGen(ikm []byte, optional ...[]byte) *SecretKey { - var sk SecretKey - var info []byte - var infoP *C.byte - if len(optional) > 0 { - info = optional[0] - if len(info) > 0 { - infoP = (*C.byte)(&info[0]) - } - } - if len(ikm) < 32 { - return nil - } - C.blst_keygen(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), - infoP, C.size_t(len(info))) - // Postponing secret key zeroing till garbage collection can be too - // late to be effective, but every little bit helps... - runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) - return &sk -} - -func KeyGenV3(ikm []byte, optional ...[]byte) *SecretKey { - if len(ikm) < 32 { - return nil - } - var sk SecretKey - var info []byte - var infoP *C.byte - if len(optional) > 0 { - info = optional[0] - if len(info) > 0 { - infoP = (*C.byte)(&info[0]) - } - } - C.blst_keygen_v3(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), - infoP, C.size_t(len(info))) - // Postponing secret key zeroing till garbage collection can be too - // late to be effective, but every little bit helps... - runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) - return &sk -} - -func KeyGenV45(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { - if len(ikm) < 32 { - return nil - } - var sk SecretKey - var info []byte - var infoP *C.byte - if len(optional) > 0 { - info = optional[0] - if len(info) > 0 { - infoP = (*C.byte)(&info[0]) - } - } - C.blst_keygen_v4_5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), - (*C.byte)(&salt[0]), C.size_t(len(salt)), - infoP, C.size_t(len(info))) - // Postponing secret key zeroing till garbage collection can be too - // late to be effective, but every little bit helps... - runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) - return &sk -} - -func KeyGenV5(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { - if len(ikm) < 32 { - return nil - } - var sk SecretKey - var info []byte - var infoP *C.byte - if len(optional) > 0 { - info = optional[0] - if len(info) > 0 { - infoP = (*C.byte)(&info[0]) - } - } - C.blst_keygen_v5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), - (*C.byte)(&salt[0]), C.size_t(len(salt)), - infoP, C.size_t(len(info))) - // Postponing secret key zeroing till garbage collection can be too - // late to be effective, but every little bit helps... - runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) - return &sk -} - -func DeriveMasterEip2333(ikm []byte) *SecretKey { - if len(ikm) < 32 { - return nil - } - var sk SecretKey - C.blst_derive_master_eip2333(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm))) - // Postponing secret key zeroing till garbage collection can be too - // late to be effective, but every little bit helps... - runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) - return &sk -} - -func (master *SecretKey) DeriveChildEip2333(child_index uint32) *SecretKey { - var sk SecretKey - C.blst_derive_child_eip2333(&sk, master, C.uint(child_index)) - // Postponing secret key zeroing till garbage collection can be too - // late to be effective, but every little bit helps... - runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) - return &sk -} - -// Pairing -func PairingCtx(hash_or_encode bool, DST []byte) Pairing { - DST_len := C.size_t(len(DST)) - ctx := make([]C.blst_pairing, int(C.go_pairing_sizeof(DST_len))) - var uDST *C.byte - if DST_len > 0 { - uDST = (*C.byte)(&DST[0]) - } - C.go_pairing_init(&ctx[0], C.bool(hash_or_encode), uDST, DST_len) - return ctx -} - -func PairingCommit(ctx Pairing) { - C.blst_pairing_commit(&ctx[0]) -} - -func PairingMerge(ctx Pairing, ctx1 Pairing) int { - r := C.blst_pairing_merge(&ctx[0], &ctx1[0]) - return int(r) -} - -func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool { - var gtsig *Fp12 = nil - if len(optional) > 0 { - gtsig = optional[0] - } - return bool(C.blst_pairing_finalverify(&ctx[0], gtsig)) -} - -func PairingRawAggregate(ctx Pairing, q *P2Affine, p *P1Affine) { - C.blst_pairing_raw_aggregate(&ctx[0], q, p) -} - -func PairingAsFp12(ctx Pairing) *Fp12 { - var pt Fp12 - C.go_pairing_as_fp12(&pt, &ctx[0]) - return &pt -} - -func Fp12One() Fp12 { - return *C.blst_fp12_one() -} - -func Fp12FinalVerify(pt1 *Fp12, pt2 *Fp12) bool { - return bool(C.blst_fp12_finalverify(pt1, pt2)) -} - -func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 { - var pt Fp12 - C.blst_miller_loop(&pt, q, p) - return &pt -} - -func Fp12MillerLoopN(qs []P2Affine, ps []P1Affine) *Fp12 { - if len(qs) != len(ps) || len(qs) == 0 { - panic("inputs' lengths mismatch") - } - - nElems := uint32(len(qs)) - nThreads := uint32(maxProcs) - - if nThreads == 1 || nElems == 1 { - var pt Fp12 - C.go_miller_loop_n(&pt, &qs[0], &ps[0], C.size_t(nElems), false) - return &pt - } - - stride := (nElems + nThreads - 1) / nThreads - if stride > 16 { - stride = 16 - } - - strides := (nElems + stride - 1) / stride - if nThreads > strides { - nThreads = strides - } - - msgsCh := make(chan Fp12, nThreads) - curElem := uint32(0) - - for tid := uint32(0); tid < nThreads; tid++ { - go func() { - acc := Fp12One() - first := true - for { - work := atomic.AddUint32(&curElem, stride) - stride - if work >= nElems { - break - } - n := nElems - work - if n > stride { - n = stride - } - C.go_miller_loop_n(&acc, &qs[work], &ps[work], C.size_t(n), - C.bool(!first)) - first = false - } - msgsCh <- acc - }() - } - - var ret = make([]Fp12, nThreads) - for i := range ret { - ret[i] = <-msgsCh - } - - var pt Fp12 - C.go_fp12slice_mul(&pt, &ret[0], C.size_t(nThreads)) - return &pt -} - -func (pt *Fp12) MulAssign(p *Fp12) { - C.blst_fp12_mul(pt, pt, p) -} - -func (pt *Fp12) FinalExp() { - C.blst_final_exp(pt, pt) -} - -func (pt *Fp12) InGroup() bool { - return bool(C.blst_fp12_in_group(pt)) -} - -func (pt *Fp12) ToBendian() []byte { - var out [BLST_FP_BYTES * 12]byte - C.blst_bendian_from_fp12((*C.byte)(&out[0]), pt) - return out[:] -} - -func (pt1 *Fp12) Equals(pt2 *Fp12) bool { - return *pt1 == *pt2 -} - -// -// MIN-PK -// - -// -// PublicKey -// - -func (pk *P1Affine) From(s *Scalar) *P1Affine { - C.blst_sk_to_pk2_in_g1(nil, pk, s) - return pk -} - -func (pk *P1Affine) KeyValidate() bool { - return !bool(C.blst_p1_affine_is_inf(pk)) && - bool(C.blst_p1_affine_in_g1(pk)) -} - -// sigInfcheck, check for infinity, is a way to avoid going -// into resource-consuming verification. Passing 'false' is -// always cryptographically safe, but application might want -// to guard against obviously bogus individual[!] signatures. -func (sig *P2Affine) SigValidate(sigInfcheck bool) bool { - if sigInfcheck && bool(C.blst_p2_affine_is_inf(sig)) { - return false - } - return bool(C.blst_p2_affine_in_g2(sig)) -} - -// -// Sign -// - -func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte, - optional ...interface{}) *P2Affine { - augSingle, aug, useHash, ok := parseOpts(optional...) - if !ok || len(aug) != 0 { - return nil - } - - var q *P2 - if useHash { - q = HashToG2(msg, dst, augSingle) - } else { - q = EncodeToG2(msg, dst, augSingle) - } - C.blst_sign_pk2_in_g1(nil, sig, q, sk) - return sig -} - -// -// Signature -// - -// Functions to return a signature and public key+augmentation tuple. -// This enables point decompression (if needed) to happen in parallel. -type sigGetterP2 func() *P2Affine -type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte) - -// Single verify with decompressed pk -func (sig *P2Affine) Verify(sigGroupcheck bool, pk *P1Affine, pkValidate bool, - msg Message, dst []byte, - optional ...interface{}) bool { // useHash bool, aug []byte - - aug, _, useHash, ok := parseOpts(optional...) - if !ok { - return false - } - return sig.AggregateVerify(sigGroupcheck, []*P1Affine{pk}, pkValidate, - []Message{msg}, dst, useHash, [][]byte{aug}) -} - -// Single verify with compressed pk -// Uses a dummy signature to get the correct type -func (dummy *P2Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, - pk []byte, pkValidate bool, msg Message, dst []byte, - optional ...bool) bool { // useHash bool, usePksAsAugs bool - - return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, - [][]byte{pk}, pkValidate, - []Message{msg}, dst, optional...) -} - -// Aggregate verify with uncompressed signature and public keys -// Note that checking message uniqueness, if required, is left to the user. -// Not all signature schemes require it and this keeps the binding minimal -// and fast. Refer to the Uniq function for one method method of performing -// this check. -func (sig *P2Affine) AggregateVerify(sigGroupcheck bool, - pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte, - optional ...interface{}) bool { // useHash bool, augs [][]byte - - // sanity checks and argument parsing - n := len(pks) - if n == 0 || len(msgs) != n { - return false - } - _, augs, useHash, ok := parseOpts(optional...) - useAugs := len(augs) != 0 - if !ok || (useAugs && len(augs) != n) { - return false - } - - sigFn := func() *P2Affine { - return sig - } - - pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) { - if useAugs { - return pks[i], augs[i] - } else { - return pks[i], nil - } - } - - return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, - msgs, dst, useHash) -} - -// Aggregate verify with compressed signature and public keys -// Uses a dummy signature to get the correct type -func (dummy *P2Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, - pks [][]byte, pksVerify bool, msgs []Message, dst []byte, - optional ...bool) bool { // useHash bool, usePksAsAugs bool - - // sanity checks and argument parsing - if len(pks) != len(msgs) { - return false - } - useHash := true - if len(optional) > 0 { - useHash = optional[0] - } - usePksAsAugs := false - if len(optional) > 1 { - usePksAsAugs = optional[1] - } - - sigFn := func() *P2Affine { - sigP := new(P2Affine) - if sigP.Uncompress(sig) == nil { - return nil - } - return sigP - } - pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) { - bytes := pks[i] - if len(bytes) == BLST_P1_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { - // Not compressed - if pk.Deserialize(bytes) == nil { - return nil, nil - } - } else if len(bytes) == BLST_P1_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { - if pk.Uncompress(bytes) == nil { - return nil, nil - } - } else { - return nil, nil - } - if usePksAsAugs { - return pk, bytes - } - return pk, nil - } - return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, - msgs, dst, useHash) -} - -func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool, - pkFn pkGetterP1, pkValidate bool, msgs []Message, dst []byte, - optional ...bool) bool { // useHash - - n := len(msgs) - if n == 0 { - return false - } - - useHash := true - if len(optional) > 0 { - useHash = optional[0] - } - - numCores := runtime.GOMAXPROCS(0) - numThreads := maxProcs - if numThreads > numCores { - numThreads = numCores - } - if numThreads > n { - numThreads = n - } - // Each thread will determine next message to process by atomically - // incrementing curItem, process corresponding pk,msg[,aug] tuple and - // repeat until n is exceeded. The resulting accumulations will be - // fed into the msgsCh channel. - msgsCh := make(chan Pairing, numThreads) - valid := int32(1) - curItem := uint32(0) - mutex := sync.Mutex{} - - mutex.Lock() - for tid := 0; tid < numThreads; tid++ { - go func() { - pairing := PairingCtx(useHash, dst) - var temp P1Affine - for atomic.LoadInt32(&valid) > 0 { - // Get a work item - work := atomic.AddUint32(&curItem, 1) - 1 - if work >= uint32(n) { - break - } else if work == 0 && maxProcs == numCores-1 && - numThreads == maxProcs { - // Avoid consuming all cores by waiting until the - // main thread has completed its miller loop before - // proceeding. - mutex.Lock() - mutex.Unlock() - } - - // Pull Public Key and augmentation blob - curPk, aug := pkFn(work, &temp) - if curPk == nil { - atomic.StoreInt32(&valid, 0) - break - } - - // Pairing and accumulate - ret := PairingAggregatePkInG1(pairing, curPk, pkValidate, - nil, false, msgs[work], aug) - if ret != C.BLST_SUCCESS { - atomic.StoreInt32(&valid, 0) - break - } - - // application might have some async work to do - runtime.Gosched() - } - if atomic.LoadInt32(&valid) > 0 { - PairingCommit(pairing) - msgsCh <- pairing - } else { - msgsCh <- nil - } - }() - } - - // Uncompress and check signature - var gtsig Fp12 - sig := sigFn() - if sig == nil { - atomic.StoreInt32(&valid, 0) - } - if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && - !sig.SigValidate(false) { - atomic.StoreInt32(&valid, 0) - } - if atomic.LoadInt32(&valid) > 0 { - C.blst_aggregated_in_g2(>sig, sig) - } - mutex.Unlock() - - // Accumulate the thread results - var pairings Pairing - for i := 0; i < numThreads; i++ { - msg := <-msgsCh - if msg != nil { - if pairings == nil { - pairings = msg - } else { - ret := PairingMerge(pairings, msg) - if ret != C.BLST_SUCCESS { - atomic.StoreInt32(&valid, 0) - } - } - } - } - if atomic.LoadInt32(&valid) == 0 || pairings == nil { - return false - } - - return PairingFinalVerify(pairings, >sig) -} - -func CoreVerifyPkInG1(pk *P1Affine, sig *P2Affine, hash_or_encode bool, - msg Message, dst []byte, optional ...[]byte) int { - - var aug []byte - var uaug *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - uaug = (*C.byte)(&aug[0]) - } - } - - if runtime.NumGoroutine() < maxProcs { - sigFn := func() *P2Affine { - return sig - } - pkFn := func(_ uint32, _ *P1Affine) (*P1Affine, []byte) { - return pk, aug - } - if !coreAggregateVerifyPkInG1(sigFn, true, pkFn, true, []Message{msg}, - dst, hash_or_encode) { - return C.BLST_VERIFY_FAIL - } - return C.BLST_SUCCESS - } - - var udst *C.byte - if len(dst) > 0 { - udst = (*C.byte)(&dst[0]) - } - var umsg *C.byte - if len(msg) > 0 { - umsg = (*C.byte)(&msg[0]) - } - - return int(C.blst_core_verify_pk_in_g1(pk, sig, C.bool(hash_or_encode), - umsg, C.size_t(len(msg)), - udst, C.size_t(len(dst)), - uaug, C.size_t(len(aug)))) -} - -// pks are assumed to be verified for proof of possession, -// which implies that they are already group-checked -func (sig *P2Affine) FastAggregateVerify(sigGroupcheck bool, - pks []*P1Affine, msg Message, dst []byte, - optional ...interface{}) bool { // pass-through to Verify - n := len(pks) - - // TODO: return value for length zero? - if n == 0 { - return false - } - - aggregator := new(P1Aggregate) - if !aggregator.Aggregate(pks, false) { - return false - } - pkAff := aggregator.ToAffine() - - // Verify - return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) -} - -func (dummy *P2Affine) MultipleAggregateVerify(sigs []*P2Affine, - sigsGroupcheck bool, pks []*P1Affine, pksVerify bool, - msgs []Message, dst []byte, randFn func(*Scalar), randBits int, - optional ...interface{}) bool { // useHash - - // Sanity checks and argument parsing - n := len(pks) - if n == 0 || len(msgs) != n || len(sigs) != n { - return false - } - _, augs, useHash, ok := parseOpts(optional...) - useAugs := len(augs) != 0 - if !ok || (useAugs && len(augs) != n) { - return false - } - - paramsFn := - func(work uint32, sig *P2Affine, pk *P1Affine, rand *Scalar) ( - *P2Affine, *P1Affine, *Scalar, []byte) { - randFn(rand) - var aug []byte - if useAugs { - aug = augs[work] - } - return sigs[work], pks[work], rand, aug - } - - return multipleAggregateVerifyPkInG1(paramsFn, sigsGroupcheck, pksVerify, - msgs, dst, randBits, useHash) -} - -type mulAggGetterPkInG1 func(work uint32, sig *P2Affine, pk *P1Affine, - rand *Scalar) (*P2Affine, *P1Affine, *Scalar, []byte) - -func multipleAggregateVerifyPkInG1(paramsFn mulAggGetterPkInG1, - sigsGroupcheck bool, pksVerify bool, msgs []Message, - dst []byte, randBits int, - optional ...bool) bool { // useHash - n := len(msgs) - if n == 0 { - return false - } - - useHash := true - if len(optional) > 0 { - useHash = optional[0] - } - - numCores := runtime.GOMAXPROCS(0) - numThreads := maxProcs - if numThreads > numCores { - numThreads = numCores - } - if numThreads > n { - numThreads = n - } - // Each thread will determine next message to process by atomically - // incrementing curItem, process corresponding pk,msg[,aug] tuple and - // repeat until n is exceeded. The resulting accumulations will be - // fed into the msgsCh channel. - msgsCh := make(chan Pairing, numThreads) - valid := int32(1) - curItem := uint32(0) - - for tid := 0; tid < numThreads; tid++ { - go func() { - pairing := PairingCtx(useHash, dst) - var tempRand Scalar - var tempPk P1Affine - var tempSig P2Affine - for atomic.LoadInt32(&valid) > 0 { - // Get a work item - work := atomic.AddUint32(&curItem, 1) - 1 - if work >= uint32(n) { - break - } - - curSig, curPk, curRand, aug := paramsFn(work, &tempSig, - &tempPk, &tempRand) - - if PairingMulNAggregatePkInG1(pairing, curPk, pksVerify, - curSig, sigsGroupcheck, curRand, - randBits, msgs[work], aug) != - C.BLST_SUCCESS { - atomic.StoreInt32(&valid, 0) - break - } - - // application might have some async work to do - runtime.Gosched() - } - if atomic.LoadInt32(&valid) > 0 { - PairingCommit(pairing) - msgsCh <- pairing - } else { - msgsCh <- nil - } - }() - } - - // Accumulate the thread results - var pairings Pairing - for i := 0; i < numThreads; i++ { - msg := <-msgsCh - if msg != nil { - if pairings == nil { - pairings = msg - } else { - ret := PairingMerge(pairings, msg) - if ret != C.BLST_SUCCESS { - atomic.StoreInt32(&valid, 0) - } - } - } - } - if atomic.LoadInt32(&valid) == 0 || pairings == nil { - return false - } - - return PairingFinalVerify(pairings, nil) -} - -// -// Aggregate P2 -// - -type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine -type P2Aggregate struct { - v *P2 -} - -// Aggregate uncompressed elements -func (agg *P2Aggregate) Aggregate(elmts []*P2Affine, - groupcheck bool) bool { - if len(elmts) == 0 { - return true - } - getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] } - return agg.aggregate(getter, groupcheck, len(elmts)) -} - -// Aggregate compressed elements -func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte, - groupcheck bool) bool { - if len(elmts) == 0 { - return true - } - getter := func(i uint32, p *P2Affine) *P2Affine { - bytes := elmts[i] - if p.Uncompress(bytes) == nil { - return nil - } - return p - } - return agg.aggregate(getter, groupcheck, len(elmts)) -} - -func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) { - if other.v == nil { - // do nothing - } else if agg.v == nil { - agg.v = other.v - } else { - C.blst_p2_add_or_double(agg.v, agg.v, other.v) - } -} - -func (agg *P2Aggregate) Add(elmt *P2Affine, groupcheck bool) bool { - if groupcheck && !bool(C.blst_p2_affine_in_g2(elmt)) { - return false - } - if agg.v == nil { - agg.v = new(P2) - C.blst_p2_from_affine(agg.v, elmt) - } else { - C.blst_p2_add_or_double_affine(agg.v, agg.v, elmt) - } - return true -} - -func (agg *P2Aggregate) ToAffine() *P2Affine { - if agg.v == nil { - return new(P2Affine) - } - return agg.v.ToAffine() -} - -func (agg *P2Aggregate) aggregate(getter aggGetterP2, groupcheck bool, - n int) bool { - - if n == 0 { - return true - } - // operations are considered short enough for not to care about - // keeping one core free... - numThreads := runtime.GOMAXPROCS(0) - if numThreads > n { - numThreads = n - } - - valid := int32(1) - type result struct { - agg *P2 - empty bool - } - msgs := make(chan result, numThreads) - curItem := uint32(0) - for tid := 0; tid < numThreads; tid++ { - go func() { - first := true - var agg P2 - var temp P2Affine - for atomic.LoadInt32(&valid) > 0 { - // Get a work item - work := atomic.AddUint32(&curItem, 1) - 1 - if work >= uint32(n) { - break - } - - // Signature validate - curElmt := getter(work, &temp) - if curElmt == nil { - atomic.StoreInt32(&valid, 0) - break - } - if groupcheck && !bool(C.blst_p2_affine_in_g2(curElmt)) { - atomic.StoreInt32(&valid, 0) - break - } - if first { - C.blst_p2_from_affine(&agg, curElmt) - first = false - } else { - C.blst_p2_add_or_double_affine(&agg, &agg, curElmt) - } - // application might have some async work to do - runtime.Gosched() - } - if first { - msgs <- result{nil, true} - } else if atomic.LoadInt32(&valid) > 0 { - msgs <- result{&agg, false} - } else { - msgs <- result{nil, false} - } - }() - } - - // Accumulate the thread results - first := agg.v == nil - validLocal := true - for i := 0; i < numThreads; i++ { - msg := <-msgs - if !validLocal || msg.empty { - // do nothing - } else if msg.agg == nil { - validLocal = false - // This should be unnecessary but seems safer - atomic.StoreInt32(&valid, 0) - } else { - if first { - agg.v = msg.agg - first = false - } else { - C.blst_p2_add_or_double(agg.v, agg.v, msg.agg) - } - } - } - if atomic.LoadInt32(&valid) == 0 { - agg.v = nil - return false - } - return true -} - -// -// MIN-SIG -// - -// -// PublicKey -// - -func (pk *P2Affine) From(s *Scalar) *P2Affine { - C.blst_sk_to_pk2_in_g2(nil, pk, s) - return pk -} - -func (pk *P2Affine) KeyValidate() bool { - return !bool(C.blst_p2_affine_is_inf(pk)) && - bool(C.blst_p2_affine_in_g2(pk)) -} - -// sigInfcheck, check for infinity, is a way to avoid going -// into resource-consuming verification. Passing 'false' is -// always cryptographically safe, but application might want -// to guard against obviously bogus individual[!] signatures. -func (sig *P1Affine) SigValidate(sigInfcheck bool) bool { - if sigInfcheck && bool(C.blst_p1_affine_is_inf(sig)) { - return false - } - return bool(C.blst_p1_affine_in_g1(sig)) -} - -// -// Sign -// - -func (sig *P1Affine) Sign(sk *SecretKey, msg []byte, dst []byte, - optional ...interface{}) *P1Affine { - augSingle, aug, useHash, ok := parseOpts(optional...) - if !ok || len(aug) != 0 { - return nil - } - - var q *P1 - if useHash { - q = HashToG1(msg, dst, augSingle) - } else { - q = EncodeToG1(msg, dst, augSingle) - } - C.blst_sign_pk2_in_g2(nil, sig, q, sk) - return sig -} - -// -// Signature -// - -// Functions to return a signature and public key+augmentation tuple. -// This enables point decompression (if needed) to happen in parallel. -type sigGetterP1 func() *P1Affine -type pkGetterP2 func(i uint32, temp *P2Affine) (*P2Affine, []byte) - -// Single verify with decompressed pk -func (sig *P1Affine) Verify(sigGroupcheck bool, pk *P2Affine, pkValidate bool, - msg Message, dst []byte, - optional ...interface{}) bool { // useHash bool, aug []byte - - aug, _, useHash, ok := parseOpts(optional...) - if !ok { - return false - } - return sig.AggregateVerify(sigGroupcheck, []*P2Affine{pk}, pkValidate, - []Message{msg}, dst, useHash, [][]byte{aug}) -} - -// Single verify with compressed pk -// Uses a dummy signature to get the correct type -func (dummy *P1Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, - pk []byte, pkValidate bool, msg Message, dst []byte, - optional ...bool) bool { // useHash bool, usePksAsAugs bool - - return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, - [][]byte{pk}, pkValidate, - []Message{msg}, dst, optional...) -} - -// Aggregate verify with uncompressed signature and public keys -// Note that checking message uniqueness, if required, is left to the user. -// Not all signature schemes require it and this keeps the binding minimal -// and fast. Refer to the Uniq function for one method method of performing -// this check. -func (sig *P1Affine) AggregateVerify(sigGroupcheck bool, - pks []*P2Affine, pksVerify bool, msgs []Message, dst []byte, - optional ...interface{}) bool { // useHash bool, augs [][]byte - - // sanity checks and argument parsing - n := len(pks) - if n == 0 || len(msgs) != n { - return false - } - _, augs, useHash, ok := parseOpts(optional...) - useAugs := len(augs) != 0 - if !ok || (useAugs && len(augs) != n) { - return false - } - - sigFn := func() *P1Affine { - return sig - } - - pkFn := func(i uint32, _ *P2Affine) (*P2Affine, []byte) { - if useAugs { - return pks[i], augs[i] - } else { - return pks[i], nil - } - } - - return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, - msgs, dst, useHash) -} - -// Aggregate verify with compressed signature and public keys -// Uses a dummy signature to get the correct type -func (dummy *P1Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, - pks [][]byte, pksVerify bool, msgs []Message, dst []byte, - optional ...bool) bool { // useHash bool, usePksAsAugs bool - - // sanity checks and argument parsing - if len(pks) != len(msgs) { - return false - } - useHash := true - if len(optional) > 0 { - useHash = optional[0] - } - usePksAsAugs := false - if len(optional) > 1 { - usePksAsAugs = optional[1] - } - - sigFn := func() *P1Affine { - sigP := new(P1Affine) - if sigP.Uncompress(sig) == nil { - return nil - } - return sigP - } - pkFn := func(i uint32, pk *P2Affine) (*P2Affine, []byte) { - bytes := pks[i] - if len(bytes) == BLST_P2_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { - // Not compressed - if pk.Deserialize(bytes) == nil { - return nil, nil - } - } else if len(bytes) == BLST_P2_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { - if pk.Uncompress(bytes) == nil { - return nil, nil - } - } else { - return nil, nil - } - if usePksAsAugs { - return pk, bytes - } - return pk, nil - } - return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, - msgs, dst, useHash) -} - -func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool, - pkFn pkGetterP2, pkValidate bool, msgs []Message, dst []byte, - optional ...bool) bool { // useHash - - n := len(msgs) - if n == 0 { - return false - } - - useHash := true - if len(optional) > 0 { - useHash = optional[0] - } - - numCores := runtime.GOMAXPROCS(0) - numThreads := maxProcs - if numThreads > numCores { - numThreads = numCores - } - if numThreads > n { - numThreads = n - } - // Each thread will determine next message to process by atomically - // incrementing curItem, process corresponding pk,msg[,aug] tuple and - // repeat until n is exceeded. The resulting accumulations will be - // fed into the msgsCh channel. - msgsCh := make(chan Pairing, numThreads) - valid := int32(1) - curItem := uint32(0) - mutex := sync.Mutex{} - - mutex.Lock() - for tid := 0; tid < numThreads; tid++ { - go func() { - pairing := PairingCtx(useHash, dst) - var temp P2Affine - for atomic.LoadInt32(&valid) > 0 { - // Get a work item - work := atomic.AddUint32(&curItem, 1) - 1 - if work >= uint32(n) { - break - } else if work == 0 && maxProcs == numCores-1 && - numThreads == maxProcs { - // Avoid consuming all cores by waiting until the - // main thread has completed its miller loop before - // proceeding. - mutex.Lock() - mutex.Unlock() - } - - // Pull Public Key and augmentation blob - curPk, aug := pkFn(work, &temp) - if curPk == nil { - atomic.StoreInt32(&valid, 0) - break - } - - // Pairing and accumulate - ret := PairingAggregatePkInG2(pairing, curPk, pkValidate, - nil, false, msgs[work], aug) - if ret != C.BLST_SUCCESS { - atomic.StoreInt32(&valid, 0) - break - } - - // application might have some async work to do - runtime.Gosched() - } - if atomic.LoadInt32(&valid) > 0 { - PairingCommit(pairing) - msgsCh <- pairing - } else { - msgsCh <- nil - } - }() - } - - // Uncompress and check signature - var gtsig Fp12 - sig := sigFn() - if sig == nil { - atomic.StoreInt32(&valid, 0) - } - if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && - !sig.SigValidate(false) { - atomic.StoreInt32(&valid, 0) - } - if atomic.LoadInt32(&valid) > 0 { - C.blst_aggregated_in_g1(>sig, sig) - } - mutex.Unlock() - - // Accumulate the thread results - var pairings Pairing - for i := 0; i < numThreads; i++ { - msg := <-msgsCh - if msg != nil { - if pairings == nil { - pairings = msg - } else { - ret := PairingMerge(pairings, msg) - if ret != C.BLST_SUCCESS { - atomic.StoreInt32(&valid, 0) - } - } - } - } - if atomic.LoadInt32(&valid) == 0 || pairings == nil { - return false - } - - return PairingFinalVerify(pairings, >sig) -} - -func CoreVerifyPkInG2(pk *P2Affine, sig *P1Affine, hash_or_encode bool, - msg Message, dst []byte, optional ...[]byte) int { - - var aug []byte - var uaug *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - uaug = (*C.byte)(&aug[0]) - } - } - - if runtime.NumGoroutine() < maxProcs { - sigFn := func() *P1Affine { - return sig - } - pkFn := func(_ uint32, _ *P2Affine) (*P2Affine, []byte) { - return pk, aug - } - if !coreAggregateVerifyPkInG2(sigFn, true, pkFn, true, []Message{msg}, - dst, hash_or_encode) { - return C.BLST_VERIFY_FAIL - } - return C.BLST_SUCCESS - } - - var udst *C.byte - if len(dst) > 0 { - udst = (*C.byte)(&dst[0]) - } - var umsg *C.byte - if len(msg) > 0 { - umsg = (*C.byte)(&msg[0]) - } - - return int(C.blst_core_verify_pk_in_g2(pk, sig, C.bool(hash_or_encode), - umsg, C.size_t(len(msg)), - udst, C.size_t(len(dst)), - uaug, C.size_t(len(aug)))) -} - -// pks are assumed to be verified for proof of possession, -// which implies that they are already group-checked -func (sig *P1Affine) FastAggregateVerify(sigGroupcheck bool, - pks []*P2Affine, msg Message, dst []byte, - optional ...interface{}) bool { // pass-through to Verify - n := len(pks) - - // TODO: return value for length zero? - if n == 0 { - return false - } - - aggregator := new(P2Aggregate) - if !aggregator.Aggregate(pks, false) { - return false - } - pkAff := aggregator.ToAffine() - - // Verify - return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) -} - -func (dummy *P1Affine) MultipleAggregateVerify(sigs []*P1Affine, - sigsGroupcheck bool, pks []*P2Affine, pksVerify bool, - msgs []Message, dst []byte, randFn func(*Scalar), randBits int, - optional ...interface{}) bool { // useHash - - // Sanity checks and argument parsing - n := len(pks) - if n == 0 || len(msgs) != n || len(sigs) != n { - return false - } - _, augs, useHash, ok := parseOpts(optional...) - useAugs := len(augs) != 0 - if !ok || (useAugs && len(augs) != n) { - return false - } - - paramsFn := - func(work uint32, sig *P1Affine, pk *P2Affine, rand *Scalar) ( - *P1Affine, *P2Affine, *Scalar, []byte) { - randFn(rand) - var aug []byte - if useAugs { - aug = augs[work] - } - return sigs[work], pks[work], rand, aug - } - - return multipleAggregateVerifyPkInG2(paramsFn, sigsGroupcheck, pksVerify, - msgs, dst, randBits, useHash) -} - -type mulAggGetterPkInG2 func(work uint32, sig *P1Affine, pk *P2Affine, - rand *Scalar) (*P1Affine, *P2Affine, *Scalar, []byte) - -func multipleAggregateVerifyPkInG2(paramsFn mulAggGetterPkInG2, - sigsGroupcheck bool, pksVerify bool, msgs []Message, - dst []byte, randBits int, - optional ...bool) bool { // useHash - n := len(msgs) - if n == 0 { - return false - } - - useHash := true - if len(optional) > 0 { - useHash = optional[0] - } - - numCores := runtime.GOMAXPROCS(0) - numThreads := maxProcs - if numThreads > numCores { - numThreads = numCores - } - if numThreads > n { - numThreads = n - } - // Each thread will determine next message to process by atomically - // incrementing curItem, process corresponding pk,msg[,aug] tuple and - // repeat until n is exceeded. The resulting accumulations will be - // fed into the msgsCh channel. - msgsCh := make(chan Pairing, numThreads) - valid := int32(1) - curItem := uint32(0) - - for tid := 0; tid < numThreads; tid++ { - go func() { - pairing := PairingCtx(useHash, dst) - var tempRand Scalar - var tempPk P2Affine - var tempSig P1Affine - for atomic.LoadInt32(&valid) > 0 { - // Get a work item - work := atomic.AddUint32(&curItem, 1) - 1 - if work >= uint32(n) { - break - } - - curSig, curPk, curRand, aug := paramsFn(work, &tempSig, - &tempPk, &tempRand) - - if PairingMulNAggregatePkInG2(pairing, curPk, pksVerify, - curSig, sigsGroupcheck, curRand, - randBits, msgs[work], aug) != - C.BLST_SUCCESS { - atomic.StoreInt32(&valid, 0) - break - } - - // application might have some async work to do - runtime.Gosched() - } - if atomic.LoadInt32(&valid) > 0 { - PairingCommit(pairing) - msgsCh <- pairing - } else { - msgsCh <- nil - } - }() - } - - // Accumulate the thread results - var pairings Pairing - for i := 0; i < numThreads; i++ { - msg := <-msgsCh - if msg != nil { - if pairings == nil { - pairings = msg - } else { - ret := PairingMerge(pairings, msg) - if ret != C.BLST_SUCCESS { - atomic.StoreInt32(&valid, 0) - } - } - } - } - if atomic.LoadInt32(&valid) == 0 || pairings == nil { - return false - } - - return PairingFinalVerify(pairings, nil) -} - -// -// Aggregate P1 -// - -type aggGetterP1 func(i uint32, temp *P1Affine) *P1Affine -type P1Aggregate struct { - v *P1 -} - -// Aggregate uncompressed elements -func (agg *P1Aggregate) Aggregate(elmts []*P1Affine, - groupcheck bool) bool { - if len(elmts) == 0 { - return true - } - getter := func(i uint32, _ *P1Affine) *P1Affine { return elmts[i] } - return agg.aggregate(getter, groupcheck, len(elmts)) -} - -// Aggregate compressed elements -func (agg *P1Aggregate) AggregateCompressed(elmts [][]byte, - groupcheck bool) bool { - if len(elmts) == 0 { - return true - } - getter := func(i uint32, p *P1Affine) *P1Affine { - bytes := elmts[i] - if p.Uncompress(bytes) == nil { - return nil - } - return p - } - return agg.aggregate(getter, groupcheck, len(elmts)) -} - -func (agg *P1Aggregate) AddAggregate(other *P1Aggregate) { - if other.v == nil { - // do nothing - } else if agg.v == nil { - agg.v = other.v - } else { - C.blst_p1_add_or_double(agg.v, agg.v, other.v) - } -} - -func (agg *P1Aggregate) Add(elmt *P1Affine, groupcheck bool) bool { - if groupcheck && !bool(C.blst_p1_affine_in_g1(elmt)) { - return false - } - if agg.v == nil { - agg.v = new(P1) - C.blst_p1_from_affine(agg.v, elmt) - } else { - C.blst_p1_add_or_double_affine(agg.v, agg.v, elmt) - } - return true -} - -func (agg *P1Aggregate) ToAffine() *P1Affine { - if agg.v == nil { - return new(P1Affine) - } - return agg.v.ToAffine() -} - -func (agg *P1Aggregate) aggregate(getter aggGetterP1, groupcheck bool, - n int) bool { - - if n == 0 { - return true - } - // operations are considered short enough for not to care about - // keeping one core free... - numThreads := runtime.GOMAXPROCS(0) - if numThreads > n { - numThreads = n - } - - valid := int32(1) - type result struct { - agg *P1 - empty bool - } - msgs := make(chan result, numThreads) - curItem := uint32(0) - for tid := 0; tid < numThreads; tid++ { - go func() { - first := true - var agg P1 - var temp P1Affine - for atomic.LoadInt32(&valid) > 0 { - // Get a work item - work := atomic.AddUint32(&curItem, 1) - 1 - if work >= uint32(n) { - break - } - - // Signature validate - curElmt := getter(work, &temp) - if curElmt == nil { - atomic.StoreInt32(&valid, 0) - break - } - if groupcheck && !bool(C.blst_p1_affine_in_g1(curElmt)) { - atomic.StoreInt32(&valid, 0) - break - } - if first { - C.blst_p1_from_affine(&agg, curElmt) - first = false - } else { - C.blst_p1_add_or_double_affine(&agg, &agg, curElmt) - } - // application might have some async work to do - runtime.Gosched() - } - if first { - msgs <- result{nil, true} - } else if atomic.LoadInt32(&valid) > 0 { - msgs <- result{&agg, false} - } else { - msgs <- result{nil, false} - } - }() - } - - // Accumulate the thread results - first := agg.v == nil - validLocal := true - for i := 0; i < numThreads; i++ { - msg := <-msgs - if !validLocal || msg.empty { - // do nothing - } else if msg.agg == nil { - validLocal = false - // This should be unnecessary but seems safer - atomic.StoreInt32(&valid, 0) - } else { - if first { - agg.v = msg.agg - first = false - } else { - C.blst_p1_add_or_double(agg.v, agg.v, msg.agg) - } - } - } - if atomic.LoadInt32(&valid) == 0 { - agg.v = nil - return false - } - return true -} -func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, - sig *P2Affine, sigGroupcheck bool, msg []byte, - optional ...[]byte) int { // aug - var aug []byte - var uaug *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - uaug = (*C.byte)(&aug[0]) - } - } - var umsg *C.byte - if len(msg) > 0 { - umsg = (*C.byte)(&msg[0]) - } - - r := C.blst_pairing_chk_n_aggr_pk_in_g1(&ctx[0], - PK, C.bool(pkValidate), - sig, C.bool(sigGroupcheck), - umsg, C.size_t(len(msg)), - uaug, C.size_t(len(aug))) - - return int(r) -} - -func PairingMulNAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, - sig *P2Affine, sigGroupcheck bool, - rand *Scalar, randBits int, msg []byte, - optional ...[]byte) int { // aug - var aug []byte - var uaug *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - uaug = (*C.byte)(&aug[0]) - } - } - var umsg *C.byte - if len(msg) > 0 { - umsg = (*C.byte)(&msg[0]) - } - - r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g1(&ctx[0], - PK, C.bool(pkValidate), - sig, C.bool(sigGroupcheck), - &rand.b[0], C.size_t(randBits), - umsg, C.size_t(len(msg)), - uaug, C.size_t(len(aug))) - - return int(r) -} - -// -// Serialization/Deserialization. -// - -// P1 Serdes -func (p1 *P1Affine) Serialize() []byte { - var out [BLST_P1_SERIALIZE_BYTES]byte - C.blst_p1_affine_serialize((*C.byte)(&out[0]), p1) - return out[:] -} - -func (p1 *P1Affine) Deserialize(in []byte) *P1Affine { - if len(in) != BLST_P1_SERIALIZE_BYTES { - return nil - } - if C.blst_p1_deserialize(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { - return nil - } - return p1 -} -func (p1 *P1Affine) Compress() []byte { - var out [BLST_P1_COMPRESS_BYTES]byte - C.blst_p1_affine_compress((*C.byte)(&out[0]), p1) - return out[:] -} - -func (p1 *P1Affine) Uncompress(in []byte) *P1Affine { - if len(in) != BLST_P1_COMPRESS_BYTES { - return nil - } - if C.blst_p1_uncompress(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { - return nil - } - return p1 -} - -func (p1 *P1Affine) InG1() bool { - return bool(C.blst_p1_affine_in_g1(p1)) -} - -func (dummy *P1Affine) BatchUncompress(in [][]byte) []*P1Affine { - // Allocate space for all of the resulting points. Later we'll save pointers - // and return those so that the result could be used in other functions, - // such as MultipleAggregateVerify. - n := len(in) - points := make([]P1Affine, n) - pointsPtrs := make([]*P1Affine, n) - - numCores := runtime.GOMAXPROCS(0) - numThreads := maxProcs - if numThreads > numCores { - numThreads = numCores - } - if numThreads > n { - numThreads = n - } - // Each thread will determine next message to process by atomically - // incrementing curItem, process corresponding point, and - // repeat until n is exceeded. Each thread will send a result (true for - // success, false for failure) into the channel when complete. - resCh := make(chan bool, numThreads) - valid := int32(1) - curItem := uint32(0) - for tid := 0; tid < numThreads; tid++ { - go func() { - for atomic.LoadInt32(&valid) > 0 { - // Get a work item - work := atomic.AddUint32(&curItem, 1) - 1 - if work >= uint32(n) { - break - } - if points[work].Uncompress(in[work]) == nil { - atomic.StoreInt32(&valid, 0) - break - } - pointsPtrs[work] = &points[work] - } - if atomic.LoadInt32(&valid) > 0 { - resCh <- true - } else { - resCh <- false - } - }() - } - - // Collect the threads - result := true - for i := 0; i < numThreads; i++ { - if !<-resCh { - result = false - } - } - if atomic.LoadInt32(&valid) == 0 || !result { - return nil - } - return pointsPtrs -} - -func (p1 *P1) Serialize() []byte { - var out [BLST_P1_SERIALIZE_BYTES]byte - C.blst_p1_serialize((*C.byte)(&out[0]), p1) - return out[:] -} -func (p1 *P1) Compress() []byte { - var out [BLST_P1_COMPRESS_BYTES]byte - C.blst_p1_compress((*C.byte)(&out[0]), p1) - return out[:] -} - -func (p1 *P1) MultAssign(scalarIf interface{}, optional ...int) *P1 { - var nbits int - var scalar *C.byte - switch val := scalarIf.(type) { - case []byte: - scalar = (*C.byte)(&val[0]) - nbits = len(val) * 8 - case *Scalar: - scalar = &val.b[0] - nbits = 255 - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - if len(optional) > 0 { - nbits = optional[0] - } - C.blst_p1_mult(p1, p1, scalar, C.size_t(nbits)) - return p1 -} - -func (p1 *P1) Mult(scalarIf interface{}, optional ...int) *P1 { - ret := *p1 - return ret.MultAssign(scalarIf, optional...) -} - -func (p1 *P1) AddAssign(pointIf interface{}) *P1 { - switch val := pointIf.(type) { - case *P1: - C.blst_p1_add_or_double(p1, p1, val) - case *P1Affine: - C.blst_p1_add_or_double_affine(p1, p1, val) - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - return p1 -} - -func (p1 *P1) Add(pointIf interface{}) *P1 { - ret := *p1 - return ret.AddAssign(pointIf) -} - -func (p1 *P1) SubAssign(pointIf interface{}) *P1 { - var x *Fp - var affine C.bool - switch val := pointIf.(type) { - case *P1: - x = &val.x - affine = false - case *P1Affine: - x = &val.x - affine = true - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - C.go_p1_sub_assign(p1, x, affine) - return p1 -} - -func (p1 *P1) Sub(pointIf interface{}) *P1 { - ret := *p1 - return ret.SubAssign(pointIf) -} - -func P1Generator() *P1 { - return C.blst_p1_generator() -} - -// 'acc += point * scalar', passing 'nil' for 'point' means "use the -// -// group generator point" -func (acc *P1) MultNAccumulate(pointIf interface{}, scalarIf interface{}, - optional ...int) *P1 { - var x *Fp - var affine C.bool - if pointIf != nil { - switch val := pointIf.(type) { - case *P1: - x = &val.x - affine = false - case *P1Affine: - x = &val.x - affine = true - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - } - var nbits int - var scalar *C.byte - switch val := scalarIf.(type) { - case []byte: - scalar = (*C.byte)(&val[0]) - nbits = len(val) * 8 - case *Scalar: - scalar = &val.b[0] - nbits = 255 - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - if len(optional) > 0 { - nbits = optional[0] - } - C.go_p1_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits)) - return acc -} - -// -// Affine -// - -func (p *P1) ToAffine() *P1Affine { - var pa P1Affine - C.blst_p1_to_affine(&pa, p) - return &pa -} - -func (p *P1) FromAffine(pa *P1Affine) { - C.blst_p1_from_affine(p, pa) -} - -// Hash -func HashToG1(msg []byte, dst []byte, - optional ...[]byte) *P1 { // aug - var q P1 - - // Handle zero length message - var msgC *C.byte - if len(msg) > 0 { - msgC = (*C.byte)(&msg[0]) - } - - var dstC *C.byte - if len(dst) > 0 { - dstC = (*C.byte)(&dst[0]) - } - - var aug []byte - var augC *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - augC = (*C.byte)(&aug[0]) - } - } - - C.blst_hash_to_g1(&q, msgC, C.size_t(len(msg)), - dstC, C.size_t(len(dst)), - augC, C.size_t(len(aug))) - return &q -} - -func EncodeToG1(msg []byte, dst []byte, - optional ...[]byte) *P1 { // aug - var q P1 - - // Handle zero length message - var msgC *C.byte - if len(msg) > 0 { - msgC = (*C.byte)(&msg[0]) - } - - var dstC *C.byte - if len(dst) > 0 { - dstC = (*C.byte)(&dst[0]) - } - - var aug []byte - var augC *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - augC = (*C.byte)(&aug[0]) - } - } - - C.blst_encode_to_g1(&q, msgC, C.size_t(len(msg)), - dstC, C.size_t(len(dst)), - augC, C.size_t(len(aug))) - return &q -} - -// -// Multi-point/scalar operations -// - -func P1sToAffine(points []*P1, optional ...int) P1Affines { - var npoints int - if len(optional) > 0 { - npoints = optional[0] - } else { - npoints = len(points) - } - ret := make([]P1Affine, npoints) - _cgoCheckPointer := func(...interface{}) {} - C.blst_p1s_to_affine(&ret[0], &points[0], C.size_t(npoints)) - return ret -} - -func (points P1s) ToAffine(optional ...P1Affines) P1Affines { - npoints := len(points) - var ret P1Affines - - if len(optional) > 0 { // used in benchmark - ret = optional[0] - if len(ret) < npoints { - panic("npoints mismatch") - } - } else { - ret = make([]P1Affine, npoints) - } - - if maxProcs < 2 || npoints < 768 { - C.go_p1slice_to_affine(&ret[0], &points[0], C.size_t(npoints)) - return ret - } - - nslices := (npoints + 511) / 512 - if nslices > maxProcs { - nslices = maxProcs - } - delta, rem := npoints/nslices+1, npoints%nslices - - var wg sync.WaitGroup - wg.Add(nslices) - for x := 0; x < npoints; x += delta { - if rem == 0 { - delta -= 1 - } - rem -= 1 - go func(out *P1Affine, inp *P1, delta int) { - C.go_p1slice_to_affine(out, inp, C.size_t(delta)) - wg.Done() - }(&ret[x], &points[x], delta) - } - wg.Wait() - - return ret -} - -// -// Batch addition -// - -func P1AffinesAdd(points []*P1Affine, optional ...int) *P1 { - var npoints int - if len(optional) > 0 { - npoints = optional[0] - } else { - npoints = len(points) - } - var ret P1 - _cgoCheckPointer := func(...interface{}) {} - C.blst_p1s_add(&ret, &points[0], C.size_t(npoints)) - return &ret -} - -func (points P1Affines) Add() *P1 { - npoints := len(points) - if maxProcs < 2 || npoints < 768 { - var ret P1 - C.go_p1slice_add(&ret, &points[0], C.size_t(npoints)) - return &ret - } - - nslices := (npoints + 511) / 512 - if nslices > maxProcs { - nslices = maxProcs - } - delta, rem := npoints/nslices+1, npoints%nslices - - msgs := make(chan P1, nslices) - for x := 0; x < npoints; x += delta { - if rem == 0 { - delta -= 1 - } - rem -= 1 - go func(points *P1Affine, delta int) { - var ret P1 - C.go_p1slice_add(&ret, points, C.size_t(delta)) - msgs <- ret - }(&points[x], delta) - } - - ret := <-msgs - for i := 1; i < nslices; i++ { - msg := <-msgs - C.blst_p1_add_or_double(&ret, &ret, &msg) - } - return &ret -} - -func (points P1s) Add() *P1 { - return points.ToAffine().Add() -} - -// -// Multi-scalar multiplication -// - -func P1AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P1 { - var npoints int - switch val := pointsIf.(type) { - case []*P1Affine: - npoints = len(val) - case []P1Affine: - npoints = len(val) - case P1Affines: - npoints = len(val) - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - - nbytes := (nbits + 7) / 8 - var scalars []*C.byte - switch val := scalarsIf.(type) { - case []byte: - if len(val) < npoints*nbytes { - return nil - } - case [][]byte: - if len(val) < npoints { - return nil - } - scalars = make([]*C.byte, npoints) - for i := range scalars { - scalars[i] = (*C.byte)(&val[i][0]) - } - case []Scalar: - if len(val) < npoints { - return nil - } - if nbits <= 248 { - scalars = make([]*C.byte, npoints) - for i := range scalars { - scalars[i] = &val[i].b[0] - } - } - case []*Scalar: - if len(val) < npoints { - return nil - } - scalars = make([]*C.byte, npoints) - for i := range scalars { - scalars[i] = &val[i].b[0] - } - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - - numThreads := maxProcs - numCores := runtime.GOMAXPROCS(0) - if numCores < maxProcs { - numThreads = numCores - } - - if numThreads < 2 || npoints < 32 { - sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 - scratch := make([]uint64, sz) - - pointsBySlice := [2]*P1Affine{nil, nil} - var p_points **P1Affine - switch val := pointsIf.(type) { - case []*P1Affine: - p_points = &val[0] - case []P1Affine: - pointsBySlice[0] = &val[0] - p_points = &pointsBySlice[0] - case P1Affines: - pointsBySlice[0] = &val[0] - p_points = &pointsBySlice[0] - } - - scalarsBySlice := [2]*C.byte{nil, nil} - var p_scalars **C.byte - switch val := scalarsIf.(type) { - case []byte: - scalarsBySlice[0] = (*C.byte)(&val[0]) - p_scalars = &scalarsBySlice[0] - case [][]byte: - p_scalars = &scalars[0] - case []Scalar: - if nbits > 248 { - scalarsBySlice[0] = (*C.byte)(&val[0].b[0]) - p_scalars = &scalarsBySlice[0] - } else { - p_scalars = &scalars[0] - } - case []*Scalar: - p_scalars = &scalars[0] - } - - var ret P1 - _cgoCheckPointer := func(...interface{}) {} - C.blst_p1s_mult_pippenger(&ret, p_points, C.size_t(npoints), - p_scalars, C.size_t(nbits), - (*C.limb_t)(&scratch[0])) - - for i := range scalars { - scalars[i] = nil - } - - return &ret - } - - // this is sizeof(scratch[0]) - sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(0)) / 8 - - nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), - numThreads) - - // |grid[]| holds "coordinates" and place for result - grid := make([]struct { - x, dx, y, dy int - point P1 - }, nx*ny) - - dx := npoints / nx - y := window * (ny - 1) - total := 0 - for ; total < nx; total++ { - grid[total].x = total * dx - grid[total].dx = dx - grid[total].y = y - grid[total].dy = nbits - y - } - grid[total-1].dx = npoints - grid[total-1].x - - for y > 0 { - y -= window - for i := 0; i < nx; i++ { - grid[total].x = grid[i].x - grid[total].dx = grid[i].dx - grid[total].y = y - grid[total].dy = window - total++ - } - } - - if numThreads > total { - numThreads = total - } - - msgsCh := make(chan int, ny) - rowSync := make([]int32, ny) // count up to |nx| - curItem := int32(0) - for tid := 0; tid < numThreads; tid++ { - go func() { - scratch := make([]uint64, sz<= total { - break - } - - x := grid[workItem].x - y := grid[workItem].y - - var p_points **P1Affine - switch val := pointsIf.(type) { - case []*P1Affine: - p_points = &val[x] - case []P1Affine: - pointsBySlice[0] = &val[x] - p_points = &pointsBySlice[0] - case P1Affines: - pointsBySlice[0] = &val[x] - p_points = &pointsBySlice[0] - } - - var p_scalars **C.byte - switch val := scalarsIf.(type) { - case []byte: - scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) - p_scalars = &scalarsBySlice[0] - case [][]byte: - p_scalars = &scalars[x] - case []Scalar: - if nbits > 248 { - scalarsBySlice[0] = (*C.byte)(&val[x].b[0]) - p_scalars = &scalarsBySlice[0] - } else { - p_scalars = &scalars[x] - } - case []*Scalar: - p_scalars = &scalars[x] - } - - C.blst_p1s_tile_pippenger(&grid[workItem].point, - p_points, C.size_t(grid[workItem].dx), - p_scalars, C.size_t(nbits), - (*C.limb_t)(&scratch[0]), - C.size_t(y), C.size_t(window)) - - if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { - msgsCh <- y // "row" is done - } else { - runtime.Gosched() // be nice to the application - } - } - - pointsBySlice[0] = nil - scalarsBySlice[0] = nil - }() - } - - var ret P1 - rows := make([]bool, ny) - row := 0 // actually index in |grid[]| - for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" - y := <-msgsCh - rows[y/window] = true // mark the "row" - for grid[row].y == y { // if it's current "row", process it - for row < total && grid[row].y == y { - C.blst_p1_add_or_double(&ret, &ret, &grid[row].point) - row++ - } - if y == 0 { - break // one can as well 'return &ret' here - } - for j := 0; j < window; j++ { - C.blst_p1_double(&ret, &ret) - } - y -= window - if !rows[y/window] { // see if next "row" was marked already - break - } - } - } - - for i := range scalars { - scalars[i] = nil - } - - return &ret -} - -func (points P1Affines) Mult(scalarsIf interface{}, nbits int) *P1 { - return P1AffinesMult(points, scalarsIf, nbits) -} - -func (points P1s) Mult(scalarsIf interface{}, nbits int) *P1 { - return points.ToAffine().Mult(scalarsIf, nbits) -} -func PairingAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, - sig *P1Affine, sigGroupcheck bool, msg []byte, - optional ...[]byte) int { // aug - var aug []byte - var uaug *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - uaug = (*C.byte)(&aug[0]) - } - } - var umsg *C.byte - if len(msg) > 0 { - umsg = (*C.byte)(&msg[0]) - } - - r := C.blst_pairing_chk_n_aggr_pk_in_g2(&ctx[0], - PK, C.bool(pkValidate), - sig, C.bool(sigGroupcheck), - umsg, C.size_t(len(msg)), - uaug, C.size_t(len(aug))) - - return int(r) -} - -func PairingMulNAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, - sig *P1Affine, sigGroupcheck bool, - rand *Scalar, randBits int, msg []byte, - optional ...[]byte) int { // aug - var aug []byte - var uaug *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - uaug = (*C.byte)(&aug[0]) - } - } - var umsg *C.byte - if len(msg) > 0 { - umsg = (*C.byte)(&msg[0]) - } - - r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g2(&ctx[0], - PK, C.bool(pkValidate), - sig, C.bool(sigGroupcheck), - &rand.b[0], C.size_t(randBits), - umsg, C.size_t(len(msg)), - uaug, C.size_t(len(aug))) - - return int(r) -} - -// -// Serialization/Deserialization. -// - -// P2 Serdes -func (p2 *P2Affine) Serialize() []byte { - var out [BLST_P2_SERIALIZE_BYTES]byte - C.blst_p2_affine_serialize((*C.byte)(&out[0]), p2) - return out[:] -} - -func (p2 *P2Affine) Deserialize(in []byte) *P2Affine { - if len(in) != BLST_P2_SERIALIZE_BYTES { - return nil - } - if C.blst_p2_deserialize(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS { - return nil - } - return p2 -} -func (p2 *P2Affine) Compress() []byte { - var out [BLST_P2_COMPRESS_BYTES]byte - C.blst_p2_affine_compress((*C.byte)(&out[0]), p2) - return out[:] -} - -func (p2 *P2Affine) Uncompress(in []byte) *P2Affine { - if len(in) != BLST_P2_COMPRESS_BYTES { - return nil - } - if C.blst_p2_uncompress(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS { - return nil - } - return p2 -} - -func (p2 *P2Affine) InG2() bool { - return bool(C.blst_p2_affine_in_g2(p2)) -} - -func (dummy *P2Affine) BatchUncompress(in [][]byte) []*P2Affine { - // Allocate space for all of the resulting points. Later we'll save pointers - // and return those so that the result could be used in other functions, - // such as MultipleAggregateVerify. - n := len(in) - points := make([]P2Affine, n) - pointsPtrs := make([]*P2Affine, n) - - numCores := runtime.GOMAXPROCS(0) - numThreads := maxProcs - if numThreads > numCores { - numThreads = numCores - } - if numThreads > n { - numThreads = n - } - // Each thread will determine next message to process by atomically - // incrementing curItem, process corresponding point, and - // repeat until n is exceeded. Each thread will send a result (true for - // success, false for failure) into the channel when complete. - resCh := make(chan bool, numThreads) - valid := int32(1) - curItem := uint32(0) - for tid := 0; tid < numThreads; tid++ { - go func() { - for atomic.LoadInt32(&valid) > 0 { - // Get a work item - work := atomic.AddUint32(&curItem, 1) - 1 - if work >= uint32(n) { - break - } - if points[work].Uncompress(in[work]) == nil { - atomic.StoreInt32(&valid, 0) - break - } - pointsPtrs[work] = &points[work] - } - if atomic.LoadInt32(&valid) > 0 { - resCh <- true - } else { - resCh <- false - } - }() - } - - // Collect the threads - result := true - for i := 0; i < numThreads; i++ { - if !<-resCh { - result = false - } - } - if atomic.LoadInt32(&valid) == 0 || !result { - return nil - } - return pointsPtrs -} - -func (p2 *P2) Serialize() []byte { - var out [BLST_P2_SERIALIZE_BYTES]byte - C.blst_p2_serialize((*C.byte)(&out[0]), p2) - return out[:] -} -func (p2 *P2) Compress() []byte { - var out [BLST_P2_COMPRESS_BYTES]byte - C.blst_p2_compress((*C.byte)(&out[0]), p2) - return out[:] -} - -func (p2 *P2) MultAssign(scalarIf interface{}, optional ...int) *P2 { - var nbits int - var scalar *C.byte - switch val := scalarIf.(type) { - case []byte: - scalar = (*C.byte)(&val[0]) - nbits = len(val) * 8 - case *Scalar: - scalar = &val.b[0] - nbits = 255 - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - if len(optional) > 0 { - nbits = optional[0] - } - C.blst_p2_mult(p2, p2, scalar, C.size_t(nbits)) - return p2 -} - -func (p2 *P2) Mult(scalarIf interface{}, optional ...int) *P2 { - ret := *p2 - return ret.MultAssign(scalarIf, optional...) -} - -func (p2 *P2) AddAssign(pointIf interface{}) *P2 { - switch val := pointIf.(type) { - case *P2: - C.blst_p2_add_or_double(p2, p2, val) - case *P2Affine: - C.blst_p2_add_or_double_affine(p2, p2, val) - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - return p2 -} - -func (p2 *P2) Add(pointIf interface{}) *P2 { - ret := *p2 - return ret.AddAssign(pointIf) -} - -func (p2 *P2) SubAssign(pointIf interface{}) *P2 { - var x *Fp2 - var affine C.bool - switch val := pointIf.(type) { - case *P2: - x = &val.x - affine = false - case *P2Affine: - x = &val.x - affine = true - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - C.go_p2_sub_assign(p2, x, affine) - return p2 -} - -func (p2 *P2) Sub(pointIf interface{}) *P2 { - ret := *p2 - return ret.SubAssign(pointIf) -} - -func P2Generator() *P2 { - return C.blst_p2_generator() -} - -// 'acc += point * scalar', passing 'nil' for 'point' means "use the -// -// group generator point" -func (acc *P2) MultNAccumulate(pointIf interface{}, scalarIf interface{}, - optional ...int) *P2 { - var x *Fp2 - var affine C.bool - if pointIf != nil { - switch val := pointIf.(type) { - case *P2: - x = &val.x - affine = false - case *P2Affine: - x = &val.x - affine = true - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - } - var nbits int - var scalar *C.byte - switch val := scalarIf.(type) { - case []byte: - scalar = (*C.byte)(&val[0]) - nbits = len(val) * 8 - case *Scalar: - scalar = &val.b[0] - nbits = 255 - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - if len(optional) > 0 { - nbits = optional[0] - } - C.go_p2_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits)) - return acc -} - -// -// Affine -// - -func (p *P2) ToAffine() *P2Affine { - var pa P2Affine - C.blst_p2_to_affine(&pa, p) - return &pa -} - -func (p *P2) FromAffine(pa *P2Affine) { - C.blst_p2_from_affine(p, pa) -} - -// Hash -func HashToG2(msg []byte, dst []byte, - optional ...[]byte) *P2 { // aug - var q P2 - - // Handle zero length message - var msgC *C.byte - if len(msg) > 0 { - msgC = (*C.byte)(&msg[0]) - } - - var dstC *C.byte - if len(dst) > 0 { - dstC = (*C.byte)(&dst[0]) - } - - var aug []byte - var augC *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - augC = (*C.byte)(&aug[0]) - } - } - - C.blst_hash_to_g2(&q, msgC, C.size_t(len(msg)), - dstC, C.size_t(len(dst)), - augC, C.size_t(len(aug))) - return &q -} - -func EncodeToG2(msg []byte, dst []byte, - optional ...[]byte) *P2 { // aug - var q P2 - - // Handle zero length message - var msgC *C.byte - if len(msg) > 0 { - msgC = (*C.byte)(&msg[0]) - } - - var dstC *C.byte - if len(dst) > 0 { - dstC = (*C.byte)(&dst[0]) - } - - var aug []byte - var augC *C.byte - if len(optional) > 0 { - aug = optional[0] - if len(aug) > 0 { - augC = (*C.byte)(&aug[0]) - } - } - - C.blst_encode_to_g2(&q, msgC, C.size_t(len(msg)), - dstC, C.size_t(len(dst)), - augC, C.size_t(len(aug))) - return &q -} - -// -// Multi-point/scalar operations -// - -func P2sToAffine(points []*P2, optional ...int) P2Affines { - var npoints int - if len(optional) > 0 { - npoints = optional[0] - } else { - npoints = len(points) - } - ret := make([]P2Affine, npoints) - _cgoCheckPointer := func(...interface{}) {} - C.blst_p2s_to_affine(&ret[0], &points[0], C.size_t(npoints)) - return ret -} - -func (points P2s) ToAffine(optional ...P2Affines) P2Affines { - npoints := len(points) - var ret P2Affines - - if len(optional) > 0 { // used in benchmark - ret = optional[0] - if len(ret) < npoints { - panic("npoints mismatch") - } - } else { - ret = make([]P2Affine, npoints) - } - - if maxProcs < 2 || npoints < 768 { - C.go_p2slice_to_affine(&ret[0], &points[0], C.size_t(npoints)) - return ret - } - - nslices := (npoints + 511) / 512 - if nslices > maxProcs { - nslices = maxProcs - } - delta, rem := npoints/nslices+1, npoints%nslices - - var wg sync.WaitGroup - wg.Add(nslices) - for x := 0; x < npoints; x += delta { - if rem == 0 { - delta -= 1 - } - rem -= 1 - go func(out *P2Affine, inp *P2, delta int) { - C.go_p2slice_to_affine(out, inp, C.size_t(delta)) - wg.Done() - }(&ret[x], &points[x], delta) - } - wg.Wait() - - return ret -} - -// -// Batch addition -// - -func P2AffinesAdd(points []*P2Affine, optional ...int) *P2 { - var npoints int - if len(optional) > 0 { - npoints = optional[0] - } else { - npoints = len(points) - } - var ret P2 - _cgoCheckPointer := func(...interface{}) {} - C.blst_p2s_add(&ret, &points[0], C.size_t(npoints)) - return &ret -} - -func (points P2Affines) Add() *P2 { - npoints := len(points) - if maxProcs < 2 || npoints < 768 { - var ret P2 - C.go_p2slice_add(&ret, &points[0], C.size_t(npoints)) - return &ret - } - - nslices := (npoints + 511) / 512 - if nslices > maxProcs { - nslices = maxProcs - } - delta, rem := npoints/nslices+1, npoints%nslices - - msgs := make(chan P2, nslices) - for x := 0; x < npoints; x += delta { - if rem == 0 { - delta -= 1 - } - rem -= 1 - go func(points *P2Affine, delta int) { - var ret P2 - C.go_p2slice_add(&ret, points, C.size_t(delta)) - msgs <- ret - }(&points[x], delta) - } - - ret := <-msgs - for i := 1; i < nslices; i++ { - msg := <-msgs - C.blst_p2_add_or_double(&ret, &ret, &msg) - } - return &ret -} - -func (points P2s) Add() *P2 { - return points.ToAffine().Add() -} - -// -// Multi-scalar multiplication -// - -func P2AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P2 { - var npoints int - switch val := pointsIf.(type) { - case []*P2Affine: - npoints = len(val) - case []P2Affine: - npoints = len(val) - case P2Affines: - npoints = len(val) - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - - nbytes := (nbits + 7) / 8 - var scalars []*C.byte - switch val := scalarsIf.(type) { - case []byte: - if len(val) < npoints*nbytes { - return nil - } - case [][]byte: - if len(val) < npoints { - return nil - } - scalars = make([]*C.byte, npoints) - for i := range scalars { - scalars[i] = (*C.byte)(&val[i][0]) - } - case []Scalar: - if len(val) < npoints { - return nil - } - if nbits <= 248 { - scalars = make([]*C.byte, npoints) - for i := range scalars { - scalars[i] = &val[i].b[0] - } - } - case []*Scalar: - if len(val) < npoints { - return nil - } - scalars = make([]*C.byte, npoints) - for i := range scalars { - scalars[i] = &val[i].b[0] - } - default: - panic(fmt.Sprintf("unsupported type %T", val)) - } - - numThreads := maxProcs - numCores := runtime.GOMAXPROCS(0) - if numCores < maxProcs { - numThreads = numCores - } - - if numThreads < 2 || npoints < 32 { - sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 - scratch := make([]uint64, sz) - - pointsBySlice := [2]*P2Affine{nil, nil} - var p_points **P2Affine - switch val := pointsIf.(type) { - case []*P2Affine: - p_points = &val[0] - case []P2Affine: - pointsBySlice[0] = &val[0] - p_points = &pointsBySlice[0] - case P2Affines: - pointsBySlice[0] = &val[0] - p_points = &pointsBySlice[0] - } - - scalarsBySlice := [2]*C.byte{nil, nil} - var p_scalars **C.byte - switch val := scalarsIf.(type) { - case []byte: - scalarsBySlice[0] = (*C.byte)(&val[0]) - p_scalars = &scalarsBySlice[0] - case [][]byte: - p_scalars = &scalars[0] - case []Scalar: - if nbits > 248 { - scalarsBySlice[0] = (*C.byte)(&val[0].b[0]) - p_scalars = &scalarsBySlice[0] - } else { - p_scalars = &scalars[0] - } - case []*Scalar: - p_scalars = &scalars[0] - } - - var ret P2 - _cgoCheckPointer := func(...interface{}) {} - C.blst_p2s_mult_pippenger(&ret, p_points, C.size_t(npoints), - p_scalars, C.size_t(nbits), - (*C.limb_t)(&scratch[0])) - - for i := range scalars { - scalars[i] = nil - } - - return &ret - } - - // this is sizeof(scratch[0]) - sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(0)) / 8 - - nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), - numThreads) - - // |grid[]| holds "coordinates" and place for result - grid := make([]struct { - x, dx, y, dy int - point P2 - }, nx*ny) - - dx := npoints / nx - y := window * (ny - 1) - total := 0 - for ; total < nx; total++ { - grid[total].x = total * dx - grid[total].dx = dx - grid[total].y = y - grid[total].dy = nbits - y - } - grid[total-1].dx = npoints - grid[total-1].x - - for y > 0 { - y -= window - for i := 0; i < nx; i++ { - grid[total].x = grid[i].x - grid[total].dx = grid[i].dx - grid[total].y = y - grid[total].dy = window - total++ - } - } - - if numThreads > total { - numThreads = total - } - - msgsCh := make(chan int, ny) - rowSync := make([]int32, ny) // count up to |nx| - curItem := int32(0) - for tid := 0; tid < numThreads; tid++ { - go func() { - scratch := make([]uint64, sz<= total { - break - } - - x := grid[workItem].x - y := grid[workItem].y - - var p_points **P2Affine - switch val := pointsIf.(type) { - case []*P2Affine: - p_points = &val[x] - case []P2Affine: - pointsBySlice[0] = &val[x] - p_points = &pointsBySlice[0] - case P2Affines: - pointsBySlice[0] = &val[x] - p_points = &pointsBySlice[0] - } - - var p_scalars **C.byte - switch val := scalarsIf.(type) { - case []byte: - scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) - p_scalars = &scalarsBySlice[0] - case [][]byte: - p_scalars = &scalars[x] - case []Scalar: - if nbits > 248 { - scalarsBySlice[0] = (*C.byte)(&val[x].b[0]) - p_scalars = &scalarsBySlice[0] - } else { - p_scalars = &scalars[x] - } - case []*Scalar: - p_scalars = &scalars[x] - } - - C.blst_p2s_tile_pippenger(&grid[workItem].point, - p_points, C.size_t(grid[workItem].dx), - p_scalars, C.size_t(nbits), - (*C.limb_t)(&scratch[0]), - C.size_t(y), C.size_t(window)) - - if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { - msgsCh <- y // "row" is done - } else { - runtime.Gosched() // be nice to the application - } - } - - pointsBySlice[0] = nil - scalarsBySlice[0] = nil - }() - } - - var ret P2 - rows := make([]bool, ny) - row := 0 // actually index in |grid[]| - for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" - y := <-msgsCh - rows[y/window] = true // mark the "row" - for grid[row].y == y { // if it's current "row", process it - for row < total && grid[row].y == y { - C.blst_p2_add_or_double(&ret, &ret, &grid[row].point) - row++ - } - if y == 0 { - break // one can as well 'return &ret' here - } - for j := 0; j < window; j++ { - C.blst_p2_double(&ret, &ret) - } - y -= window - if !rows[y/window] { // see if next "row" was marked already - break - } - } - } - - for i := range scalars { - scalars[i] = nil - } - - return &ret -} - -func (points P2Affines) Mult(scalarsIf interface{}, nbits int) *P2 { - return P2AffinesMult(points, scalarsIf, nbits) -} - -func (points P2s) Mult(scalarsIf interface{}, nbits int) *P2 { - return points.ToAffine().Mult(scalarsIf, nbits) -} - -func parseOpts(optional ...interface{}) ([]byte, [][]byte, bool, bool) { - var aug [][]byte // For aggregate verify - var augSingle []byte // For signing - useHash := true // hash (true), encode (false) - - for _, arg := range optional { - switch v := arg.(type) { - case []byte: - augSingle = v - case [][]byte: - aug = v - case bool: - useHash = v - default: - return nil, nil, useHash, false - } - } - return augSingle, aug, useHash, true -} - -func bytesAllZero(s []byte) bool { - for _, v := range s { - if v != 0 { - return false - } - } - return true -} - -// These methods are inefficient because of cgo call overhead. For this -// reason they should be used primarily for prototyping with a goal to -// formulate interfaces that would process multiple scalars per cgo call. -func (a *Scalar) MulAssign(b *Scalar) (*Scalar, bool) { - return a, bool(C.blst_sk_mul_n_check(a, a, b)) -} - -func (a *Scalar) Mul(b *Scalar) (*Scalar, bool) { - var ret Scalar - return &ret, bool(C.blst_sk_mul_n_check(&ret, a, b)) -} - -func (a *Scalar) AddAssign(b *Scalar) (*Scalar, bool) { - return a, bool(C.blst_sk_add_n_check(a, a, b)) -} - -func (a *Scalar) Add(b *Scalar) (*Scalar, bool) { - var ret Scalar - return &ret, bool(C.blst_sk_add_n_check(&ret, a, b)) -} - -func (a *Scalar) SubAssign(b *Scalar) (*Scalar, bool) { - return a, bool(C.blst_sk_sub_n_check(a, a, b)) -} - -func (a *Scalar) Sub(b *Scalar) (*Scalar, bool) { - var ret Scalar - return &ret, bool(C.blst_sk_sub_n_check(&ret, a, b)) -} - -func (a *Scalar) Inverse() *Scalar { - var ret Scalar - C.blst_sk_inverse(&ret, a) - return &ret -} - -// -// Serialization/Deserialization. -// - -// Scalar serdes -func (s *Scalar) Serialize() []byte { - var out [BLST_SCALAR_BYTES]byte - C.blst_bendian_from_scalar((*C.byte)(&out[0]), s) - return out[:] -} - -func (s *Scalar) Deserialize(in []byte) *Scalar { - if len(in) != BLST_SCALAR_BYTES || - !C.go_scalar_from_bendian(s, (*C.byte)(&in[0])) { - return nil - } - return s -} - -func (s *Scalar) Valid() bool { - return bool(C.blst_sk_check(s)) -} - -func (s *Scalar) HashTo(msg []byte, dst []byte) bool { - ret := HashToScalar(msg, dst) - if ret != nil { - *s = *ret - return true - } - return false -} - -func HashToScalar(msg []byte, dst []byte) *Scalar { - var ret Scalar - - var msgC *C.byte - if len(msg) > 0 { - msgC = (*C.byte)(&msg[0]) - } - - var dstC *C.byte - if len(dst) > 0 { - dstC = (*C.byte)(&dst[0]) - } - - if C.go_hash_to_scalar(&ret, msgC, C.size_t(len(msg)), - dstC, C.size_t(len(dst))) { - return &ret - } - - return nil -} - -// -// LEndian -// - -func (fr *Scalar) ToLEndian() []byte { - var arr [BLST_SCALAR_BYTES]byte - C.blst_lendian_from_scalar((*C.byte)(&arr[0]), fr) - return arr[:] -} - -func (fp *Fp) ToLEndian() []byte { - var arr [BLST_FP_BYTES]byte - C.blst_lendian_from_fp((*C.byte)(&arr[0]), fp) - return arr[:] -} - -func (fr *Scalar) FromLEndian(arr []byte) *Scalar { - nbytes := len(arr) - if nbytes < BLST_SCALAR_BYTES || - !C.blst_scalar_from_le_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) { - return nil - } - return fr -} - -func (fp *Fp) FromLEndian(arr []byte) *Fp { - if len(arr) != BLST_FP_BYTES { - return nil - } - C.blst_fp_from_lendian(fp, (*C.byte)(&arr[0])) - return fp -} - -// -// BEndian -// - -func (fr *Scalar) ToBEndian() []byte { - var arr [BLST_SCALAR_BYTES]byte - C.blst_bendian_from_scalar((*C.byte)(&arr[0]), fr) - return arr[:] -} - -func (fp *Fp) ToBEndian() []byte { - var arr [BLST_FP_BYTES]byte - C.blst_bendian_from_fp((*C.byte)(&arr[0]), fp) - return arr[:] -} - -func (fr *Scalar) FromBEndian(arr []byte) *Scalar { - nbytes := len(arr) - if nbytes < BLST_SCALAR_BYTES || - !C.blst_scalar_from_be_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) { - return nil - } - return fr -} - -func (fp *Fp) FromBEndian(arr []byte) *Fp { - if len(arr) != BLST_FP_BYTES { - return nil - } - C.blst_fp_from_bendian(fp, (*C.byte)(&arr[0])) - return fp -} - -// -// Printing -// - -func PrintBytes(val []byte, name string) { - fmt.Printf("%s = %02x\n", name, val) -} - -func (s *Scalar) Print(name string) { - arr := s.ToBEndian() - PrintBytes(arr[:], name) -} - -func (p *P1Affine) Print(name string) { - fmt.Printf("%s:\n", name) - arr := p.x.ToBEndian() - PrintBytes(arr, " x") - arr = p.y.ToBEndian() - PrintBytes(arr, " y") -} - -func (p *P1) Print(name string) { - fmt.Printf("%s:\n", name) - aff := p.ToAffine() - aff.Print(name) -} - -func (f *Fp2) Print(name string) { - fmt.Printf("%s:\n", name) - arr := f.fp[0].ToBEndian() - PrintBytes(arr, " 0") - arr = f.fp[1].ToBEndian() - PrintBytes(arr, " 1") -} - -func (p *P2Affine) Print(name string) { - fmt.Printf("%s:\n", name) - p.x.Print(" x") - p.y.Print(" y") -} - -func (p *P2) Print(name string) { - fmt.Printf("%s:\n", name) - aff := p.ToAffine() - aff.Print(name) -} - -// -// Equality -// - -func (s1 *Scalar) Equals(s2 *Scalar) bool { - return *s1 == *s2 -} - -func (e1 *Fp) Equals(e2 *Fp) bool { - return *e1 == *e2 -} - -func (e1 *Fp2) Equals(e2 *Fp2) bool { - return *e1 == *e2 -} - -func (e1 *P1Affine) Equals(e2 *P1Affine) bool { - return bool(C.blst_p1_affine_is_equal(e1, e2)) -} - -func (e1 *P1) Equals(e2 *P1) bool { - return bool(C.blst_p1_is_equal(e1, e2)) -} - -func (e1 *P2Affine) Equals(e2 *P2Affine) bool { - return bool(C.blst_p2_affine_is_equal(e1, e2)) -} - -func (e1 *P2) Equals(e2 *P2) bool { - return bool(C.blst_p2_is_equal(e1, e2)) -} - -// private thunk for testing - -func expandMessageXmd(msg []byte, dst []byte, len_in_bytes int) []byte { - ret := make([]byte, len_in_bytes) - - var msgC *C.byte - if len(msg) > 0 { - msgC = (*C.byte)(&msg[0]) - } - - var dstC *C.byte - if len(dst) > 0 { - dstC = (*C.byte)(&dst[0]) - } - - C.blst_expand_message_xmd((*C.byte)(&ret[0]), C.size_t(len(ret)), - msgC, C.size_t(len(msg)), - dstC, C.size_t(len(dst))) - return ret -} - -func breakdown(nbits, window, ncpus int) (int, int, int) { - var nx, ny, wnd int - - if nbits > window*ncpus { - nx = 1 - wnd = bits.Len(uint(ncpus) / 4) - if (window + wnd) > 18 { - wnd = window - wnd - } else { - wnd = (nbits/window + ncpus - 1) / ncpus - if (nbits/(window+1)+ncpus-1)/ncpus < wnd { - wnd = window + 1 - } else { - wnd = window - } - } - } else { - nx = 2 - wnd = window - 2 - for (nbits/wnd+1)*nx < ncpus { - nx += 1 - wnd = window - bits.Len(3*uint(nx)/2) - } - nx -= 1 - wnd = window - bits.Len(3*uint(nx)/2) - } - ny = nbits/wnd + 1 - wnd = nbits/ny + 1 - - return nx, ny, wnd -} - -func pippenger_window_size(npoints int) int { - wbits := bits.Len(uint(npoints)) - - if wbits > 13 { - return wbits - 4 - } - if wbits > 5 { - return wbits - 3 - } - return 2 -} diff --git a/crypto/internal/blst/blst.h b/crypto/internal/blst/blst.h deleted file mode 100644 index 1349896a3f8..00000000000 --- a/crypto/internal/blst/blst.h +++ /dev/null @@ -1,482 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLST_H__ -#define __BLST_H__ - -#ifdef __SIZE_TYPE__ -typedef __SIZE_TYPE__ size_t; -#else -#include -#endif - -#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ - && defined(__UINT64_TYPE__) -typedef __UINT8_TYPE__ uint8_t; -typedef __UINT32_TYPE__ uint32_t; -typedef __UINT64_TYPE__ uint64_t; -#else -#include -#endif - -#ifdef __cplusplus -extern "C" { -#elif defined(__BLST_CGO__) -typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ -#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 -# define bool _Bool -#else -# define bool int -#endif - -#ifdef SWIG -# define DEFNULL =NULL -#elif defined __cplusplus -# define DEFNULL =0 -#else -# define DEFNULL -#endif - -typedef enum { - BLST_SUCCESS = 0, - BLST_BAD_ENCODING, - BLST_POINT_NOT_ON_CURVE, - BLST_POINT_NOT_IN_GROUP, - BLST_AGGR_TYPE_MISMATCH, - BLST_VERIFY_FAIL, - BLST_PK_IS_INFINITY, - BLST_BAD_SCALAR, -} BLST_ERROR; - -typedef uint8_t byte; -typedef uint64_t limb_t; - -typedef struct { byte b[256/8]; } blst_scalar; -typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; -typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; -/* 0 is "real" part, 1 is "imaginary" */ -typedef struct { blst_fp fp[2]; } blst_fp2; -typedef struct { blst_fp2 fp2[3]; } blst_fp6; -typedef struct { blst_fp6 fp6[2]; } blst_fp12; - -void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]); -void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a); -void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]); -void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a); -void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]); -void blst_bendian_from_scalar(byte out[32], const blst_scalar *a); -void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]); -void blst_lendian_from_scalar(byte out[32], const blst_scalar *a); -bool blst_scalar_fr_check(const blst_scalar *a); -bool blst_sk_check(const blst_scalar *a); -bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a, - const blst_scalar *b); -bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a, - const blst_scalar *b); -bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a, - const blst_scalar *b); -void blst_sk_inverse(blst_scalar *out, const blst_scalar *a); -bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len); -bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len); - -#ifndef SWIG -/* - * BLS12-381-specific Fr operations. - */ -void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); -void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); -void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); -void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); -void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); -void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); -void blst_fr_sqr(blst_fr *ret, const blst_fr *a); -void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); -void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); -void blst_fr_inverse(blst_fr *ret, const blst_fr *a); - -void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); -void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); -void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a); -void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a); - -/* - * BLS12-381-specific Fp operations. - */ -void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); -void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); -void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); -void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); -void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); -void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); -void blst_fp_sqr(blst_fp *ret, const blst_fp *a); -void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag); -void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); -void blst_fp_inverse(blst_fp *ret, const blst_fp *a); -bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a); - -void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); -void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); -void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); -void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); -void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); -void blst_bendian_from_fp(byte ret[48], const blst_fp *a); -void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); -void blst_lendian_from_fp(byte ret[48], const blst_fp *a); - -/* - * BLS12-381-specific Fp2 operations. - */ -void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); -void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); -void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); -void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); -void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); -void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); -void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); -void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag); -void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a); -void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a); -bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a); - -/* - * BLS12-381-specific Fp12 operations. - */ -void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); -void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); -void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); -void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, - const blst_fp6 *xy00z0); -void blst_fp12_conjugate(blst_fp12 *a); -void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); -/* caveat lector! |n| has to be non-zero and not more than 3! */ -void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); -bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); -bool blst_fp12_is_one(const blst_fp12 *a); -bool blst_fp12_in_group(const blst_fp12 *a); -const blst_fp12 *blst_fp12_one(void); -#endif // SWIG - -/* - * BLS12-381-specific point operations. - */ -typedef struct { blst_fp x, y, z; } blst_p1; -typedef struct { blst_fp x, y; } blst_p1_affine; - -void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); -void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); -void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, - const blst_p1_affine *b); -void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, - const blst_p1_affine *b); -void blst_p1_double(blst_p1 *out, const blst_p1 *a); -void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, - size_t nbits); -void blst_p1_cneg(blst_p1 *p, bool cbit); -void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); -void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); -bool blst_p1_on_curve(const blst_p1 *p); -bool blst_p1_in_g1(const blst_p1 *p); -bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b); -bool blst_p1_is_inf(const blst_p1 *a); -const blst_p1 *blst_p1_generator(void); - -bool blst_p1_affine_on_curve(const blst_p1_affine *p); -bool blst_p1_affine_in_g1(const blst_p1_affine *p); -bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); -bool blst_p1_affine_is_inf(const blst_p1_affine *a); -const blst_p1_affine *blst_p1_affine_generator(void); - -typedef struct { blst_fp2 x, y, z; } blst_p2; -typedef struct { blst_fp2 x, y; } blst_p2_affine; - -void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); -void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); -void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, - const blst_p2_affine *b); -void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, - const blst_p2_affine *b); -void blst_p2_double(blst_p2 *out, const blst_p2 *a); -void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, - size_t nbits); -void blst_p2_cneg(blst_p2 *p, bool cbit); -void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); -void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); -bool blst_p2_on_curve(const blst_p2 *p); -bool blst_p2_in_g2(const blst_p2 *p); -bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b); -bool blst_p2_is_inf(const blst_p2 *a); -const blst_p2 *blst_p2_generator(void); - -bool blst_p2_affine_on_curve(const blst_p2_affine *p); -bool blst_p2_affine_in_g2(const blst_p2_affine *p); -bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); -bool blst_p2_affine_is_inf(const blst_p2_affine *a); -const blst_p2_affine *blst_p2_affine_generator(void); - -/* - * Multi-scalar multiplications and other multi-point operations. - */ - -void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[], - size_t npoints); -void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[], - size_t npoints); - -size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); -void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits, - const blst_p1_affine *const points[], - size_t npoints); -size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints); -void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[], - size_t wbits, size_t npoints, - const byte *const scalars[], size_t nbits, - limb_t *scratch); - -size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints); -void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], - size_t npoints, const byte *const scalars[], - size_t nbits, limb_t *scratch); -void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], - size_t npoints, const byte *const scalars[], - size_t nbits, limb_t *scratch, - size_t bit0, size_t window); - -void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[], - size_t npoints); -void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[], - size_t npoints); - -size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); -void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits, - const blst_p2_affine *const points[], - size_t npoints); -size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints); -void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[], - size_t wbits, size_t npoints, - const byte *const scalars[], size_t nbits, - limb_t *scratch); - -size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints); -void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], - size_t npoints, const byte *const scalars[], - size_t nbits, limb_t *scratch); -void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], - size_t npoints, const byte *const scalars[], - size_t nbits, limb_t *scratch, - size_t bit0, size_t window); - -/* - * Hash-to-curve operations. - */ -#ifndef SWIG -void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); -void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); -#endif - -void blst_encode_to_g1(blst_p1 *out, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, size_t DST_len DEFNULL, - const byte *aug DEFNULL, size_t aug_len DEFNULL); -void blst_hash_to_g1(blst_p1 *out, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, size_t DST_len DEFNULL, - const byte *aug DEFNULL, size_t aug_len DEFNULL); - -void blst_encode_to_g2(blst_p2 *out, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, size_t DST_len DEFNULL, - const byte *aug DEFNULL, size_t aug_len DEFNULL); -void blst_hash_to_g2(blst_p2 *out, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, size_t DST_len DEFNULL, - const byte *aug DEFNULL, size_t aug_len DEFNULL); - -/* - * Zcash-compatible serialization/deserialization. - */ -void blst_p1_serialize(byte out[96], const blst_p1 *in); -void blst_p1_compress(byte out[48], const blst_p1 *in); -void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); -void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); -BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); -BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); - -void blst_p2_serialize(byte out[192], const blst_p2 *in); -void blst_p2_compress(byte out[96], const blst_p2 *in); -void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); -void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); -BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); -BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); - -/* - * Specification defines two variants, 'minimal-signature-size' and - * 'minimal-pubkey-size'. To unify appearance we choose to distinguish - * them by suffix referring to the public key type, more specifically - * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to - * 'minimal-signature-size'. It might appear a bit counterintuitive - * in sign call, but no matter how you twist it, something is bound to - * turn a little odd. - */ -/* - * Secret-key operations. - */ -void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, - const byte *info DEFNULL, size_t info_len DEFNULL); -void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); -void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, - const blst_scalar *SK); -void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); -void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, - const blst_scalar *SK); - -/* - * Pairing interface. - */ -#ifndef SWIG -void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, - const blst_p1_affine *P); -void blst_miller_loop_n(blst_fp12 *ret, const blst_p2_affine *const Qs[], - const blst_p1_affine *const Ps[], - size_t n); -void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); -void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); -void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], - const blst_p1_affine *P); -bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2); -#endif - -#ifdef __BLST_CGO__ -typedef limb_t blst_pairing; -#elif defined(__BLST_RUST_BINDGEN__) -typedef struct {} blst_pairing; -#else -typedef struct blst_opaque blst_pairing; -#endif - -size_t blst_pairing_sizeof(void); -void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, - const byte *DST DEFNULL, size_t DST_len DEFNULL); -const byte *blst_pairing_get_dst(const blst_pairing *ctx); -void blst_pairing_commit(blst_pairing *ctx); -BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, - const blst_p2_affine *PK, - const blst_p1_affine *signature, - const byte *msg, size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx, - const blst_p2_affine *PK, - bool pk_grpchk, - const blst_p1_affine *signature, - bool sig_grpchk, - const byte *msg, size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx, - const blst_p2_affine *PK, - const blst_p1_affine *sig, - const byte *scalar, - size_t nbits, - const byte *msg, - size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx, - const blst_p2_affine *PK, - bool pk_grpchk, - const blst_p1_affine *sig, - bool sig_grpchk, - const byte *scalar, - size_t nbits, - const byte *msg, - size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, - const blst_p1_affine *PK, - const blst_p2_affine *signature, - const byte *msg, size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx, - const blst_p1_affine *PK, - bool pk_grpchk, - const blst_p2_affine *signature, - bool sig_grpchk, - const byte *msg, size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx, - const blst_p1_affine *PK, - const blst_p2_affine *sig, - const byte *scalar, - size_t nbits, - const byte *msg, - size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx, - const blst_p1_affine *PK, - bool pk_grpchk, - const blst_p2_affine *sig, - bool sig_grpchk, - const byte *scalar, - size_t nbits, - const byte *msg, - size_t msg_len, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); -bool blst_pairing_finalverify(const blst_pairing *ctx, - const blst_fp12 *gtsig DEFNULL); - - -/* - * Customarily applications aggregate signatures separately. - * In which case application would have to pass NULLs for |signature| - * to blst_pairing_aggregate calls and pass aggregated signature - * collected with these calls to blst_pairing_finalverify. Inputs are - * Zcash-compatible "straight-from-wire" byte vectors, compressed or - * not. - */ -BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, - const byte *zwire); -BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, - const byte *zwire); - -void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); -void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); - -/* - * "One-shot" CoreVerify entry points. - */ -BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, - const blst_p2_affine *signature, - bool hash_or_encode, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, - size_t DST_len DEFNULL, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); -BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, - const blst_p1_affine *signature, - bool hash_or_encode, - const byte *msg, size_t msg_len, - const byte *DST DEFNULL, - size_t DST_len DEFNULL, - const byte *aug DEFNULL, - size_t aug_len DEFNULL); - -extern const blst_p1_affine BLS12_381_G1; -extern const blst_p1_affine BLS12_381_NEG_G1; -extern const blst_p2_affine BLS12_381_G2; -extern const blst_p2_affine BLS12_381_NEG_G2; - -#include "blst_aux.h" - -#ifdef __cplusplus -} -#endif -#endif diff --git a/crypto/internal/blst/blst_aux.h b/crypto/internal/blst/blst_aux.h deleted file mode 100644 index 3de0850e330..00000000000 --- a/crypto/internal/blst/blst_aux.h +++ /dev/null @@ -1,117 +0,0 @@ -/* - * Copyright Supranational LLC - * Licensed under the Apache License, Version 2.0, see LICENSE for details. - * SPDX-License-Identifier: Apache-2.0 - */ -#ifndef __BLST_AUX_H__ -#define __BLST_AUX_H__ -/* - * This file lists interfaces that might be promoted to blst.h or removed, - * depending on their proven/unproven worthiness. - */ - -void blst_fr_ct_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); -void blst_fr_gs_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); -void blst_fr_to(blst_fr *ret, const blst_fr *a); -void blst_fr_from(blst_fr *ret, const blst_fr *a); -#ifdef BLST_FR_PENTAROOT -void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a); -void blst_fr_pentapow(blst_fr *ret, const blst_fr *a); -#endif - -void blst_fp_to(blst_fp *ret, const blst_fp *a); -void blst_fp_from(blst_fp *ret, const blst_fp *a); - -bool blst_fp_is_square(const blst_fp *a); -bool blst_fp2_is_square(const blst_fp2 *a); - -void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); -void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); - -/* - * Below functions produce both point and deserialized outcome of - * SkToPk and Sign. However, deserialized outputs are pre-decorated - * with sign and infinity bits. This means that you have to bring the - * output into compliance prior returning to application. If you want - * compressed point value, then do [equivalent of] - * - * byte temp[96]; - * blst_sk_to_pk2_in_g1(temp, out_pk, SK); - * temp[0] |= 0x80; - * memcpy(out, temp, 48); - * - * Otherwise do - * - * blst_sk_to_pk2_in_g1(out, out_pk, SK); - * out[0] &= ~0x20; - * - * Either |out| or |out_| can be NULL. - */ -void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, - const blst_scalar *SK); -void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, - const blst_p2 *hash, const blst_scalar *SK); -void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, - const blst_scalar *SK); -void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, - const blst_p1 *hash, const blst_scalar *SK); - -#ifdef __BLST_RUST_BINDGEN__ -typedef struct {} blst_uniq; -#else -typedef struct blst_opaque blst_uniq; -#endif - -size_t blst_uniq_sizeof(size_t n_nodes); -void blst_uniq_init(blst_uniq *tree); -bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len); - -#ifdef expand_message_xmd -void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, - const unsigned char *aug, size_t aug_len, - const unsigned char *msg, size_t msg_len, - const unsigned char *DST, size_t DST_len); -#else -void blst_expand_message_xmd(byte *out, size_t out_len, - const byte *msg, size_t msg_len, - const byte *DST, size_t DST_len); -#endif - -void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, - size_t nbits); -void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, - size_t nbits); - -void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q, - const blst_p1_affine *p); -blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx); -void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a); - -void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, - const byte *info DEFNULL, size_t info_len DEFNULL); -void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, - const byte *salt, size_t salt_len, - const byte *info DEFNULL, size_t info_len DEFNULL); -void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, - const byte *salt, size_t salt_len, - const byte *info DEFNULL, size_t info_len DEFNULL); -void blst_derive_master_eip2333(blst_scalar *out_SK, - const byte *IKM, size_t IKM_len); -void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK, - uint32_t child_index); - -void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex); -void blst_fr_from_hexascii(blst_fr *ret, const byte *hex); -void blst_fp_from_hexascii(blst_fp *ret, const byte *hex); - -size_t blst_p1_sizeof(void); -size_t blst_p1_affine_sizeof(void); -size_t blst_p2_sizeof(void); -size_t blst_p2_affine_sizeof(void); -size_t blst_fp12_sizeof(void); - -/* - * Single-shot SHA-256 hash function. - */ -void blst_sha256(byte out[32], const byte *msg, size_t msg_len); -#endif diff --git a/crypto/random/chacha20.go b/crypto/random/chacha20.go deleted file mode 100644 index ae834057b81..00000000000 --- a/crypto/random/chacha20.go +++ /dev/null @@ -1,194 +0,0 @@ -package random - -import ( - "encoding/binary" - "fmt" - - "golang.org/x/crypto/chacha20" -) - -// We use Chacha20, to build a cryptographically secure random number generator -// that uses the ChaCha algorithm. -// -// ChaCha is a stream cipher designed by Daniel J. Bernstein[^1], that we use as a PRG. It is -// an improved variant of the Salsa20 cipher family. -// -// We use Chacha20 with a 256-bit key, a 192-bit stream identifier and a 32-bit counter as -// as specified in RFC 8439 [^2]. -// The encryption key is used as the PRG seed while the stream identifier is used as a nonce -// to customize the PRG. The PRG outputs are the successive encryptions of a constant message. -// -// A 32-bit counter over 64-byte blocks allows 256 GiB of output before cycling, -// and the stream identifier allows 2^192 unique streams of output per seed. -// It is the caller's responsibility to avoid the PRG output cycling. -// -// [^1]: D. J. Bernstein, [*ChaCha, a variant of Salsa20*]( -// https://cr.yp.to/chacha.html) -// -// [^2]: [RFC 8439: ChaCha20 and Poly1305 for IETF Protocols]( -// https://datatracker.ietf.org/doc/html/rfc8439) - -// The PRG core, implements the randCore interface -type chachaCore struct { - cipher chacha20.Cipher - - // empty message added to minimize allocations and buffer clearing - emptyMessage [lenEmptyMessage]byte - - // Only used for State/Restore functionality - - // Counter of bytes encrypted so far by the sream cipher. - // Note this is different than the internal 32-bits counter of the chacha state - // that counts the encrypted blocks of 512 bits. - bytesCounter uint64 - // initial seed - seed [keySize]byte - // initial customizer - customizer [nonceSize]byte -} - -// The main PRG, implements the Rand interface -type chachaPRG struct { - genericPRG - core *chachaCore -} - -const ( - keySize = chacha20.KeySize - nonceSize = chacha20.NonceSize - - // Chacha20SeedLen is the seed length of the Chacha based PRG, it is fixed to 32 bytes. - Chacha20SeedLen = keySize - // Chacha20CustomizerMaxLen is the maximum length of the nonce used as a PRG customizer, it is fixed to 24 bytes. - // Shorter customizers are padded by zeros to 24 bytes. - Chacha20CustomizerMaxLen = nonceSize -) - -// NewChacha20PRG returns a new Chacha20-based PRG, seeded with -// the input seed (32 bytes) and a customizer (up to 12 bytes). -// -// It is recommended to sample the seed uniformly at random. -// The function errors if the seed is different than 32 bytes, -// or if the customizer is larger than 12 bytes. -// Shorter customizers than 12 bytes are padded by zero bytes. -func NewChacha20PRG(seed []byte, customizer []byte) (*chachaPRG, error) { - - // check the key size - if len(seed) != Chacha20SeedLen { - return nil, fmt.Errorf("chacha20 seed length should be %d, got %d", Chacha20SeedLen, len(seed)) - } - - // check the nonce size - if len(customizer) > Chacha20CustomizerMaxLen { - return nil, fmt.Errorf("chacha20 streamID should be less than %d bytes", Chacha20CustomizerMaxLen) - } - - // init the state core - var core chachaCore - // core.bytesCounter is set to 0 - copy(core.seed[:], seed) - copy(core.customizer[:], customizer) // pad the customizer with zero bytes when it's short - - // create the Chacha20 state, initialized with the seed as a key, and the customizer as a streamID. - chacha, err := chacha20.NewUnauthenticatedCipher(core.seed[:], core.customizer[:]) - if err != nil { - return nil, fmt.Errorf("chacha20 instance creation failed: %w", err) - } - core.cipher = *chacha - - prg := &chachaPRG{ - genericPRG: genericPRG{ - randCore: &core, - }, - core: &core, - } - return prg, nil -} - -const lenEmptyMessage = 64 - -// Read pulls random bytes from the pseudo-random source. -// The randoms are copied into the input buffer, the number of bytes read -// is equal to the buffer input length. -// -// The stream cipher encrypts a stream of a constant message (empty for simplicity). -func (c *chachaCore) Read(buffer []byte) { - // message to encrypt - var message []byte - - if len(buffer) <= lenEmptyMessage { - // use a constant message (used for most of the calls) - message = c.emptyMessage[:len(buffer)] - } else { - // when buffer is large, use is as the message to encrypt, - // but this requires clearing it first. - for i := 0; i < len(buffer); i++ { - buffer[i] = 0 - } - message = buffer - } - c.cipher.XORKeyStream(buffer, message) - // increase the counter - c.bytesCounter += uint64(len(buffer)) -} - -// counter is stored over 8 bytes -const counterBytesLen = 8 - -// Store returns the internal state of the concatenated Chacha20s -// This is used for serialization/deserialization purposes. -func (c *chachaPRG) Store() []byte { - bytes := make([]byte, 0, keySize+nonceSize+counterBytesLen) - counter := make([]byte, counterBytesLen) - binary.LittleEndian.PutUint64(counter, c.core.bytesCounter) - // output is seed || streamID || counter - bytes = append(bytes, c.core.seed[:]...) - bytes = append(bytes, c.core.customizer[:]...) - bytes = append(bytes, counter...) - return bytes -} - -// RestoreChacha20PRG creates a chacha20 base PRG based on a previously stored state. -// The created PRG is restored at the same state where the previous PRG was stored. -func RestoreChacha20PRG(stateBytes []byte) (*chachaPRG, error) { - // input should be seed (32 bytes) || streamID (12 bytes) || bytesCounter (8 bytes) - const expectedLen = keySize + nonceSize + counterBytesLen - - // check input length - if len(stateBytes) != expectedLen { - return nil, fmt.Errorf("Rand state length should be of %d bytes, got %d", expectedLen, len(stateBytes)) - } - - seed := stateBytes[:keySize] - streamID := stateBytes[keySize : keySize+nonceSize] - bytesCounter := binary.LittleEndian.Uint64(stateBytes[keySize+nonceSize:]) - - // create the Chacha20 instance with seed and streamID - chacha, err := chacha20.NewUnauthenticatedCipher(seed, streamID) - if err != nil { - return nil, fmt.Errorf("Chacha20 instance creation failed: %w", err) - } - // set the block counter, each chacha internal block is 512 bits - const bytesPerBlock = 512 >> 3 - blockCount := uint32(bytesCounter / bytesPerBlock) - remainingBytes := bytesCounter % bytesPerBlock - chacha.SetCounter(blockCount) - // query the remaining bytes and to catch the stored chacha state - remainderStream := make([]byte, remainingBytes) - chacha.XORKeyStream(remainderStream, remainderStream) - - core := &chachaCore{ - cipher: *chacha, - bytesCounter: bytesCounter, - } - copy(core.seed[:], seed) - copy(core.customizer[:], streamID) - - prg := &chachaPRG{ - genericPRG: genericPRG{ - randCore: core, - }, - core: core, - } - return prg, nil -} diff --git a/crypto/random/rand.go b/crypto/random/rand.go deleted file mode 100644 index 370b7416dd6..00000000000 --- a/crypto/random/rand.go +++ /dev/null @@ -1,173 +0,0 @@ -package random - -import ( - "encoding/binary" - "fmt" -) - -// Rand is a pseudo random number generator -// All methods update the internal state of the PRG -// which makes the PRGs implementing this interface -// non concurrent-safe. -type Rand interface { - // Read fills the input slice with random bytes. - Read([]byte) - - // UintN returns a random number between 0 and N (exclusive) - UintN(uint64) uint64 - - // Permutation returns a permutation of the set [0,n-1] - // the theoretical output space grows very fast with (!n) so that input (n) should be chosen carefully - // to make sure the function output space covers a big chunk of the theoretical outputs. - // The function errors if the parameter is a negative integer. - Permutation(n int) ([]int, error) - - // SubPermutation returns the m first elements of a permutation of [0,n-1] - // the theoretical output space can be large (n!/(n-m)!) so that the inputs should be chosen carefully - // to make sure the function output space covers a big chunk of the theoretical outputs. - // The function errors if the parameter is a negative integer. - SubPermutation(n int, m int) ([]int, error) - - // Shuffle permutes an ordered data structure of an arbitrary type in place. The main use-case is - // permuting slice or array elements. (n) is the size of the data structure. - // the theoretical output space grows very fast with the slice size (n!) so that input (n) should be chosen carefully - // to make sure the function output space covers a big chunk of the theoretical outputs. - // The function errors if any of the parameters is a negative integer. - Shuffle(n int, swap func(i, j int)) error - - // Samples picks (m) random ordered elements of a data structure of an arbitrary type of total size (n). The (m) elements are placed - // in the indices 0 to (m-1) with in place swapping. The data structure ends up being a permutation of the initial (n) elements. - // While the sampling of the (m) elements is pseudo-uniformly random, there is no guarantee about the uniformity of the permutation of - // the (n) elements. The function Shuffle should be used in case the entire (n) elements need to be shuffled. - // The main use-case of the data structure is a slice or array. - // The theoretical output space grows very fast with the slice size (n!/(n-m)!) so that inputs should be chosen carefully - // to make sure the function output space covers a big chunk of the theoretical outputs. - // The function errors if any of the parameters is a negative integer. - Samples(n int, m int, swap func(i, j int)) error - - // Store returns the internal state of the random generator. - // The internal state can be used as a seed input for the function - // Restore to restore an identical PRG (with the same internal state) - Store() []byte -} - -// randCore is PRG providing the core Read function of a PRG. -// All other Rand methods use the core Read method. -// -// In order to add a new Rand implementation, -// it should be enough to implement randCore. -type randCore interface { - // Read fills the input slice with random bytes. - Read([]byte) -} - -// genericPRG implements all the Rand methods using the embedded randCore method. -// All implementations of the Rand interface should embed the genericPRG struct. -type genericPRG struct { - randCore - // buffer used by UintN function to avoid extra memory allocation - uintnBuffer [8]byte -} - -// UintN returns an uint64 pseudo-random number in [0,n-1], -// using `p` as an entropy source. -// The function panics if input `n` is zero. -func (p *genericPRG) UintN(n uint64) uint64 { - if n == 0 { - panic("input to UintN can't be 0") - } - // the max returned random is n-1 - max := n - 1 - // count the size of max in bytes - size := 0 - for tmp := max; tmp != 0; tmp >>= 8 { - size++ - } - // get the bit size of max - mask := uint64(0) - for max&mask != max { - mask = (mask << 1) | 1 - } - - // For a better uniformity of the result, loop till a sample is less or equal to `max`. - // This means the function might take longer time to output a random. - // Using the size of `max` in bits helps the loop end earlier. - // (a different approach would be to pull at least 128 bits from the random source - // and use big number modular reduction by `n`) - random := n - for random > max { - p.Read(p.uintnBuffer[:size]) // adjust to the size of max in bytes - random = binary.LittleEndian.Uint64(p.uintnBuffer[:]) - random &= mask // adjust to the size of max in bits - } - - return random -} - -// Permutation returns a permutation of the set [0,n-1]. -// It implements Fisher-Yates Shuffle (inside-out variant) using `p` as a random source. -// The output space grows very fast with (!n) so that input `n` should be chosen carefully -// to guarantee a good uniformity of the output. -// -// O(n) space and O(n) time. -func (p *genericPRG) Permutation(n int) ([]int, error) { - if n < 0 { - return nil, fmt.Errorf("population size cannot be negative") - } - items := make([]int, n) - for i := 0; i < n; i++ { - j := p.UintN(uint64(i + 1)) - items[i] = items[j] - items[j] = i - } - return items, nil -} - -// SubPermutation returns the `m` first elements of a permutation of [0,n-1]. -// -// It implements Fisher-Yates Shuffle using `p` as a source of randoms. -// -// O(n) space and O(n) time -func (p *genericPRG) SubPermutation(n int, m int) ([]int, error) { - if m < 0 { - return nil, fmt.Errorf("sample size cannot be negative") - } - if n < m { - return nil, fmt.Errorf("sample size (%d) cannot be larger than entire population (%d)", m, n) - } - // condition n >= 0 is enforced by function Permutation(n) - items, _ := p.Permutation(n) - return items[:m], nil -} - -// Shuffle permutes the given slice in place. -// -// It implements Fisher-Yates Shuffle using `p` as a source of randoms. -// -// O(1) space and O(n) time -func (p *genericPRG) Shuffle(n int, swap func(i, j int)) error { - if n < 0 { - return fmt.Errorf("population size cannot be negative") - } - return p.Samples(n, n, swap) -} - -// Samples picks randomly m elements out of n elements and places them -// in random order at indices [0,m-1], the swapping being implemented in place. -// -// It implements the first (m) elements of Fisher-Yates Shuffle using `p` as a source of randoms. -// -// O(1) space and O(m) time -func (p *genericPRG) Samples(n int, m int, swap func(i, j int)) error { - if m < 0 { - return fmt.Errorf("sample size cannot be negative") - } - if n < m { - return fmt.Errorf("sample size (%d) cannot be larger than entire population (%d)", m, n) - } - for i := 0; i < m; i++ { - j := p.UintN(uint64(n - i)) - swap(i, i+int(j)) - } - return nil -} diff --git a/crypto/random/rand_test.go b/crypto/random/rand_test.go deleted file mode 100644 index 626e0188f70..00000000000 --- a/crypto/random/rand_test.go +++ /dev/null @@ -1,415 +0,0 @@ -package random - -import ( - "bytes" - mrand "math/rand" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "golang.org/x/crypto/chacha20" -) - -// sanity check for the underlying implementation of Chacha20 -// to make sure the implementation is compliant the RFC 7539. -func TestChacha20Compliance(t *testing.T) { - - t.Run("key and nonce length", func(t *testing.T) { - - assert.Equal(t, Chacha20SeedLen, 32) - assert.Equal(t, Chacha20CustomizerMaxLen, 12) - }) - - t.Run("RFC test vector", func(t *testing.T) { - - key := []byte{ - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, - } - nonce := []byte{0, 0, 0, 0, 0, 0, 0, 0x4a, 0, 0, 0, 0} - counter := uint32(1) - plaintext := []byte{ - 0x4c, 0x61, 0x64, 0x69, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x47, 0x65, 0x6e, 0x74, 0x6c, - 0x65, 0x6d, 0x65, 0x6e, 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6c, 0x61, 0x73, - 0x73, 0x20, 0x6f, 0x66, 0x20, 0x27, 0x39, 0x39, 0x3a, 0x20, 0x49, 0x66, 0x20, 0x49, 0x20, 0x63, - 0x6f, 0x75, 0x6c, 0x64, 0x20, 0x6f, 0x66, 0x66, 0x65, 0x72, 0x20, 0x79, 0x6f, 0x75, 0x20, 0x6f, - 0x6e, 0x6c, 0x79, 0x20, 0x6f, 0x6e, 0x65, 0x20, 0x74, 0x69, 0x70, 0x20, 0x66, 0x6f, 0x72, 0x20, - 0x74, 0x68, 0x65, 0x20, 0x66, 0x75, 0x74, 0x75, 0x72, 0x65, 0x2c, 0x20, 0x73, 0x75, 0x6e, 0x73, - 0x63, 0x72, 0x65, 0x65, 0x6e, 0x20, 0x77, 0x6f, 0x75, 0x6c, 0x64, 0x20, 0x62, 0x65, 0x20, 0x69, - } - ciphertext := []byte{ - 0x6e, 0x2e, 0x35, 0x9a, 0x25, 0x68, 0xf9, 0x80, 0x41, 0xba, 0x07, 0x28, 0xdd, 0x0d, 0x69, 0x81, - 0xe9, 0x7e, 0x7a, 0xec, 0x1d, 0x43, 0x60, 0xc2, 0x0a, 0x27, 0xaf, 0xcc, 0xfd, 0x9f, 0xae, 0x0b, - 0xf9, 0x1b, 0x65, 0xc5, 0x52, 0x47, 0x33, 0xab, 0x8f, 0x59, 0x3d, 0xab, 0xcd, 0x62, 0xb3, 0x57, - 0x16, 0x39, 0xd6, 0x24, 0xe6, 0x51, 0x52, 0xab, 0x8f, 0x53, 0x0c, 0x35, 0x9f, 0x08, 0x61, 0xd8, - 0x07, 0xca, 0x0d, 0xbf, 0x50, 0x0d, 0x6a, 0x61, 0x56, 0xa3, 0x8e, 0x08, 0x8a, 0x22, 0xb6, 0x5e, - 0x52, 0xbc, 0x51, 0x4d, 0x16, 0xcc, 0xf8, 0x06, 0x81, 0x8c, 0xe9, 0x1a, 0xb7, 0x79, 0x37, 0x36, - 0x5a, 0xf9, 0x0b, 0xbf, 0x74, 0xa3, 0x5b, 0xe6, 0xb4, 0x0b, 0x8e, 0xed, 0xf2, 0x78, 0x5e, 0x42, - } - - chacha, err := chacha20.NewUnauthenticatedCipher(key, nonce) - require.NoError(t, err) - chacha.SetCounter(counter) - chacha.XORKeyStream(plaintext, plaintext) - assert.Equal(t, plaintext, ciphertext) - - }) - - t.Run("invalid constructor inputs", func(t *testing.T) { - seed := make([]byte, Chacha20SeedLen+1) - customizer := make([]byte, Chacha20CustomizerMaxLen+1) - - // long seed - _, err := NewChacha20PRG(seed, customizer[:Chacha20CustomizerMaxLen]) - assert.Error(t, err) - // long nonce - _, err = NewChacha20PRG(seed[:Chacha20SeedLen], customizer) - assert.Error(t, err) - }) - - t.Run("short nonce", func(t *testing.T) { - seed := make([]byte, Chacha20SeedLen) - customizer := make([]byte, Chacha20CustomizerMaxLen) - - // short nonces should be accepted - _, err := NewChacha20PRG(seed, customizer[:Chacha20CustomizerMaxLen-1]) - assert.NoError(t, err) - _, err = NewChacha20PRG(seed, customizer[:0]) - assert.NoError(t, err) - }) -} - -func getPRG(t *testing.T) *mrand.Rand { - random := time.Now().UnixNano() - t.Logf("rng seed is %d", random) - rng := mrand.New(mrand.NewSource(random)) - return rng -} - -// The tests are targeting the PRG implementations in the package. -// For now, the tests are only used for Chacha20 PRG, but can be ported -// to test another PRG implementation. - -// Simple unit testing of UintN using a basic randomness test. -// It doesn't perform advanced statistical tests. -func TestUintN(t *testing.T) { - rand := getPRG(t) - seed := make([]byte, Chacha20SeedLen) - _, err := rand.Read(seed) - require.NoError(t, err) - customizer := make([]byte, Chacha20CustomizerMaxLen) - _, err = rand.Read(customizer) - require.NoError(t, err) - - rng, err := NewChacha20PRG(seed, customizer) - require.NoError(t, err) - - t.Run("basic uniformity", func(t *testing.T) { - maxN := uint64(1000) - mod := mrand.Uint64() - var n, classWidth uint64 - if mod < maxN { // `mod` is too small so that we can consider `mod` classes - n = mod - classWidth = 1 - } else { // `mod` is big enough so that we can partition [0,mod-1] into `maxN` classes - n = maxN - mod = (mod / n) * n // adjust `mod` to make sure it is a multiple of n for a more accurate test - classWidth = mod / n - } - - uintNf := func() (uint64, error) { - return uint64(rng.UintN(mod)), nil - } - BasicDistributionTest(t, n, classWidth, uintNf) - }) - - t.Run("zero n", func(t *testing.T) { - assert.Panics(t, func() { - rng.UintN(0) - }) - }) -} - -// Simple unit testing of SubPermutation using a basic randomness test. -// It doesn't perform advanced statistical tests. -// -// SubPermutation tests cover Permutation as well. -func TestSubPermutation(t *testing.T) { - rand := getPRG(t) - - seed := make([]byte, Chacha20SeedLen) - _, err := rand.Read(seed) - require.NoError(t, err) - customizer := make([]byte, Chacha20CustomizerMaxLen) - _, err = rand.Read(customizer) - require.NoError(t, err) - - rng, err := NewChacha20PRG(seed, customizer) - require.NoError(t, err) - - t.Run("basic randomness", func(t *testing.T) { - listSize := 100 - subsetSize := 20 - sampleSize := 85000 - // tests the subset sampling randomness - samplingDistribution := make([]float64, listSize) - // tests the subset ordering randomness (using a particular element testElement) - orderingDistribution := make([]float64, subsetSize) - testElement := rand.Intn(listSize) - - for i := 0; i < sampleSize; i++ { - shuffledlist, err := rng.SubPermutation(listSize, subsetSize) - require.NoError(t, err) - require.Equal(t, len(shuffledlist), subsetSize) - has := make(map[int]struct{}) - for j, e := range shuffledlist { - // check for repetition - _, ok := has[e] - require.False(t, ok, "duplicated item") - has[e] = struct{}{} - // fill the distribution - samplingDistribution[e] += 1.0 - if e == testElement { - orderingDistribution[j] += 1.0 - } - } - } - EvaluateDistributionUniformity(t, samplingDistribution) - EvaluateDistributionUniformity(t, orderingDistribution) - }) - - // Evaluate that - // - permuting an empty set returns an empty list - // - drawing a sample of size zero from a non-empty set returns an empty list - t.Run("empty sets", func(t *testing.T) { - - // verify that permuting an empty set returns an empty list - res, err := rng.SubPermutation(0, 0) - require.NoError(t, err) - assert.True(t, len(res) == 0) - - // verify that drawing a sample of size zero from a non-empty set returns an empty list - res, err = rng.SubPermutation(10, 0) - require.NoError(t, err) - assert.True(t, len(res) == 0) - }) - - t.Run("negative inputs", func(t *testing.T) { - res, err := rng.Permutation(-3) - require.Error(t, err) - assert.Nil(t, res) - - res, err = rng.SubPermutation(5, -3) - require.Error(t, err) - assert.Nil(t, res) - - res, err = rng.SubPermutation(-3, 5) - require.Error(t, err) - assert.Nil(t, res) - }) -} - -// Simple unit testing of Shuffle using a basic randomness test. -// It doesn't perform advanced statistical tests. -func TestShuffle(t *testing.T) { - rand := getPRG(t) - - seed := make([]byte, Chacha20SeedLen) - _, err := rand.Read(seed) - require.NoError(t, err) - customizer := make([]byte, Chacha20CustomizerMaxLen) - _, err = rand.Read(customizer) - require.NoError(t, err) - - rng, err := NewChacha20PRG(seed, customizer) - require.NoError(t, err) - - t.Run("basic uniformity", func(t *testing.T) { - // compute n! - fact := func(n int) int { - f := 1 - for i := 1; i <= n; i++ { - f *= i - } - return f - } - - for listSize := 2; listSize <= 6; listSize++ { - factN := uint64(fact(listSize)) - t.Logf("permutation size is %d (factorial is %d)", listSize, factN) - t.Run("shuffle a random permutation", func(t *testing.T) { - list := make([]int, 0, listSize) - for i := 0; i < listSize; i++ { - list = append(list, i) - } - permEncoding := func() (uint64, error) { - err = rng.Shuffle(listSize, func(i, j int) { - list[i], list[j] = list[j], list[i] - }) - return uint64(EncodePermutation(list)), err - } - BasicDistributionTest(t, factN, 1, permEncoding) - }) - - t.Run("shuffle a same permutation", func(t *testing.T) { - list := make([]int, listSize) - permEncoding := func() (uint64, error) { - // reinit the permutation to the same value - for i := 0; i < listSize; i++ { - list[i] = i - } - err = rng.Shuffle(listSize, func(i, j int) { - list[i], list[j] = list[j], list[i] - }) - return uint64(EncodePermutation(list)), err - } - BasicDistributionTest(t, factN, 1, permEncoding) - }) - } - }) - - t.Run("empty slice", func(t *testing.T) { - emptySlice := make([]float64, 0) - err = rng.Shuffle(len(emptySlice), func(i, j int) { - emptySlice[i], emptySlice[j] = emptySlice[j], emptySlice[i] - }) - require.NoError(t, err) - assert.True(t, len(emptySlice) == 0) - }) - - t.Run("negative inputs", func(t *testing.T) { - emptySlice := make([]float64, 5) - err = rng.Shuffle(-3, func(i, j int) { - emptySlice[i], emptySlice[j] = emptySlice[j], emptySlice[i] - }) - require.Error(t, err) - }) -} - -func TestSamples(t *testing.T) { - rand := getPRG(t) - - seed := make([]byte, Chacha20SeedLen) - _, err := rand.Read(seed) - require.NoError(t, err) - customizer := make([]byte, Chacha20CustomizerMaxLen) - _, err = rand.Read(customizer) - require.NoError(t, err) - - rng, err := NewChacha20PRG(seed, customizer) - require.NoError(t, err) - - t.Run("basic uniformity", func(t *testing.T) { - listSize := 100 - samplesSize := 20 - sampleSize := 100000 - // tests the subset sampling randomness - samplingDistribution := make([]float64, listSize) - // tests the subset ordering randomness (using a particular element testElement) - orderingDistribution := make([]float64, samplesSize) - testElement := rand.Intn(listSize) - // Slice to shuffle - list := make([]int, 0, listSize) - for i := 0; i < listSize; i++ { - list = append(list, i) - } - - for i := 0; i < sampleSize; i++ { - err = rng.Samples(listSize, samplesSize, func(i, j int) { - list[i], list[j] = list[j], list[i] - }) - require.NoError(t, err) - has := make(map[int]struct{}) - for j, e := range list[:samplesSize] { - // check for repetition - _, ok := has[e] - require.False(t, ok, "duplicated item") - has[e] = struct{}{} - // fill the distribution - samplingDistribution[e] += 1.0 - if e == testElement { - orderingDistribution[j] += 1.0 - } - } - } - EvaluateDistributionUniformity(t, samplingDistribution) - EvaluateDistributionUniformity(t, orderingDistribution) - }) - - t.Run("zero edge cases", func(t *testing.T) { - // Sampling from an empty set - emptySlice := make([]float64, 0) - err = rng.Samples(len(emptySlice), len(emptySlice), func(i, j int) { - emptySlice[i], emptySlice[j] = emptySlice[j], emptySlice[i] - }) - require.NoError(t, err) - assert.True(t, len(emptySlice) == 0) - - // drawing a sample of size zero from an non-empty list should leave the original list unmodified - constant := []float64{0, 1, 2, 3, 4, 5} - fullSlice := constant - err = rng.Samples(len(fullSlice), 0, func(i, j int) { // modifies fullSlice in-place - emptySlice[i], emptySlice[j] = emptySlice[j], emptySlice[i] - }) - require.NoError(t, err) - assert.Equal(t, constant, fullSlice) - }) - - t.Run("negative inputs", func(t *testing.T) { - emptySlice := make([]float64, 5) - err = rng.Samples(-3, 5, func(i, j int) { - emptySlice[i], emptySlice[j] = emptySlice[j], emptySlice[i] - }) - require.Error(t, err) - - err = rng.Samples(-5, 3, func(i, j int) { - emptySlice[i], emptySlice[j] = emptySlice[j], emptySlice[i] - }) - require.Error(t, err) - }) -} - -// TestStateRestore tests the serilaization and deserialization functions -// Store and Restore -func TestStateRestore(t *testing.T) { - rand := getPRG(t) - - // generate a seed - seed := make([]byte, Chacha20SeedLen) - _, err := rand.Read(seed) - require.NoError(t, err) - customizer := make([]byte, Chacha20CustomizerMaxLen) - _, err = rand.Read(customizer) - require.NoError(t, err) - t.Logf("seed is %x, customizer is %x\n", seed, customizer) - - // create an rng - rng, err := NewChacha20PRG(seed, customizer) - require.NoError(t, err) - - // evolve the internal state of the rng - iterations := rand.Intn(1000) - for i := 0; i < iterations; i++ { - _ = rng.UintN(1024) - } - // get the internal state of the rng - state := rng.Store() - - // check the state is deterministic - state_clone := rng.Store() - assert.True(t, bytes.Equal(state, state_clone), "Store is not deterministic") - - // check Store is the Restore reverse function - secondRng, err := RestoreChacha20PRG(state) - require.NoError(t, err) - assert.True(t, bytes.Equal(state, secondRng.Store()), "Store o Restore is not identity") - - // check the 2 PRGs are generating identical outputs - iterations = rand.Intn(1000) - for i := 0; i < iterations; i++ { - rand1 := rng.UintN(1024) - rand2 := secondRng.UintN(1024) - assert.Equal(t, rand1, rand2, "the 2 rngs are not identical on round %d", i) - } -} diff --git a/crypto/random/rand_utils.go b/crypto/random/rand_utils.go deleted file mode 100644 index fbad033e04f..00000000000 --- a/crypto/random/rand_utils.go +++ /dev/null @@ -1,81 +0,0 @@ -package random - -import ( - "fmt" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - "gonum.org/v1/gonum/stat" -) - -// this constant should be increased if tests are flakey, but it the higher the constant -// the slower the test -const sampleSizeConstant = 85000 -const sampleCoefficient = sampleSizeConstant / 85 - -// BasicDistributionTest is a test function to run a basic statistic test on `randf` output. -// `randf` is a function that outputs random integers. -// It partitions all outputs into `n` continuous classes and computes the distribution -// over the partition. Each class has a width of `classWidth`: first class is [0..classWidth-1], -// second class is [classWidth..2*classWidth-1], etc.. -// It computes the frequency of outputs in the `n` classes and computes the -// standard deviation of frequencies. A small standard deviation is a necessary -// condition for a uniform distribution of `randf` (but is not a guarantee of -// uniformity) -func BasicDistributionTest(t *testing.T, n uint64, classWidth uint64, randf func() (uint64, error)) { - // sample size should ideally be a high number multiple of `n` - sampleSize := sampleCoefficient * n - if n < 80 { - // but if `n` is too small, we use a "high enough" sample size - sampleSize = ((sampleSizeConstant) / n) * n // highest multiple of n less than 80000 - } - distribution := make([]float64, n) - // populate the distribution - for i := uint64(0); i < sampleSize; i++ { - r, err := randf() - require.NoError(t, err) - if n*classWidth != 0 { - require.Less(t, r, n*classWidth) - } - distribution[r/classWidth] += 1.0 - } - EvaluateDistributionUniformity(t, distribution) -} - -// EvaluateDistributionUniformity evaluates if the input distribution is close to uniform -// through a basic quick test. -// The test computes the standard deviation and checks it is small enough compared -// to the distribution mean. -func EvaluateDistributionUniformity(t *testing.T, distribution []float64) { - tolerance := 0.05 - stdev := stat.StdDev(distribution, nil) - mean := stat.Mean(distribution, nil) - assert.Greater(t, tolerance*mean, stdev, fmt.Sprintf("basic randomness test failed: n: %d, stdev: %v, mean: %v", len(distribution), stdev, mean)) -} - -// computes a bijection from the set of all permutations -// into the the set [0, n!-1] (where `n` is the size of input `perm`). -// input `perm` is assumed to be a correct permutation of the set [0,n-1] -// (not checked in this function). -func EncodePermutation(perm []int) int { - r := make([]int, len(perm)) - // generate Lehmer code - // (for details https://en.wikipedia.org/wiki/Lehmer_code) - for i, x := range perm { - for _, y := range perm[i+1:] { - if y < x { - r[i]++ - } - } - } - // Convert to an integer following the factorial number system - // (for details https://en.wikipedia.org/wiki/Factorial_number_system) - m := 0 - fact := 1 - for i := len(perm) - 1; i >= 0; i-- { - m += r[i] * fact - fact *= len(perm) - i - } - return m -} diff --git a/crypto/sign.go b/crypto/sign.go deleted file mode 100644 index d400898d97d..00000000000 --- a/crypto/sign.go +++ /dev/null @@ -1,230 +0,0 @@ -// Package crypto ... -package crypto - -import ( - "crypto/elliptic" - "fmt" - - "github.com/btcsuite/btcd/btcec/v2" - - "github.com/onflow/flow-go/crypto/hash" -) - -// revive:disable:var-naming - -// revive:enable - -// SigningAlgorithm is an identifier for a signing algorithm -// (and parameters if applicable) -type SigningAlgorithm int - -const ( - // Supported signing algorithms - UnknownSigningAlgorithm SigningAlgorithm = iota - // BLSBLS12381 is BLS on BLS 12-381 curve - BLSBLS12381 - // ECDSAP256 is ECDSA on NIST P-256 curve - ECDSAP256 - // ECDSASecp256k1 is ECDSA on secp256k1 curve - ECDSASecp256k1 -) - -// String returns the string representation of this signing algorithm. -func (f SigningAlgorithm) String() string { - return [...]string{"UNKNOWN", "BLS_BLS12381", "ECDSA_P256", "ECDSA_secp256k1"}[f] -} - -// Signature is a generic type, regardless of the signature scheme -type Signature []byte - -// Signer interface -type signer interface { - // generatePrivateKey generates a private key - generatePrivateKey([]byte) (PrivateKey, error) - // decodePrivateKey loads a private key from a byte array - decodePrivateKey([]byte) (PrivateKey, error) - // decodePublicKey loads a public key from a byte array - decodePublicKey([]byte) (PublicKey, error) - // decodePublicKeyCompressed loads a public key from a byte array representing a point in compressed form - decodePublicKeyCompressed([]byte) (PublicKey, error) -} - -// newSigner returns a signer instance -func newSigner(algo SigningAlgorithm) (signer, error) { - switch algo { - case ECDSAP256: - return p256Instance, nil - case ECDSASecp256k1: - return secp256k1Instance, nil - case BLSBLS12381: - return blsInstance, nil - default: - return nil, invalidInputsErrorf("the signature scheme %s is not supported", algo) - } -} - -// Initialize the context of all algos -func init() { - // ECDSA - p256Instance = &(ecdsaAlgo{ - curve: elliptic.P256(), - algo: ECDSAP256, - }) - secp256k1Instance = &(ecdsaAlgo{ - curve: btcec.S256(), - algo: ECDSASecp256k1, - }) - - // BLS - initBLS12381() - blsInstance = &blsBLS12381Algo{ - algo: BLSBLS12381, - } -} - -// SignatureFormatCheck verifies the format of a serialized signature, -// regardless of messages or public keys. -// -// This function is only defined for ECDSA algos for now. -// -// If SignatureFormatCheck returns false then the input is not a valid -// signature and will fail a verification against any message and public key. -func SignatureFormatCheck(algo SigningAlgorithm, s Signature) (bool, error) { - switch algo { - case ECDSAP256: - return p256Instance.signatureFormatCheck(s), nil - case ECDSASecp256k1: - return secp256k1Instance.signatureFormatCheck(s), nil - default: - return false, invalidInputsErrorf( - "the signature scheme %s is not supported", - algo) - } -} - -// GeneratePrivateKey generates a private key of the algorithm using the entropy of the given seed. -// -// The seed minimum length is 32 bytes and it should have enough entropy. -// It is recommended to use a secure crypto RNG to generate the seed. -// -// The function returns: -// - (false, invalidInputsErrors) if the signing algorithm is not supported or -// if the seed length is not valid (less than 32 bytes or larger than 256 bytes) -// - (false, error) if an unexpected error occurs -// - (sk, nil) if key generation was successful -func GeneratePrivateKey(algo SigningAlgorithm, seed []byte) (PrivateKey, error) { - signer, err := newSigner(algo) - if err != nil { - return nil, fmt.Errorf("key generation failed: %w", err) - } - return signer.generatePrivateKey(seed) -} - -// DecodePrivateKey decodes an array of bytes into a private key of the given algorithm -// -// The function returns: -// - (nil, invalidInputsErrors) if the signing algorithm is not supported -// - (nil, invalidInputsErrors) if the input does not serialize a valid private key: -// - ECDSA: bytes(x) where bytes() is the big-endian encoding padded to the curve order. -// - BLS: bytes(x) where bytes() is the big-endian encoding padded to the order of BLS12-381. -// for all algorithms supported, input is big-endian encoding -// of a the private scalar less than the curve order and left-padded to 32 bytes -// - (nil, error) if an unexpected error occurs -// - (sk, nil) otherwise -func DecodePrivateKey(algo SigningAlgorithm, data []byte) (PrivateKey, error) { - signer, err := newSigner(algo) - if err != nil { - return nil, fmt.Errorf("decode private key failed: %w", err) - } - return signer.decodePrivateKey(data) -} - -// DecodePublicKey decodes an array of bytes into a public key of the given algorithm -// -// The function returns: -// - (nil, invalidInputsErrors) if the signing algorithm is not supported -// - (nil, invalidInputsErrors) if the input does not serialize a valid public key: -// - ECDSA: bytes(x)||bytes(y) where bytes() is the big-endian encoding padded to the field size. -// - BLS: compressed serialization of a G2 point following https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format- -// - (nil, error) if an unexpected error occurs -// - (sk, nil) otherwise -func DecodePublicKey(algo SigningAlgorithm, data []byte) (PublicKey, error) { - signer, err := newSigner(algo) - if err != nil { - return nil, fmt.Errorf("decode public key failed: %w", err) - } - return signer.decodePublicKey(data) -} - -// DecodePublicKeyCompressed decodes an array of bytes given in a compressed representation into a public key of the given algorithm -// Only ECDSA is supported (BLS uses the compressed serialzation by default). -// -// The function returns: -// - (nil, invalidInputsErrors) if the signing algorithm is not supported (is not ECDSA) -// - (nil, invalidInputsErrors) if the input does not serialize a valid public key: -// - ECDSA: sign_byte||bytes(x) according to X9.62 section 4.3.6. -// - (nil, error) if an unexpected error occurs -// - (sk, nil) otherwise -func DecodePublicKeyCompressed(algo SigningAlgorithm, data []byte) (PublicKey, error) { - signer, err := newSigner(algo) - if err != nil { - return nil, fmt.Errorf("decode public key failed: %w", err) - } - return signer.decodePublicKeyCompressed(data) -} - -// Signature type tools - -// Bytes returns a byte array of the signature data -func (s Signature) Bytes() []byte { - return s[:] -} - -// String returns a String representation of the signature data -func (s Signature) String() string { - return fmt.Sprintf("%#x", s.Bytes()) -} - -// Key Pair - -// PrivateKey is an unspecified signature scheme private key -type PrivateKey interface { - // Algorithm returns the signing algorithm related to the private key. - Algorithm() SigningAlgorithm - // Size return the key size in bytes. - Size() int - // String return a hex representation of the key - String() string - // Sign generates a signature using the provided hasher. - Sign([]byte, hash.Hasher) (Signature, error) - // PublicKey returns the public key. - PublicKey() PublicKey - // Encode returns a bytes representation of the private key - Encode() []byte - // Equals returns true if the given PrivateKeys are equal. Keys are considered unequal if their algorithms are - // unequal or if their encoded representations are unequal. If the encoding of either key fails, they are considered - // unequal as well. - Equals(PrivateKey) bool -} - -// PublicKey is an unspecified signature scheme public key. -type PublicKey interface { - // Algorithm returns the signing algorithm related to the public key. - Algorithm() SigningAlgorithm - // Size() return the key size in bytes. - Size() int - // String return a hex representation of the key - String() string - // Verify verifies a signature of an input message using the provided hasher. - Verify(Signature, []byte, hash.Hasher) (bool, error) - // Encode returns a bytes representation of the public key. - Encode() []byte - // EncodeCompressed returns a compressed byte representation of the public key. - // The compressed serialization concept is generic to elliptic curves, - // but we refer to individual curve parameters for details of the compressed format - EncodeCompressed() []byte - // Equals returns true if the given PublicKeys are equal. Keys are considered unequal if their algorithms are - // unequal or if their encoded representations are unequal. If the encoding of either key fails, they are considered - // unequal as well. - Equals(PublicKey) bool -} diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go deleted file mode 100644 index 9ecc684a4be..00000000000 --- a/crypto/sign_test_utils.go +++ /dev/null @@ -1,366 +0,0 @@ -package crypto - -import ( - crand "crypto/rand" - "fmt" - mrand "math/rand" - "testing" - "time" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" - - "github.com/onflow/flow-go/crypto/hash" -) - -func getPRG(t *testing.T) *mrand.Rand { - random := time.Now().UnixNano() - t.Logf("rng seed is %d", random) - rng := mrand.New(mrand.NewSource(random)) - return rng -} - -func TestKeyGenErrors(t *testing.T) { - seed := make([]byte, 50) - invalidSigAlgo := SigningAlgorithm(20) - sk, err := GeneratePrivateKey(invalidSigAlgo, seed) - assert.Nil(t, sk) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) -} - -func TestHasherErrors(t *testing.T) { - t.Run("nilHasher error sanity", func(t *testing.T) { - err := nilHasherError - invInpError := invalidInputsErrorf("") - otherError := fmt.Errorf("some error") - assert.True(t, IsNilHasherError(err)) - assert.False(t, IsInvalidInputsError(err)) - assert.False(t, IsNilHasherError(invInpError)) - assert.False(t, IsNilHasherError(otherError)) - assert.False(t, IsNilHasherError(nil)) - }) - - t.Run("nilHasher error sanity", func(t *testing.T) { - err := invalidHasherSizeErrorf("") - invInpError := invalidInputsErrorf("") - otherError := fmt.Errorf("some error") - assert.True(t, IsInvalidHasherSizeError(err)) - assert.False(t, IsInvalidInputsError(err)) - assert.False(t, IsInvalidHasherSizeError(invInpError)) - assert.False(t, IsInvalidHasherSizeError(otherError)) - assert.False(t, IsInvalidHasherSizeError(nil)) - }) -} - -// tests sign and verify are consistent for multiple generated keys and messages -func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) { - t.Run(fmt.Sprintf("Generation/Signature/Verification for %s", salg), func(t *testing.T) { - seed := make([]byte, KeyGenSeedMinLen) - input := make([]byte, 100) - rand := getPRG(t) - - loops := 50 - for j := 0; j < loops; j++ { - n, err := rand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - sk, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - _, err = rand.Read(input) - require.NoError(t, err) - s, err := sk.Sign(input, halg) - require.NoError(t, err) - pk := sk.PublicKey() - - // test a valid signature - result, err := pk.Verify(s, input, halg) - require.NoError(t, err) - assert.True(t, result) - - // test with a different message - input[0] ^= 1 - result, err = pk.Verify(s, input, halg) - require.NoError(t, err) - assert.False(t, result) - input[0] ^= 1 - - // test with a valid but different key - seed[0] ^= 1 - wrongSk, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - result, err = wrongSk.PublicKey().Verify(s, input, halg) - require.NoError(t, err) - assert.False(t, result) - - // test a wrong signature length - invalidLen := rand.Intn(2 * len(s)) // try random invalid lengths - if invalidLen == len(s) { // map to an invalid length - invalidLen = 0 - } - invalidSig := make([]byte, invalidLen) - result, err = pk.Verify(invalidSig, input, halg) - require.NoError(t, err) - assert.False(t, result) - } - }) -} - -// tests the key generation constraints with regards to the input seed, mainly -// the seed length constraints and the result determinicity. -func testKeyGenSeed(t *testing.T, salg SigningAlgorithm, minLen int, maxLen int) { - t.Run("seed length check", func(t *testing.T) { - // valid seed lengths - seed := make([]byte, minLen) - _, err := GeneratePrivateKey(salg, seed) - assert.NoError(t, err) - if maxLen > 0 { - seed = make([]byte, maxLen) - _, err = GeneratePrivateKey(salg, seed) - assert.NoError(t, err) - } - // invalid seed lengths - seed = make([]byte, minLen-1) - _, err = GeneratePrivateKey(salg, seed) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - if maxLen > 0 { - seed = make([]byte, maxLen+1) - _, err = GeneratePrivateKey(salg, seed) - assert.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - } - }) - - t.Run("deterministic generation", func(t *testing.T) { - // same seed results in the same key - seed := make([]byte, minLen) - read, err := crand.Read(seed) - require.Equal(t, read, minLen) - require.NoError(t, err) - sk1, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - sk2, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - assert.True(t, sk1.Equals(sk2)) - // different seed results in a different key - seed[0] ^= 1 // alter a seed bit - sk2, err = GeneratePrivateKey(salg, seed) - require.NoError(t, err) - assert.False(t, sk1.Equals(sk2)) - }) -} - -var BLS12381Order = []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x39, - 0xD8, 0x08, 0x09, 0xA1, 0xD8, 0x05, 0x53, 0xBD, 0xA4, 0x02, 0xFF, 0xFE, - 0x5B, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01} - -func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { - t.Run(fmt.Sprintf("generic encode/decode for %s", salg), func(t *testing.T) { - rand := getPRG(t) - - t.Run("happy path tests", func(t *testing.T) { - loops := 50 - for j := 0; j < loops; j++ { - // generate a private key - seed := make([]byte, KeyGenSeedMinLen) - read, err := rand.Read(seed) - require.Equal(t, read, KeyGenSeedMinLen) - require.NoError(t, err) - sk, err := GeneratePrivateKey(salg, seed) - assert.Nil(t, err) - seed[0] ^= 1 // alter the seed to get a new private key - distinctSk, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - - // check private key encoding - skBytes := sk.Encode() - skCheck, err := DecodePrivateKey(salg, skBytes) - require.Nil(t, err) - assert.True(t, sk.Equals(skCheck)) - skCheckBytes := skCheck.Encode() - assert.Equal(t, skBytes, skCheckBytes) - distinctSkBytes := distinctSk.Encode() - assert.NotEqual(t, skBytes, distinctSkBytes) - - // check public key encoding - pk := sk.PublicKey() - pkBytes := pk.Encode() - pkCheck, err := DecodePublicKey(salg, pkBytes) - require.Nil(t, err) - assert.True(t, pk.Equals(pkCheck)) - pkCheckBytes := pkCheck.Encode() - assert.Equal(t, pkBytes, pkCheckBytes) - distinctPkBytes := distinctSk.PublicKey().Encode() - assert.NotEqual(t, pkBytes, distinctPkBytes) - - // same for the compressed encoding - // skip is BLS is used and compression isn't supported - if !(salg == BLSBLS12381 && !isG2Compressed()) { - pkComprBytes := pk.EncodeCompressed() - pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes) - require.Nil(t, err) - assert.True(t, pk.Equals(pkComprCheck)) - pkCheckComprBytes := pkComprCheck.EncodeCompressed() - assert.Equal(t, pkComprBytes, pkCheckComprBytes) - distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed() - assert.NotEqual(t, pkComprBytes, distinctPkComprBytes) - } - } - }) - - // test invalid private keys (equal to the curve group order) - - t.Run("private keys equal to the group order", func(t *testing.T) { - groupOrder := make(map[SigningAlgorithm][]byte) - groupOrder[ECDSAP256] = []byte{255, 255, 255, 255, 0, 0, 0, 0, 255, 255, 255, - 255, 255, 255, 255, 255, 188, 230, 250, 173, 167, - 23, 158, 132, 243, 185, 202, 194, 252, 99, 37, 81} - - groupOrder[ECDSASecp256k1] = []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 254, 186, 174, 220, 230, - 175, 72, 160, 59, 191, 210, 94, 140, 208, 54, 65, 65} - - groupOrder[BLSBLS12381] = BLS12381Order - - sk, err := DecodePrivateKey(salg, groupOrder[salg]) - require.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, sk) - }) - - // test invalid private and public keys (invalid length) - t.Run("invalid key length", func(t *testing.T) { - // private key - skLens := make(map[SigningAlgorithm]int) - skLens[ECDSAP256] = PrKeyLenECDSAP256 - skLens[ECDSASecp256k1] = PrKeyLenECDSASecp256k1 - skLens[BLSBLS12381] = 32 - - bytes := make([]byte, skLens[salg]+1) - sk, err := DecodePrivateKey(salg, bytes) - require.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, sk) - - // public key - pkLens := make(map[SigningAlgorithm]int) - pkLens[ECDSAP256] = PubKeyLenECDSAP256 - pkLens[ECDSASecp256k1] = PubKeyLenECDSASecp256k1 - pkLens[BLSBLS12381] = 96 - - bytes = make([]byte, pkLens[salg]+1) - pk, err := DecodePublicKey(salg, bytes) - require.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, pk) - }) - }) -} - -func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorithm) { - t.Run(fmt.Sprintf("equals for %s", salg), func(t *testing.T) { - rand := getPRG(t) - // generate a key pair - seed := make([]byte, KeyGenSeedMinLen) - n, err := rand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - - // first pair - sk1, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - pk1 := sk1.PublicKey() - - // second pair without changing the seed - sk2, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - pk2 := sk2.PublicKey() - - // unrelated algo pair - sk3, err := GeneratePrivateKey(otherSigAlgo, seed) - require.NoError(t, err) - pk3 := sk3.PublicKey() - - // fourth pair with same algo but a different seed - seed[0] ^= 1 - sk4, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - pk4 := sk4.PublicKey() - - // tests - assert.True(t, sk1.Equals(sk2)) - assert.True(t, pk1.Equals(pk2)) - assert.False(t, sk1.Equals(sk3)) - assert.False(t, pk1.Equals(pk3)) - assert.False(t, sk1.Equals(sk4)) - assert.False(t, pk1.Equals(pk4)) - }) -} - -func testKeysAlgorithm(t *testing.T, sk PrivateKey, salg SigningAlgorithm) { - t.Run(fmt.Sprintf("key.Algorithm for %s", salg), func(t *testing.T) { - alg := sk.Algorithm() - assert.Equal(t, alg, salg) - alg = sk.PublicKey().Algorithm() - assert.Equal(t, alg, salg) - }) -} - -func testKeySize(t *testing.T, sk PrivateKey, skLen int, pkLen int) { - t.Run(fmt.Sprintf("key.Size for %s", sk.Algorithm()), func(t *testing.T) { - size := sk.Size() - assert.Equal(t, size, skLen) - size = sk.PublicKey().Size() - assert.Equal(t, size, pkLen) - }) -} - -func benchVerify(b *testing.B, algo SigningAlgorithm, halg hash.Hasher) { - seed := make([]byte, 48) - for j := 0; j < len(seed); j++ { - seed[j] = byte(j) - } - sk, err := GeneratePrivateKey(algo, seed) - require.NoError(b, err) - pk := sk.PublicKey() - - input := []byte("Bench input") - s, err := sk.Sign(input, halg) - require.NoError(b, err) - var result bool - - b.ResetTimer() - for i := 0; i < b.N; i++ { - result, err = pk.Verify(s, input, halg) - require.NoError(b, err) - } - // sanity check - require.True(b, result) - - b.StopTimer() -} - -func benchSign(b *testing.B, algo SigningAlgorithm, halg hash.Hasher) { - seed := make([]byte, 48) - for j := 0; j < len(seed); j++ { - seed[j] = byte(j) - } - sk, err := GeneratePrivateKey(algo, seed) - require.NoError(b, err) - - input := []byte("Bench input") - var signature []byte - - b.ResetTimer() - for i := 0; i < b.N; i++ { - signature, err = sk.Sign(input, halg) - require.NoError(b, err) - } - // sanity check - result, err := sk.PublicKey().Verify(signature, input, halg) - require.NoError(b, err) - require.True(b, result) - - b.StopTimer() -} diff --git a/crypto/spock.go b/crypto/spock.go deleted file mode 100644 index da269c23ac1..00000000000 --- a/crypto/spock.go +++ /dev/null @@ -1,101 +0,0 @@ -package crypto - -// SPoCK design based on the BLS signature scheme. -// BLS is using BLS12-381 curve and the same settings in bls.go. - -// #include "bls_include.h" -import "C" -import ( - "fmt" - - "github.com/onflow/flow-go/crypto/hash" -) - -// SPOCKProve generates a spock poof for data under the private key sk. -// -// The function returns: -// - (false, nilHasherError) if the hasher is nil -// - (false, invalidHasherSiseError) if hasher's output size is not 128 bytes -// - (nil, notBLSKeyError) if input key is not a BLS key -// - (nil, error) if an unexpected error occurs -// - (proof, nil) otherwise -func SPOCKProve(sk PrivateKey, data []byte, kmac hash.Hasher) (Signature, error) { - if sk.Algorithm() != BLSBLS12381 { - return nil, notBLSKeyError - } - - // BLS signature of data - return sk.Sign(data, kmac) -} - -// SPOCKVerifyAgainstData verifies a SPoCK proof is generated from the given data -// and the prover's public key. -// -// This is a simple BLS signature verifictaion of the proof under the input data -// and public key. -// -// The function returns: -// - (false, notBLSKeyError) if input key is not a BLS key -// - (false, nilHasherError) if the hasher is nil -// - (false, invalidHasherSiseError) if hasher's output size is not 128 bytes -// - (false, error) if an unexpected error occurs -// - (validity, nil) otherwise -func SPOCKVerifyAgainstData(pk PublicKey, proof Signature, data []byte, kmac hash.Hasher) (bool, error) { - if pk.Algorithm() != BLSBLS12381 { - return false, notBLSKeyError - } - // BLS verification of data - return pk.Verify(proof, data, kmac) -} - -// SPOCKVerify checks whether two couples of (SPoCK proof, public key) are consistent. -// -// Two (SPoCK proof, public key) couples are consistent if there exists a message such -// that each proof could be generated from the message and the private key corresponding -// to the respective public key. -// -// If the input proof slices have an invalid length or fail to deserialize into curve -// points, the function returns false without an error. -// The proofs membership checks in G1 are included in the verifcation. -// -// The function does not check the public keys membership in G2 because it is -// guaranteed by the package. However, the caller must make sure each input public key has been -// verified against a proof of possession prior to calling this function. -// -// The function returns: -// - (false, notBLSKeyError) if at least one key is not a BLS key. -// - (false, error) if an unexpected error occurs. -// - (validity, nil) otherwise -func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signature) (bool, error) { - blsPk1, ok1 := pk1.(*pubKeyBLSBLS12381) - blsPk2, ok2 := pk2.(*pubKeyBLSBLS12381) - if !(ok1 && ok2) { - return false, notBLSKeyError - } - - if len(proof1) != g1BytesLen || len(proof2) != g1BytesLen { - return false, nil - } - - // if pk1 and proof1 are identities of their respective groups, any couple (pk2, proof2) would - // verify the pairing equality which breaks the unforgeability of the SPoCK scheme. This edge case - // is avoided by not allowing an identity pk1. Similarly, an identity pk2 is not allowed. - if blsPk1.isIdentity || blsPk2.isIdentity { - return false, nil - } - - // verify the spock proof using the secret data - verif := C.bls_spock_verify((*C.E2)(&blsPk1.point), - (*C.uchar)(&proof1[0]), - (*C.E2)(&blsPk2.point), - (*C.uchar)(&proof2[0])) - - switch verif { - case invalid: - return false, nil - case valid: - return true, nil - default: - return false, fmt.Errorf("SPoCK verification failed") - } -} diff --git a/crypto/spock_test.go b/crypto/spock_test.go deleted file mode 100644 index 59498a42f6f..00000000000 --- a/crypto/spock_test.go +++ /dev/null @@ -1,182 +0,0 @@ -package crypto - -import ( - crand "crypto/rand" - "testing" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestSPOCKProveVerifyAgainstData(t *testing.T) { - // test the consistency with different data - seed := make([]byte, KeyGenSeedMinLen) - data := make([]byte, 100) - - n, err := crand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - sk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(t, err) - _, err = crand.Read(data) - require.NoError(t, err) - - // generate a SPoCK proof - kmac := NewExpandMsgXOFKMAC128("spock test") - s, err := SPOCKProve(sk, data, kmac) - require.NoError(t, err) - pk := sk.PublicKey() - - // SPoCK verify against the data (happy path) - t.Run("correctness check", func(t *testing.T) { - result, err := SPOCKVerifyAgainstData(pk, s, data, kmac) - require.NoError(t, err) - assert.True(t, result, - "Verification should succeed:\n signature:%s\n message:%s\n private key:%s", s, data, sk) - }) - - // test with a different message (unhappy path) - t.Run("invalid message", func(t *testing.T) { - data[0] ^= 1 - result, err := SPOCKVerifyAgainstData(pk, s, data, kmac) - require.NoError(t, err) - assert.False(t, result, - "Verification should fail:\n signature:%s\n message:%s\n private key:%s", s, data, sk) - data[0] ^= 1 - }) - - // test with a valid but different key (unhappy path) - t.Run("invalid key", func(t *testing.T) { - seed[0] ^= 1 - wrongSk, err := GeneratePrivateKey(BLSBLS12381, seed) - require.NoError(t, err) - result, err := SPOCKVerifyAgainstData(wrongSk.PublicKey(), s, data, kmac) - require.NoError(t, err) - assert.False(t, result, - "Verification should fail:\n signature:%s\n message:%s\n private key:%s", s, data, sk) - }) - - // test with an invalid key type - t.Run("invalid key type", func(t *testing.T) { - wrongSk := invalidSK(t) - result, err := SPOCKVerifyAgainstData(wrongSk.PublicKey(), s, data, kmac) - require.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.False(t, result) - }) - - // test with an identity public key - t.Run("identity proof", func(t *testing.T) { - // verifying with a pair of (proof, publicKey) equal to (identity_signature, identity_key) should - // return false - identityProof := g1Serialization - result, err := SPOCKVerifyAgainstData(IdentityBLSPublicKey(), identityProof, data, kmac) - assert.NoError(t, err) - assert.False(t, result) - }) -} - -// tests of happy and unhappy paths of SPOCKVerify -func TestSPOCKProveVerify(t *testing.T) { - // test the consistency with different data - seed1 := make([]byte, KeyGenSeedMinLen) - seed2 := make([]byte, KeyGenSeedMinLen) - data := make([]byte, 100) - - // data - _, err := crand.Read(data) - require.NoError(t, err) - // sk1 - n, err := crand.Read(seed1) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - sk1, err := GeneratePrivateKey(BLSBLS12381, seed1) - require.NoError(t, err) - // sk2 - n, err = crand.Read(seed2) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - sk2, err := GeneratePrivateKey(BLSBLS12381, seed2) - require.NoError(t, err) - - // generate SPoCK proofs - kmac := NewExpandMsgXOFKMAC128("spock test") - pr1, err := SPOCKProve(sk1, data, kmac) - require.NoError(t, err) - pr2, err := SPOCKProve(sk2, data, kmac) - require.NoError(t, err) - - // SPoCK verify against the data, happy path - t.Run("correctness check", func(t *testing.T) { - result, err := SPOCKVerify(sk1.PublicKey(), pr1, sk2.PublicKey(), pr2) - require.NoError(t, err) - assert.True(t, result, - "Verification should succeed:\n proofs:%s\n %s\n private keys:%s\n %s\n data:%x", - pr1, pr2, sk1, sk2, data) - }) - - // test with a different message, verification should fail for proofs - // of different messages. - t.Run("inconsistent proofs", func(t *testing.T) { - data[0] ^= 1 // alter the data - pr2bis, err := SPOCKProve(sk2, data, kmac) - require.NoError(t, err) - result, err := SPOCKVerify(sk1.PublicKey(), pr1, sk2.PublicKey(), pr2bis) - require.NoError(t, err) - assert.False(t, result, - "Verification should fail:\n proofs:%s\n %s\n private keys:%s\n %s \n data:%x", - pr1, pr2bis, sk1, sk2, data) - data[0] ^= 1 // restore the data - }) - - // test with a different key, verification should fail if the public keys are not - // matching the private keys used to generate the proofs. - t.Run("invalid public key", func(t *testing.T) { - seed2[0] ^= 1 // alter the seed - sk2bis, err := GeneratePrivateKey(BLSBLS12381, seed2) - require.NoError(t, err) - result, err := SPOCKVerify(sk1.PublicKey(), pr1, sk2bis.PublicKey(), pr2) - require.NoError(t, err) - assert.False(t, result, - "Verification should succeed:\n proofs:%s\n %s\n private keys:%s\n %s \n data:%s", - pr1, pr2, sk1, sk2bis, data) - }) - - // test with an invalid key type - t.Run("invalid key type", func(t *testing.T) { - wrongSk := invalidSK(t) - - pr, err := SPOCKProve(wrongSk, data, kmac) - require.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.Nil(t, pr) - - result, err := SPOCKVerify(wrongSk.PublicKey(), pr1, sk2.PublicKey(), pr2) - require.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.False(t, result) - - result, err = SPOCKVerify(sk1.PublicKey(), pr1, wrongSk.PublicKey(), pr2) - require.Error(t, err) - assert.True(t, IsNotBLSKeyError(err)) - assert.False(t, result) - }) - - // test with identity public key and proof - t.Run("identity proof", func(t *testing.T) { - // verifying with either pair of (proof, publicKey) equal to (identity_signature, identity_key) should - // return falsen with any other (proof, key) pair. - identityProof := g1Serialization - result, err := SPOCKVerify(IdentityBLSPublicKey(), identityProof, sk2.PublicKey(), pr2) - assert.NoError(t, err) - assert.False(t, result) - - result, err = SPOCKVerify(sk1.PublicKey(), pr1, IdentityBLSPublicKey(), identityProof) - assert.NoError(t, err) - assert.False(t, result) - - result, err = SPOCKVerify(IdentityBLSPublicKey(), identityProof, IdentityBLSPublicKey(), identityProof) - assert.NoError(t, err) - assert.False(t, result) - }) -} diff --git a/crypto/thresholdsign.go b/crypto/thresholdsign.go deleted file mode 100644 index 2dae7061b76..00000000000 --- a/crypto/thresholdsign.go +++ /dev/null @@ -1,146 +0,0 @@ -package crypto - -import ( - "errors" - "fmt" -) - -// A threshold signature scheme allows any subset of (t+1) -// valid signature shares to reconstruct the threshold signature. -// Up to (t) shares do not reveal any information about the threshold -// signature. -// Although the API allows using arbitrary values of (t), -// the threshold signature scheme is secure in the presence of up to (t) -// malicious participants when (t < n/2). -// In order to optimize equally for unforgeability and robustness, -// the input threshold value (t) should be set to t = floor((n-1)/2). - -const ( - // ThresholdSignMinSize is the minimum size of a group participating in a threshold signature protocol - ThresholdSignMinSize = MinimumThreshold + 1 - // ThresholdSignMaxSize is the maximum size of a group participating in a threshold signature protocol - ThresholdSignMaxSize = DKGMaxSize -) - -// ThresholdSignatureInspector is an inspector of the threshold signature protocol. -// The interface only allows inspecting the threshold signing protocol without taking part in it. -type ThresholdSignatureInspector interface { - // VerifyShare verifies the input signature against the stored message and stored - // key at the input index. This function does not update the internal state. - // The function is thread-safe. - // Returns: - // - (true, nil) if the signature is valid - // - (false, nil) if `orig` is a valid index but the signature share is invalid - // - (false, InvalidInputsError) if `orig` is an invalid index value - // - (false, error) for all other unexpected errors - VerifyShare(orig int, share Signature) (bool, error) - - // VerifyThresholdSignature verifies the input signature against the stored - // message and stored group public key. It does not update the internal state. - // The function is thread-safe. - // Returns: - // - (true, nil) if the signature is valid - // - (false, nil) if the signature is invalid - // - (false, error) for all other unexpected errors - VerifyThresholdSignature(thresholdSignature Signature) (bool, error) - - // EnoughShares indicates whether enough shares have been accumulated in order to reconstruct - // a group signature. This function is thread safe and locks the internal state. - // Returns: - // - true if and only if at least (threshold+1) shares were added - EnoughShares() bool - - // TrustedAdd adds a signature share to the internal pool of shares - // without verifying the signature against the message and the participant's - // public key. This function is thread safe and locks the internal state. - // - // The share is only added if the signer index is valid and has not been - // added yet. Moreover, the share is added only if not enough shares were collected. - // The function returns: - // - (true, nil) if enough signature shares were already collected and no error occurred - // - (false, nil) if not enough shares were collected and no error occurred - // - (false, InvalidInputsError) if index is invalid - // - (false, duplicatedSignerError) if a signature for the index was previously added - TrustedAdd(orig int, share Signature) (bool, error) - - // VerifyAndAdd verifies a signature share (same as `VerifyShare`), - // and may or may not add the share to the local pool of shares. - // This function is thread safe and locks the internal state. - // - // The share is only added if the signature is valid, the signer index is valid and has not been - // added yet. Moreover, the share is added only if not enough shares were collected. - // Boolean returns: - // - First boolean output is true if the share is valid and no error is returned, and false otherwise. - // - Second boolean output is true if enough shares were collected and no error is returned, and false otherwise. - // Error returns: - // - invalidInputsError if input index is invalid. A signature that doesn't verify against the signer's - // public key is not considered an invalid input. - // - duplicatedSignerError if signer was already added. - // - other errors if an unexpected exception occurred. - VerifyAndAdd(orig int, share Signature) (bool, bool, error) - - // HasShare checks whether the internal map contains the share of the given index. - // This function is thread safe. - // The function errors with InvalidInputsError if the index is invalid. - HasShare(orig int) (bool, error) - - // ThresholdSignature returns the threshold signature if the threshold was reached. - // The threshold signature is reconstructed only once and is cached for subsequent calls. - // - // Returns: - // - (signature, nil) if no error occurred - // - (nil, notEnoughSharesError) if not enough shares were collected - // - (nil, invalidSignatureError) if at least one collected share does not serialize to a valid BLS signature. - // - (nil, invalidInputsError) if the constructed signature failed to verify against the group public key and stored message. This post-verification - // is required for safety, as `TrustedAdd` allows adding invalid signatures. - // - (nil, error) for any other unexpected error. - ThresholdSignature() (Signature, error) -} - -// ThresholdSignatureParticipant is a participant in a threshold signature protocol. -// A participant is able to participate in a threshold signing protocol as well as inspecting the -// protocol. -type ThresholdSignatureParticipant interface { - ThresholdSignatureInspector - // SignShare generates a signature share using the current private key share. - // - // The function does not add the share to the internal pool of shares and do - // not update the internal state. - // This function is thread safe - // No error is expected unless an unexpected exception occurs - SignShare() (Signature, error) -} - -// duplicatedSignerError is an error returned when TrustedAdd or VerifyAndAdd encounter -// a signature share that has been already added to the internal state. -type duplicatedSignerError struct { - error -} - -// duplicatedSignerErrorf constructs a new duplicatedSignerError -func duplicatedSignerErrorf(msg string, args ...interface{}) error { - return &duplicatedSignerError{error: fmt.Errorf(msg, args...)} -} - -// IsDuplicatedSignerError checks if the input error is a duplicatedSignerError -func IsDuplicatedSignerError(err error) bool { - var target *duplicatedSignerError - return errors.As(err, &target) -} - -// notEnoughSharesError is an error returned when ThresholdSignature is called -// and not enough shares have been collected. -type notEnoughSharesError struct { - error -} - -// notEnoughSharesErrorf constructs a new notEnoughSharesError -func notEnoughSharesErrorf(msg string, args ...interface{}) error { - return ¬EnoughSharesError{error: fmt.Errorf(msg, args...)} -} - -// IsNotEnoughSharesError checks if the input error is a notEnoughSharesError -func IsNotEnoughSharesError(err error) bool { - var target *notEnoughSharesError - return errors.As(err, &target) -} diff --git a/crypto_adx_flag.mk b/crypto_adx_flag.mk index 667c8d493d3..277a4c3fbb4 100644 --- a/crypto_adx_flag.mk +++ b/crypto_adx_flag.mk @@ -1,4 +1,11 @@ -# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. +# This script can be imported by Makefiles in order to set the `CRYPTO_FLAG` automatically. +# The `CRYPTO_FLAG` is a Go command flag that should be used when the machine's CPU executing +# the command may not support ADX instructions. +# For new machines that support ADX instructions, the `CRYPTO_FLAG` flag is not needed (or set +# to an empty string). + +# First detect ADX support: +# `ADX_SUPPORT` is 1 if ADX instructions are supported on the current machine and 0 otherwise. ifeq ($(shell uname -s),Linux) # detect ADX support on the CURRENT linux machine. ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) @@ -7,11 +14,12 @@ else ADX_SUPPORT := 1 endif +# Then, set `CRYPTO_FLAG` # the crypto package uses BLST source files underneath which may use ADX instructions. ifeq ($(ADX_SUPPORT), 1) -# if ADX instructions are supported, default is to use a fast ADX BLST implementation +# if ADX instructions are supported on the current machine, default is to use a fast ADX implementation CRYPTO_FLAG := "" else -# if ADX instructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation +# if ADX instructions aren't supported, this CGO flags uses a slower non-ADX implementation CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" endif \ No newline at end of file