diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 8fb8b9c94b9..62231a7afeb 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -17,8 +17,6 @@ jobs: go-version: "1.20" - name: Checkout repo uses: actions/checkout@v2 - - name: Build relic - run: make crypto_setup_gopath # Provide Google Service Account credentials to Github Action, allowing interaction with the Google Container Registry # Logging in as github-actions@dl-flow.iam.gserviceaccount.com - id: auth diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f865bc9f0a5..904eebaebe0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -41,8 +41,10 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath + - name: Install C formatter + run: sudo apt-get install -y clang-format + - name: Run C formatter and sanitizer for ./crypto + run: make -C crypto c-format && make -C crypto c-sanitize - name: Run go generate run: go generate working-directory: ${{ matrix.dir }} @@ -51,10 +53,11 @@ jobs: with: # Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version. version: v1.54 - args: -v --build-tags relic + args: -v working-directory: ${{ matrix.dir }} # https://github.com/golangci/golangci-lint-action/issues/244 skip-cache: true + tidy: name: Tidy @@ -72,18 +75,6 @@ jobs: - name: code sanity check run: make code-sanity-check - shell-check: - name: ShellCheck - runs-on: ubuntu-latest - steps: - - name: Checkout repo - uses: actions/checkout@v3 - - name: Run ShellCheck - uses: ludeeus/action-shellcheck@203a3fd018dfe73f8ae7e3aa8da2c149a5f41c33 - with: - scandir: './crypto' - ignore: 'relic' - create-dynamic-test-matrix: name: Create Dynamic Test Matrix runs-on: ubuntu-latest @@ -144,20 +135,17 @@ jobs: matrix: include: - name: crypto - make1: -C crypto setup - make2: unittest + setup: noop retries: 1 race: 1 runner: ubuntu-latest - name: insecure - make1: install-tools - make2: test + setup: install-tools retries: 5 race: 0 runner: buildjet-4vcpu-ubuntu-2204 - name: integration - make1: install-tools - make2: test + setup: install-tools retries: 5 race: 0 runner: buildjet-4vcpu-ubuntu-2204 @@ -171,7 +159,7 @@ jobs: go-version: ${{ env.GO_VERSION }} cache: true - name: Setup tests (${{ matrix.name }}) - run: make ${{ matrix.make1 }} + run: make ${{ matrix.setup }} - name: Run tests (${{ matrix.name }}) env: RACE_DETECTOR: ${{ matrix.race }} @@ -179,8 +167,8 @@ jobs: with: timeout_minutes: 35 max_attempts: ${{ matrix.retries }} - # run `make2` target inside each module's root - command: VERBOSE=1 make -C ${{ matrix.name }} ${{ matrix.make2 }} + # run test target inside each module's root + command: VERBOSE=1 make -C ${{ matrix.name }} test - name: Upload coverage report uses: codecov/codecov-action@v3 with: @@ -202,8 +190,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath - name: Docker build run: make docker-build-flow docker-build-flow-corrupt - name: Save Docker images @@ -294,8 +280,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath - name: Load cached Docker images uses: actions/cache@v3 with: diff --git a/.github/workflows/flaky-test-monitor.yml b/.github/workflows/flaky-test-monitor.yml index 06731f77b9a..b3e380beaaa 100644 --- a/.github/workflows/flaky-test-monitor.yml +++ b/.github/workflows/flaky-test-monitor.yml @@ -82,18 +82,15 @@ jobs: matrix: include: - name: crypto - make1: -C crypto setup - make2: unittest + setup: noop race: 1 test_category: unit-crypto - name: insecure - make1: install-tools - make2: test + setup: install-tools race: 0 test_category: unit-insecure - name: integration - make1: install-tools - make2: test + setup: install-tools race: 0 test_category: unit-integration runs-on: ubuntu-latest @@ -106,11 +103,11 @@ jobs: go-version: ${{ env.GO_VERSION }} cache: true - name: Setup tests (${{ matrix.name }}) - run: make ${{ matrix.make1 }} + run: make ${{ matrix.setup }} - name: Run tests (${{ matrix.name }}) env: RACE_DETECTOR: ${{ matrix.race }} - run: make -es -C ${{ matrix.name }} ${{ matrix.make2 }} > test-output + run: make -es -C ${{ matrix.name }} test > test-output timeout-minutes: 100 continue-on-error: true - name: Process test results (${{ matrix.name }}) @@ -167,8 +164,6 @@ jobs: with: go-version: ${{ env.GO_VERSION }} cache: true - - name: Build relic - run: make crypto_setup_gopath - name: Docker build run: make docker-build-flow docker-build-flow-corrupt - name: Run tests diff --git a/.github/workflows/tools.yml b/.github/workflows/tools.yml index 77d27066919..c9cfdfbfd5d 100644 --- a/.github/workflows/tools.yml +++ b/.github/workflows/tools.yml @@ -38,8 +38,6 @@ jobs: # to accurately get the version tag fetch-depth: 0 ref: ${{ inputs.tag }} - - name: Build relic - run: make crypto_setup_gopath - name: Build and upload boot-tools run: | make tool-bootstrap tool-transit diff --git a/.gitignore b/.gitignore index 1be2e18a99f..0c025be2692 100644 --- a/.gitignore +++ b/.gitignore @@ -7,8 +7,6 @@ /cmd/util/util /cmd/bootstrap/bootstrap -# crypto relic folder -crypto/relic/ # Test binary, build with `go test -c` *.test diff --git a/Makefile b/Makefile index 6fd27764cc8..9a68b66a9ac 100644 --- a/Makefile +++ b/Makefile @@ -39,16 +39,20 @@ K8S_YAMLS_LOCATION_STAGING=./k8s/staging export CONTAINER_REGISTRY := gcr.io/flow-container-registry export DOCKER_BUILDKIT := 1 -# setup the crypto package under the GOPATH: needed to test packages importing flow-go/crypto -.PHONY: crypto_setup_gopath -crypto_setup_gopath: - bash crypto_setup.sh +include crypto_adx_flag.mk + +CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) + +# needed for CI +.PHONY: noop +noop: + @echo "This is a no-op target" cmd/collection/collection: - go build -o cmd/collection/collection cmd/collection/main.go + $(CGO_FLAG) go build -o cmd/collection/collection cmd/collection/main.go cmd/util/util: - go build -o cmd/util/util --tags relic cmd/util/main.go + $(CGO_FLAG) go build -o cmd/util/util cmd/util/main.go .PHONY: update-core-contracts-version update-core-contracts-version: @@ -64,13 +68,10 @@ update-cadence-version: ./scripts/update-cadence.sh $(CC_VERSION) make tidy -############################################################################################ -# CAUTION: DO NOT MODIFY THESE TARGETS! DOING SO WILL BREAK THE FLAKY TEST MONITOR - .PHONY: unittest-main unittest-main: - # test all packages with Relic library enabled - go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(GO_TEST_PACKAGES) + # test all packages + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) -covermode=atomic $(if $(RACE_DETECTOR),-race,) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(GO_TEST_PACKAGES) .PHONY: install-mock-generators install-mock-generators: @@ -79,7 +80,7 @@ install-mock-generators: go install github.com/golang/mock/mockgen@v1.6.0; .PHONY: install-tools -install-tools: crypto_setup_gopath check-go-version install-mock-generators +install-tools: check-go-version install-mock-generators cd ${GOPATH}; \ go install github.com/golang/protobuf/protoc-gen-go@v1.3.2; \ go install github.com/uber/prototool/cmd/prototool@v1.9.0; \ @@ -90,13 +91,6 @@ install-tools: crypto_setup_gopath check-go-version install-mock-generators verify-mocks: tidy generate-mocks git diff --exit-code -############################################################################################ - -.PHONY: emulator-norelic-check -emulator-norelic-check: - # test the fvm package compiles with Relic library disabled (required for the emulator build) - cd ./fvm && go test ./... -run=NoTestHasThisPrefix - .SILENT: go-math-rand-check go-math-rand-check: # check that the insecure math/rand Go package isn't used by production code. @@ -112,12 +106,12 @@ go-math-rand-check: fi .PHONY: code-sanity-check -code-sanity-check: go-math-rand-check emulator-norelic-check +code-sanity-check: go-math-rand-check .PHONY: fuzz-fvm fuzz-fvm: # run fuzz tests in the fvm package - cd ./fvm && go test -fuzz=Fuzz -run ^$$ --tags relic + cd ./fvm && $(CGO_FLAG) go test -fuzz=Fuzz -run ^$$ .PHONY: test test: verify-mocks unittest-main @@ -155,14 +149,14 @@ generate-proto: .PHONY: generate-fvm-env-wrappers generate-fvm-env-wrappers: - go run ./fvm/environment/generate-wrappers fvm/environment/parse_restricted_checker.go + $(CGO_FLAG) go run ./fvm/environment/generate-wrappers fvm/environment/parse_restricted_checker.go .PHONY: generate-mocks generate-mocks: install-mock-generators mockery --name '(Connector|PingInfoProvider)' --dir=network/p2p --case=underscore --output="./network/mocknetwork" --outpkg="mocknetwork" - mockgen -destination=storage/mocks/storage.go -package=mocks github.com/onflow/flow-go/storage Blocks,Headers,Payloads,Collections,Commits,Events,ServiceEvents,TransactionResults - mockgen -destination=module/mocks/network.go -package=mocks github.com/onflow/flow-go/module Local,Requester - mockgen -destination=network/mocknetwork/mock_network.go -package=mocknetwork github.com/onflow/flow-go/network EngineRegistry + $(CGO_FLAG) mockgen -destination=storage/mocks/storage.go -package=mocks github.com/onflow/flow-go/storage Blocks,Headers,Payloads,Collections,Commits,Events,ServiceEvents,TransactionResults + $(CGO_FLAG) mockgen -destination=module/mocks/network.go -package=mocks github.com/onflow/flow-go/module Local,Requester + $(CGO_FLAG) mockgen -destination=network/mocknetwork/mock_network.go -package=mocknetwork github.com/onflow/flow-go/network EngineRegistry mockery --name='.*' --dir=integration/benchmark/mocksiface --case=underscore --output="integration/benchmark/mock" --outpkg="mock" mockery --name=ExecutionDataStore --dir=module/executiondatasync/execution_data --case=underscore --output="./module/executiondatasync/execution_data/mock" --outpkg="mock" mockery --name=Downloader --dir=module/executiondatasync/execution_data --case=underscore --output="./module/executiondatasync/execution_data/mock" --outpkg="mock" @@ -177,7 +171,7 @@ generate-mocks: install-mock-generators mockery --name 'ProviderEngine' --dir=engine/execution/provider --case=underscore --output="engine/execution/provider/mock" --outpkg="mock" (cd ./crypto && mockery --name 'PublicKey' --case=underscore --output="../module/mock" --outpkg="mock") mockery --name '.*' --dir=state/cluster --case=underscore --output="state/cluster/mock" --outpkg="mock" - mockery --name '.*' --dir=module --case=underscore --tags="relic" --output="./module/mock" --outpkg="mock" + mockery --name '.*' --dir=module --case=underscore --output="./module/mock" --outpkg="mock" mockery --name '.*' --dir=module/mempool --case=underscore --output="./module/mempool/mock" --outpkg="mempool" mockery --name '.*' --dir=module/component --case=underscore --output="./module/component/mock" --outpkg="component" mockery --name '.*' --dir=network --case=underscore --output="./network/mocknetwork" --outpkg="mocknetwork" @@ -233,12 +227,12 @@ tidy: .PHONY: lint lint: tidy # revive -config revive.toml -exclude storage/ledger/trie ./... - golangci-lint run -v --build-tags relic ./... + golangci-lint run -v ./... .PHONY: fix-lint fix-lint: # revive -config revive.toml -exclude storage/ledger/trie ./... - golangci-lint run -v --build-tags relic --fix ./... + golangci-lint run -v --fix ./... # Runs unit tests with different list of packages as passed by CI so they run in parallel .PHONY: ci @@ -246,7 +240,7 @@ ci: install-tools test # Runs integration tests .PHONY: ci-integration -ci-integration: crypto_setup_gopath +ci-integration: $(MAKE) -C integration ci-integration-test # Runs benchmark tests @@ -268,7 +262,6 @@ docker-ci: # Runs integration tests in Docker (for mac) .PHONY: docker-ci-integration docker-ci-integration: - rm -rf crypto/relic docker run \ --env DOCKER_API_VERSION='1.39' \ --network host \ @@ -281,59 +274,59 @@ docker-ci-integration: .PHONY: docker-build-collection docker-build-collection: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/collection:latest" -t "$(CONTAINER_REGISTRY)/collection:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG)" . .PHONY: docker-build-collection-without-netgo docker-build-collection-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/collection:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-collection-debug docker-build-collection-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/collection --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/collection-debug:latest" -t "$(CONTAINER_REGISTRY)/collection-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/collection-debug:$(IMAGE_TAG)" . .PHONY: docker-build-consensus docker-build-consensus: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/consensus:latest" -t "$(CONTAINER_REGISTRY)/consensus:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG)" . .PHONY: docker-build-consensus-without-netgo docker-build-consensus-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/consensus:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-consensus-debug docker-build-consensus-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/consensus --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/consensus-debug:latest" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/consensus-debug:$(IMAGE_TAG)" . .PHONY: docker-build-execution docker-build-execution: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/execution:latest" -t "$(CONTAINER_REGISTRY)/execution:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG)" . .PHONY: docker-build-execution-without-netgo docker-build-execution-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/execution:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-execution-debug docker-build-execution-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/execution-debug:latest" -t "$(CONTAINER_REGISTRY)/execution-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-debug:$(IMAGE_TAG)" . # build corrupt execution node for BFT testing @@ -341,28 +334,28 @@ docker-build-execution-debug: docker-build-execution-corrupt: # temporarily make insecure/ a non-module to allow Docker to use corrupt builders there ./insecure/cmd/mods_override.sh - docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/execution --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/execution-corrupted:latest" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/execution-corrupted:$(IMAGE_TAG)" . ./insecure/cmd/mods_restore.sh .PHONY: docker-build-verification docker-build-verification: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/verification:latest" -t "$(CONTAINER_REGISTRY)/verification:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG)" . .PHONY: docker-build-verification-without-netgo docker-build-verification-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/verification:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-verification-debug docker-build-verification-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/verification-debug:latest" -t "$(CONTAINER_REGISTRY)/verification-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-debug:$(IMAGE_TAG)" . # build corrupt verification node for BFT testing @@ -370,28 +363,28 @@ docker-build-verification-debug: docker-build-verification-corrupt: # temporarily make insecure/ a non-module to allow Docker to use corrupt builders there ./insecure/cmd/mods_override.sh - docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/verification --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/verification-corrupted:latest" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/verification-corrupted:$(IMAGE_TAG)" . ./insecure/cmd/mods_restore.sh .PHONY: docker-build-access docker-build-access: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/access:latest" -t "$(CONTAINER_REGISTRY)/access:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG)" . .PHONY: docker-build-access-without-netgo docker-build-access-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/access:$(IMAGE_TAG_NO_NETGO)" . .PHONY: docker-build-access-debug docker-build-access-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/access-debug:latest" -t "$(CONTAINER_REGISTRY)/access-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-debug:$(IMAGE_TAG)" . # build corrupt access node for BFT testing @@ -399,21 +392,21 @@ docker-build-access-debug: docker-build-access-corrupt: #temporarily make insecure/ a non-module to allow Docker to use corrupt builders there ./insecure/cmd/mods_override.sh - docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./insecure/cmd/access --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/access-corrupted:latest" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/access-corrupted:$(IMAGE_TAG)" . ./insecure/cmd/mods_restore.sh .PHONY: docker-build-observer docker-build-observer: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/observer:latest" -t "$(CONTAINER_REGISTRY)/observer:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG)" . .PHONY: docker-build-observer-without-netgo docker-build-observer-without-netgo: - docker build -f cmd/Dockerfile --build-arg TAGS=relic --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/observer --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG_NO_NETGO) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --secret id=git_creds,env=GITHUB_CREDS --build-arg GOPRIVATE=$(GOPRIVATE) \ --label "git_commit=${COMMIT}" --label "git_tag=$(IMAGE_TAG_NO_NETGO)" \ -t "$(CONTAINER_REGISTRY)/observer:$(IMAGE_TAG_NO_NETGO)" . @@ -421,18 +414,18 @@ docker-build-observer-without-netgo: .PHONY: docker-build-ghost docker-build-ghost: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/ghost:latest" -t "$(CONTAINER_REGISTRY)/ghost:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost:$(IMAGE_TAG)" . .PHONY: docker-build-ghost-debug docker-build-ghost-debug: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --target debug \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/ghost --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(IMAGE_TAG) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target debug \ -t "$(CONTAINER_REGISTRY)/ghost-debug:latest" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/ghost-debug:$(IMAGE_TAG)" . PHONY: docker-build-bootstrap docker-build-bootstrap: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --build-arg VERSION=$(IMAGE_TAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap --build-arg GOARCH=$(GOARCH) --build-arg VERSION=$(IMAGE_TAG) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/bootstrap:latest" -t "$(CONTAINER_REGISTRY)/bootstrap:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap:$(IMAGE_TAG)" . @@ -442,7 +435,7 @@ tool-bootstrap: docker-build-bootstrap .PHONY: docker-build-bootstrap-transit docker-build-bootstrap-transit: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --no-cache \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/bootstrap/transit --build-arg COMMIT=$(COMMIT) --build-arg VERSION=$(VERSION) --build-arg GOARCH=$(GOARCH) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --no-cache \ --target production \ -t "$(CONTAINER_REGISTRY)/bootstrap-transit:latest" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/bootstrap-transit:$(IMAGE_TAG)" . @@ -452,7 +445,7 @@ tool-transit: docker-build-bootstrap-transit .PHONY: docker-build-loader docker-build-loader: - docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual --target production \ + docker build -f ./integration/benchmark/cmd/manual/Dockerfile --build-arg TARGET=./benchmark/cmd/manual --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ --label "git_commit=${COMMIT}" --label "git_tag=${IMAGE_TAG}" \ -t "$(CONTAINER_REGISTRY)/loader:latest" -t "$(CONTAINER_REGISTRY)/loader:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/loader:$(IMAGE_TAG)" . @@ -632,7 +625,7 @@ docker-all-tools: tool-util tool-remove-execution-fork PHONY: docker-build-util docker-build-util: - docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --build-arg VERSION=$(IMAGE_TAG) --target production \ + docker build -f cmd/Dockerfile --build-arg TARGET=./cmd/util --build-arg GOARCH=$(GOARCH) --build-arg VERSION=$(IMAGE_TAG) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ -t "$(CONTAINER_REGISTRY)/util:latest" -t "$(CONTAINER_REGISTRY)/util:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/util:$(IMAGE_TAG)" . PHONY: tool-util @@ -641,7 +634,7 @@ tool-util: docker-build-util PHONY: docker-build-remove-execution-fork docker-build-remove-execution-fork: - docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --build-arg VERSION=$(IMAGE_TAG) --target production \ + docker build -f cmd/Dockerfile --ssh default --build-arg TARGET=./cmd/util/cmd/remove-execution-fork --build-arg GOARCH=$(GOARCH) --build-arg VERSION=$(IMAGE_TAG) --build-arg CGO_FLAG=$(CRYPTO_FLAG) --target production \ -t "$(CONTAINER_REGISTRY)/remove-execution-fork:latest" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(SHORT_COMMIT)" -t "$(CONTAINER_REGISTRY)/remove-execution-fork:$(IMAGE_TAG)" . PHONY: tool-remove-execution-fork diff --git a/README.md b/README.md index 39bd7a13e3e..291e45de347 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,6 @@ The following table lists all work streams and links to their home directory and - Clone this repository - Install [Go](https://golang.org/doc/install) (Flow supports Go 1.18 and later) -- Install [CMake](https://cmake.org/install/), which is used for building the crypto library - Install [Docker](https://docs.docker.com/get-docker/), which is used for running a local network and integration tests - Make sure the [`GOPATH`](https://golang.org/cmd/go/#hdr-GOPATH_environment_variable) and `GOBIN` environment variables are set, and `GOBIN` is added to your path: @@ -75,12 +74,6 @@ The following table lists all work streams and links to their home directory and At this point, you should be ready to build, test, and run Flow! 🎉 -Note: Whenever the crypto module version imported by "go.mod" is updated to a version that was never locally imported before, the crypto dependency needs to be set-up. If not, you should notice errors about "relic" or "crypto". Run the following command to set-up the new module version: - -```bash -make crypto_setup_gopath -``` - ## Development Workflow ### Testing diff --git a/cmd/Dockerfile b/cmd/Dockerfile index d9d7800546c..5f72b5c1c48 100644 --- a/cmd/Dockerfile +++ b/cmd/Dockerfile @@ -6,7 +6,7 @@ FROM golang:1.20-bullseye AS build-setup RUN apt-get update -RUN apt-get -y install cmake zip +RUN apt-get -y install zip ## (2) Setup crypto dependencies FROM build-setup AS build-env @@ -25,8 +25,7 @@ COPY . . RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ - --mount=type=secret,id=git_creds,dst=/root/.netrc \ - make crypto_setup_gopath + --mount=type=secret,id=git_creds,dst=/root/.netrc #################################### ## (3) Build the production app binary @@ -36,14 +35,16 @@ WORKDIR /app ARG GOARCH=amd64 # TAGS can be overriden to modify the go build tags (e.g. build without netgo) -ARG TAGS="relic,netgo" +ARG TAGS="netgo" +# CGO_FLAG can be overwritten +ARG CGO_FLAG # Keep Go's build cache between builds. # https://github.com/golang/go/issues/27719#issuecomment-514747274 RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ --mount=type=secret,id=git_creds,dst=/root/.netrc \ - CGO_ENABLED=1 GOOS=linux go build --tags "${TAGS}" -ldflags "-extldflags -static \ + CGO_ENABLED=1 GOOS=linux CGO_FLAGS="${CGO_FLAG}" go build --tags "${TAGS}" -ldflags "-extldflags -static \ -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X 'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \ -o ./app ${TARGET} @@ -64,7 +65,7 @@ ARG GOARCH=amd64 RUN --mount=type=ssh \ --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ - CGO_ENABLED=1 GOOS=linux go build --tags "relic,netgo" -ldflags "-extldflags -static \ + CGO_ENABLED=1 GOOS=linux CGO_FLAGS="${CGO_FLAG}" go build --tags "netgo" -ldflags "-extldflags -static \ -X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' -X 'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \ -gcflags="all=-N -l" -o ./app ${TARGET} diff --git a/cmd/bootstrap/README.md b/cmd/bootstrap/README.md index 14339cc91ac..6b138946ca1 100644 --- a/cmd/bootstrap/README.md +++ b/cmd/bootstrap/README.md @@ -46,7 +46,7 @@ _Each cluster_ of collector nodes needs to have its own root Block and root QC # Usage -`go run -tags relic ./cmd/bootstrap` prints usage information +`go run ./cmd/bootstrap` prints usage information ## Phase 1: Generate networking and staking keys for partner nodes: @@ -65,7 +65,7 @@ If seeds are not provided, the CLI will try to use the system's pseudo-random nu #### Example ```bash -go run -tags relic ./cmd/bootstrap key --address "example.com:1234" --role "consensus" -o ./bootstrap/partner-node-infos +go run ./cmd/bootstrap key --address "example.com:1234" --role "consensus" -o ./bootstrap/partner-node-infos ``` #### Generated output files @@ -97,7 +97,7 @@ Each input is a config file specified as a command line parameter: #### Example ```bash -go run -tags relic ./cmd/bootstrap finalize \ +go run ./cmd/bootstrap finalize \ --root-chain main \ --root-height 0 \ --root-parent 0000000000000000000000000000000000000000000000000000000000000000 \ @@ -152,7 +152,7 @@ go run -tags relic ./cmd/bootstrap finalize \ This generates the networking key used by observers to connect to the public libp2p network. It is a different key format than staked nodes and should only be used for Observers. ```bash -go run -tags relic ./cmd/bootstrap observer-network-key -f ./path/network-key +go run ./cmd/bootstrap observer-network-key -f ./path/network-key ``` This key must be kept secret as it's used to encrypt and sign network requests sent by the observers. diff --git a/cmd/bootstrap/cmd/dkg.go b/cmd/bootstrap/cmd/dkg.go index d7069534e64..f87cbde2492 100644 --- a/cmd/bootstrap/cmd/dkg.go +++ b/cmd/bootstrap/cmd/dkg.go @@ -19,7 +19,7 @@ func runBeaconKG(nodes []model.NodeInfo) dkg.DKGData { log.Debug().Msgf("will run DKG") var dkgData dkg.DKGData var err error - dkgData, err = bootstrapDKG.RandomBeaconKG(n, GenerateRandomSeed(crypto.SeedMinLenDKG)) + dkgData, err = bootstrapDKG.RandomBeaconKG(n, GenerateRandomSeed(crypto.KeyGenSeedMinLen)) if err != nil { log.Fatal().Err(err).Msg("error running DKG") } diff --git a/cmd/bootstrap/cmd/genconfig.go b/cmd/bootstrap/cmd/genconfig.go index 404bd5e873e..ccf66104ecc 100644 --- a/cmd/bootstrap/cmd/genconfig.go +++ b/cmd/bootstrap/cmd/genconfig.go @@ -63,7 +63,7 @@ func genconfigCmdRun(_ *cobra.Command, _ []string) { var genconfigCmd = &cobra.Command{ Use: "genconfig", Short: "Generate node-config.json", - Long: "example: go run -tags relic ./cmd/bootstrap genconfig --address-format \"%s-%03d.devnet19.nodes.onflow.org:3569\" --access 2 --collection 3 --consensus 3 --execution 2 --verification 1 --weight 100", + Long: "example: go run ./cmd/bootstrap genconfig --address-format \"%s-%03d.devnet19.nodes.onflow.org:3569\" --access 2 --collection 3 --consensus 3 --execution 2 --verification 1 --weight 100", Run: genconfigCmdRun, } diff --git a/cmd/bootstrap/dkg/dkg_test.go b/cmd/bootstrap/dkg/dkg_test.go index a5d5a56de18..fb92aad0ee0 100644 --- a/cmd/bootstrap/dkg/dkg_test.go +++ b/cmd/bootstrap/dkg/dkg_test.go @@ -10,7 +10,7 @@ import ( ) func TestBeaconKG(t *testing.T) { - seed := unittest.SeedFixture(2 * crypto.SeedMinLenDKG) + seed := unittest.SeedFixture(2 * crypto.KeyGenSeedMinLen) // n = 0 _, err := RandomBeaconKG(0, seed) diff --git a/cmd/bootstrap/run/qc_test.go b/cmd/bootstrap/run/qc_test.go index 5deed36d1ed..4f925a5e793 100644 --- a/cmd/bootstrap/run/qc_test.go +++ b/cmd/bootstrap/run/qc_test.go @@ -50,7 +50,7 @@ func createSignerData(t *testing.T, n int) *ParticipantData { networkingKeys := unittest.NetworkingKeys(n) stakingKeys := unittest.StakingKeys(n) - seed := make([]byte, crypto.SeedMinLenDKG) + seed := make([]byte, crypto.KeyGenSeedMinLen) _, err := rand.Read(seed) require.NoError(t, err) randomBSKs, randomBPKs, groupKey, err := crypto.BLSThresholdKeyGen(n, diff --git a/config/README.md b/config/README.md index 3a4fe42c918..8308efcde8a 100644 --- a/config/README.md +++ b/config/README.md @@ -15,12 +15,12 @@ defined. A single default value can be overridden by setting the CLI flag for th config to false. Override entire config file. ```shell -go build -tags relic -o flow-access-node ./cmd/access +go build -o flow-access-node ./cmd/access ./flow-access-node --config-file=config/config.yml ``` Override a single configuration value. ```shell -go build -tags relic -o flow-access-node ./cmd/access +go build -o flow-access-node ./cmd/access ./flow-access-node --networking-connection-pruning=false ``` ### Adding a new config value diff --git a/consensus/hotstuff/signature/randombeacon_inspector_test.go b/consensus/hotstuff/signature/randombeacon_inspector_test.go index 5df5b897289..3aead48f822 100644 --- a/consensus/hotstuff/signature/randombeacon_inspector_test.go +++ b/consensus/hotstuff/signature/randombeacon_inspector_test.go @@ -40,7 +40,7 @@ func (rs *randomBeaconSuite) SetupTest() { // generate threshold keys rs.rng = unittest.GetPRG(rs.T()) - seed := make([]byte, crypto.SeedMinLenDKG) + seed := make([]byte, crypto.KeyGenSeedMinLen) _, err := rs.rng.Read(seed) require.NoError(rs.T(), err) rs.skShares, rs.pkShares, rs.pkGroup, err = crypto.BLSThresholdKeyGen(rs.n, rs.threshold, seed) diff --git a/consensus/hotstuff/verification/combined_verifier_v2.go b/consensus/hotstuff/verification/combined_verifier_v2.go index ee67a4ea36a..560cb1f8ece 100644 --- a/consensus/hotstuff/verification/combined_verifier_v2.go +++ b/consensus/hotstuff/verification/combined_verifier_v2.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package verification import ( diff --git a/consensus/hotstuff/verification/combined_verifier_v3.go b/consensus/hotstuff/verification/combined_verifier_v3.go index 8f5f9acd8f0..39af088ae0d 100644 --- a/consensus/hotstuff/verification/combined_verifier_v3.go +++ b/consensus/hotstuff/verification/combined_verifier_v3.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package verification import ( diff --git a/consensus/hotstuff/verification/staking_verifier.go b/consensus/hotstuff/verification/staking_verifier.go index 60b2f45f4d5..ecd5013f171 100644 --- a/consensus/hotstuff/verification/staking_verifier.go +++ b/consensus/hotstuff/verification/staking_verifier.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package verification import ( diff --git a/crypto/.dockerignore b/crypto/.dockerignore deleted file mode 100644 index 5c75f82093a..00000000000 --- a/crypto/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -relic/build diff --git a/crypto/Dockerfile b/crypto/Dockerfile index d75e9543de4..9c3fbff6363 100644 --- a/crypto/Dockerfile +++ b/crypto/Dockerfile @@ -2,7 +2,7 @@ FROM golang:1.20-buster RUN apt-get update -RUN apt-get -y install cmake zip +RUN apt-get -y install zip RUN go install github.com/axw/gocov/gocov@latest RUN go install github.com/matm/gocov-html@latest WORKDIR /go/src/flow diff --git a/crypto/Makefile b/crypto/Makefile index c66774e1033..14016e40619 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -3,6 +3,9 @@ COVER_PROFILE := cover.out IMAGE_TAG := v0.0.7 +# OS +UNAME := $(shell uname -s) + # allows CI to specify whether to have race detection on / off ifeq ($(RACE_DETECTOR),1) RACE_FLAG := -race @@ -10,42 +13,89 @@ else RACE_FLAG := endif -ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) - -.PHONY: setup -setup: - go generate +# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. +ifeq ($(UNAME),Linux) +# detect ADX support on the CURRENT linux machine. + ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) +else +# on non-linux machines, set the flag to 1 by default + ADX_SUPPORT := 1 +endif -# test BLS-related functionalities requiring the Relic library (and hence relic Go build flag) -.PHONY: relic_tests -relic_tests: +# the crypto package uses BLST source files underneath which may use ADX instructions. ifeq ($(ADX_SUPPORT), 1) - go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,) +# if ADX instructions are supported, default is to use a fast ADX BLST implementation + CRYPTO_FLAG := "" else - CGO_CFLAGS="-D__BLST_PORTABLE__" go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic $(if $(VERBOSE),-v,) +# if ADX instructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation + CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" endif +CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) -# test all packages that do not require Relic library (all functionalities except the BLS-related ones) -.PHONY: non_relic_tests -non_relic_tests: -# root package without relic - go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) -# sub packages - go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./hash - go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./random +# format C code +.PHONY: c-format +c-format: + clang-format -style=llvm -dump-config > .clang-format + clang-format -i *.c + clang-format -i *.h + rm -f .clang-format + git diff --exit-code -############################################################################################ -# CAUTION: DO NOT MODIFY THIS TARGET! DOING SO WILL BREAK THE FLAKY TEST MONITOR +# address sanitization and other checks +.SILENT: c-asan +c-asan: +# - address sanitization and other checks (only on linux) + if [ $(UNAME) = "Linux" ]; then \ + $(CGO_FLAG) CC="clang -O0 -g -fsanitize=address -fno-omit-frame-pointer -fsanitize=leak -fsanitize=undefined -fno-sanitize-recover=all -fsanitize=float-divide-by-zero -fsanitize=float-cast-overflow -fno-sanitize=null -fno-sanitize=alignment" \ + LD="-fsanitize=address -fsanitize=leak" go test; \ + if [ $$? -ne 0 ]; then exit 1; fi; \ + else \ + echo "sanitization is only supported on Linux"; \ + fi; \ -# sets up the crypto module and runs all tests -.PHONY: test -test: setup unittest +# memory sanitization +.SILENT: c-msan +c-msan: +# - memory sanitization (only on linux and using clang) - (could use go test -msan) +# currently, this leads to many false positives, most likely because of assembly code not handled properly +# by asan. If you would like to run this command, you can use `NO_MSAN` to diable msan in some C functions. +# For instance "void NO_MSAN f() {...}" disables msan in function f. `NO_MSAN` is already defined in +# bls12381_utils.h + if [ $(UNAME) = "Linux" ]; then \ + $(CGO_FLAG) CC="clang -DMSAN -O0 -g -fsanitize=memory -fno-omit-frame-pointer -fsanitize-memory-track-origins" \ + LD="-fsanitize=memory" go test; \ + if [ $$? -ne 0 ]; then exit 1; fi; \ + else \ + echo "sanitization is only supported on Linux"; \ + fi; \ + +# sanitize C code +.SILENT: c-sanitize +c-sanitize: c-asan +# - address sanitization and other checks (only on linux) +# - memory sanitization (target m-san) is disabled because of multiple false positives -# runs the unit tests of the module (assumes the module was set up) -.PHONY: unittest -unittest: relic_tests non_relic_tests +# Go tidy +.PHONY: go-tidy +go-tidy: + go mod tidy -v + git diff --exit-code -############################################################################################ +# Go lint +.PHONY: go-lint +go-lint: +lint: go-tidy + # revive -config revive.toml + golangci-lint run -v ./... + +# test all packages +.PHONY: test +test: +# root package + $(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) +# sub packages + $(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./hash + $(CGO_FLAG) go test -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) $(if $(VERBOSE),-v,) ./random .PHONY: docker-build docker-build: diff --git a/crypto/README.md b/crypto/README.md index 9f29ad03e16..c15d0a36462 100644 --- a/crypto/README.md +++ b/crypto/README.md @@ -6,86 +6,22 @@ Most of the primitives and protocols can be used in other projects and are not s Flow is an ongoing project, which means that new features will still be added and modifications will still be made to improve security and performance of the cryptography package. Notes: - - The package has been audited for security in January 2021 on [this version](https://github.com/onflow/flow-go/tree/2707acdabb851138e298b2d186e73f47df8a14dd). The package had a few improvements since. + - The package has been audited for security in January 2021 on [this version](https://github.com/onflow/flow-go/tree/2707acdabb851138e298b2d186e73f47df8a14dd). The package had a major refactor to switch all the BLS12-381 curve implementation to use [BLST](https://github.com/supranational/blst/tree/master/src) starting from [this version](TODO: link the commit/tag). - The package does not provide security against side channel or fault attacks. ## Package import -Cloning Flow repository and following the [installation steps](https://github.com/onflow/flow-go) builds the necessary tools to use Flow cryptography. +To use the Flow cryptography package, you can: -If you wish to only import the Flow cryptography package into your Go project, please follow the following steps: - -- Get Flow cryptography package +- get the package ``` go get github.com/onflow/flow-go/crypto ``` -or simply import the package to your Go project +- or simply import the package to your Go project ``` import "github.com/onflow/flow-go/crypto" ``` -This is enough to run the package code for many functionalities. However, this isn't enough if BLS signature related functionalities are used. The BLS features rely on an extrnal C library ([Relic](https://github.com/relic-toolkit/relic)) for lower level mathematical operations. Building your project at this stage including BLS functionalities would result in build errors related to missing "relic" files. For instance: -``` -fatal error: 'relic.h' file not found -#include "relic.h" - ^~~~~~~~~ -``` - - An extra step is required to compile the external dependency (Relic) locally. - -- Install [CMake](https://cmake.org/install/), which is used for building the package. The build also requires [Git](http://git-scm.com/) and bash scripting. -- From the Go package directory in `$GOPATH/pkg/mod/github.com/onflow/flow-go/crypto@/`, build the package dependencies. `version-tag` is the imported package version. -For instance: -``` -cd $GOPATH/pkg/mod/github.com/onflow/flow-go/crypto@v0.25.0/ -go generate -``` - -Below is a bash script example to automate the above steps. The script can be copied into your Go project root directory. -It extracts the imported pacakage version from your project's go.mod file and performs the remaining steps. -```bash -#!/bin/bash - -# crypto package -PKG_NAME="github.com/onflow/flow-go/crypto" - -# go get the package -go get ${PKG_NAME} - -# go.mod -MOD_FILE="./go.mod" - -# the version of onflow/flow-go/crypto used in the project is read from the go.mod file -if [ -f "${MOD_FILE}" ] -then - # extract the version from the go.mod file - VERSION="$(grep ${PKG_NAME} < ${MOD_FILE} | cut -d' ' -f 2)" - # using the right version, get the package directory path - PKG_DIR="$(go env GOPATH)/pkg/mod/${PKG_NAME}@${VERSION}" -else - { echo "couldn't find go.mod file - make sure the script is in the project root directory"; exit 1; } -fi - -# grant permissions if not existant -if [[ ! -r ${PKG_DIR} || ! -w ${PKG_DIR} || ! -x ${PKG_DIR} ]]; then - sudo chmod -R 755 "${PKG_DIR}" -fi - -# get into the package directory and set up the external dependencies -( - cd "${PKG_DIR}" || { echo "cd into the GOPATH package folder failed"; exit 1; } - go generate -) -``` - - -Finally, when building your project and including any BLS functionality, adding a Go build tag to include the BLS files in the build is required. -The tag is not required when the package is used without BLS functions. It was introduced to avoid build errors when BLS (and therefore Relic) is not needed. - -``` -go build -tags=relic -``` - ## Algorithms ### Hashing and Message Authentication Code: @@ -103,11 +39,11 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey` * ECDSA * public keys are compressed or uncompressed. - * ephemeral key is derived from the private key, hash and an external entropy using a CSPRNG (based on https://golang.org/pkg/crypto/ecdsa/). + * ephemeral key is derived from the private key, hash and the system entropy (based on https://golang.org/pkg/crypto/ecdsa/). * supports NIST P-256 (secp256r1) and secp256k1 curves. * BLS - * supports [BLS 12-381](https://electriccoin.co/blog/new-snark-curve/) curve. + * supports [BLS12-381](https://electriccoin.co/blog/new-snark-curve/) curve. * is implementing the minimal-signature-size variant: signatures in G1 and public keys in G2. * default set-up uses [compressed](https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) G1/G2 points, @@ -119,18 +55,13 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey` and BLS_POP_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for proofs of possession. * signature verification includes the signature membership check in G1. * public key membership check in G2 is provided outside of the signature verification. - * membership check in G1 is using [Bowe's fast check](https://eprint.iacr.org/2019/814.pdf), while membership check in G2 is using a simple scalar multiplication by the group order (both will be updated to use Scott's method) - * non-interactive aggregation of signatures, public keys and private keys. - * multi-signature verification of an aggregated signature of a single message under multiple public keys. - * multi-signature verification of an aggregated signature of multiple messages under multiple public keys. + * aggregation of signatures, public keys and private keys. + * verification of an aggregated signature of a single message under multiple public keys. + * verification of an aggregated signature of multiple messages under multiple public keys. * batch verification of multiple signatures of a single message under multiple - public keys: use a binary tree of aggregations to find the invalid signatures. + public keys, using a binary tree of aggregations. * SPoCK scheme based on BLS: verifies two signatures have been generated from the same message that is unknown to the verifier. - * Future features: - * membership checks in G1/G2 using [Scotts's method](https://eprint.iacr.org/2021/1130.pdf). - * support minimal-pubkey-size variant - ### PRNG * ChaCha20-based CSPRNG @@ -146,9 +77,6 @@ All signature schemes use the generic interfaces of `PrivateKey` and `PublicKey` * key generation (single dealer) to provide the set of keys. * provides a stateless api and a stateful api. - * Future features: - * support a partial signature reconstruction in the stateful api to avoid a long final reconstruction. - ### Discrete-Log based distributed key generation @@ -158,7 +86,7 @@ All supported Distributed Key Generation protocols are [discrete log based](http * simple verifiable secret sharing with a single dealer. * the library does not implement the communication channels between participants. The caller should implement the methods `PrivateSend` (1-to-1 messaging) and `Broadcast` (1-to-n messaging) * 1-to-1 messaging must be a private channel, the caller must make sure the channel preserves confidentialiy and authenticates the sender. - * 1-to-n broadcasting assume all destination participants receive the same copy of the message. The channel should also authenticate the broadcaster. + * 1-to-n broadcasting is a reliable broadcast, where honest senders are able to reach all honest receivers, and where all honest receivers end up with the same received messages. The channel should also authenticate the broadcaster. * It is recommended that both communication channels are unique per protocol instance. This could be achieved by prepending the messages to send/broadcast by a unique protocol instance ID. * Feldman VSS Qual. * an extension of the simple Feldman VSS. diff --git a/crypto/bls.go b/crypto/bls.go index 1e009304fe2..27ddd881bfd 100644 --- a/crypto/bls.go +++ b/crypto/bls.go @@ -1,15 +1,13 @@ -//go:build relic -// +build relic - package crypto -// BLS signature scheme implementation using BLS12-381 curve -// ([zcash]https://electriccoin.co/blog/new-snark-curve/) -// Pairing, ellipic curve and modular arithmetic is using Relic library. -// This implementation does not include any security against side-channel attacks. +// BLS signature scheme implementation using the BLS12-381 curve +// ([zcash]https://electriccoin.co/blog/new-snark-curve/). +// Pairing, ellipic curve and modular arithmetic are using [BLST](https://github.com/supranational/blst/tree/master/src) +// tools underneath. +// This implementation does not include security against side-channel or fault attacks. -// existing features: -// - the implementation variant is minimal-signature-size signatures: +// Existing features: +// - the implementation variant is minimal-signature-size: // shorter signatures in G1, longer public keys in G2 // - serialization of points on G1 and G2 is compressed ([zcash] // https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) @@ -21,25 +19,16 @@ package crypto // and BLS_POP_BLS12381G1_XOF:KMAC128_SSWU_RO_POP_ for proofs of possession. // - signature verification checks the membership of signature in G1. // - the public key membership check in G2 is implemented separately from the signature verification. -// - membership check in G1 is implemented using fast Bowe's check (to be updated to Scott's check). -// - membership check in G2 is using a simple scalar multiplication with the group order (to be updated to Scott's check). // - multi-signature tools are defined in bls_multisg.go -// - SPoCK scheme based on BLS: verifies two signatures have been generated from the same message, -// that is unknown to the verifier. - -// future features: -// - membership checks G2 using Bowe's method (https://eprint.iacr.org/2019/814.pdf) -// - implement a G1/G2 swap (signatures on G2 and public keys on G1) +// - SPoCK scheme based on BLS: verifies two signatures are generated from the same message, +// even though the message is unknown to the verifier. -// #cgo CFLAGS: -g -Wall -std=c99 -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "bls_include.h" import "C" import ( "bytes" "crypto/sha256" - "errors" "fmt" "golang.org/x/crypto/hkdf" @@ -48,26 +37,15 @@ import ( ) const ( - // BLS12-381 - // p size in bytes, where G1 is defined over the field Zp - fieldSize = 48 - // - // 1 for compressed, 0 for uncompressed - values should not be changed - uncompressed = 0 //nolint - compressed = 1 - // Points compression when serialized - serializationG1 = compressed - serializationG2 = compressed - // - // SignatureLenBLSBLS12381 is the size of G1 elements - SignatureLenBLSBLS12381 = fieldSize * (2 - serializationG1) // the length is divided by 2 if compression is on - PrKeyLenBLSBLS12381 = 32 - // PubKeyLenBLSBLS12381 is the size of G2 elements - PubKeyLenBLSBLS12381 = 2 * fieldSize * (2 - serializationG2) // the length is divided by 2 if compression is on + // SignatureLenBLSBLS12381 is the serialization size of a `G_1` element. + SignatureLenBLSBLS12381 = g1BytesLen + // PubKeyLenBLSBLS12381 is the serialization size of a `G_2` element. + PubKeyLenBLSBLS12381 = g2BytesLen + // PrKeyLenBLSBLS12381 is the serialization size of a `F_r` element, + // where `r` is the order of `G_1` and `G_2`. + PrKeyLenBLSBLS12381 = frBytesLen // Hash to curve params - // expandMsgOutput is the output length of the expand_message step as required by the hash_to_curve algorithm - expandMsgOutput = 2 * (fieldSize + (securityBits / 8)) // hash to curve suite ID of the form : CurveID_ || HashID_ || MapID_ || encodingVariant_ h2cSuiteID = "BLS12381G1_XOF:KMAC128_SSWU_RO_" // scheme implemented as a countermasure for rogue attacks of the form : SchemeTag_ @@ -77,12 +55,13 @@ const ( // Cipher suite used for BLS PoP of the form : BLS_POP_ || h2cSuiteID || SchemeTag_ // The PoP cipher suite is guaranteed to be different than all signature ciphersuites blsPOPCipherSuite = "BLS_POP_" + h2cSuiteID + schemeTag + // expandMsgOutput is the output length of the expand_message step as required by the + // hash_to_curve algorithm (and the map to G1 step). + expandMsgOutput = int(C.MAP_TO_G1_INPUT_LEN) ) // blsBLS12381Algo, embeds SignAlgo type blsBLS12381Algo struct { - // points to Relic context of BLS12-381 with all the parameters - context ctx // the signing algo and parameters algo SigningAlgorithm } @@ -165,12 +144,9 @@ func (sk *prKeyBLSBLS12381) Sign(data []byte, kmac hash.Hasher) (Signature, erro // hash the input to 128 bytes h := kmac.ComputeHash(data) - // set BLS context - blsInstance.reInit() - s := make([]byte, SignatureLenBLSBLS12381) C.bls_sign((*C.uchar)(&s[0]), - (*C.bn_st)(&sk.scalar), + (*C.Fr)(&sk.scalar), (*C.uchar)(&h[0]), (C.int)(len(h))) return s, nil @@ -202,10 +178,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) return false, err } - // intialize BLS context - blsInstance.reInit() - - if len(s) != signatureLengthBLSBLS12381 { + if len(s) != SignatureLenBLSBLS12381 { return false, nil } @@ -217,7 +190,7 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) return false, nil } - verif := C.bls_verify((*C.ep2_st)(&pk.point), + verif := C.bls_verify((*C.E2)(&pk.point), (*C.uchar)(&s[0]), (*C.uchar)(&h[0]), (C.int)(len(h))) @@ -228,15 +201,10 @@ func (pk *pubKeyBLSBLS12381) Verify(s Signature, data []byte, kmac hash.Hasher) case valid: return true, nil default: - return false, fmt.Errorf("signature verification failed") + return false, fmt.Errorf("signature verification failed: code %d", verif) } } -// 0xC0 is the header of the point at infinity serialization (either in G1 or G2) -const infinityPointHeader = 0xC0 - -var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, signatureLengthBLSBLS12381-1)...) - // IsBLSSignatureIdentity checks whether the input signature is // the identity signature (point at infinity in G1). // @@ -246,7 +214,7 @@ var identityBLSSignature = append([]byte{infinityPointHeader}, make([]byte, sign // suspected to be equal to identity, which avoids failing the aggregated // signature verification. func IsBLSSignatureIdentity(s Signature) bool { - return bytes.Equal(s, identityBLSSignature) + return bytes.Equal(s, g1Serialization) } // generatePrivateKey deterministically generates a private key for BLS on BLS12-381 curve. @@ -277,7 +245,7 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) { // L is the OKM length // L = ceil((3 * ceil(log2(r))) / 16) which makes L (security_bits/8)-larger than r size - okmLength := (3 * PrKeyLenBLSBLS12381) / 2 + okmLength := (3 * frBytesLen) / 2 // HKDF secret = IKM || I2OSP(0, 1) secret := make([]byte, len(ikm)+1) @@ -299,8 +267,9 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) { } defer overwrite(okm) // overwrite okm - // map the bytes to a private key : SK = OS2IP(OKM) mod r - isZero := mapToZr(&sk.scalar, okm) + // map the bytes to a private key using modular reduction + // SK = OS2IP(OKM) mod r + isZero := mapToFr(&sk.scalar, okm) if !isZero { return sk, nil } @@ -315,31 +284,27 @@ func (a *blsBLS12381Algo) generatePrivateKey(ikm []byte) (PrivateKey, error) { const invalidBLSSignatureHeader = byte(0xE0) // BLSInvalidSignature returns an invalid signature that fails when verified -// with any message and public key. +// with any message and public key, which can be used for testing. // // The signature bytes represent an invalid serialization of a point which // makes the verification fail early. The verification would return (false, nil). func BLSInvalidSignature() Signature { signature := make([]byte, SignatureLenBLSBLS12381) - signature[0] = invalidBLSSignatureHeader // invalid header as per C.ep_read_bin_compact + signature[0] = invalidBLSSignatureHeader // invalid header as per the Zcash serialization return signature } // decodePrivateKey decodes a slice of bytes into a private key. +// Decoding assumes a bytes big endian format. // It checks the scalar is non-zero and is less than the group order. func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, error) { - if len(privateKeyBytes) != prKeyLengthBLSBLS12381 { - return nil, invalidInputsErrorf("input length must be %d, got %d", - prKeyLengthBLSBLS12381, len(privateKeyBytes)) - } sk := newPrKeyBLSBLS12381(nil) - readScalar(&sk.scalar, privateKeyBytes) - if C.check_membership_Zr_star((*C.bn_st)(&sk.scalar)) == valid { - return sk, nil + err := readScalarFrStar(&sk.scalar, privateKeyBytes) + if err != nil { + return nil, fmt.Errorf("failed to read the private key: %w", err) } - - return nil, invalidInputsErrorf("the private key is not a valid BLS12-381 curve key") + return sk, nil } // decodePublicKey decodes a slice of bytes into a public key. @@ -350,18 +315,18 @@ func (a *blsBLS12381Algo) decodePrivateKey(privateKeyBytes []byte) (PrivateKey, // a faster check during signature verifications. Any verification against an identity // public key outputs `false`. func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, error) { - if len(publicKeyBytes) != pubKeyLengthBLSBLS12381 { + if len(publicKeyBytes) != PubKeyLenBLSBLS12381 { return nil, invalidInputsErrorf("input length must be %d, got %d", - pubKeyLengthBLSBLS12381, len(publicKeyBytes)) + PubKeyLenBLSBLS12381, len(publicKeyBytes)) } var pk pubKeyBLSBLS12381 - err := readPointG2(&pk.point, publicKeyBytes) + err := readPointE2(&pk.point, publicKeyBytes) if err != nil { - return nil, fmt.Errorf("decode public key failed %w", err) + return nil, fmt.Errorf("decode public key failed: %w", err) } // membership check in G2 - if C.check_membership_G2((*C.ep2_st)(&pk.point)) != valid { + if !bool(C.E2_in_G2((*C.E2)(&pk.point))) { return nil, invalidInputsErrorf("input key is infinity or does not encode a BLS12-381 point in the valid group") } @@ -374,7 +339,7 @@ func (a *blsBLS12381Algo) decodePublicKey(publicKeyBytes []byte) (PublicKey, err // decodePublicKeyCompressed decodes a slice of bytes into a public key. // since we use the compressed representation by default, this checks the default and delegates to decodePublicKeyCompressed func (a *blsBLS12381Algo) decodePublicKeyCompressed(publicKeyBytes []byte) (PublicKey, error) { - if serializationG2 != compressed { + if !isG2Compressed() { panic("library is not configured to use compressed public key serialization") } return a.decodePublicKey(publicKeyBytes) @@ -388,20 +353,19 @@ type prKeyBLSBLS12381 struct { scalar scalar } +var _ PrivateKey = (*prKeyBLSBLS12381)(nil) + // newPrKeyBLSBLS12381 creates a new BLS private key with the given scalar. // If no scalar is provided, the function allocates an // empty scalar. func newPrKeyBLSBLS12381(x *scalar) *prKeyBLSBLS12381 { - var sk prKeyBLSBLS12381 - if x == nil { - // initialize the scalar - C.bn_new_wrapper((*C.bn_st)(&sk.scalar)) - } else { - // set the scalar - sk.scalar = *x + if x != nil { + return &prKeyBLSBLS12381{ + // the embedded public key is only computed when needed + scalar: *x, + } } - // the embedded public key is only computed when needed - return &sk + return &prKeyBLSBLS12381{} } // Algorithm returns the Signing Algorithm @@ -440,7 +404,7 @@ func (sk *prKeyBLSBLS12381) PublicKey() PublicKey { // Encode returns a byte encoding of the private key. // The encoding is a raw encoding in big endian padded to the group order func (a *prKeyBLSBLS12381) Encode() []byte { - dest := make([]byte, prKeyLengthBLSBLS12381) + dest := make([]byte, frBytesLen) writeScalar(dest, &a.scalar) return dest } @@ -451,12 +415,12 @@ func (sk *prKeyBLSBLS12381) Equals(other PrivateKey) bool { if !ok { return false } - return sk.scalar.equals(&otherBLS.scalar) + return (&sk.scalar).equals(&otherBLS.scalar) } // String returns the hex string representation of the key. func (sk *prKeyBLSBLS12381) String() string { - return fmt.Sprintf("%#x", sk.Encode()) + return sk.scalar.String() } // pubKeyBLSBLS12381 is the public key of BLS using BLS12_381, @@ -472,15 +436,17 @@ type pubKeyBLSBLS12381 struct { // sure the comparison is performed after an instance is created. // // public key G2 point - point pointG2 + point pointE2 // G2 identity check cache isIdentity bool } +var _ PublicKey = (*pubKeyBLSBLS12381)(nil) + // newPubKeyBLSBLS12381 creates a new BLS public key with the given point. // If no scalar is provided, the function allocates an // empty scalar. -func newPubKeyBLSBLS12381(p *pointG2) *pubKeyBLSBLS12381 { +func newPubKeyBLSBLS12381(p *pointE2) *pubKeyBLSBLS12381 { if p != nil { key := &pubKeyBLSBLS12381{ point: *p, @@ -507,17 +473,19 @@ func (pk *pubKeyBLSBLS12381) Size() int { // The encoding is a compressed encoding of the point // [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format- func (a *pubKeyBLSBLS12381) EncodeCompressed() []byte { - if serializationG2 != compressed { + if !isG2Compressed() { panic("library is not configured to use compressed public key serialization") } return a.Encode() } -// Encode returns a byte encoding of the public key. -// Since we use a compressed encoding by default, this delegates to EncodeCompressed +// Encode returns a byte encoding of the public key (a G2 point). +// The current encoding is a compressed serialization of G2 following [zcash] https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format- +// +// The function should evolve in the future to support uncompressed compresion too. func (a *pubKeyBLSBLS12381) Encode() []byte { - dest := make([]byte, pubKeyLengthBLSBLS12381) - writePointG2(dest, &a.point) + dest := make([]byte, g2BytesLen) + writePointE2(dest, &a.point) return dest } @@ -532,46 +500,7 @@ func (pk *pubKeyBLSBLS12381) Equals(other PublicKey) bool { // String returns the hex string representation of the key. func (pk *pubKeyBLSBLS12381) String() string { - return fmt.Sprintf("%#x", pk.Encode()) -} - -// Get Macro definitions from the C layer as Cgo does not export macros -var signatureLengthBLSBLS12381 = int(C.get_signature_len()) -var pubKeyLengthBLSBLS12381 = int(C.get_pk_len()) -var prKeyLengthBLSBLS12381 = int(C.get_sk_len()) - -// init sets the context of BLS12-381 curve -func (a *blsBLS12381Algo) init() error { - // initializes relic context and sets the B12_381 parameters - if err := a.context.initContext(); err != nil { - return err - } - - // compare the Go and C layer constants as a sanity check - if signatureLengthBLSBLS12381 != SignatureLenBLSBLS12381 || - pubKeyLengthBLSBLS12381 != PubKeyLenBLSBLS12381 || - prKeyLengthBLSBLS12381 != PrKeyLenBLSBLS12381 { - return errors.New("BLS-12381 length settings in Go and C are not consistent, check hardcoded lengths and compressions") - } - return nil -} - -// set the context of BLS 12-381 curve in the lower C and Relic layers assuming the context -// was previously initialized with a call to init(). -// -// If the implementation evolves to support multiple contexts, -// reinit should be called at every blsBLS12381Algo operation. -func (a *blsBLS12381Algo) reInit() { - a.context.setContext() -} - -// This is only a TEST/DEBUG/BENCH function. -// It returns the hash to G1 point from a slice of 128 bytes -func mapToG1(data []byte) *pointG1 { - l := len(data) - var h pointG1 - C.map_to_G1((*C.ep_st)(&h), (*C.uchar)(&data[0]), (C.int)(l)) - return &h + return pk.point.String() } // This is only a TEST function. @@ -592,7 +521,7 @@ func (sk *prKeyBLSBLS12381) signWithXMDSHA256(data []byte) Signature { // sign the hash s := make([]byte, SignatureLenBLSBLS12381) C.bls_sign((*C.uchar)(&s[0]), - (*C.bn_st)(&sk.scalar), + (*C.Fr)(&sk.scalar), (*C.uchar)(&hash[0]), (C.int)(len(hash))) return s diff --git a/crypto/bls12381_hashtocurve.c b/crypto/bls12381_hashtocurve.c deleted file mode 100644 index 229f9c009de..00000000000 --- a/crypto/bls12381_hashtocurve.c +++ /dev/null @@ -1,338 +0,0 @@ -// +build relic - -#include "bls12381_utils.h" -#include "bls_include.h" - -extern prec_st* bls_prec; - -#if (hashToPoint== LOCAL_SSWU) - -// These constants are taken from https://github.com/kwantam/bls12-381_hash -// and converted to the Mongtomery domain. -// Copyright 2019 Riad S. Wahby -const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_DIGITS] = { - {0x4d18b6f3af00131c, 0x19fa219793fee28c, 0x3f2885f1467f19ae, - 0x23dcea34f2ffb304, 0xd15b58d2ffc00054, 0x0913be200a20bef4,}, - {0x898985385cdbbd8b, 0x3c79e43cc7d966aa, 0x1597e193f4cd233a, - 0x8637ef1e4d6623ad, 0x11b22deed20d827b, 0x07097bc5998784ad,}, - {0xa542583a480b664b, 0xfc7169c026e568c6, 0x5ba2ef314ed8b5a6, - 0x5b5491c05102f0e7, 0xdf6e99707d2a0079, 0x0784151ed7605524,}, - {0x494e212870f72741, 0xab9be52fbda43021, 0x26f5577994e34c3d, - 0x049dfee82aefbd60, 0x65dadd7828505289, 0x0e93d431ea011aeb,}, - {0x90ee774bd6a74d45, 0x7ada1c8a41bfb185, 0x0f1a8953b325f464, - 0x104c24211be4805c, 0x169139d319ea7a8f, 0x09f20ead8e532bf6,}, - {0x6ddd93e2f43626b7, 0xa5482c9aa1ccd7bd, 0x143245631883f4bd, - 0x2e0a94ccf77ec0db, 0xb0282d480e56489f, 0x18f4bfcbb4368929,}, - {0x23c5f0c953402dfd, 0x7a43ff6958ce4fe9, 0x2c390d3d2da5df63, - 0xd0df5c98e1f9d70f, 0xffd89869a572b297, 0x1277ffc72f25e8fe,}, - {0x79f4f0490f06a8a6, 0x85f894a88030fd81, 0x12da3054b18b6410, - 0xe2a57f6505880d65, 0xbba074f260e400f1, 0x08b76279f621d028,}, - {0xe67245ba78d5b00b, 0x8456ba9a1f186475, 0x7888bff6e6b33bb4, - 0xe21585b9a30f86cb, 0x05a69cdcef55feee, 0x09e699dd9adfa5ac,}, - {0x0de5c357bff57107, 0x0a0db4ae6b1a10b2, 0xe256bb67b3b3cd8d, - 0x8ad456574e9db24f, 0x0443915f50fd4179, 0x098c4bf7de8b6375,}, - {0xe6b0617e7dd929c7, 0xfe6e37d442537375, 0x1dafdeda137a489e, - 0xe4efd1ad3f767ceb, 0x4a51d8667f0fe1cf, 0x054fdf4bbf1d821c,}, - {0x72db2a50658d767b, 0x8abf91faa257b3d5, 0xe969d6833764ab47, - 0x464170142a1009eb, 0xb14f01aadb30be2f, 0x18ae6a856f40715d,}, -}; - -const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_DIGITS] = { - {0x2b567ff3e2837267, 0x1d4d9e57b958a767, 0xce028fea04bd7373, - 0xcc31a30a0b6cd3df, 0x7d7b18a682692693, 0x0d300744d42a0310,}, - {0x99c2555fa542493f, 0xfe7f53cc4874f878, 0x5df0608b8f97608a, - 0x14e03832052b49c8, 0x706326a6957dd5a4, 0x0a8dadd9c2414555,}, - {0x13d942922a5cf63a, 0x357e33e36e261e7d, 0xcf05a27c8456088d, - 0x0000bd1de7ba50f0, 0x83d0c7532f8c1fde, 0x13f70bf38bbf2905,}, - {0x5c57fd95bfafbdbb, 0x28a359a65e541707, 0x3983ceb4f6360b6d, - 0xafe19ff6f97e6d53, 0xb3468f4550192bf7, 0x0bb6cde49d8ba257,}, - {0x590b62c7ff8a513f, 0x314b4ce372cacefd, 0x6bef32ce94b8a800, - 0x6ddf84a095713d5f, 0x64eace4cb0982191, 0x0386213c651b888d,}, - {0xa5310a31111bbcdd, 0xa14ac0f5da148982, 0xf9ad9cc95423d2e9, - 0xaa6ec095283ee4a7, 0xcf5b1f022e1c9107, 0x01fddf5aed881793,}, - {0x65a572b0d7a7d950, 0xe25c2d8183473a19, 0xc2fcebe7cb877dbd, - 0x05b2d36c769a89b0, 0xba12961be86e9efb, 0x07eb1b29c1dfde1f,}, - {0x93e09572f7c4cd24, 0x364e929076795091, 0x8569467e68af51b5, - 0xa47da89439f5340f, 0xf4fa918082e44d64, 0x0ad52ba3e6695a79,}, - {0x911429844e0d5f54, 0xd03f51a3516bb233, 0x3d587e5640536e66, - 0xfa86d2a3a9a73482, 0xa90ed5adf1ed5537, 0x149c9c326a5e7393,}, - {0x462bbeb03c12921a, 0xdc9af5fa0a274a17, 0x9a558ebde836ebed, - 0x649ef8f11a4fae46, 0x8100e1652b3cdc62, 0x1862bd62c291dacb,}, - {0x05c9b8ca89f12c26, 0x0194160fa9b9ac4f, 0x6a643d5a6879fa2c, - 0x14665bdd8846e19d, 0xbb1d0d53af3ff6bf, 0x12c7e1c3b28962e5,}, - {0xb55ebf900b8a3e17, 0xfedc77ec1a9201c4, 0x1f07db10ea1a4df4, - 0x0dfbd15dc41a594d, 0x389547f2334a5391, 0x02419f98165871a4,}, - {0xb416af000745fc20, 0x8e563e9d1ea6d0f5, 0x7c763e17763a0652, - 0x01458ef0159ebbef, 0x8346fe421f96bb13, 0x0d2d7b829ce324d2,}, - {0x93096bb538d64615, 0x6f2a2619951d823a, 0x8f66b3ea59514fa4, - 0xf563e63704f7092f, 0x724b136c4cf2d9fa, 0x046959cfcfd0bf49,}, - {0xea748d4b6e405346, 0x91e9079c2c02d58f, 0x41064965946d9b59, - 0xa06731f1d2bbe1ee, 0x07f897e267a33f1b, 0x1017290919210e5f,}, - {0x872aa6c17d985097, 0xeecc53161264562a, 0x07afe37afff55002, - 0x54759078e5be6838, 0xc4b92d15db8acca8, 0x106d87d1b51d13b9,}, -}; - -// sqrt_ration optimized for p mod 4 = 3. -// Check if (U/V) is a square, return 1 if yes, 0 otherwise -// If 1 is returned, out contains sqrt(U/V), -// otherwise out is sqrt(z*U/V) -// out should not be the same as U, or V -static int sqrt_ratio_3mod4(fp_t out, const fp_t u, const fp_t v) { - fp_t t0, t1, t2; - - fp_sqr(t1, v); // V^2 - fp_mul(t2, u, v); // U*V - fp_mul(t1, t1, t2); // U*V^3 - fp_exp(out, t1, &bls_prec->p_3div4); // (U*V^3)^((p-3)/4) - fp_mul(out, out, t2); // (U*V)*(U*V^3)^((p-3)/4) = U^((p+1)/4) * V^(3p-5)/4 - - fp_sqr(t0, out); // out^2 - fp_mul(t0, t0, v); // out^2 * V - - int res = 1; - if (fp_cmp(t0, u) != RLC_EQ) { // check whether U/V is a quadratic residue - fp_mul(out, out, bls_prec->sqrt_z); // sqrt(-z)*U*V(UV^3)^((p-3)/4) - res = 0; - } - - return res; -} - -// returns 1 if input is odd and 0 if input is even -static int sign_0(const fp_t in) { -#if FP_RDC == MONTY - bn_t tmp; - fp_prime_back(tmp, in); // TODO: entire reduction may not be needed to get the parity - return bn_is_even(tmp); -#endif - return in[0]&1; -} - -// Maps the field element t to a point p in E1(Fp) where E1: y^2 = g(x) = x^3 + a1*x + b1 -// using optimized non-constant-time Simplified SWU implementation (A.B = 0) -// Outout point p is in Jacobian coordinates to avoid extra inversions. -static inline void map_to_E1_osswu(ep_t p, const fp_t t) { - fp_t t0, t1, t2, t3, t4; - - // get the isogeny map coefficients - ctx_t* ctx = core_get(); - fp_t *a1 = &ctx->ep_iso.a; - fp_t *b1 = &ctx->ep_iso.b; - fp_t *z = &ctx->ep_map_u; - - // compute numerator and denominator of X0(t) = N / D - fp_sqr(t1, t); // t^2 - fp_mul(t1, t1, *z); // z * t^2 - fp_sqr(t2, t1); // z^2 * t^4 - fp_add(t2, t2, t1); // z * t^2 + z^2 * t^4 - fp_add(t3, t2, bls_prec->r); // z * t^2 + z^2 * t^4 + 1 - fp_mul(t3, t3, *b1); // N = b * (z * t^2 + z^2 * t^4 + 1) - - if (fp_is_zero(t2)) { - fp_copy(p->z, bls_prec->a1z); // D = a * z - } else { - fp_mul(p->z, t2, bls_prec->minus_a1); // D = - a * (z * t^2 + z^2 * t^4) - } - - // compute numerator and denominator of g(X0(t)) = U / V - // U = N^3 + a1 * N * D^2 + b1 * D^3 - // V = D^3 - fp_sqr(t2, t3); // N^2 - fp_sqr(t0, p->z); // D^2 - fp_mul(t4, *a1, t0); // a * D^2 - fp_add(t2, t4, t2); // N^2 + a * D^2 - fp_mul(t2, t3, t2); // N^3 + a * N * D^2 - fp_mul(t0, t0, p->z); // V = D^3 - fp_mul(t4, *b1, t0); // b * V = b * D^3 - fp_add(t2, t4, t2); // U = N^3 + a1 * N * D^2 + b1 * D^3 - - // compute sqrt(U/V) - int is_sqr = sqrt_ratio_3mod4(p->y, t2, t0); - if (is_sqr) { - fp_copy(p->x, t3); // x = N - } else { - fp_mul(p->x, t1, t3); // x = N * z * t^2 - fp_mul(t1, t1, t); // z * t^3 - fp_mul(p->y, p->y, t1); // y = z * t^3 * sqrt(r * U/V) where r is 1 or map coefficient z - } - - // negate y to be the same sign of t - if (sign_0(t) != sign_0(p->y)) { - fp_neg(p->y, p->y); // -y - } - - // convert (x/D, y) into Jacobian (X,Y,Z) where Z=D to avoid inversion. - // Z = D, X = x/D * D^2 = x*D , Y = y*D^3 - fp_mul(p->x, p->x, p->z); // X = N*D - fp_mul(p->y, p->y, t0); // Y = y*D^3 - // p->z is already equal to D - p->coord = JACOB; -} - -// This code is taken from https://github.com/kwantam/bls12-381_hash -// and adapted to use Relic modular arithemtic. -// Copyright 2019 Riad S. Wahby -static inline void hornerPolynomial(fp_t accumulator, const fp_t x, const int start_val, const fp_t fp_tmp[]) { - for (int i = start_val; i >= 0; --i) { - fp_mul(accumulator, accumulator, x); // acc *= x - fp_add(accumulator, accumulator, fp_tmp[i]); // acc += next_val - } -} - -// This code is taken from https://github.com/kwantam/bls12-381_hash -// and adapted to use Relic modular arithemtic. -// Copyright 2019 Riad S. Wahby -static inline void compute_map_zvals(fp_t out[], const fp_t inv[], const fp_t zv[], const unsigned len) { - for (unsigned i = 0; i < len; ++i) { - fp_mul(out[i], inv[i], zv[i]); - } -} - -// 11-isogeny map -// computes the mapping of p and stores the result in r -// -// This code is taken from https://github.com/kwantam/bls12-381_hash -// and adapted to use Relic modular arithemtic. The constant tables -// iso_D and iso_N were converted to the Montgomery domain. -// -// Copyright 2019 Riad S. Wahby -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -static inline void eval_iso11(ep_t r, const ep_t p) { - fp_t fp_tmp[32]; - - // precompute even powers of Z up to Z^30 in fp_tmp[31]..fp_tmp[17] - fp_sqr(fp_tmp[31], p->z); // Z^2 - fp_sqr(fp_tmp[30], fp_tmp[31]); // Z^4 - fp_mul(fp_tmp[29], fp_tmp[30], fp_tmp[31]); // Z^6 - fp_sqr(fp_tmp[28], fp_tmp[30]); // Z^8 - fp_mul(fp_tmp[27], fp_tmp[28], fp_tmp[31]); // Z^10 - fp_sqr(fp_tmp[26], fp_tmp[29]); // Z^12 - fp_mul(fp_tmp[25], fp_tmp[26], fp_tmp[31]); // Z^14 - fp_sqr(fp_tmp[24], fp_tmp[28]); // Z^16 - fp_mul(fp_tmp[23], fp_tmp[24], fp_tmp[31]); // Z^18 - fp_sqr(fp_tmp[22], fp_tmp[27]); // Z^20 - fp_mul(fp_tmp[21], fp_tmp[22], fp_tmp[31]); // Z^22 - fp_sqr(fp_tmp[20], fp_tmp[26]); // Z^24 - fp_mul(fp_tmp[19], fp_tmp[20], fp_tmp[31]); // Z^26 - fp_sqr(fp_tmp[18], fp_tmp[25]); // Z^28 - fp_mul(fp_tmp[17], fp_tmp[18], fp_tmp[31]); // Z^30 - - // get isogeny map coefficients - iso_t iso = ep_curve_get_iso(); - // hardcode the constant to avoid warnings of gcc -Wstringop-overread - const int deg_dy = 15; // also equal to iso->deg_yd; - const int deg_dx = 10; // also equal to iso->deg_xd; - // TODO: get N coefficient from Relic and update N computations - - // y = Ny/Dy - // compute Dy - compute_map_zvals(fp_tmp, iso->yd, fp_tmp + 17, deg_dy); // k_(15-i) Z^(2i) - fp_add(fp_tmp[16], p->x, fp_tmp[deg_dy - 1]); // X + k_14 Z^2 - hornerPolynomial(fp_tmp[16], p->x, deg_dy - 2, fp_tmp); // Horner for the rest - fp_mul(fp_tmp[15], fp_tmp[16], fp_tmp[31]); // Dy * Z^2 - fp_mul(fp_tmp[15], fp_tmp[15], p->z); // Dy * Z^3 - - // compute Ny - compute_map_zvals(fp_tmp, bls_prec->iso_Ny, fp_tmp + 17, ELLP_Ny_LEN - 1); // k_(15-i) Z^(2i) - fp_mul(fp_tmp[16], p->x, bls_prec->iso_Ny[ELLP_Ny_LEN - 1]); // k_15 * X - fp_add(fp_tmp[16], fp_tmp[16], fp_tmp[ELLP_Ny_LEN - 2]); // k_15 * X + k_14 Z^2 - hornerPolynomial(fp_tmp[16], p->x, ELLP_Ny_LEN - 3, fp_tmp); // Horner for the rest - fp_mul(fp_tmp[16], fp_tmp[16], p->y); // Ny * Y - - // x = Nx/Dx - // compute Dx - compute_map_zvals(fp_tmp, iso->xd, fp_tmp + 22, deg_dx); // k_(10-i) Z^(2i) - fp_add(fp_tmp[14], p->x, fp_tmp[deg_dx - 1]); // X + k_9 Z^2 - hornerPolynomial(fp_tmp[14], p->x, deg_dx - 2, fp_tmp); // Horner for the rest - fp_mul(fp_tmp[14], fp_tmp[14], fp_tmp[31]); // Dx * Z^2 - - // compute Nx - compute_map_zvals(fp_tmp, bls_prec->iso_Nx, fp_tmp + 21, ELLP_Nx_LEN - 1); // k_(11-i) Z^(2i) - fp_mul(fp_tmp[13], p->x, bls_prec->iso_Nx[ELLP_Nx_LEN - 1]); // k_11 * X - fp_add(fp_tmp[13], fp_tmp[13], fp_tmp[ELLP_Nx_LEN - 2]); // k_11 * X + k_10 * Z^2 - hornerPolynomial(fp_tmp[13], p->x, ELLP_Nx_LEN - 3, fp_tmp); // Dy: Horner for the rest - - // compute the resulting point (Xo,Yo,Zo) - fp_mul(r->z, fp_tmp[14], fp_tmp[15]); // Zo = Dx Dy - fp_mul(r->x, fp_tmp[13], fp_tmp[15]); // Nx Dy - fp_mul(r->x, r->x, r->z); // Xo = Nx Dy Z - fp_sqr(fp_tmp[12], r->z); // Zo^2 - fp_mul(r->y, fp_tmp[16], fp_tmp[14]); // Ny Dx - fp_mul(r->y, r->y, fp_tmp[12]); // Yo = Ny Dx Zo^2 - r->coord = JACOB; -} - -// map an input point in E to a point in G1 by clearing the cofactor of G1 -static void clear_cofactor(ep_t out, const ep_t in) { - bn_t z; - bn_new(z); - fp_prime_get_par(z); - // compute 1-z - bn_neg(z, z); - bn_add_dig(z, z, 1); - ep_mul_dig(out, in, z->dp[0]); // z fits in 64 bits - bn_free(z); -} - -// construction 2 section 5 in in https://eprint.iacr.org/2019/403.pdf -// evaluate the optimized SSWU map twice, add resulting points, apply isogeny map, clear cofactor -// the result is stored in p -// msg is the input message to hash, must be at least 2*(FP_BYTES+16) = 128 bytes -static void map_to_G1_local(ep_t p, const uint8_t *msg, int len) { - RLC_TRY { - if (len < 2*(Fp_BYTES+16)) { - RLC_THROW(ERR_NO_BUFFER); - } - - fp_t t1, t2; - bn_t tmp; - bn_new(tmp); - bn_read_bin(tmp, msg, len/2); - fp_prime_conv(t1, tmp); - bn_read_bin(tmp, msg + len/2, len - len/2); - fp_prime_conv(t2, tmp); - bn_free(tmp); - - ep_t p_temp; - ep_new(p_temp); - // first mapping - map_to_E1_osswu(p_temp, t1); // map to E1 - eval_iso11(p_temp, p_temp); // map to E - - // second mapping - map_to_E1_osswu(p, t2); // map to E1 - eval_iso11(p, p); // map to E - // sum - // TODO: implement point addition in E1 and apply the isogeny map only once. - // Gives 4% improvement for map-to-curve overall - ep_add_jacob(p, p, p_temp); - - // clear the cofactor - clear_cofactor(p, p); // map to G1 - ep_free(p_temp); - } - RLC_CATCH_ANY { - RLC_THROW(ERR_CAUGHT); - } -} -#endif - -// computes a hash of input data to G1 -// construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf -void map_to_G1(ep_t h, const byte* data, const int len) { - #if hashToPoint==LOCAL_SSWU - map_to_G1_local(h, data, len); - #elif hashToPoint==RELIC_SSWU - ep_map_from_field(h, data, len); - #endif -} diff --git a/crypto/bls12381_utils.c b/crypto/bls12381_utils.c index 19a1b730b5e..fc29046e47f 100644 --- a/crypto/bls12381_utils.c +++ b/crypto/bls12381_utils.c @@ -1,852 +1,1174 @@ -// +build relic - // this file contains utility functions for the curve BLS 12-381 -// these tools are shared by the BLS signature scheme, the BLS based threshold signature -// and the BLS distributed key generation protocols +// these tools are shared by the BLS signature scheme, the BLS based threshold +// signature, BLS-SPoCK and the BLS distributed key generation protocols #include "bls12381_utils.h" -#include "bls_include.h" #include "assert.h" +#include "bls_include.h" -// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) +// compile all blst C src along with this file +#include "blst_src.c" + +// make sure flow crypto types are consistent with BLST types +void types_sanity(void) { + assert(sizeof(Fr) == sizeof(vec256)); + assert(sizeof(Fp) == sizeof(vec384)); + assert(sizeof(Fp2) == sizeof(vec384x)); + assert(sizeof(E1) == sizeof(POINTonE1)); + assert(sizeof(E2) == sizeof(POINTonE2)); + assert(sizeof(Fp12) == sizeof(vec384fp12)); +} -// return macro values to the upper Go Layer -int get_valid() { - return VALID; +// ------------------- Fr utilities + +// Montgomery constant R related to the curve order r +// R = (1<<256) mod r +const Fr BLS12_381_rR = {{ + TO_LIMB_T(0x1824b159acc5056f), + TO_LIMB_T(0x998c4fefecbc4ff5), + TO_LIMB_T(0x5884b7fa00034802), + TO_LIMB_T(0x00000001fffffffe), +}}; + +// returns true if a is zero and false otherwise +bool Fr_is_zero(const Fr *a) { return vec_is_zero(a, sizeof(Fr)); } + +// returns true if a == b and false otherwise +bool Fr_is_equal(const Fr *a, const Fr *b) { + return vec_is_equal(a, b, sizeof(Fr)); } -int get_invalid() { - return INVALID; +// sets `a` to limb `l` +void Fr_set_limb(Fr *a, const limb_t l) { + vec_zero((byte *)a + sizeof(limb_t), sizeof(Fr) - sizeof(limb_t)); + *((limb_t *)a) = l; } -void bn_new_wrapper(bn_t a) { - bn_new(a); +void Fr_copy(Fr *res, const Fr *a) { + if ((uptr_t)a == (uptr_t)res) { + return; + } + vec_copy((byte *)res, (byte *)a, sizeof(Fr)); } -// global variable of the pre-computed data -prec_st bls_prec_st; -prec_st* bls_prec = NULL; +// sets `a` to 0 +void Fr_set_zero(Fr *a) { vec_zero((byte *)a, sizeof(Fr)); } -// required constants for the optimized SWU hash to curve -#if (hashToPoint == LOCAL_SSWU) -extern const uint64_t iso_Nx_data[ELLP_Nx_LEN][Fp_DIGITS]; -extern const uint64_t iso_Ny_data[ELLP_Ny_LEN][Fp_DIGITS]; -#endif +void Fr_add(Fr *res, const Fr *a, const Fr *b) { + add_mod_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r); +} -#if (MEMBERSHIP_CHECK_G1 == BOWE) -extern const uint64_t beta_data[Fp_DIGITS]; -extern const uint64_t z2_1_by3_data[2]; -#endif +void Fr_sub(Fr *res, const Fr *a, const Fr *b) { + sub_mod_256((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_r); +} -// sets the global variable to input -void precomputed_data_set(const prec_st* p) { - bls_prec = (prec_st*)p; -} - -// Reads a prime field element from a digit vector in big endian format. -// There is no conversion to Montgomery domain in this function. - #define fp_read_raw(a, data_pointer) dv_copy((a), (data_pointer), Fp_DIGITS) - -// pre-compute some data required for curve BLS12-381 -prec_st* init_precomputed_data_BLS12_381() { - bls_prec = &bls_prec_st; - ctx_t* ctx = core_get(); - - // (p-1)/2 - bn_div_dig(&bls_prec->p_1div2, &ctx->prime, 2); - #if (hashToPoint == LOCAL_SSWU) - // (p-3)/4 - bn_div_dig(&bls_prec->p_3div4, &bls_prec->p_1div2, 2); - // sqrt(-z) - fp_neg(bls_prec->sqrt_z, ctx->ep_map_u); - fp_srt(bls_prec->sqrt_z, bls_prec->sqrt_z); - // -a1 and a1*z - fp_neg(bls_prec->minus_a1, ctx->ep_iso.a); - fp_mul(bls_prec->a1z, ctx->ep_iso.a, ctx->ep_map_u); - - for (int i=0; iiso_Nx[i], iso_Nx_data[i]); - for (int i=0; iiso_Ny[i], iso_Ny_data[i]); - #endif - - #if (MEMBERSHIP_CHECK_G1 == BOWE) - bn_new(&bls_prec->beta); - bn_read_raw(&bls_prec->beta, beta_data, Fp_DIGITS); - bn_new(&bls_prec->z2_1_by3); - bn_read_raw(&bls_prec->z2_1_by3, z2_1_by3_data, 2); - #endif - - // Montgomery constant R - fp_set_dig(bls_prec->r, 1); - return bls_prec; -} - -// Initializes Relic context with BLS12-381 parameters -ctx_t* relic_init_BLS12_381() { - // check Relic was compiled with the right conf - assert(ALLOC == AUTO); - - // sanity check of Relic constants the package is relying on - assert(RLC_OK == RLC_EQ); - - // initialize relic core with a new context - ctx_t* bls_ctx = (ctx_t*) calloc(1, sizeof(ctx_t)); - if (!bls_ctx) return NULL; - core_set(bls_ctx); - if (core_init() != RLC_OK) return NULL; - - // init BLS curve - int ret = RLC_OK; - #if (FP_PRIME == 381) - ret = ep_param_set_any_pairf(); // sets B12_P381 if FP_PRIME = 381 in relic config - #else - ep_param_set(B12_P381); - ep2_curve_set_twist(EP_MTYPE); // Multiplicative twist - #endif - - if (ret != RLC_OK) return NULL; - return core_get(); -} - -// seeds relic PRG -void seed_relic(byte* seed, int len) { - #if RAND == HASHD - // instantiate a new DRBG - ctx_t *ctx = core_get(); - ctx->seeded = 0; - #endif - rand_seed(seed, len); -} - -// Exponentiation of a generic point p in G1 -void ep_mult(ep_t res, const ep_t p, const bn_t expo) { - // Using window NAF of size 2 - ep_mul_lwnaf(res, p, expo); -} - -// Exponentiation of generator g1 in G1 -// These two function are here for bench purposes only -void ep_mult_gen_bench(ep_t res, const bn_t expo) { - // Using precomputed table of size 4 - ep_mul_gen(res, (bn_st *)expo); -} - -void ep_mult_generic_bench(ep_t res, const bn_t expo) { - // generic point multiplication - ep_mult(res, &core_get()->ep_g, expo); -} - -// Exponentiation of a generic point p in G2 -void ep2_mult(ep2_t res, ep2_t p, bn_t expo) { - // Using window NAF of size 2 - ep2_mul_lwnaf(res, p, expo); -} - -// Exponentiation of fixed g2 in G2 -void ep2_mult_gen(ep2_t res, const bn_t expo) { - // Using precomputed table of size 4 - g2_mul_gen(res, (bn_st*)expo); -} - -// DEBUG printing functions -void bytes_print_(char* s, byte* data, int len) { - printf("[%s]:\n", s); - for (int i=0; iep_r); -} - -// Reads a scalar from an array and maps it to Zr. -// The resulting scalar `a` satisfies 0 <= a < r. -// `len` must be less than BITS_TO_BYTES(RLC_BN_BITS). -// It returns VALID if scalar is zero and INVALID otherwise -int bn_map_to_Zr(bn_t a, const uint8_t* bin, int len) { - bn_t tmp; - bn_new(tmp); - bn_new_size(tmp, BYTES_TO_DIGITS(len)); - bn_read_bin(tmp, bin, len); - bn_mod(a, tmp, &core_get()->ep_r); - bn_rand(tmp, RLC_POS, len << 3); // overwrite tmp - bn_free(tmp); - if (bn_cmp_dig(a, 0) == RLC_EQ) { - return VALID; - } - return INVALID; +// res = a*R^(-1) +void Fr_from_montg(Fr *res, const Fr *a) { + from_mont_256((limb_t *)res, (limb_t *)a, BLS12_381_r, r0); } -// Reads a scalar from an array and maps it to Zr*. -// The resulting scalar `a` satisfies 0 < a < r. -// `len` must be less than BITS_TO_BYTES(RLC_BN_BITS) -void bn_map_to_Zr_star(bn_t a, const uint8_t* bin, int len) { - bn_t tmp; - bn_new(tmp); - bn_new_size(tmp, BYTES_TO_DIGITS(len)); - bn_read_bin(tmp, bin, len); - bn_t r_1; - bn_new(r_1); - bn_sub_dig(r_1, &core_get()->ep_r, 1); - bn_mod_basic(a,tmp,r_1); - bn_add_dig(a,a,1); - bn_rand(tmp, RLC_POS, len << 3); // overwrite tmp - bn_free(tmp); - bn_free(r_1); -} - -// returns the sign of y. -// 1 if y > (p - 1)/2 and 0 otherwise. -static int fp_get_sign(const fp_t y) { - bn_t bn_y; - bn_new(bn_y); - fp_prime_back(bn_y, y); - return bn_cmp(bn_y, &bls_prec->p_1div2) == RLC_GT; -} - -// ep_write_bin_compact exports a point a in E(Fp) to a buffer bin in a compressed or uncompressed form. -// len is the allocated size of the buffer bin. -// The serialization is following: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// The code is a modified version of Relic ep_write_bin -void ep_write_bin_compact(byte *bin, const ep_t a, const int len) { - const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1)); - - if (len!=G1_size) { - RLC_THROW(ERR_NO_BUFFER); - return; - } - - if (ep_is_infty(a)) { - // set the infinity bit - bin[0] = (G1_SERIALIZATION << 7) | 0x40; - memset(bin+1, 0, G1_size-1); - return; - } +// res = a^(-1)*R +void Fr_inv_montg_eucl(Fr *res, const Fr *a) { + // copied and modified from BLST code + // Copyright Supranational LLC + static const vec256 rx2 = { + /* left-aligned value of the modulus */ + TO_LIMB_T(0xfffffffe00000002), + TO_LIMB_T(0xa77b4805fffcb7fd), + TO_LIMB_T(0x6673b0101343b00a), + TO_LIMB_T(0xe7db4ea6533afa90), + }; + vec512 temp; + ct_inverse_mod_256(temp, (limb_t *)a, BLS12_381_r, rx2); + redc_mont_256((limb_t *)res, temp, BLS12_381_r, r0); +} - RLC_TRY { - ep_t t; - ep_null(t); - ep_new(t); - ep_norm(t, a); - fp_write_bin(bin, Fp_BYTES, t->x); - - if (G1_SERIALIZATION == COMPRESSED) { - bin[0] |= (fp_get_sign(t->y) << 5); - } else { - fp_write_bin(bin + Fp_BYTES, Fp_BYTES, t->y); - } - ep_free(t); - } RLC_CATCH_ANY { - RLC_THROW(ERR_CAUGHT); - } +// computes the sum of the array elements and writes the sum in jointx +void Fr_sum_vector(Fr *jointx, const Fr x[], const int x_len) { + Fr_set_zero(jointx); + for (int i = 0; i < x_len; i++) { + Fr_add(jointx, jointx, &x[i]); + } +} - bin[0] |= (G1_SERIALIZATION << 7); - } - -// fp_read_bin_safe is a modified version of Relic's (void fp_read_bin). -// It reads a field element from a buffer and makes sure the big number read can be -// written as a field element (is reduced modulo p). -// Unlike Relic's versions, the function does not reduce the read integer modulo p and does -// not throw an exception for an integer larger than p. The function returns RLC_OK if the input -// corresponds to a field element, and returns RLC_ERR otherwise. -static int fp_read_bin_safe(fp_t a, const uint8_t *bin, int len) { - if (len != Fp_BYTES) { - return RLC_ERR; +// internal type of BLST `pow256` uses bytes little endian. +// input is bytes big endian as used by Flow crypto lib external scalars. +static void pow256_from_be_bytes(pow256 ret, const byte a[Fr_BYTES]) { + byte *b = (byte *)a + Fr_BYTES - 1; + if ((uptr_t)ret == (uptr_t)a) { // swap in place + for (int i = 0; i < Fr_BYTES / 2; i++) { + byte tmp = *ret; + *(ret++) = *b; + *(b--) = tmp; } + } else { + for (int i = 0; i < Fr_BYTES; i++) { + *(ret++) = *(b--); + } + } +} - int ret = RLC_ERR; - bn_t t; - bn_new(t); - bn_read_bin(t, bin, Fp_BYTES); +// internal type of BLST `pow256` uses bytes little endian. +static void pow256_from_Fr(pow256 ret, const Fr *in) { + le_bytes_from_limbs(ret, (limb_t *)in, Fr_BYTES); +} - // make sure read bn is reduced modulo p - // first check is sanity check, since current implementation of `bn_read_bin` insures - // output bn is positive - if (bn_sign(t) == RLC_NEG || bn_cmp(t, &core_get()->prime) != RLC_LT) { - goto out; - } +// reads a scalar in `a` and checks it is a valid Fr element (a < r). +// input is bytes-big-endian. +// returns: +// - BAD_ENCODING if the length is invalid +// - BAD_VALUE if the scalar isn't in Fr +// - VALID if the scalar is valid +ERROR Fr_read_bytes(Fr *a, const byte *in, int in_len) { + if (in_len != Fr_BYTES) { + return BAD_ENCODING; + } + // compare to r using BLST internal function + pow256 tmp; + pow256_from_be_bytes(tmp, in); + // (check_mod_256 compares pow256 against a vec256!) + if (!check_mod_256(tmp, BLS12_381_r)) { + return BAD_VALUE; + } + vec_zero(tmp, sizeof(tmp)); + limbs_from_be_bytes((limb_t *)a, in, Fr_BYTES); + return VALID; +} - if (bn_is_zero(t)) { - fp_zero(a); - } else { - if (t->used == 1) { - fp_prime_conv_dig(a, t->dp[0]); - } else { - fp_prime_conv(a, t); - } - } - ret = RLC_OK; -out: - bn_free(t); +// reads a scalar in `a` and checks it is a valid Fr_star element (0 < a < r). +// input bytes are big endian. +// returns: +// - BAD_ENCODING if the length is invalid +// - BAD_VALUE if the scalar isn't in Fr_star +// - VALID if the scalar is valid +ERROR Fr_star_read_bytes(Fr *a, const byte *in, int in_len) { + int ret = Fr_read_bytes(a, in, in_len); + if (ret != VALID) { return ret; + } + // check if a=0 + if (Fr_is_zero(a)) { + return BAD_VALUE; + } + return VALID; } -// ep_read_bin_compact imports a point from a buffer in a compressed or uncompressed form. -// len is the size of the input buffer. +// write Fr element `a` in big endian bytes. +void Fr_write_bytes(byte *out, const Fr *a) { + // be_bytes_from_limbs works for both limb endianness types + be_bytes_from_limbs(out, (limb_t *)a, Fr_BYTES); +} + +// maps big-endian bytes of any size into an Fr element using modular reduction. +// Input is byte-big-endian, output is Fr (internally vec256). // -// The resulting point is guaranteed to be on the curve E1. -// The serialization follows: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// The code is a modified version of Relic ep_read_bin +// Note: could use redc_mont_256(vec256 ret, const vec512 a, const vec256 p, +// limb_t n0) to reduce 512 bits at a time. +static void Fr_from_be_bytes(Fr *out, const byte *in, const int in_len) { + // input can be written in base 2^|R|, with R the Montgomery constant + // N = l_1 + L_2*2^|R| .. + L_n*2^(|R|*(n-1)) + // Therefore N mod p can be expressed using R as: + // N mod p = l_1 + L_2*R .. + L_n*R^(n-1) + Fr digit, radix; + Fr_set_zero(out); + Fr_copy(&radix, (Fr *)BLS12_381_rRR); // R^2 + + int n = in_len; + byte *p = (byte *)in + in_len; + while (n > Fr_BYTES) { + // limbs_from_be_bytes works for both limb endiannesses + limbs_from_be_bytes((limb_t *)&digit, p -= Fr_BYTES, Fr_BYTES); // l_i + Fr_mul_montg(&digit, &digit, + &radix); // l_i * R^i (i is the loop number starting at 1) + Fr_add(out, out, &digit); + Fr_mul_montg(&radix, &radix, (Fr *)BLS12_381_rRR); // R^(i+1) + n -= Fr_BYTES; + } + Fr_set_zero(&digit); + limbs_from_be_bytes((limb_t *)&digit, p - n, n); + Fr_mul_montg(&digit, &digit, &radix); + Fr_add(out, out, &digit); + // at this point : out = l_1*R + L_2*R^2 .. + L_n*R^n, + // reduce the extra R + Fr_from_montg(out, out); + // clean up possible sensitive data + Fr_set_zero(&digit); +} + +// Reads a scalar from an array and maps it to Fr using modular reduction. +// Input is byte-big-endian as used by the external APIs. +// It returns true if scalar is zero and false otherwise. +bool map_bytes_to_Fr(Fr *a, const byte *in, int in_len) { + Fr_from_be_bytes(a, in, in_len); + return Fr_is_zero(a); +} + +// ------------------- Fp utilities + +// Montgomery constants related to the prime p +const Fp BLS12_381_pR = {ONE_MONT_P}; /* R mod p = (1<<384)%p */ + +// sets `a` to 0 +static void Fp_set_zero(Fp *a) { vec_zero((byte *)a, sizeof(Fp)); } + +// sets `a` to limb `l` +static void Fp_set_limb(Fp *a, const limb_t l) { + vec_zero((byte *)a + sizeof(limb_t), sizeof(Fp) - sizeof(limb_t)); + *((limb_t *)a) = l; +} + +void Fp_copy(Fp *res, const Fp *a) { + if ((uptr_t)a == (uptr_t)res) { + return; + } + vec_copy((byte *)res, (byte *)a, sizeof(Fp)); +} + +static void Fp_add(Fp *res, const Fp *a, const Fp *b) { + add_mod_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P); +} + +static void Fp_sub(Fp *res, const Fp *a, const Fp *b) { + sub_mod_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P); +} + +static void Fp_neg(Fp *res, const Fp *a) { + cneg_mod_384((limb_t *)res, (limb_t *)a, 1, BLS12_381_P); +} + +// checks if `a` is a quadratic residue in Fp. If yes, it computes +// the square root in `res`. // -// It returns RLC_OK if the inputs are valid (input buffer lengths are valid and coordinates correspond -// to a point on curve) and the execution completes, and RLC_ERR otherwise. -int ep_read_bin_compact(ep_t a, const byte *bin, const int len) { - // check the length - const int G1_size = (G1_BYTES/(G1_SERIALIZATION+1)); - if (len!=G1_size) { - return RLC_ERR; - } +// The boolean output is valid whether `a` is in Montgomery form or not, +// since montgomery constant `R` is a quadratic residue. +// However, the square root is valid only if `a` is in montgomery form. +static bool Fp_sqrt_montg(Fp *res, const Fp *a) { + return sqrt_fp((limb_t *)res, (limb_t *)a); +} - // check the compression bit - int compressed = bin[0] >> 7; - if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) { - return RLC_ERR; - } - - // check if the point is infinity - int is_infinity = bin[0] & 0x40; - if (is_infinity) { - // check if the remaining bits are cleared - if (bin[0] & 0x3F) { - return RLC_ERR; - } - for (int i=1; i> 5) & 1; - if (y_sign && (!compressed)) { - return RLC_ERR; - } - - a->coord = BASIC; - fp_set_dig(a->z, 1); - // use a temporary buffer to mask the header bits and read a.x - byte temp[Fp_BYTES]; - memcpy(temp, bin, Fp_BYTES); - temp[0] &= 0x1F; - if (fp_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) { - return RLC_ERR; - } +static bool Fp_check(const Fp *a) { + // use same method as in BLST internal function + // which seems the most efficient. The method uses the assembly-based + // modular addition instead of limbs comparison + Fp temp; + Fp_add(&temp, a, &ZERO_384); + return vec_is_equal(&temp, a, Fp_BYTES); + // no need to clear `tmp` as no current use-case involves sensitive data being + // passed as `a` +} - if (G1_SERIALIZATION == UNCOMPRESSED) { - if (fp_read_bin_safe(a->y, bin + Fp_BYTES, Fp_BYTES) != RLC_OK) { - return RLC_ERR; - } - // check read point is on curve - if (!ep_on_curve(a)) { - return RLC_ERR; - } - return RLC_OK; - } - fp_zero(a->y); - fp_set_bit(a->y, 0, y_sign); - if (ep_upk(a, a) == 1) { - // resulting point is guaranteed to be on curve - return RLC_OK; - } - return RLC_ERR; +// res = a*b*R^(-1) +void Fp_mul_montg(Fp *res, const Fp *a, const Fp *b) { + mul_mont_384((limb_t *)res, (limb_t *)a, (limb_t *)b, BLS12_381_P, p0); } +// res = a^2 * R^(-1) +void Fp_squ_montg(Fp *res, const Fp *a) { + sqr_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0); +} -// returns the sign of y. -// sign(y_0) if y_1 = 0, else sign(y_1) -static int fp2_get_sign(fp2_t y) { - if (fp_is_zero(y[1])) { // no need to convert back as the montgomery form of 0 is 0 - return fp_get_sign(y[0]); - } - return fp_get_sign(y[1]); +// res = a*R +void Fp_to_montg(Fp *res, const Fp *a) { + mul_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_RR, BLS12_381_P, p0); } -// ep2_write_bin_compact exports a point in E(Fp^2) to a buffer in a compressed or uncompressed form. -// len is the allocated size of the buffer bin. -// The serialization is following: -// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) -// The code is a modified version of Relic ep2_write_bin -void ep2_write_bin_compact(byte *bin, const ep2_t a, const int len) { - ep2_t t; - ep2_null(t); - const int G2_size = (G2_BYTES/(G2_SERIALIZATION+1)); - - if (len!=G2_size) { - RLC_THROW(ERR_NO_BUFFER); - return; - } - - if (ep2_is_infty((ep2_st *)a)) { - // set the infinity bit - bin[0] = (G2_SERIALIZATION << 7) | 0x40; - memset(bin+1, 0, G2_size-1); - return; - } +// res = a*R^(-1) +void Fp_from_montg(Fp *res, const Fp *a) { + from_mont_384((limb_t *)res, (limb_t *)a, BLS12_381_P, p0); +} - RLC_TRY { - ep2_new(t); - ep2_norm(t, (ep2_st *)a); - fp2_write_bin(bin, Fp2_BYTES, t->x, 0); - - if (G2_SERIALIZATION == COMPRESSED) { - bin[0] |= (fp2_get_sign(t->y) << 5); - } else { - fp2_write_bin(bin + Fp2_BYTES, Fp2_BYTES, t->y, 0); - } - } RLC_CATCH_ANY { - RLC_THROW(ERR_CAUGHT); - } +// reads a scalar in `out` and checks it is a valid Fp element (out < p). +// input is bytes-big-endian. +// returns: +// - BAD_ENCODING if the length is invalid +// - BAD_VALUE if the scalar isn't in Fp +// - VALID if the scalar is valid +ERROR Fp_read_bytes(Fp *out, const byte *in, int in_len) { + if (in_len != Fp_BYTES) { + return BAD_ENCODING; + } + limbs_from_be_bytes((limb_t *)out, in, Fp_BYTES); + // compare read scalar to p + if (!Fp_check(out)) { + return BAD_VALUE; + } + return VALID; +} - bin[0] |= (G2_SERIALIZATION << 7); - ep_free(t); +// write Fp element to `out`, +// assuming `out` has `Fp_BYTES` allocated bytes. +void Fp_write_bytes(byte *out, const Fp *a) { + be_bytes_from_limbs(out, (limb_t *)a, Fp_BYTES); } -// fp2_read_bin_safe is a modified version of Relic's (void fp2_read_bin). -// It reads an Fp^2 element from a buffer and makes sure the big numbers read can be -// written as field elements (are reduced modulo p). -// Unlike Relic's versions, the function does not reduce the read integers modulo p and does -// not throw an exception for integers larger than p. The function returns RLC_OK if the input -// corresponds to a field element in Fp^2, and returns RLC_ERR otherwise. -static int fp2_read_bin_safe(fp2_t a, const uint8_t *bin, int len) { - if (len != Fp2_BYTES) { - return RLC_ERR; - } - if (fp_read_bin_safe(a[0], bin, Fp_BYTES) != RLC_OK) { - return RLC_ERR; - } - if (fp_read_bin_safe(a[1], bin + Fp_BYTES, Fp_BYTES) != RLC_OK) { - return RLC_ERR; - } - return RLC_OK; +// returns the sign of y: +// 1 if y > (p - 1)/2 and 0 otherwise. +// y is in montgomery form! +static byte Fp_get_sign(const Fp *y) { + // - BLST's sgn0_pty_mont_384 requires input to be in Montg form. + // - The needed sign bit is on position 1 + return (sgn0_pty_mont_384((const limb_t *)y, BLS12_381_P, p0) >> 1) & 1; } -// ep2_read_bin_compact imports a point from a buffer in a compressed or uncompressed form. -// The resulting point is guaranteed to be on curve E2. +// ------------------- Fp^2 utilities + +// sets `a` to limb `l` +static void Fp2_set_limb(Fp2 *a, const limb_t l) { + Fp_set_limb(&real(a), l); + Fp_set_zero(&imag(a)); +} + +static void Fp2_add(Fp2 *res, const Fp2 *a, const Fp2 *b) { + add_mod_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P); +} + +static void Fp2_sub(Fp2 *res, const Fp2 *a, const Fp2 *b) { + sub_mod_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P); +} + +static void Fp2_neg(Fp2 *res, const Fp2 *a) { + cneg_mod_384(real(res), real(a), 1, BLS12_381_P); + cneg_mod_384(imag(res), imag(a), 1, BLS12_381_P); +} + +// res = a*b in montgomery form +static void Fp2_mul_montg(Fp2 *res, const Fp2 *a, const Fp2 *b) { + mul_mont_384x((vec384 *)res, (vec384 *)a, (vec384 *)b, BLS12_381_P, p0); +} + +// res = a^2 in montgomery form +static void Fp2_squ_montg(Fp2 *res, const Fp2 *a) { + sqr_mont_384x((vec384 *)res, (vec384 *)a, BLS12_381_P, p0); +} + +// checks if `a` is a quadratic residue in Fp^2. If yes, it computes +// the square root in `res`. // -// It returns RLC_OK if the inputs are valid (input buffer lengths are valid and read coordinates -// correspond to a point on curve) and the execution completes and RLC_ERR otherwise. -// The code is a modified version of Relic ep2_read_bin -int ep2_read_bin_compact(ep2_t a, const byte *bin, const int len) { - // check the length - const int G2size = (G2_BYTES/(G2_SERIALIZATION+1)); - if (len!=G2size) { - return RLC_ERR; - } +// The boolean output is valid whether `a` is in Montgomery form or not, +// since montgomery constant `R` is itself a quadratic residue. +// However, the square root is correct only if `a` is in montgomery form +// (the square root would be in montgomery form too). +static bool Fp2_sqrt_montg(Fp2 *res, const Fp2 *a) { + return sqrt_fp2((vec384 *)res, (vec384 *)a); +} - // check the compression bit - int compressed = bin[0] >> 7; - if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { - return RLC_ERR; - } - - // check if the point in infinity - int is_infinity = bin[0] & 0x40; - if (is_infinity) { - // the remaining bits need to be cleared - if (bin[0] & 0x3F) { - return RLC_ERR; - } - for (int i=1; i> 5) & 1; - if (y_sign && (!compressed)) { - return RLC_ERR; - } - - a->coord = BASIC; - fp2_set_dig(a->z, 1); // a.z - // use a temporary buffer to mask the header bits and read a.x - byte temp[Fp2_BYTES]; - memcpy(temp, bin, Fp2_BYTES); - temp[0] &= 0x1F; // clear the header bits - if (fp2_read_bin_safe(a->x, temp, sizeof(temp)) != RLC_OK) { - return RLC_ERR; - } +// returns the sign of y: +// sign(y_0) if y_1 = 0, else sign(y_1). +// y coordinates must be in montgomery form! +static byte Fp2_get_sign(Fp2 *y) { + // - BLST's sgn0_pty_mont_384x requires input to be in montgomery form. + // - the sign bit is on position 1 + return (sgn0_pty_mont_384x((vec384 *)y, BLS12_381_P, p0) >> 1) & 1; +} - if (G2_SERIALIZATION == UNCOMPRESSED) { - if (fp2_read_bin_safe(a->y, bin + Fp2_BYTES, Fp2_BYTES) != RLC_OK){ - return RLC_ERR; - } - // check read point is on curve - if (!ep2_on_curve(a)) { - return RLC_ERR; - } - return RLC_OK; - } - - fp2_zero(a->y); - fp_set_bit(a->y[0], 0, y_sign); - fp_zero(a->y[1]); - if (ep2_upk(a, a) == 1) { - // resulting point is guaranteed to be on curve - return RLC_OK; - } - return RLC_ERR; +// reads an Fp^2 element in `a`. +// input is a serialization of real(a) concatenated to serializetion of imag(a). +// a[i] are both Fp elements. +// returns: +// - BAD_ENCODING if the length is invalid +// - BAD_VALUE if the scalar isn't in Fp +// - VALID if the scalar is valid +static ERROR Fp2_read_bytes(Fp2 *a, const byte *in, int in_len) { + if (in_len != Fp2_BYTES) { + return BAD_ENCODING; + } + ERROR ret = Fp_read_bytes(&real(a), in, Fp_BYTES); + if (ret != VALID) { + return ret; + } + ret = Fp_read_bytes(&imag(a), in + Fp_BYTES, Fp_BYTES); + if (ret != VALID) { + return ret; + } + return VALID; } -// reads a scalar in a and checks it is a valid Zr element (a < r) -// returns RLC_OK if the scalar is valid and RLC_ERR otherwise. -int bn_read_Zr_bin(bn_t a, const uint8_t *bin, int len) { - if (len!=Fr_BYTES) { - return RLC_ERR; - } - bn_read_bin(a, bin, Fr_BYTES); - bn_t r; - bn_new(r); - g2_get_ord(r); - if (bn_cmp(a, r) == RLC_LT) { - return RLC_OK; +// write Fp2 element to bin and assume `bin` has `Fp2_BYTES` allocated bytes. +void Fp2_write_bytes(byte *out, const Fp2 *a) { + Fp_write_bytes(out, &real(a)); + Fp_write_bytes(out + Fp_BYTES, &imag(a)); +} + +// ------------------- E1 utilities + +void E1_copy(E1 *res, const E1 *p) { + if ((uptr_t)p == (uptr_t)res) { + return; + } + vec_copy(res, p, sizeof(E1)); +} + +// checks p1 == p2 +bool E1_is_equal(const E1 *p1, const E1 *p2) { + // `POINTonE1_is_equal` includes the infinity case + return POINTonE1_is_equal((const POINTonE1 *)p1, (const POINTonE1 *)p2); +} + +// compare `p` to infinity +bool E1_is_infty(const E1 *p) { + // BLST infinity points are defined by Z=0 + return vec_is_zero(p->z, sizeof(p->z)); +} + +// set `p` to infinity +void E1_set_infty(E1 *p) { + // BLST infinity points are defined by Z=0 + vec_zero(p->z, sizeof(p->z)); +} + +// converts an E1 point from Jacobian into affine coordinates (z=1) +void E1_to_affine(E1 *res, const E1 *p) { + // optimize in case coordinates are already affine + if (vec_is_equal(p->z, BLS12_381_pR, Fp_BYTES)) { + E1_copy(res, p); + return; + } + // convert from Jacobian + POINTonE1_from_Jacobian((POINTonE1 *)res, (const POINTonE1 *)p); +} + +// checks affine point `p` is in E1 +bool E1_affine_on_curve(const E1 *p) { + // BLST's `POINTonE1_affine_on_curve` does not include the infinity case! + return POINTonE1_affine_on_curve((POINTonE1_affine *)p) | E1_is_infty(p); +} + +// checks if input E1 point is on the subgroup G1. +// It assumes input `p` is on E1. +bool E1_in_G1(const E1 *p) { + // currently uses Scott method + return POINTonE1_in_G1((const POINTonE1 *)p); +} + +// E1_read_bytes imports a E1(Fp) point from a buffer in a compressed or +// uncompressed form. The resulting point is guaranteed to be on curve E1 (no G1 +// check is included). Expected serialization follows: +// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) +// +// returns: +// - BAD_ENCODING if the length is invalid or serialization header bits are +// invalid +// - BAD_VALUE if Fp coordinates couldn't deserialize +// - POINT_NOT_ON_CURVE if deserialized point isn't on E1 +// - VALID if deserialization is valid + +// Note: could use POINTonE1_Deserialize_BE and POINTonE1_Uncompress_Z, +// but needs to update the logic around G2 subgroup check +ERROR E1_read_bytes(E1 *a, const byte *in, const int in_len) { + // check the length + if (in_len != G1_SER_BYTES) { + return BAD_ENCODING; + } + + // check the compression bit + int compressed = in[0] >> 7; + if ((compressed == 1) != (G1_SERIALIZATION == COMPRESSED)) { + return BAD_ENCODING; + } + + // check if the point in infinity + int is_infinity = in[0] & 0x40; + if (is_infinity) { + // the remaining bits need to be cleared + if (in[0] & 0x3F) { + return BAD_ENCODING; } - return RLC_ERR; -} - -// computes the sum of the array elements x and writes the sum in jointx -// the sum is computed in Zr -void bn_sum_vector(bn_t jointx, const bn_st* x, const int len) { - bn_t r; - bn_new(r); - g2_get_ord(r); - bn_set_dig(jointx, 0); - bn_new_size(jointx, BITS_TO_DIGITS(Fr_BITS+1)); - for (int i=0; i> 5) & 1; + if (y_sign && (!compressed)) { + return BAD_ENCODING; + } + + // use a temporary buffer to mask the header bits and read a.x + byte temp[Fp_BYTES]; + memcpy(temp, in, Fp_BYTES); + temp[0] &= 0x1F; // clear the header bits + ERROR ret = Fp_read_bytes(&a->x, temp, sizeof(temp)); + if (ret != VALID) { + return ret; + } + Fp_to_montg(&a->x, &a->x); -// computes the sum of the G2 array elements y and writes the sum in jointy -void ep2_sum_vector(ep2_t jointy, ep2_st* y, const int len){ - ep2_set_infty(jointy); - for (int i=0; iz, &BLS12_381_pR); + + if (G1_SERIALIZATION == UNCOMPRESSED) { + ret = Fp_read_bytes(&a->y, in + Fp_BYTES, sizeof(a->y)); + if (ret != VALID) { + return ret; } - ep2_norm(jointy, jointy); // not necessary but left here to optimize the - // multiple pairing computations with the same - // public key -} - -// Verifies the validity of 2 SPoCK proofs and 2 public keys. -// Membership check in G1 of both proofs is verified in this function. -// Membership check in G2 of both keys is not verified in this function. -// the membership check in G2 is separated to allow optimizing multiple verifications -// using the same public keys. -int bls_spock_verify(const ep2_t pk1, const byte* sig1, const ep2_t pk2, const byte* sig2) { - ep_t elemsG1[2]; - ep2_t elemsG2[2]; - - // elemsG1[0] = s1 - ep_new(elemsG1[0]); - int read_ret = ep_read_bin_compact(elemsG1[0], sig1, SIGNATURE_LEN); - if (read_ret != RLC_OK) - return read_ret; - - // check s1 is in G1 - if (check_membership_G1(elemsG1[0]) != VALID) // only enabled if MEMBERSHIP_CHECK==1 - return INVALID; - - // elemsG1[1] = s2 - ep_new(elemsG1[1]); - read_ret = ep_read_bin_compact(elemsG1[1], sig2, SIGNATURE_LEN); - if (read_ret != RLC_OK) - return read_ret; - - // check s2 in G1 - if (check_membership_G1(elemsG1[1]) != VALID) // only enabled if MEMBERSHIP_CHECK==1 - return INVALID; - - // elemsG2[1] = pk1 - ep2_new(elemsG2[1]); - ep2_copy(elemsG2[1], (ep2_st*)pk1); - - // elemsG2[0] = pk2 - ep2_new(elemsG2[0]); - ep2_copy(elemsG2[0], (ep2_st*)pk2); - -#if DOUBLE_PAIRING - // elemsG2[0] = -pk2 - ep2_neg(elemsG2[0], elemsG2[0]); - - fp12_t pair; - fp12_new(&pair); - // double pairing with Optimal Ate - pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), 2); - - // compare the result to 1 - int res = fp12_cmp_dig(pair, 1); - -#elif SINGLE_PAIRING - fp12_t pair1, pair2; - fp12_new(&pair1); fp12_new(&pair2); - pp_map_oatep_k12(pair1, elemsG1[0], elemsG2[0]); - pp_map_oatep_k12(pair2, elemsG1[1], elemsG2[1]); - - int res = fp12_cmp(pair1, pair2); -#endif - fp12_free(&one); - ep_free(elemsG1[0]); - ep_free(elemsG1[1]); - ep2_free(elemsG2[0]); - ep2_free(elemsG2[1]); - - if (core_get()->code == RLC_OK) { - if (res == RLC_EQ) return VALID; - return INVALID; + Fp_to_montg(&a->y, &a->y); + // check read point is on curve + if (!E1_affine_on_curve(a)) { + return POINT_NOT_ON_CURVE; } - return UNDEFINED; + return VALID; + } + + // compute the possible square root + Fp_squ_montg(&a->y, &a->x); + Fp_mul_montg(&a->y, &a->y, &a->x); // x^3 + Fp_add(&a->y, &a->y, &B_E1); // B_E1 is already in montg form + // check whether x^3+b is a quadratic residue + if (!Fp_sqrt_montg(&a->y, &a->y)) { + return POINT_NOT_ON_CURVE; + } + + // resulting (x,y) is guaranteed to be on curve (y is already in montg form) + if (Fp_get_sign(&a->y) != y_sign) { + Fp_neg(&a->y, &a->y); // flip y sign if needed + } + return VALID; } -// Subtracts the sum of a G2 array elements y from an element x and writes the -// result in res -void ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len){ - ep2_sum_vector(res, y, len); - ep2_neg(res, res); - ep2_add_projc(res, x, res); +// E1_write_bytes exports a point in E1(Fp) to a buffer in a compressed or +// uncompressed form. It assumes buffer is of length G1_SER_BYTES The +// serialization follows: +// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) +void E1_write_bytes(byte *out, const E1 *a) { + if (E1_is_infty(a)) { + memset(out, 0, G1_SER_BYTES); + // set the infinity bit + out[0] = (G1_SERIALIZATION << 7) | (1 << 6); + return; + } + E1 tmp; + E1_to_affine(&tmp, a); + + Fp_from_montg(&tmp.x, &tmp.x); + Fp_write_bytes(out, &tmp.x); + + if (G1_SERIALIZATION == COMPRESSED) { + out[0] |= (Fp_get_sign(&tmp.y) << 5); + } else { + Fp_from_montg(&tmp.y, &tmp.y); + Fp_write_bytes(out + Fp_BYTES, &tmp.y); + } + // compression bit + out[0] |= (G1_SERIALIZATION << 7); } -// computes the sum of the G1 array elements y and writes the sum in jointy -void ep_sum_vector(ep_t jointx, ep_st* x, const int len) { - ep_set_infty(jointx); - for (int i=0; iep_r); - if (!ep_is_infty(inf)){ - ep_free(inf); - return INVALID; - } - ep_free(inf); - return VALID; + return error; } -// uses a simple scalar multiplication by G1's order -// to check whether a point on the curve E2 is in G2. -int simple_subgroup_check_G2(const ep2_t p){ - ep2_t inf; - ep2_new(inf); - // check p^order == infinity - // use basic double & add as lwnaf reduces the expo modulo r - ep2_mul_basic(inf, (ep2_st*)p, &core_get()->ep_r); - if (!ep2_is_infty(inf)){ - ep2_free(inf); - return INVALID; - } - ep2_free(inf); - return VALID; +// Exponentiation of generator g1 of G1, res = expo.g1 +void G1_mult_gen(E1 *res, const Fr *expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE1_mult_glv((POINTonE1 *)res, &BLS12_381_G1, tmp); + vec_zero(&tmp, sizeof(tmp)); +} + +// Reads a scalar bytes and maps it to Fp using modular reduction. +// output is in Montgomery form. +// `in_len` must be less or equal to 96 bytes and must be a multiple of 8. +// This function is only used by `map_to_G1` where input is 64 bytes. +// input `in_len` is not checked to satisfy the conditions above. +static void map_96_bytes_to_Fp(Fp *a, const byte *in, int in_len) { + vec768 tmp; + vec_zero(&tmp, sizeof(tmp)); + limbs_from_be_bytes((limb_t *)tmp, in, in_len); + redc_mont_384((limb_t *)a, tmp, BLS12_381_P, p0); // aR^(-2) + Fp_mul_montg(a, a, (Fp *)BLS12_381_RRRR); // aR +} + +// maps bytes input `hash` to G1. +// `hash` must be `MAP_TO_G1_INPUT_LEN` (128 bytes) +// It uses construction 2 from section 5 in https://eprint.iacr.org/2019/403.pdf +int map_to_G1(E1 *h, const byte *hash, const int hash_len) { + // sanity check of length + if (hash_len != MAP_TO_G1_INPUT_LEN) { + return INVALID; + } + // map to field elements + Fp u[2]; + const int half = MAP_TO_G1_INPUT_LEN / 2; + map_96_bytes_to_Fp(&u[0], hash, half); + map_96_bytes_to_Fp(&u[1], hash + half, half); + // map field elements to G1 + // inputs must be in Montgomery form + map_to_g1((POINTonE1 *)h, (limb_t *)&u[0], (limb_t *)&u[1]); + return VALID; +} + +// maps the bytes to a point in G1. +// `len` should be at least Fr_BYTES. +// this is a testing file only, should not be used in any protocol! +void unsafe_map_bytes_to_G1(E1 *p, const byte *bytes, int len) { + assert(len >= Fr_BYTES); + // map to Fr + Fr log; + map_bytes_to_Fr(&log, bytes, len); + // multiplies G1 generator by a random scalar + G1_mult_gen(p, &log); } -#if (MEMBERSHIP_CHECK_G1 == BOWE) -// beta such that beta^3 == 1 mod p -// beta is in the Montgomery form -const uint64_t beta_data[Fp_DIGITS] = { - 0xcd03c9e48671f071, 0x5dab22461fcda5d2, 0x587042afd3851b95, - 0x8eb60ebe01bacb9e, 0x03f97d6e83d050d2, 0x18f0206554638741, -}; - - -// (z^2-1)/3 with z being the parameter of bls12-381 -const uint64_t z2_1_by3_data[2] = { - 0x0000000055555555, 0x396c8c005555e156 -}; - -// uses Bowe's check from section 3.2 from https://eprint.iacr.org/2019/814.pdf -// to check whether a point on the curve E1 is in G1. -int bowe_subgroup_check_G1(const ep_t p){ - if (ep_is_infty(p) == 1) - return VALID; - fp_t b; - dv_copy(b, beta_data, Fp_DIGITS); - ep_t sigma, sigma2, p_inv; - ep_new(sigma); - ep_new(sigma2); - ep_new(p_inv); - - // si(p) - ep_copy(sigma, p); - fp_mul(sigma[0].x, sigma[0].x, b); - // -si^2(p) - ep_copy(sigma2, sigma); - fp_mul(sigma2[0].x, sigma2[0].x, b); - fp_neg(sigma2[0].y, sigma2[0].y); - ep_dbl(sigma, sigma); - // -p - ep_copy(p_inv, p); - fp_neg(p_inv[0].y, p_inv[0].y); - // (z^2-1)/3 (2*si(p) - p - si^2(p)) - si^2(p) - ep_add(sigma, sigma, p_inv); - ep_add(sigma, sigma, sigma2); - // TODO: multiplication using a chain? - ep_mul_lwnaf(sigma, sigma, &bls_prec->z2_1_by3); - ep_add(sigma, sigma, sigma2); - - ep_free(sigma2); - ep_free(p_inv); - // check result against infinity - if (!ep_is_infty(sigma)){ - ep_free(sigma); - return INVALID; +// maps bytes to a point in E1\G1. +// `len` must be at least 96 bytes. +// this is a testing function only, should not be used in any protocol! +void unsafe_map_bytes_to_G1complement(E1 *p, const byte *in, int in_len) { + assert(in_len >= 96); + Fp u; + map_96_bytes_to_Fp(&u, in, 96); + // map to E1's isogenous and then to E1 + map_to_isogenous_E1((POINTonE1 *)p, u); + isogeny_map_to_E1((POINTonE1 *)p, (POINTonE1 *)p); + // clear G1 order + E1_mult(p, p, (Fr *)&BLS12_381_r); +} + +// ------------------- E2 utilities + +const E2 *BLS12_381_g2 = (const E2 *)&BLS12_381_G2; +const E2 *BLS12_381_minus_g2 = (const E2 *)&BLS12_381_NEG_G2; + +// E2_read_bytes imports a E2(Fp^2) point from a buffer in a compressed or +// uncompressed form. The resulting point is guaranteed to be on curve E2 (no G2 +// check is included). +// E2 point is in affine coordinates. This avoids further conversions +// when the point is used in multiple pairing computation. +// +// returns: +// - BAD_ENCODING if the length is invalid or serialization header bits are +// invalid +// - BAD_VALUE if Fp^2 coordinates couldn't deserialize +// - POINT_NOT_ON_CURVE if deserialized point isn't on E2 +// - VALID if deserialization is valid +// +// Note: can use with POINTonE2_Deserialize_BE and POINTonE2_Uncompress_Z, +// and update the logic around G2 subgroup check. +ERROR E2_read_bytes(E2 *a, const byte *in, const int in_len) { + // check the length + if (in_len != G2_SER_BYTES) { + return BAD_ENCODING; + } + + // check the compression bit + int compressed = in[0] >> 7; + if ((compressed == 1) != (G2_SERIALIZATION == COMPRESSED)) { + return BAD_ENCODING; + } + + // check if the point in infinity + int is_infinity = in[0] & 0x40; + if (is_infinity) { + // the remaining bits need to be cleared + if (in[0] & 0x3F) { + return BAD_ENCODING; + } + for (int i = 1; i < G2_SER_BYTES - 1; i++) { + if (in[i]) { + return BAD_ENCODING; + } } - ep_free(sigma); + E2_set_infty(a); return VALID; + } + + // read the sign bit and check for consistency + int y_sign = (in[0] >> 5) & 1; + if (y_sign && (!compressed)) { + return BAD_ENCODING; + } + + // use a temporary buffer to mask the header bits and read a.x + byte temp[Fp2_BYTES]; + memcpy(temp, in, Fp2_BYTES); + temp[0] &= 0x1F; // clear the header bits + ERROR ret = Fp2_read_bytes(&a->x, temp, sizeof(temp)); + if (ret != VALID) { + return ret; + } + Fp2 *a_x = &(a->x); + Fp_to_montg(&real(a_x), &real(a_x)); + Fp_to_montg(&imag(a_x), &imag(a_x)); + + // set a.z to 1 + Fp2 *a_z = &(a->z); + Fp_copy(&real(a_z), &BLS12_381_pR); + Fp_set_zero(&imag(a_z)); + + Fp2 *a_y = &(a->y); + if (G2_SERIALIZATION == UNCOMPRESSED) { + ret = Fp2_read_bytes(a_y, in + Fp2_BYTES, sizeof(a->y)); + if (ret != VALID) { + return ret; + } + Fp_to_montg(&real(a_y), &real(a_y)); + Fp_to_montg(&imag(a_y), &imag(a_y)); + // check read point is on curve + if (!E2_affine_on_curve(a)) { + return POINT_NOT_ON_CURVE; + } + return VALID; + } + + // compute the possible square root + Fp2_squ_montg(a_y, a_x); + Fp2_mul_montg(a_y, a_y, a_x); // x^3 + Fp2_add(a_y, a_y, &B_E2); // B_E2 is already in Montg form + if (!Fp2_sqrt_montg(a_y, a_y)) // check whether x^3+b is a quadratic residue + return POINT_NOT_ON_CURVE; + + // resulting (x,y) is guaranteed to be on curve (y is already in Montg form) + if (Fp2_get_sign(a_y) != y_sign) { + Fp2_neg(a_y, a_y); // flip y sign if needed + } + return VALID; +} + +// E2_write_bytes exports a point in E2(Fp^2) to a buffer in a compressed or +// uncompressed form. It assumes buffer is of length G2_SER_BYTES The +// serialization follows: +// https://www.ietf.org/archive/id/draft-irtf-cfrg-pairing-friendly-curves-08.html#name-zcash-serialization-format-) +void E2_write_bytes(byte *out, const E2 *a) { + if (E2_is_infty(a)) { + // set the infinity bit + out[0] = (G2_SERIALIZATION << 7) | (1 << 6); + memset(out + 1, 0, G2_SER_BYTES - 1); + return; + } + E2 tmp; + E2_to_affine(&tmp, a); + + Fp2 *t_x = &(tmp.x); + Fp_from_montg(&real(t_x), &real(t_x)); + Fp_from_montg(&imag(t_x), &imag(t_x)); + Fp2_write_bytes(out, t_x); + + Fp2 *t_y = &(tmp.y); + if (G2_SERIALIZATION == COMPRESSED) { + out[0] |= (Fp2_get_sign(t_y) << 5); + } else { + Fp_from_montg(&real(t_y), &real(t_y)); + Fp_from_montg(&imag(t_y), &imag(t_y)); + Fp2_write_bytes(out + Fp2_BYTES, t_y); + } + + out[0] |= (G2_SERIALIZATION << 7); +} + +// set p to infinity +void E2_set_infty(E2 *p) { + // BLST infinity points are defined by Z=0 + vec_zero(p->z, sizeof(p->z)); +} + +// check if `p` is infinity +bool E2_is_infty(const E2 *p) { + // BLST infinity points are defined by Z=0 + return vec_is_zero(p->z, sizeof(p->z)); +} + +// checks affine point `p` is in E2 +bool E2_affine_on_curve(const E2 *p) { + // BLST's `POINTonE2_affine_on_curve` does not include the infinity case! + return POINTonE2_affine_on_curve((POINTonE2_affine *)p) | E2_is_infty(p); +} + +// checks p1 == p2 +bool E2_is_equal(const E2 *p1, const E2 *p2) { + // `POINTonE2_is_equal` includes the infinity case + return POINTonE2_is_equal((const POINTonE2 *)p1, (const POINTonE2 *)p2); +} + +// res = p +void E2_copy(E2 *res, const E2 *p) { + if ((uptr_t)p == (uptr_t)res) { + return; + } + vec_copy(res, p, sizeof(E2)); +} + +// converts an E2 point from Jacobian into affine coordinates (z=1) +void E2_to_affine(E2 *res, const E2 *p) { + // optimize in case coordinates are already affine + if (vec_is_equal(p->z, BLS12_381_Rx.p2, sizeof(p->z))) { + E2_copy(res, p); + return; + } + // convert from Jacobian + POINTonE2_from_Jacobian((POINTonE2 *)res, (const POINTonE2 *)p); +} + +// generic point addition that must handle doubling and points at infinity +void E2_add(E2 *res, const E2 *a, const E2 *b) { + POINTonE2_dadd((POINTonE2 *)res, (POINTonE2 *)a, (POINTonE2 *)b, NULL); +} + +// generic point double that must handle point at infinity +static void E2_double(E2 *res, const E2 *a) { + POINTonE2_double((POINTonE2 *)res, (POINTonE2 *)a); +} + +// Point negation: res = -a +void E2_neg(E2 *res, const E2 *a) { + E2_copy(res, a); + POINTonE2_cneg((POINTonE2 *)res, 1); } -#endif -// generates a random point in G1 and stores it in p -void ep_rand_G1(ep_t p) { - // multiplies G1 generator by a random scalar - ep_rand(p); -} - -// generates a random point in E1\G1 and stores it in p -void ep_rand_G1complement(ep_t p) { - // generate a random point in E1 - p->coord = BASIC; - fp_set_dig(p->z, 1); - do { - fp_rand(p->x); // set x to a random field element - byte r; - rand_bytes(&r, 1); - fp_zero(p->y); - fp_set_bit(p->y, 0, r&1); // set y randomly to 0 or 1 +// Exponentiation of a generic point `a` in E2, res = expo.a +void E2_mult(E2 *res, const E2 *p, const Fr *expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE2_mult_gls((POINTonE2 *)res, (POINTonE2 *)p, tmp); + vec_zero(&tmp, sizeof(tmp)); +} + +// Exponentiation of a generic point `a` in E2 by a byte exponent, +// using a classic double-and-add algorithm (non constant-time) +void E2_mult_small_expo(E2 *res, const E2 *p, const byte expo) { + // return early if expo is zero + if (expo == 0) { + E2_set_infty(res); + return; + } + // expo is non zero + + byte mask = 1 << 7; + // process the most significant zero bits + while ((expo & mask) == 0) { + mask >>= 1; + } + + // process the first `1` bit + E2 tmp; + E2_copy(&tmp, p); + mask >>= 1; + // scan the remaining bits + for (; mask != 0; mask >>= 1) { + E2_double(&tmp, &tmp); + if (expo & mask) { + E2_add(&tmp, &tmp, p); } - while (ep_upk(p, p) == 0); // make sure p is in E1 + } + E2_copy(res, &tmp); +} - // map the point to E1\G1 by clearing G1 order - ep_mul_basic(p, p, &core_get()->ep_r); +// Exponentiation of generator g2 of G2, res = expo.g2 +void G2_mult_gen(E2 *res, const Fr *expo) { + pow256 tmp; + pow256_from_Fr(tmp, expo); + POINTonE2_mult_gls((POINTonE2 *)res, (POINTonE2 *)BLS12_381_g2, tmp); + vec_zero(&tmp, sizeof(tmp)); +} + +// Exponentiation of generator g2 of G2, res = expo.g2. +// +// Result is converted to affine. This is useful for results being used multiple +// times in pairings. Conversion to affine saves later pre-pairing conversions. +void G2_mult_gen_to_affine(E2 *res, const Fr *expo) { + G2_mult_gen(res, expo); + E2_to_affine(res, res); +} - assert(ep_on_curve(p)); // sanity check to make sure p is in E1 +// checks if input E2 point is on the subgroup G2. +// It assumes input `p` is on E2. +bool E2_in_G2(const E2 *p) { + // currently uses Scott method + return POINTonE2_in_G2((const POINTonE2 *)p); } -// generates a random point in G2 and stores it in p -void ep2_rand_G2(ep2_t p) { - // multiplies G2 generator by a random scalar - ep2_rand(p); +// computes the sum of the E2 array elements `y[i]` and writes it in `sum` +void E2_sum_vector(E2 *sum, const E2 *y, const int y_len) { + E2_set_infty(sum); + for (int i = 0; i < y_len; i++) { + E2_add(sum, sum, &y[i]); + } } -// generates a random point in E2\G2 and stores it in p -void ep2_rand_G2complement(ep2_t p) { - // generate a random point in E2 - p->coord = BASIC; - fp_set_dig(p->z[0], 1); - fp_zero(p->z[1]); - do { - fp2_rand(p->x); // set x to a random field element - byte r; - rand_bytes(&r, 1); - fp2_zero(p->y); - fp_set_bit(p->y[0], 0, r&1); // set y randomly to 0 or 1 +// computes the sum of the E2 array elements `y[i]`, converts it +// to affine coordinates, and writes it in `sum`. +// +// Result is converted to affine. This is useful for results being used multiple +// times in pairings. Conversion to affine saves later pre-pairing conversions. +void E2_sum_vector_to_affine(E2 *sum, const E2 *y, const int y_len) { + E2_sum_vector(sum, y, y_len); + E2_to_affine(sum, sum); +} + +// Subtracts all G2 array elements `y` from an element `x` and writes the +// result in res. +void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int y_len) { + E2_sum_vector(res, y, y_len); + E2_neg(res, res); + E2_add(res, x, res); +} + +// maps the bytes to a point in G2. +// `in_len` should be at least Fr_BYTES. +// this is a testing tool only, it should not be used in any protocol! +void unsafe_map_bytes_to_G2(E2 *p, const byte *in, int in_len) { + assert(in_len >= Fr_BYTES); + // map to Fr + Fr log; + map_bytes_to_Fr(&log, in, in_len); + // multiplies G2 generator by a random scalar + G2_mult_gen(p, &log); +} + +// maps `in` to a point in E2\G2 and stores it in p. +// `len` should be at least 192. +// this is a testing tool only, it should not be used in any protocol! +void unsafe_map_bytes_to_G2complement(E2 *p, const byte *in, int in_len) { + assert(in_len >= 192); + Fp2 u; + map_96_bytes_to_Fp(&real(&u), in, 96); + map_96_bytes_to_Fp(&imag(&u), in + 96, 96); + // map to E2's isogenous and then to E2 + map_to_isogenous_E2((POINTonE2 *)p, u); + isogeny_map_to_E2((POINTonE2 *)p, (POINTonE2 *)p); + // clear G2 order + E2_mult(p, p, (Fr *)&BLS12_381_r); +} + +// ------------------- Pairing utilities + +bool Fp12_is_one(Fp12 *a) { + return vec_is_equal(a, BLS12_381_Rx.p12, sizeof(Fp12)); +} + +void Fp12_set_one(Fp12 *a) { vec_copy(a, BLS12_381_Rx.p12, sizeof(Fp12)); } + +// computes e(p[0], q[0]) * ... * e(q[len-1], q[len-1]) +// by optimizing a common final exponentiation for all pairings. +// result is stored in `res`. +// It assumes `p` and `q` are correctly initialized and all +// p[i] and q[i] are respectively on G1 and G2 (it does not +// check their memberships). +void Fp12_multi_pairing(Fp12 *res, const E1 *p, const E2 *q, const int len) { + // easier access pointer + vec384fp6 *res_vec = (vec384fp6 *)res; + // N_MAX is defined within BLST. It should represent a good tradeoff of the + // max number of miller loops to be batched in one call to `miller_loop_n`. + // miller_loop_n expects an array of `POINTonEx_affine`. + POINTonE1_affine p_aff[N_MAX]; + POINTonE2_affine q_aff[N_MAX]; + int n = 0; // the number of couples (p,q) held in p_aff and q_aff + int init_flag = 0; + + for (int i = 0; i < len; i++) { + if (E1_is_infty(p + i) || E2_is_infty(q + i)) { + continue; + } + // `miller_loop_n` expects affine coordinates in a `POINTonEx_affine` array. + // `POINTonEx_affine` has a different size than `POINTonEx` and `Ex` ! + E1 tmp1; + E1_to_affine(&tmp1, p + i); + vec_copy(p_aff + n, &tmp1, sizeof(POINTonE1_affine)); + E2 tmp2; + E2_to_affine(&tmp2, q + i); + vec_copy(q_aff + n, &tmp2, sizeof(POINTonE2_affine)); + n++; + // if p_aff and q_aff are filled, batch `N_MAX` miller loops + if (n == N_MAX) { + if (!init_flag) { + miller_loop_n(res_vec, q_aff, p_aff, N_MAX); + init_flag = 1; + } else { + vec384fp12 tmp; + miller_loop_n(tmp, q_aff, p_aff, N_MAX); + mul_fp12(res_vec, res_vec, tmp); + } + n = 0; } - while (ep2_upk(p, p) == 0); // make sure p is in E1 + } + // if p_aff and q_aff aren't empty, + // the remaining couples are also batched in `n` miller loops + if (n > 0) { + if (!init_flag) { + miller_loop_n(res_vec, q_aff, p_aff, n); + init_flag = 1; + } else { + vec384fp12 tmp; + miller_loop_n(tmp, q_aff, p_aff, n); + mul_fp12(res_vec, res_vec, tmp); + } + } + + // check if no miller loop was computed + if (!init_flag) { + Fp12_set_one(res); + } + final_exp(res_vec, res_vec); +} + +// ------------------- Other utilities + +// This is a testing function and is not used in exported functions +// It uses an expand message XMD based on SHA2-256. +void xmd_sha256(byte *hash, int len_hash, byte *msg, int len_msg, byte *dst, + int len_dst) { + expand_message_xmd(hash, len_hash, NULL, 0, msg, len_msg, dst, len_dst); +} + +// DEBUG printing functions +#ifdef DEBUG +void bytes_print_(char *s, byte *data, int len) { + if (strlen(s)) + printf("[%s]:\n", s); + for (int i = 0; i < len; i++) + printf("%02X,", data[i]); + printf("\n"); +} + +void Fr_print_(char *s, Fr *a) { + if (strlen(s)) + printf("[%s]:\n", s); + limb_t *p = (limb_t *)(a) + Fr_LIMBS; + for (int i = 0; i < Fr_LIMBS; i++) + printf("%016llX", *(--p)); + printf("\n"); +} - // map the point to E1\G1 by clearing G1 order - ep2_mul_basic(p, p, &core_get()->ep_r); +void Fp_print_(char *s, const Fp *a) { + if (strlen(s)) + printf("[%s]:\n", s); + Fp tmp; + Fp_from_montg(&tmp, a); + limb_t *p = (limb_t *)(&tmp) + Fp_LIMBS; + for (int i = 0; i < Fp_LIMBS; i++) + printf("%016llX ", *(--p)); + printf("\n"); +} - assert(ep2_on_curve(p)); // sanity check to make sure p is in E1 +void Fp2_print_(char *s, const Fp2 *a) { + if (strlen(s)) + printf("[%s]:\n", s); + Fp_print_("", &real(a)); + Fp_print_("", &imag(a)); } -// This is a testing function. -// It wraps a call to a Relic macro since cgo can't call macros. -void xmd_sha256(uint8_t *hash, int len_hash, uint8_t *msg, int len_msg, uint8_t *dst, int len_dst){ - md_xmd_sh256(hash, len_hash, msg, len_msg, dst, len_dst); +void Fp12_print_(char *s, const Fp12 *a) { + if (strlen(s)) + printf("[%s]:\n", s); + for (int i = 0; i < 2; i++) { + vec384fp6 *a_ = (vec384fp6 *)a + i; + for (int j = 0; j < 3; j++) { + vec384fp2 *a__ = (vec384fp2 *)a_ + j; + Fp2_print_("", a__); + } + } } + +void E1_print_(char *s, const E1 *p, const int jacob) { + E1 a; + E1_copy(&a, p); + if (!jacob) + E1_to_affine(&a, &a); + if (strlen(s)) + printf("[%s]:\n", s); + Fp_print_(".x", &(a.x)); + Fp_print_(".y", &(a.y)); + if (jacob) + Fp_print_(".z", &(a.z)); +} + +void E2_print_(char *s, const E2 *p, const int jacob) { + E2 a; + E2_copy(&a, p); + if (!jacob) + E2_to_affine(&a, &a); + if (strlen(s)) + printf("[%s]:\n", s); + Fp2_print_("", &(a.x)); + Fp2_print_("", &(a.y)); + if (jacob) + Fp2_print_("", &(a.z)); +} + +#endif diff --git a/crypto/bls12381_utils.go b/crypto/bls12381_utils.go index 50676fc2c04..65a54bb9dd4 100644 --- a/crypto/bls12381_utils.go +++ b/crypto/bls12381_utils.go @@ -1,239 +1,316 @@ -//go:build relic -// +build relic - package crypto // this file contains utility functions for the curve BLS 12-381 // these tools are shared by the BLS signature scheme, the BLS based threshold signature // and the BLS distributed key generation protocols -// #cgo CFLAGS: -g -Wall -std=c99 -I${SRCDIR}/ -I${SRCDIR}/relic/build/include -I${SRCDIR}/relic/include -I${SRCDIR}/relic/include/low -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s +// #cgo CFLAGS: -I${SRCDIR}/ -I${SRCDIR}/blst_src -I${SRCDIR}/blst_src/build -D__BLST_CGO__ -Wall -fno-builtin-memcpy -fno-builtin-memset -Wno-unused-function -Wno-unused-macros -Wno-unused-variable +// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx +// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ // #include "bls12381_utils.h" -// #include "bls_include.h" +// +// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) +// # include +// # include +// # include +// static void handler(int signum) +// { char text[1024] = "Caught SIGILL in blst_cgo_init, BLST library (used by flow-go/crypto) requires ADX support, build with CGO_CFLAGS=\"-O -D__BLST_PORTABLE__\"\n"; +// ssize_t n = write(2, &text, strlen(text)); +// _exit(128+SIGILL); +// (void)n; +// } +// __attribute__((constructor)) static void flow_crypto_cgo_init() +// { Fp temp = { 0 }; +// struct sigaction act = {{ handler }}, oact; +// sigaction(SIGILL, &act, &oact); +// Fp_squ_montg(&temp, &temp); +// sigaction(SIGILL, &oact, NULL); +// } +// #endif +// import "C" import ( "errors" + "fmt" + + "github.com/onflow/flow-go/crypto/random" ) -// Go wrappers to Relic C types -// Relic is compiled with ALLOC=AUTO -type pointG1 C.ep_st -type pointG2 C.ep2_st -type scalar C.bn_st +// Go wrappers around BLST C types +type pointE1 C.E1 +type pointE2 C.E2 +type scalar C.Fr + +// Note that scalars and field elements F_r are represented in Go by the same type +// called `scalar`, which is internally represented by C type `Fr`. Scalars used by the +// Go layer are all reduced modulo the curve order `r`. + +const ( + // BLS12-381 related lengths imported from the C layer + frBytesLen = int(C.Fr_BYTES) + fpBytesLen = int(C.Fp_BYTES) + g1BytesLen = int(C.G1_SER_BYTES) + g2BytesLen = int(C.G2_SER_BYTES) + + // error constants imported from the C layer + valid = C.VALID + invalid = C.INVALID + badEncoding = C.BAD_ENCODING + badValue = C.BAD_VALUE + pointNotOnCurve = C.POINT_NOT_ON_CURVE +) -// context required for the BLS set-up -type ctx struct { - relicCtx *C.ctx_t - precCtx *C.prec_st -} +// header of the point at infinity serializations +var g1SerHeader byte // g1 (G1 identity) +var g2SerHeader byte // g2 (G2 identity) -// get some constants from the C layer -// (Cgo does not export C macros) -var valid = C.get_valid() -var invalid = C.get_invalid() +// `g1` serialization +var g1Serialization []byte -// initContext sets relic B12_381 parameters and precomputes some data in the C layer -func (ct *ctx) initContext() error { - c := C.relic_init_BLS12_381() - if c == nil { - return errors.New("Relic core init failed") - } - ct.relicCtx = c - ct.precCtx = C.init_precomputed_data_BLS12_381() - return nil -} +var g2PublicKey pubKeyBLSBLS12381 + +// initialization of BLS12-381 curve +func initBLS12381() { + C.types_sanity() -// seeds the internal relic random function. -// relic context must be initialized before seeding. -func seedRelic(seed []byte) error { - if len(seed) < (securityBits / 8) { - return invalidInputsErrorf( - "seed length needs to be larger than %d", - securityBits/8) + if isG1Compressed() { + g1SerHeader = 0xC0 + } else { + g1SerHeader = 0x40 } - if len(seed) > maxRelicPrgSeed { - return invalidInputsErrorf( - "seed length needs to be less than %x", - maxRelicPrgSeed) + g1Serialization = append([]byte{g1SerHeader}, make([]byte, g1BytesLen-1)...) + if isG2Compressed() { + g2SerHeader = 0xC0 + } else { + g2SerHeader = 0x40 } - C.seed_relic((*C.uchar)(&seed[0]), (C.int)(len(seed))) - return nil + // set a global point to infinity + C.E2_set_infty((*C.E2)(&g2PublicKey.point)) + g2PublicKey.isIdentity = true } -// setContext sets the context (previously initialized) of the C layer with -// pre-saved data. -func (ct *ctx) setContext() { - C.core_set(ct.relicCtx) - C.precomputed_data_set(ct.precCtx) +// String returns a hex-encoded representation of the scalar. +func (a *scalar) String() string { + encoding := make([]byte, frBytesLen) + writeScalar(encoding, a) + return fmt.Sprintf("%#x", encoding) } -// Exponentiation in G1 (scalar point multiplication) -func (p *pointG1) scalarMultG1(res *pointG1, expo *scalar) { - C.ep_mult((*C.ep_st)(res), (*C.ep_st)(p), (*C.bn_st)(expo)) +// String returns a hex-encoded representation of the E2 point. +func (p *pointE2) String() string { + encoding := make([]byte, g2BytesLen) + writePointE2(encoding, p) + return fmt.Sprintf("%#x", encoding) } -// This function is for TEST only -// Exponentiation of g1 in G1 -func generatorScalarMultG1(res *pointG1, expo *scalar) { - C.ep_mult_gen_bench((*C.ep_st)(res), (*C.bn_st)(expo)) +// Scalar multiplication of a generic point `p` in E1 +func (p *pointE1) scalarMultE1(res *pointE1, expo *scalar) { + C.E1_mult((*C.E1)(res), (*C.E1)(p), (*C.Fr)(expo)) } -// This function is for TEST only -// Generic Exponentiation G1 -func genericScalarMultG1(res *pointG1, expo *scalar) { - C.ep_mult_generic_bench((*C.ep_st)(res), (*C.bn_st)(expo)) +// Scalar multiplication of generator g1 in G1 +func generatorScalarMultG1(res *pointE1, expo *scalar) { + C.G1_mult_gen((*C.E1)(res), (*C.Fr)(expo)) } -// Exponentiation of g2 in G2 -func generatorScalarMultG2(res *pointG2, expo *scalar) { - C.ep2_mult_gen((*C.ep2_st)(res), (*C.bn_st)(expo)) +// Scalar multiplication of generator g2 in G2 +// +// This often results in a public key that is used in +// multiple pairing computation. Therefore, convert the +// resulting point to affine coordinate to save pre-pairing +// conversions. +func generatorScalarMultG2(res *pointE2, expo *scalar) { + C.G2_mult_gen_to_affine((*C.E2)(res), (*C.Fr)(expo)) } -// comparison in Zr where r is the group order of G1/G2 +// comparison in Fr where r is the group order of G1/G2 // (both scalars should be reduced mod r) func (x *scalar) equals(other *scalar) bool { - return C.bn_cmp((*C.bn_st)(x), (*C.bn_st)(other)) == valid + return bool(C.Fr_is_equal((*C.Fr)(x), (*C.Fr)(other))) +} + +// comparison in E1 +func (p *pointE1) equals(other *pointE1) bool { + return bool(C.E1_is_equal((*C.E1)(p), (*C.E1)(other))) } -// comparison in G2 -func (p *pointG2) equals(other *pointG2) bool { - return C.ep2_cmp((*C.ep2_st)(p), (*C.ep2_st)(other)) == valid +// comparison in E2 +func (p *pointE2) equals(other *pointE2) bool { + return bool(C.E2_is_equal((*C.E2)(p), (*C.E2)(other))) } -// Comparison to zero in Zr. +// Comparison to zero in Fr. // Scalar must be already reduced modulo r func (x *scalar) isZero() bool { - return C.bn_is_zero((*C.bn_st)(x)) == 1 + return bool(C.Fr_is_zero((*C.Fr)(x))) } // Comparison to point at infinity in G2. -func (p *pointG2) isInfinity() bool { - return C.ep2_is_infty((*C.ep2_st)(p)) == 1 +func (p *pointE2) isInfinity() bool { + return bool(C.E2_is_infty((*C.E2)(p))) } -// returns a random number in Zr -func randZr(x *scalar) { - C.bn_randZr((*C.bn_st)(x)) +// generates a random element in F_r using input random source, +// and saves the random in `x`. +// returns `true` if generated element is zero. +func randFr(x *scalar, rand random.Rand) bool { + // use extra 128 bits to reduce the modular reduction bias + bytes := make([]byte, frBytesLen+securityBits/8) + rand.Read(bytes) + // modular reduction + return mapToFr(x, bytes) } -// returns a random non-zero number in Zr -func randZrStar(x *scalar) { - C.bn_randZr_star((*C.bn_st)(x)) +// generates a random element in F_r* using input random source, +// and saves the random in `x`. +func randFrStar(x *scalar, rand random.Rand) { + isZero := true + // extremely unlikely this loop runs more than once, + // but force the output to be non-zero instead of propagating an error. + for isZero { + isZero = randFr(x, rand) + } } -// mapToZr reads a scalar from a slice of bytes and maps it to Zr. -// The resulting scalar `k` satisfies 0 <= k < r. +// mapToFr reads a scalar from a slice of bytes and maps it to Fr using modular reduction. +// The resulting element `k` therefore satisfies 0 <= k < r. // It returns true if scalar is zero and false otherwise. -func mapToZr(x *scalar, src []byte) bool { - isZero := C.bn_map_to_Zr((*C.bn_st)(x), +func mapToFr(x *scalar, src []byte) bool { + isZero := C.map_bytes_to_Fr((*C.Fr)(x), (*C.uchar)(&src[0]), (C.int)(len(src))) - return isZero == valid + return bool(isZero) } -// writeScalar writes a G2 point in a slice of bytes +// writeScalar writes a scalar in a slice of bytes func writeScalar(dest []byte, x *scalar) { - C.bn_write_bin((*C.uchar)(&dest[0]), - (C.ulong)(prKeyLengthBLSBLS12381), - (*C.bn_st)(x), - ) + C.Fr_write_bytes((*C.uchar)(&dest[0]), (*C.Fr)(x)) } -// readScalar reads a scalar from a slice of bytes -func readScalar(x *scalar, src []byte) { - C.bn_read_bin((*C.bn_st)(x), - (*C.uchar)(&src[0]), - (C.ulong)(len(src)), - ) +// writePointE2 writes a G2 point in a slice of bytes +// The slice should be of size g2BytesLen and the serialization +// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves +func writePointE2(dest []byte, a *pointE2) { + C.E2_write_bytes((*C.uchar)(&dest[0]), (*C.E2)(a)) } -// writePointG2 writes a G2 point in a slice of bytes -// The slice should be of size PubKeyLenBLSBLS12381 and the serialization will -// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func writePointG2(dest []byte, a *pointG2) { - C.ep2_write_bin_compact((*C.uchar)(&dest[0]), - (*C.ep2_st)(a), - (C.int)(pubKeyLengthBLSBLS12381), - ) +// writePointE1 writes a G1 point in a slice of bytes +// The slice should be of size g1BytesLen and the serialization +// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves +func writePointE1(dest []byte, a *pointE1) { + C.E1_write_bytes((*C.uchar)(&dest[0]), (*C.E1)(a)) } -// writePointG1 writes a G1 point in a slice of bytes -// The slice should be of size SignatureLenBLSBLS12381 and the serialization will -// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func writePointG1(dest []byte, a *pointG1) { - C.ep_write_bin_compact((*C.uchar)(&dest[0]), - (*C.ep_st)(a), - (C.int)(signatureLengthBLSBLS12381), - ) +// read an Fr* element from a byte slice +// and stores it into a `scalar` type element. +func readScalarFrStar(a *scalar, src []byte) error { + read := C.Fr_star_read_bytes( + (*C.Fr)(a), + (*C.uchar)(&src[0]), + (C.int)(len(src))) + + switch read { + case valid: + return nil + case badEncoding: + return invalidInputsErrorf("input length must be %d, got %d", + frBytesLen, len(src)) + case badValue: + return invalidInputsErrorf("scalar is not in the correct range") + default: + return invalidInputsErrorf("reading the scalar failed") + } } -// readPointG2 reads a G2 point from a slice of bytes -// The slice is expected to be of size PubKeyLenBLSBLS12381 and the deserialization will -// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func readPointG2(a *pointG2, src []byte) error { - switch C.ep2_read_bin_compact((*C.ep2_st)(a), +// readPointE2 reads a E2 point from a slice of bytes +// The slice is expected to be of size g2BytesLen and the deserialization +// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. +// No G2 membership check is performed. +func readPointE2(a *pointE2, src []byte) error { + read := C.E2_read_bytes((*C.E2)(a), (*C.uchar)(&src[0]), - (C.int)(len(src))) { + (C.int)(len(src))) + + switch read { case valid: return nil - case invalid: - return invalidInputsErrorf("input is not a G2 point") + case badEncoding, badValue: + return invalidInputsErrorf("input could not deserialize to an E2 point") + case pointNotOnCurve: + return invalidInputsErrorf("input is not a point on curve E2") default: - return errors.New("reading a G2 point failed") + return errors.New("reading E2 point failed") } } -// readPointG1 reads a G1 point from a slice of bytes -// The slice should be of size SignatureLenBLSBLS12381 and the deserialization will -// follow the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves -func readPointG1(a *pointG1, src []byte) error { - switch C.ep_read_bin_compact((*C.ep_st)(a), +// readPointE1 reads a E1 point from a slice of bytes +// The slice should be of size g1BytesLen and the deserialization +// follows the Zcash format specified in draft-irtf-cfrg-pairing-friendly-curves. +// No G1 membership check is performed. +func readPointE1(a *pointE1, src []byte) error { + read := C.E1_read_bytes((*C.E1)(a), (*C.uchar)(&src[0]), - (C.int)(len(src))) { + (C.int)(len(src))) + + switch read { case valid: return nil - case invalid: - return invalidInputsErrorf("input is not a G1 point") + case badEncoding, badValue: + return invalidInputsErrorf("input could not deserialize to a E1 point") + case pointNotOnCurve: + return invalidInputsErrorf("input is not a point on curve E1") default: - return errors.New("reading a G1 point failed") + return errors.New("reading E1 point failed") } } // checkMembershipG1 wraps a call to a subgroup check in G1 since cgo can't be used // in go test files. -func checkMembershipG1(pt *pointG1) int { - return int(C.check_membership_G1((*C.ep_st)(pt))) +func checkMembershipG1(pt *pointE1) bool { + return bool(C.E1_in_G1((*C.E1)(pt))) } // checkMembershipG2 wraps a call to a subgroup check in G2 since cgo can't be used // in go test files. -func checkMembershipG2(pt *pointG2) int { - return int(C.check_membership_G2((*C.ep2_st)(pt))) +func checkMembershipG2(pt *pointE2) bool { + return bool(C.E2_in_G2((*C.E2)(pt))) +} + +// This is only a TEST/DEBUG/BENCH function. +// It returns the hash-to-G1 point from a slice of 128 bytes +func mapToG1(data []byte) *pointE1 { + l := len(data) + var h pointE1 + if C.map_to_G1((*C.E1)(&h), (*C.uchar)(&data[0]), (C.int)(l)) != valid { + return nil + } + return &h } -// randPointG1 wraps a call to C since cgo can't be used in go test files. -// It generates a random point in G1 and stores it in input point. -func randPointG1(pt *pointG1) { - C.ep_rand_G1((*C.ep_st)(pt)) +// mapToG1 is a test function, it wraps a call to C since cgo can't be used in go test files. +// It maps input bytes to a point in G2 and stores it in input point. +// THIS IS NOT the kind of mapping function that is used in BLS signature. +func unsafeMapToG1(pt *pointE1, seed []byte) { + C.unsafe_map_bytes_to_G1((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } -// randPointG1Complement wraps a call to C since cgo can't be used in go test files. -// It generates a random point in E1\G1 and stores it in input point. -func randPointG1Complement(pt *pointG1) { - C.ep_rand_G1complement((*C.ep_st)(pt)) +// unsafeMapToG1Complement is a test function, it wraps a call to C since cgo can't be used in go test files. +// It generates a random point in E2\G2 and stores it in input point. +func unsafeMapToG1Complement(pt *pointE1, seed []byte) { + C.unsafe_map_bytes_to_G1complement((*C.E1)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } -// randPointG2 wraps a call to C since cgo can't be used in go test files. -// It generates a random point in G2 and stores it in input point. -func randPointG2(pt *pointG2) { - C.ep2_rand_G2((*C.ep2_st)(pt)) +// unsafeMapToG2 is a test function, it wraps a call to C since cgo can't be used in go test files. +// It maps input bytes to a point in G2 and stores it in input point. +// THIS IS NOT the kind of mapping function that is used in BLS signature. +func unsafeMapToG2(pt *pointE2, seed []byte) { + C.unsafe_map_bytes_to_G2((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } -// randPointG1Complement wraps a call to C since cgo can't be used in go test files. +// unsafeMapToG2Complement is a test function, it wraps a call to C since cgo can't be used in go test files. // It generates a random point in E2\G2 and stores it in input point. -func randPointG2Complement(pt *pointG2) { - C.ep2_rand_G2complement((*C.ep2_st)(pt)) +func unsafeMapToG2Complement(pt *pointE2, seed []byte) { + C.unsafe_map_bytes_to_G2complement((*C.E2)(pt), (*C.uchar)(&seed[0]), (C.int)(len(seed))) } // This is only a TEST function. @@ -254,11 +331,21 @@ func hashToG1Bytes(data, dst []byte) []byte { (*C.uchar)(&dst[0]), (C.int)(len(dst))) // map the hash to G1 - var point pointG1 - C.map_to_G1((*C.ep_st)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) + var point pointE1 + if C.map_to_G1((*C.E1)(&point), (*C.uchar)(&hash[0]), (C.int)(len(hash))) != valid { + return nil + } // serialize the point - pointBytes := make([]byte, signatureLengthBLSBLS12381) - writePointG1(pointBytes, &point) + pointBytes := make([]byte, g1BytesLen) + writePointE1(pointBytes, &point) return pointBytes } + +func isG1Compressed() bool { + return g1BytesLen == fpBytesLen +} + +func isG2Compressed() bool { + return g2BytesLen == 2*fpBytesLen +} diff --git a/crypto/bls12381_utils.h b/crypto/bls12381_utils.h index 2c96503654c..923208ef3f3 100644 --- a/crypto/bls12381_utils.h +++ b/crypto/bls12381_utils.h @@ -1,143 +1,165 @@ -// +build relic - // this file contains utility functions for the curve BLS 12-381 -// these tools are shared by the BLS signature scheme, the BLS based threshold signature -// and the BLS distributed key generation protocols +// these tools are shared by the BLS signature scheme, the BLS based threshold +// signature, BLS-SPoCK and the BLS distributed key generation protocols -#ifndef _REL_MISC_INCLUDE_H -#define _REL_MISC_INCLUDE_H +#ifndef _BLS12_381_UTILS_H +#define _BLS12_381_UTILS_H -#include "relic.h" +#include "blst_include.h" +#include +#include typedef uint8_t byte; - -#define VALID RLC_OK -#define INVALID RLC_ERR -#define UNDEFINED (((VALID&1)^1) | ((INVALID&2)^2)) // different value than RLC_OK and RLC_ERR - -#define BITS_TO_BYTES(x) ((x+7)>>3) -#define BITS_TO_DIGITS(x) ((x+63)>>6) -#define BYTES_TO_DIGITS(x) ((x+7)>>3) -#define MIN(a,b) ((a)>(b)?(b):(a)) +typedef _Bool bool; // assuming cgo is using a modern enough compiler + +// minimum targeted security level +#define SEC_BITS 128 + +typedef enum { + VALID = 0, + INVALID, + BAD_ENCODING, + BAD_VALUE, + POINT_NOT_ON_CURVE, + POINT_NOT_IN_GROUP, + UNDEFINED, +} ERROR; + +#define BITS_TO_BYTES(x) ((x + 7) >> 3) +#define BITS_TO_LIMBS(x) ((x + 63) >> 6) +#define BYTES_TO_LIMBS(x) ((x + 7) >> 3) +#define LIMBS_TO_BYTES(x) ((x) << 3) +#define MIN(a, b) ((a) > (b) ? (b) : (a)) // Fields and Group serialization lengths -#define SEC_BITS 128 -#define Fp_BITS 381 -#define Fr_BITS 255 -#define Fp_BYTES BITS_TO_BYTES(Fp_BITS) -#define Fp2_BYTES (2*Fp_BYTES) -#define Fp_DIGITS BITS_TO_DIGITS(Fp_BITS) -#define Fr_BYTES BITS_TO_BYTES(Fr_BITS) +#define Fp_BITS 381 +#define Fp2_BYTES (2 * Fp_BYTES) +#define Fp_LIMBS BITS_TO_LIMBS(Fp_BITS) +#define Fp_BYTES LIMBS_TO_BYTES(Fp_LIMBS) // BLST implements Fp as a limb array +#define Fr_BITS 255 +#define Fr_LIMBS BITS_TO_LIMBS(Fr_BITS) +#define Fr_BYTES LIMBS_TO_BYTES(Fr_LIMBS) // BLST implements Fr as a limb array -#define G1_BYTES (2*Fp_BYTES) -#define G2_BYTES (2*Fp2_BYTES) +#define G1_BYTES (2 * Fp_BYTES) +#define G2_BYTES (2 * Fp2_BYTES) // Compressed and uncompressed points -#define COMPRESSED 1 -#define UNCOMPRESSED 0 -#define G1_SERIALIZATION COMPRESSED -#define G2_SERIALIZATION COMPRESSED - -// Subgroup membership check method -#define EXP_ORDER 0 -#define BOWE 1 -#define MEMBERSHIP_CHECK_G1 BOWE -#define MEMBERSHIP_CHECK_G2 EXP_ORDER - - -// constants used in the optimized SWU hash to curve -#if (hashToPoint == LOCAL_SSWU) - #define ELLP_Nx_LEN 12 - #define ELLP_Dx_LEN 10 - #define ELLP_Ny_LEN 16 - #define ELLP_Dy_LEN 15 -#endif - - -// Structure of precomputed data -typedef struct prec_ { - #if (hashToPoint == LOCAL_SSWU) - // constants needed in optimized SSWU - bn_st p_3div4; - fp_st sqrt_z; - // related hardcoded constants for faster access, - // where a1 is the coefficient of isogenous curve E1 - fp_st minus_a1; - fp_st a1z; - // coefficients of the isogeny map - fp_st iso_Nx[ELLP_Nx_LEN]; - fp_st iso_Ny[ELLP_Ny_LEN]; - #endif - #if (MEMBERSHIP_CHECK_G1 == BOWE) - bn_st beta; - bn_st z2_1_by3; - #endif - // other field-related constants - bn_st p_1div2; - fp_t r; // Montgomery multiplication constant -} prec_st; - -// BLS based SPoCK -int bls_spock_verify(const ep2_t, const byte*, const ep2_t, const byte*); - -// hash to curve functions (functions in bls12381_hashtocurve.c) -void map_to_G1(ep_t, const byte*, const int); - -// Utility functions -int get_valid(); -int get_invalid(); -void bn_new_wrapper(bn_t a); - -ctx_t* relic_init_BLS12_381(); -prec_st* init_precomputed_data_BLS12_381(); -void precomputed_data_set(const prec_st* p); -void seed_relic(byte*, int); - -int ep_read_bin_compact(ep_t, const byte *, const int); -void ep_write_bin_compact(byte *, const ep_t, const int); -int ep2_read_bin_compact(ep2_t, const byte *, const int); -void ep2_write_bin_compact(byte *, const ep2_t, const int); -int bn_read_Zr_bin(bn_t, const uint8_t *, int ); - -void ep_mult_gen_bench(ep_t, const bn_t); -void ep_mult_generic_bench(ep_t, const bn_t); -void ep_mult(ep_t, const ep_t, const bn_t); -void ep2_mult_gen(ep2_t, const bn_t); - -void bn_randZr(bn_t); -void bn_randZr_star(bn_t); -int bn_map_to_Zr(bn_t, const uint8_t*, int); -void bn_map_to_Zr_star(bn_t, const uint8_t*, int); - -void bn_sum_vector(bn_t, const bn_st*, const int); -void ep_sum_vector(ep_t, ep_st*, const int); -void ep2_sum_vector(ep2_t, ep2_st*, const int); -int ep_sum_vector_byte(byte*, const byte*, const int); -void ep2_subtract_vector(ep2_t res, ep2_t x, ep2_st* y, const int len); - -// membership checks -int check_membership_G1(const ep_t); -int check_membership_G2(const ep2_t); -int check_membership_Zr_star(const bn_t); - -int simple_subgroup_check_G1(const ep_t); -int simple_subgroup_check_G2(const ep2_t); -void ep_rand_G1(ep_t); -void ep_rand_G1complement( ep_t); -void ep2_rand_G2(ep2_t); -void ep2_rand_G2complement( ep2_t); -#if (MEMBERSHIP_CHECK_G1 == BOWE) -int bowe_subgroup_check_G1(const ep_t); -#endif +#define UNCOMPRESSED 0 +#define COMPRESSED (UNCOMPRESSED ^ 1) +#define G1_SERIALIZATION (COMPRESSED) +#define G2_SERIALIZATION (COMPRESSED) +#define G1_SER_BYTES \ + (G1_SERIALIZATION == UNCOMPRESSED ? G1_BYTES : (G1_BYTES / 2)) +#define G2_SER_BYTES \ + (G2_SERIALIZATION == UNCOMPRESSED ? G2_BYTES : (G2_BYTES / 2)) + +// init-related functions +void types_sanity(void); + +// Fr utilities +extern const Fr BLS12_381_rR; +bool Fr_is_zero(const Fr *a); +bool Fr_is_equal(const Fr *a, const Fr *b); +void Fr_set_limb(Fr *, const limb_t); +void Fr_copy(Fr *, const Fr *); +void Fr_set_zero(Fr *); +void Fr_add(Fr *res, const Fr *a, const Fr *b); +void Fr_sub(Fr *res, const Fr *a, const Fr *b); +void Fr_neg(Fr *res, const Fr *a); +void Fr_sum_vector(Fr *, const Fr x[], const int); +void Fr_mul_montg(Fr *res, const Fr *a, const Fr *b); +void Fr_squ_montg(Fr *res, const Fr *a); +void Fr_to_montg(Fr *res, const Fr *a); +void Fr_from_montg(Fr *res, const Fr *a); +void Fr_inv_montg_eucl(Fr *res, const Fr *a); +ERROR Fr_read_bytes(Fr *a, const byte *bin, int len); +ERROR Fr_star_read_bytes(Fr *a, const byte *bin, int len); +void Fr_write_bytes(byte *bin, const Fr *a); +bool map_bytes_to_Fr(Fr *, const byte *, int); + +// Fp utilities +void Fp_mul_montg(Fp *, const Fp *, const Fp *); +void Fp_squ_montg(Fp *, const Fp *); + +// E1 and G1 utilities +void E1_copy(E1 *, const E1 *); +bool E1_is_equal(const E1 *, const E1 *); +void E1_set_infty(E1 *); +bool E1_is_infty(const E1 *); +void E1_to_affine(E1 *, const E1 *); +bool E1_affine_on_curve(const E1 *); +bool E1_in_G1(const E1 *); +void E1_mult(E1 *, const E1 *, const Fr *); +void E1_add(E1 *, const E1 *, const E1 *); +void E1_neg(E1 *, const E1 *); +void E1_sum_vector(E1 *, const E1 *, const int); +int E1_sum_vector_byte(byte *, const byte *, const int); +void G1_mult_gen(E1 *, const Fr *); +ERROR E1_read_bytes(E1 *, const byte *, const int); +void E1_write_bytes(byte *, const E1 *); +void unsafe_map_bytes_to_G1(E1 *, const byte *, int); +void unsafe_map_bytes_to_G1complement(E1 *, const byte *, int); + +#define MAP_TO_G1_INPUT_LEN (2 * (Fp_BYTES + SEC_BITS / 8)) +int map_to_G1(E1 *, const byte *, const int); + +// E2 and G2 utilities +void E2_set_infty(E2 *p); +bool E2_is_infty(const E2 *); +bool E2_affine_on_curve(const E2 *); +bool E2_is_equal(const E2 *, const E2 *); +void E2_copy(E2 *, const E2 *); +void E2_to_affine(E2 *, const E2 *); +ERROR E2_read_bytes(E2 *, const byte *, const int); +void E2_write_bytes(byte *, const E2 *); +void G2_mult_gen(E2 *, const Fr *); +void G2_mult_gen_to_affine(E2 *, const Fr *); +void E2_mult(E2 *, const E2 *, const Fr *); +void E2_mult_small_expo(E2 *, const E2 *, const byte); +void E2_add(E2 *res, const E2 *a, const E2 *b); +void E2_neg(E2 *, const E2 *); +void E2_sum_vector(E2 *, const E2 *, const int); +void E2_sum_vector_to_affine(E2 *, const E2 *, const int); +void E2_subtract_vector(E2 *res, const E2 *x, const E2 *y, const int len); +bool E2_in_G2(const E2 *); +void unsafe_map_bytes_to_G2(E2 *, const byte *, int); +void unsafe_map_bytes_to_G2complement(E2 *, const byte *, int); + +// pairing and Fp12 +bool Fp12_is_one(Fp12 *); +void Fp12_set_one(Fp12 *); +void Fp12_multi_pairing(Fp12 *, const E1 *, const E2 *, const int); // utility testing function -void xmd_sha256(uint8_t *, int, uint8_t *, int, uint8_t *, int); +void xmd_sha256(byte *, int, byte *, int, byte *, int); // Debugging related functions -void bytes_print_(char*, byte*, int); -void fp_print_(char*, fp_t); -void bn_print_(char*, bn_st*); -void ep_print_(char*, ep_st*); -void ep2_print_(char*, ep2_st*); - -#endif \ No newline at end of file +// DEBUG can be enabled directly from the Go command: CC="clang -DDEBUG" go test +#ifdef DEBUG +#include +void bytes_print_(char *, byte *, int); +void Fr_print_(char *, Fr *); +void Fp_print_(char *, const Fp *); +void Fp2_print_(char *, const Fp2 *); +void Fp12_print_(char *, const Fp12 *); +void E1_print_(char *, const E1 *, const int); +void E2_print_(char *, const E2 *, const int); + +#endif /* DEBUG */ + +// memory sanitization disabler +#define NO_MSAN +#ifdef MSAN +/* add NO_MSAN to a function defintion to disable MSAN in that function ( void + * NO_MSAN f(..) {} ) */ +#if defined(__has_feature) +#if __has_feature(memory_sanitizer) +// disable memory sanitization in this function because of a +// use-of-uninitialized-value false positive. +#undef NO_MSAN +#define NO_MSAN __attribute__((no_sanitize("memory"))) +#endif /* __has_feature(memory_sanitizer) */ +#endif /* __has_feature*/ +#endif /*MSAN*/ + +#endif /* BLS12_381_UTILS */ \ No newline at end of file diff --git a/crypto/bls12381_utils_test.go b/crypto/bls12381_utils_test.go index f8278414e4a..a528e240363 100644 --- a/crypto/bls12381_utils_test.go +++ b/crypto/bls12381_utils_test.go @@ -1,10 +1,7 @@ -//go:build relic -// +build relic - package crypto import ( - crand "crypto/rand" + "crypto/rand" "encoding/hex" "testing" @@ -12,85 +9,96 @@ import ( "github.com/stretchr/testify/require" ) -func TestDeterministicKeyGen(t *testing.T) { - // 2 keys generated with the same seed should be equal - seed := make([]byte, KeyGenSeedMinLen) - n, err := crand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) +// Sanity check of G1 and G2 scalar multiplication +func TestScalarMultBLS12381(t *testing.T) { + expoBytes, err := hex.DecodeString("444465cb6cc2dba9474e6beeb6a9013fbf1260d073429fb14a31e63e89129390") require.NoError(t, err) - sk1, err := GeneratePrivateKey(BLSBLS12381, seed) - require.Nil(t, err) - sk2, err := GeneratePrivateKey(BLSBLS12381, seed) - require.Nil(t, err) - assert.True(t, sk1.Equals(sk2), "private keys should be equal") -} -// test the deterministicity of the relic PRG (used by the DKG polynomials) -func TestPRGseeding(t *testing.T) { - blsInstance.reInit() - // 2 scalars generated with the same seed should be equal - seed := make([]byte, KeyGenSeedMinLen) - n, err := crand.Read(seed) - require.Equal(t, n, KeyGenSeedMinLen) - require.NoError(t, err) - // 1st scalar (wrapped in a private key) - err = seedRelic(seed) - require.Nil(t, err) - var sk1 prKeyBLSBLS12381 - randZr(&sk1.scalar) - // 2nd scalar (wrapped in a private key) - err = seedRelic(seed) - require.Nil(t, err) - var sk2 prKeyBLSBLS12381 - randZr(&sk2.scalar) - // compare the 2 scalars (by comparing the private keys) - assert.True(t, sk1.Equals(&sk2), "private keys should be equal") + var expo scalar + isZero := mapToFr(&expo, expoBytes) + require.False(t, isZero) + + // G1 generator multiplication + // Note that generator and random point multiplications + // are implemented with the same algorithm + t.Run("G1", func(t *testing.T) { + if !isG1Compressed() { + t.Skip() + } + var p pointE1 + generatorScalarMultG1(&p, &expo) + expected, err := hex.DecodeString("96484ca50719f5d2533047960878b6bae8289646c0f00a942a1e6992be9981a9e0c7a51e9918f9b19d178cf04a8018a4") + require.NoError(t, err) + pBytes := make([]byte, g1BytesLen) + writePointE1(pBytes, &p) + assert.Equal(t, pBytes, expected) + }) + + // G2 generator multiplication + // Note that generator and random point multiplications + // are implemented with the same algorithm + t.Run("G2", func(t *testing.T) { + if !isG2Compressed() { + t.Skip() + } + var p pointE2 + generatorScalarMultG2(&p, &expo) + expected, err := hex.DecodeString("b35f5043f166848805b98da62dcb9c5d2f25e497bd0d9c461d4a00d19e4e67cc1e813de3c99479d5a2c62fb754fd7df40c4fd60c46834c8ae665343a3ff7dc3cc929de34ad62b7b55974f4e3fd20990d3e564b96e4d33de87716052d58cf823e") + require.NoError(t, err) + pBytes := make([]byte, g2BytesLen) + writePointE2(pBytes, &p) + assert.Equal(t, pBytes, expected) + }) } // G1 and G2 scalar multiplication -func BenchmarkScalarMultG1G2(b *testing.B) { - blsInstance.reInit() +func BenchmarkScalarMult(b *testing.B) { seed := make([]byte, securityBits/8) - _, err := crand.Read(seed) + _, err := rand.Read(seed) require.NoError(b, err) - _ = seedRelic(seed) + var expo scalar - randZr(&expo) + _ = mapToFr(&expo, seed) // G1 generator multiplication + // Note that generator and random point multiplications + // are implemented with the same algorithm + var res pointE1 b.Run("G1 gen", func(b *testing.B) { - var res pointG1 b.ResetTimer() for i := 0; i < b.N; i++ { generatorScalarMultG1(&res, &expo) } - b.StopTimer() }) - // G1 base point multiplication - b.Run("G1 generic", func(b *testing.B) { - var res pointG1 + // E1 random point multiplication + // Note that generator and random point multiplications + // are implemented with the same algorithm + b.Run("E1 rand", func(b *testing.B) { + var res pointE1 b.ResetTimer() for i := 0; i < b.N; i++ { - genericScalarMultG1(&res, &expo) + res.scalarMultE1(&res, &expo) } - b.StopTimer() }) - // G2 base point multiplication + // G2 generator multiplication + // Note that generator and random point multiplications + // are implemented with the same algorithm b.Run("G2 gen", func(b *testing.B) { - var res pointG2 + var res pointE2 b.ResetTimer() for i := 0; i < b.N; i++ { generatorScalarMultG2(&res, &expo) } - b.StopTimer() }) } // Sanity-check of the map-to-G1 with regards to the IETF draft hash-to-curve func TestMapToG1(t *testing.T) { - + if !isG1Compressed() { + t.Skip() + } // test vectors from https://datatracker.ietf.org/doc/html/draft-irtf-cfrg-hash-to-curve-14#appendix-J.9.1 dst := []byte("QUUX-V01-CS02-with-BLS12381G1_XMD:SHA-256_SSWU_RO_") @@ -112,6 +120,7 @@ func TestMapToG1(t *testing.T) { for i, msg := range msgs { pointBytes := hashToG1Bytes(msg, dst) + require.NotNil(t, pointBytes) expectedPointBytes, err := hex.DecodeString(expectedPointString[i]) require.NoError(t, err) @@ -123,69 +132,142 @@ func TestMapToG1(t *testing.T) { // Hashing to G1 bench func BenchmarkMapToG1(b *testing.B) { - blsInstance.reInit() input := make([]byte, expandMsgOutput) for i := 0; i < len(input); i++ { input[i] = byte(i) } b.ResetTimer() + var p *pointE1 for i := 0; i < b.N; i++ { - mapToG1(input) + p = mapToG1(input) } - b.StopTimer() + require.NotNil(b, p) } // test subgroup membership check in G1 and G2 func TestSubgroupCheck(t *testing.T) { - blsInstance.reInit() - // seed Relic PRG - seed := make([]byte, securityBits/8) - _, err := crand.Read(seed) + prg := getPRG(t) + seed := make([]byte, 192) + _, err := prg.Read(seed) require.NoError(t, err) - _ = seedRelic(seed) t.Run("G1", func(t *testing.T) { - var p pointG1 - randPointG1(&p) // point in G1 - res := checkMembershipG1(&p) - assert.Equal(t, res, int(valid)) - randPointG1Complement(&p) // point in E1\G1 - res = checkMembershipG1(&p) - assert.Equal(t, res, int(invalid)) + var p pointE1 + unsafeMapToG1(&p, seed) // point in G1 + assert.True(t, checkMembershipG1(&p)) + + unsafeMapToG1Complement(&p, seed) // point in E2\G2 + assert.False(t, checkMembershipG1(&p)) }) t.Run("G2", func(t *testing.T) { - var p pointG2 - randPointG2(&p) // point in G2 - res := checkMembershipG2(&p) - assert.Equal(t, res, int(valid)) - randPointG2Complement(&p) // point in E2\G2 - res = checkMembershipG2(&p) - assert.Equal(t, res, int(invalid)) + var p pointE2 + unsafeMapToG2(&p, seed) // point in G2 + assert.True(t, checkMembershipG2(&p)) + + unsafeMapToG2Complement(&p, seed) // point in E2\G2 + assert.False(t, checkMembershipG2(&p)) }) } // subgroup membership check bench func BenchmarkSubgroupCheck(b *testing.B) { - blsInstance.reInit() + seed := make([]byte, g2BytesLen) + _, err := rand.Read(seed) + require.NoError(b, err) b.Run("G1", func(b *testing.B) { - var p pointG1 - randPointG1(&p) + var p pointE1 + unsafeMapToG1(&p, seed) // point in G1 b.ResetTimer() for i := 0; i < b.N; i++ { _ = checkMembershipG1(&p) // G1 } - b.StopTimer() }) b.Run("G2", func(b *testing.B) { - var p pointG2 - randPointG2(&p) + var p pointE2 + unsafeMapToG2(&p, seed) // point in G2 b.ResetTimer() for i := 0; i < b.N; i++ { _ = checkMembershipG2(&p) // G2 } - b.StopTimer() }) } + +// specific test of G1 points Encode and decode (BLS signature since the library is set for min_sig). +// G2 points read and write are implicitly tested by public keys Encode/Decode. +func TestReadWriteG1(t *testing.T) { + prg := getPRG(t) + seed := make([]byte, frBytesLen) + bytes := make([]byte, g1BytesLen) + // generate a random G1 point, encode it, decode it, + // and compare it the original point + t.Run("random points", func(t *testing.T) { + iterations := 50 + for i := 0; i < iterations; i++ { + var p, q pointE1 + _, err := prg.Read(seed) + unsafeMapToG1(&p, seed) + require.NoError(t, err) + writePointE1(bytes, &p) + err = readPointE1(&q, bytes) + require.NoError(t, err) + assert.True(t, p.equals(&q)) + } + }) + + t.Run("infinity", func(t *testing.T) { + var p, q pointE1 + seed := make([]byte, frBytesLen) + unsafeMapToG1(&p, seed) // this results in the infinity point given how `unsafeMapToG1` works with an empty scalar + writePointE1(bytes, &p) + require.True(t, IsBLSSignatureIdentity(bytes)) // sanity check + err := readPointE1(&q, bytes) + require.NoError(t, err) + assert.True(t, p.equals(&q)) + }) +} + +// test some edge cases of MapToFr to validate modular reduction and endianness: +// - inputs `0` and curve order `r` +// - inputs `1` and `r+1` +func TestMapToFr(t *testing.T) { + var x scalar + offset := 10 + bytes := make([]byte, frBytesLen+offset) + expectedEncoding := make([]byte, frBytesLen) + // zero bytes + isZero := mapToFr(&x, bytes) + assert.True(t, isZero) + assert.True(t, x.isZero()) + assert.Equal(t, expectedEncoding, newPrKeyBLSBLS12381(&x).Encode()) + // curve order bytes + copy(bytes[offset:], BLS12381Order) + isZero = mapToFr(&x, bytes) + assert.True(t, isZero) + assert.True(t, x.isZero()) + assert.Equal(t, expectedEncoding, newPrKeyBLSBLS12381(&x).Encode()) + // curve order + 1 + g1, err := hex.DecodeString("824aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb813e02b6052719f607dacd3a088274f65596bd0d09920b61ab5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e") + require.NoError(t, err) + bytes[len(bytes)-1] += 1 + isZero = mapToFr(&x, bytes) + assert.False(t, isZero) + assert.False(t, x.isZero()) + expectedEncoding[frBytesLen-1] = 1 + sk := newPrKeyBLSBLS12381(&x) + assert.Equal(t, expectedEncoding, sk.Encode()) + // check scalar is equal to "1" in the lower layer (scalar multiplication) + assert.Equal(t, sk.PublicKey().Encode(), g1, "scalar should be 1, check endianness in the C layer") + // 1 + copy(bytes[offset:], expectedEncoding) + isZero = mapToFr(&x, bytes) + assert.False(t, isZero) + assert.False(t, x.isZero()) + expectedEncoding[frBytesLen-1] = 1 + sk = newPrKeyBLSBLS12381(&x) + assert.Equal(t, expectedEncoding, sk.Encode()) + // check scalar is equal to "1" in the lower layer (scalar multiplication) + assert.Equal(t, sk.PublicKey().Encode(), g1, "scalar should be 1, check endianness in the C layer") +} diff --git a/crypto/bls_core.c b/crypto/bls_core.c index 32b56a5d03d..65f510f5987 100644 --- a/crypto/bls_core.c +++ b/crypto/bls_core.c @@ -1,541 +1,506 @@ -// +build relic - #include "bls_include.h" // this file is about the core functions required by the BLS signature scheme -// The functions are tested for ALLOC=AUTO (not for ALLOC=DYNAMIC) - -// functions to export macros to the Go layer (because cgo does not import macros) -int get_signature_len() { - return SIGNATURE_LEN; +// Compute a BLS signature from a G1 point (not checked) and writes it in `out`. +// `out` must be allocated properly with `G1_SER_BYTES` bytes. +static void bls_sign_E1(byte *out, const Fr *sk, const E1 *h) { + // s = h^sk + E1 s; + E1_mult(&s, h, sk); + E1_write_bytes(out, &s); } -int get_pk_len() { - return PK_LEN; +// Computes a BLS signature from a hash and writes it in `out`. +// `hash` represents the hashed message with length `hash_len` equal to +// `MAP_TO_G1_INPUT_LEN`. +// `out` must be allocated properly with `G1_SER_BYTES` bytes. +int bls_sign(byte *out, const Fr *sk, const byte *hash, const int hash_len) { + // hash to G1 + E1 h; + if (map_to_G1(&h, hash, hash_len) != VALID) { + return INVALID; + } + // s = h^sk + bls_sign_E1(out, sk, &h); + return VALID; } -int get_sk_len() { - return SK_LEN; -} - -// checks an input scalar a satisfies 0 < a < r -// where (r) is the order of G1/G2 -int check_membership_Zr_star(const bn_t a){ - if (bn_cmp(a, &core_get()->ep_r) != RLC_LT || bn_cmp_dig(a, 0) != RLC_GT) { - return INVALID; - } - return VALID; -} - -// Checks if input point p is in the subgroup G1. -// The function assumes the input is known to be on the curve E1. -int check_membership_G1(const ep_t p){ -#if MEMBERSHIP_CHECK - #if MEMBERSHIP_CHECK_G1 == EXP_ORDER - return simple_subgroup_check_G1(p); - #elif MEMBERSHIP_CHECK_G1 == BOWE - // section 3.2 from https://eprint.iacr.org/2019/814.pdf - return bowe_subgroup_check_G1(p); - #else - return UNDEFINED; - #endif -#endif - return VALID; -} - -// checks if input point s is on the curve E2 -// and is in the subgroup G2. -// -// membership check in G2 is using a scalar multiplication by the group order. -// TODO: switch to the faster Bowe check -int check_membership_G2(const ep2_t p){ -#if MEMBERSHIP_CHECK - // check p is on curve - if (!ep2_on_curve((ep2_st*)p)) - return INVALID; - // check p is in G2 - #if MEMBERSHIP_CHECK_G2 == EXP_ORDER - return simple_subgroup_check_G2(p); - #elif MEMBERSHIP_CHECK_G2 == BOWE - // TODO: implement Bowe's check - return UNDEFINED; - #else - return UNDEFINED; - #endif -#endif - return VALID; -} - -// Computes a BLS signature from a G1 point -static void bls_sign_ep(byte* s, const bn_t sk, const ep_t h) { - ep_t p; - ep_new(p); - // s = h^sk - ep_mult(p, h, sk); - ep_write_bin_compact(s, p, SIGNATURE_LEN); - ep_free(p); -} - -// Computes a BLS signature from a hash -void bls_sign(byte* s, const bn_t sk, const byte* data, const int len) { - ep_t h; - ep_new(h); - // hash to G1 - map_to_G1(h, data, len); - // s = h^sk - bls_sign_ep(s, sk, h); - ep_free(h); -} +extern const E2 *BLS12_381_minus_g2; // Verifies a BLS signature (G1 point) against a public key (G2 point) -// and a message data. -// The signature and public key are assumed to be in G1 and G2 respectively. This -// function only checks the pairing equality. -static int bls_verify_ep(const ep2_t pk, const ep_t s, const byte* data, const int len) { - - ep_t elemsG1[2]; - ep2_t elemsG2[2]; - - // elemsG1[0] = s - ep_new(elemsG1[0]); - ep_copy(elemsG1[0], (ep_st*)s); - - // elemsG1[1] = h - ep_new(elemsG1[1]); - // hash to G1 - map_to_G1(elemsG1[1], data, len); - - // elemsG2[1] = pk - ep2_new(elemsG2[1]); - ep2_copy(elemsG2[1], (ep2_st*)pk); - ep2_new(&elemsG2[0]); - - int ret = UNDEFINED; - -#if DOUBLE_PAIRING - // elemsG2[0] = -g2 - ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded - - fp12_t pair; - fp12_new(&pair); - // double pairing with Optimal Ate - pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), 2); - - // compare the result to 1 - int res = fp12_cmp_dig(pair, 1); - -#elif SINGLE_PAIRING - fp12_t pair1, pair2; - fp12_new(&pair1); fp12_new(&pair2); - pp_map_oatep_k12(pair1, elemsG1[0], core_get()->ep2_g); - pp_map_oatep_k12(pair2, elemsG1[1], elemsG2[1]); - - int res = fp12_cmp(pair1, pair2); -#endif - if (core_get()->code == RLC_OK) { - if (res == RLC_EQ) { - ret = VALID; - goto out; - } else { - ret = INVALID; - goto out; - } - } - -out: - ep_free(elemsG1[0]); - ep_free(elemsG1[1]); - ep2_free(elemsG2[0]); - ep2_free(elemsG2[1]); - - return ret; +// and a message hash `h` (G1 point). +// Hash, signature and public key are assumed to be in G1, G1 and G2 +// respectively. +// This function only checks the pairing equality. +static int bls_verify_E1(const E2 *pk, const E1 *s, const E1 *h) { + E1 elemsG1[2]; + E2 elemsG2[2]; + + // elemsG1[0] = s, elemsG1[1] = h + E1_copy(&elemsG1[0], s); + E1_copy(&elemsG1[1], h); + + // elemsG2[0] = -g2, elemsG2[1] = pk + E2_copy(&elemsG2[0], BLS12_381_minus_g2); + E2_copy(&elemsG2[1], pk); + + // double pairing + Fp12 e; + Fp12_multi_pairing(&e, elemsG1, elemsG2, 2); + if (Fp12_is_one(&e)) { + return VALID; + } + return INVALID; } - // Verifies the validity of an aggregated BLS signature under distinct messages. // -// Each message is mapped to a set of public keys, so that the verification equation is -// optimized to compute one pairing per message. +// Each message is mapped to a set of public keys, so that the verification +// equation is optimized to compute one pairing per message. // - sig is the signature. // - nb_hashes is the number of the messages (hashes) in the map -// - hashes is pointer to all flattened hashes in order where the hash at index i has a byte length len_hashes[i], -// is mapped to pks_per_hash[i] public keys. +// - hashes is pointer to all flattened hashes in order where the hash at index +// i has a byte length len_hashes[i], +// is mapped to pks_per_hash[i] public keys. // - the keys are flattened in pks in the same hashes order. // // membership check of the signature in G1 is verified in this function // membership check of pks in G2 is not verified in this function -// the membership check is separated to allow optimizing multiple verifications using the same pks -int bls_verifyPerDistinctMessage(const byte* sig, - const int nb_hashes, const byte* hashes, const uint32_t* len_hashes, - const uint32_t* pks_per_hash, const ep2_st* pks) { - - int ret = UNDEFINED; // return value - - ep_t* elemsG1 = (ep_t*)malloc((nb_hashes + 1) * sizeof(ep_t)); - if (!elemsG1) goto outG1; - ep2_t* elemsG2 = (ep2_t*)malloc((nb_hashes + 1) * sizeof(ep2_t)); - if (!elemsG2) goto outG2; - - for (int i=0; i < nb_hashes+1; i++) { - ep_new(elemsG1[i]); - ep2_new(elemsG2[i]); - } - - // elemsG1[0] = sig - ret = ep_read_bin_compact(elemsG1[0], sig, SIGNATURE_LEN); - if (ret != RLC_OK) goto out; - - // check s is in G1 - ret = check_membership_G1(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1 - if (ret != VALID) goto out; - - // elemsG2[0] = -g2 - ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded - - // map all hashes to G1 - int offset = 0; - for (int i=1; i < nb_hashes+1; i++) { - // elemsG1[i] = h - // hash to G1 - map_to_G1(elemsG1[i], &hashes[offset], len_hashes[i-1]); - offset += len_hashes[i-1]; - } - - // aggregate public keys mapping to the same hash - offset = 0; - for (int i=1; i < nb_hashes+1; i++) { - // elemsG2[i] = agg_pk[i] - ep2_sum_vector(elemsG2[i], (ep2_st*) &pks[offset] , pks_per_hash[i-1]); - offset += pks_per_hash[i-1]; - } - - fp12_t pair; - fp12_new(&pair); - // double pairing with Optimal Ate - pp_map_sim_oatep_k12(pair, (ep_t*)(elemsG1) , (ep2_t*)(elemsG2), nb_hashes+1); - - // compare the result to 1 - int cmp_res = fp12_cmp_dig(pair, 1); - - if (core_get()->code == RLC_OK) { - if (cmp_res == RLC_EQ) ret = VALID; - else ret = INVALID; - } else { - ret = UNDEFINED; - } +// the membership check is separated to allow optimizing multiple verifications +// using the same pks +int bls_verifyPerDistinctMessage(const byte *sig, const int nb_hashes, + const byte *hashes, const uint32_t *len_hashes, + const uint32_t *pks_per_hash, const E2 *pks) { + + int ret = UNDEFINED; // return value + + E1 *elemsG1 = (E1 *)malloc((nb_hashes + 1) * sizeof(E1)); + if (!elemsG1) + goto outG1; + E2 *elemsG2 = (E2 *)malloc((nb_hashes + 1) * sizeof(E2)); + if (!elemsG2) + goto outG2; + + // elemsG1[0] = sig + if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { + ret = INVALID; + goto out; + } + + // check signature is in G1 + if (!E1_in_G1(&elemsG1[0])) { + ret = INVALID; + goto out; + } + + // elemsG2[0] = -g2 + E2_copy(&elemsG2[0], BLS12_381_minus_g2); + + // map all hashes to G1 + int offset = 0; + for (int i = 1; i < nb_hashes + 1; i++) { + // elemsG1[i] = h + // hash to G1 + map_to_G1(&elemsG1[i], &hashes[offset], len_hashes[i - 1]); + offset += len_hashes[i - 1]; + } + + // aggregate public keys mapping to the same hash + offset = 0; + for (int i = 1; i < nb_hashes + 1; i++) { + // elemsG2[i] = agg_pk[i] + E2_sum_vector(&elemsG2[i], &pks[offset], pks_per_hash[i - 1]); + offset += pks_per_hash[i - 1]; + } + + // multi pairing + Fp12 e; + Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_hashes + 1); + if (Fp12_is_one(&e)) { + ret = VALID; + } else { + ret = INVALID; + } out: - for (int i=0; i < nb_hashes+1; i++) { - ep_free(elemsG1[i]); - ep2_free(elemsG2[i]); - } - free(elemsG2); + free(elemsG2); outG2: - free(elemsG1); + free(elemsG1); outG1: - return ret; + return ret; } - -// Verifies the validity of an aggregated BLS signature under distinct public keys. +// Verifies the validity of an aggregated BLS signature under distinct public +// keys. // -// Each key is mapped to a set of messages, so that the verification equation is -// optimized to compute one pairing per public key. +// Each key is mapped to a set of messages, so that the verification equation is +// optimized to compute one pairing per public key. // - nb_pks is the number of the public keys in the map. // - pks is pointer to all pks in order where the key at index i -// is mapped to hashes_per_pk[i] hashes. +// is mapped to hashes_per_pk[i] hashes. // - the messages (hashes) are flattened in hashes in the same public key order, // each with a length in len_hashes. // // membership check of the signature in G1 is verified in this function // membership check of pks in G2 is not verified in this function -// the membership check is separated to allow optimizing multiple verifications using the same pks -int bls_verifyPerDistinctKey(const byte* sig, - const int nb_pks, const ep2_st* pks, const uint32_t* hashes_per_pk, - const byte* hashes, const uint32_t* len_hashes){ - - int ret = UNDEFINED; // return value - - ep_t* elemsG1 = (ep_t*)malloc((nb_pks + 1) * sizeof(ep_t)); - if (!elemsG1) goto outG1; - ep2_t* elemsG2 = (ep2_t*)malloc((nb_pks + 1) * sizeof(ep2_t)); - if (!elemsG2) goto outG2; - for (int i=0; i < nb_pks+1; i++) { - ep_new(elemsG1[i]); - ep2_new(elemsG2[i]); +// the membership check is separated to allow optimizing multiple verifications +// using the same pks +int bls_verifyPerDistinctKey(const byte *sig, const int nb_pks, const E2 *pks, + const uint32_t *hashes_per_pk, const byte *hashes, + const uint32_t *len_hashes) { + + int ret = UNDEFINED; // return value + + E1 *elemsG1 = (E1 *)malloc((nb_pks + 1) * sizeof(E1)); + if (!elemsG1) + goto outG1; + E2 *elemsG2 = (E2 *)malloc((nb_pks + 1) * sizeof(E2)); + if (!elemsG2) + goto outG2; + + // elemsG1[0] = s + if (E1_read_bytes(&elemsG1[0], sig, G1_SER_BYTES) != VALID) { + ret = INVALID; + goto out; + } + + // check s in G1 + if (!E1_in_G1(&elemsG1[0])) { + ret = INVALID; + goto out; + } + + // elemsG2[0] = -g2 + E2_copy(&elemsG2[0], BLS12_381_minus_g2); + + // set the public keys + for (int i = 1; i < nb_pks + 1; i++) { + E2_copy(&elemsG2[i], &pks[i - 1]); + } + + // map all hashes to G1 and aggregate the ones with the same public key + + // tmp_hashes is a temporary array of all hashes under a same key mapped to a + // G1 point. tmp_hashes size is set to the maximum possible size to minimize + // malloc calls. + int tmp_hashes_size = hashes_per_pk[0]; + for (int i = 1; i < nb_pks; i++) { + if (hashes_per_pk[i] > tmp_hashes_size) { + tmp_hashes_size = hashes_per_pk[i]; } - - // elemsG1[0] = s - ret = ep_read_bin_compact(elemsG1[0], sig, SIGNATURE_LEN); - if (ret != RLC_OK) goto out; - - // check s in G1 - ret = check_membership_G1(elemsG1[0]); // only enabled if MEMBERSHIP_CHECK==1 - if (ret != VALID) goto out; - - // elemsG2[0] = -g2 - ep2_neg(elemsG2[0], core_get()->ep2_g); // could be hardcoded - - // set the public keys - for (int i=1; i < nb_pks+1; i++) { - ep2_copy(elemsG2[i], (ep2_st*) &pks[i-1]); + } + E1 *tmp_hashes = (E1 *)malloc(tmp_hashes_size * sizeof(E1)); + if (!tmp_hashes) { + ret = UNDEFINED; + goto out; + } + + // sum hashes under the same key + int data_offset = 0; + int index_offset = 0; + for (int i = 1; i < nb_pks + 1; i++) { + for (int j = 0; j < hashes_per_pk[i - 1]; j++) { + // map the hash to G1 + map_to_G1(&tmp_hashes[j], &hashes[data_offset], len_hashes[index_offset]); + data_offset += len_hashes[index_offset]; + index_offset++; } + // aggregate all the points of the array + E1_sum_vector(&elemsG1[i], tmp_hashes, hashes_per_pk[i - 1]); + } + free(tmp_hashes); - // map all hashes to G1 and aggregate the ones with the same public key - - // tmp_hashes is a temporary array of all hashes under a same key mapped to a G1 point. - // tmp_hashes size is set to the maximum possible size to minimize malloc calls. - int tmp_hashes_size = hashes_per_pk[0]; - for (int i=1; i tmp_hashes_size) - tmp_hashes_size = hashes_per_pk[i]; - ep_st* tmp_hashes = (ep_st*)malloc(tmp_hashes_size * sizeof(ep_st)); - if (!tmp_hashes) { - ret = UNDEFINED; - goto out; - } + // multi pairing + Fp12 e; + Fp12_multi_pairing(&e, elemsG1, elemsG2, nb_pks + 1); - // sum hashes under the same key - for (int i=0; icode == RLC_OK) { - if (cmp_res == RLC_EQ) ret = VALID; - else ret = INVALID; - } else { - ret = UNDEFINED; - } + if (Fp12_is_one(&e)) { + ret = VALID; + } else { + ret = INVALID; + } out: - for (int i=0; i < nb_pks+1; i++) { - ep_free(elemsG1[i]); - ep2_free(elemsG2[i]); - } - free(elemsG2); + free(elemsG2); outG2: - free(elemsG1); + free(elemsG1); outG1: - return ret; + return ret; } // Verifies a BLS signature in a byte buffer. // membership check of the signature in G1 is verified. // membership check of pk in G2 is not verified in this function. -// the membership check in G2 is separated to allow optimizing multiple verifications using the same key. -int bls_verify(const ep2_t pk, const byte* sig, const byte* data, const int len) { - ep_t s; - ep_new(s); - - // deserialize the signature into a curve point - int read_ret = ep_read_bin_compact(s, sig, SIGNATURE_LEN); - if (read_ret != RLC_OK) - return read_ret; - - // check s is in G1 - if (check_membership_G1(s) != VALID) // only enabled if MEMBERSHIP_CHECK==1 - return INVALID; - - return bls_verify_ep(pk, s, data, len); +// the membership check in G2 is separated to optimize multiple verifications +// using the same key. `hash` represents the hashed message with length +// `hash_len` equal to `MAP_TO_G1_INPUT_LEN`. +int bls_verify(const E2 *pk, const byte *sig, const byte *hash, + const int hash_len) { + E1 s, h; + // deserialize the signature into a curve point + if (E1_read_bytes(&s, sig, G1_SER_BYTES) != VALID) { + return INVALID; + } + + // check s is in G1 + if (!E1_in_G1(&s)) { + return INVALID; + } + + if (map_to_G1(&h, hash, hash_len) != VALID) { + return INVALID; + } + + return bls_verify_E1(pk, &s, &h); } // binary tree structure to be used by bls_batch verify. -// Each node contains a signature and a public key, the signature (resp. the public key) -// being the aggregated signature of the two children's signature (resp. public keys). -// The leaves contain the initial signatures and public keys. -typedef struct st_node { - ep_st* sig; - ep2_st* pk; - struct st_node* left; - struct st_node* right; +// Each node contains a signature and a public key, the signature (resp. the +// public key) being the aggregated signature of the two children's signature +// (resp. public keys). The leaves contain the initial signatures and public +// keys. +typedef struct st_node { + E1 *sig; + E2 *pk; + struct st_node *left; + struct st_node *right; } node; -static node* new_node(const ep2_st* pk, const ep_st* sig){ - node* t = (node*) malloc(sizeof(node)); - if (t) { - t->pk = (ep2_st*)pk; - t->sig = (ep_st*)sig; - t->right = t->left = NULL; - } - return t; +static node *new_node(const E2 *pk, const E1 *sig) { + node *t = (node *)malloc(sizeof(node)); + if (t) { + t->pk = (E2 *)pk; + t->sig = (E1 *)sig; + t->right = t->left = NULL; + } + return t; } -static void free_tree(node* root) { - if (!root) return; - - // only free pks and sigs of non-leafs, data of leafs are allocated - // as an entire array in `bls_batchVerify`. - if (root->left) { // no need to check the right child for the leaf check because - // the recursive build starts with the left side first - // relic free - if (root->sig) ep_free(root->sig); - if (root->pk) ep2_free(root->pk); - // pointer free - free(root->sig); - free(root->pk); - // free the children nodes - free_tree(root->left); - free_tree(root->right); - } - free(root); +static void free_tree(node *root) { + if (!root) + return; + + // only free pks and sigs of non-leafs, data of leafs are allocated + // as an entire array in `bls_batch_verify`. + if (root->left) { // no need to check the right child for the leaf check + // because + // the recursive build starts with the left side first + // pointer free + free(root->sig); + free(root->pk); + // free the children nodes + free_tree(root->left); + free_tree(root->right); + } + free(root); } -// builds a binary tree of aggregation of signatures and public keys recursively. -static node* build_tree(const int len, const ep2_st* pks, const ep_st* sigs) { - // check if a leaf is reached - if (len == 1) { - return new_node(&pks[0], &sigs[0]); // use the first element of the arrays - } - - // a leaf is not reached yet, - int right_len = len/2; - int left_len = len - right_len; - - // create a new node with new points - ep2_st* new_pk = (ep2_st*)malloc(sizeof(ep2_st)); - if (!new_pk) goto error; - ep_st* new_sig = (ep_st*)malloc(sizeof(ep_st)); - if (!new_sig) goto error_sig; - - node* t = new_node(new_pk, new_sig); - if (!t) goto error_node; - ep_new(t->sig); - ep2_new(t->pk); - - // build the tree in a top-down way - t->left = build_tree(left_len, &pks[0], &sigs[0]); - if (!t->left) { free_tree(t); goto error; } - - t->right = build_tree(right_len, &pks[left_len], &sigs[left_len]); - if (!t->right) { free_tree(t); goto error; } - // sum the children - ep_add_jacob(t->sig, t->left->sig, t->right->sig); - ep2_add_projc(t->pk, t->left->pk, t->right->pk); - return t; +// builds a binary tree of aggregation of signatures and public keys +// recursively. +static node *build_tree(const int len, const E2 *pks, const E1 *sigs) { + // check if a leaf is reached + if (len == 1) { + return new_node(&pks[0], &sigs[0]); // use the first element of the arrays + } + + // a leaf is not reached yet, + int right_len = len / 2; + int left_len = len - right_len; + + // create a new node with new points + E2 *new_pk = (E2 *)malloc(sizeof(E2)); + if (!new_pk) { + goto error; + } + E1 *new_sig = (E1 *)malloc(sizeof(E1)); + if (!new_sig) { + goto error_sig; + } + + node *t = new_node(new_pk, new_sig); + if (!t) + goto error_node; + + // build the tree in a top-down way + t->left = build_tree(left_len, &pks[0], &sigs[0]); + if (!t->left) { + free_tree(t); + goto error; + } + + t->right = build_tree(right_len, &pks[left_len], &sigs[left_len]); + if (!t->right) { + free_tree(t); + goto error; + } + // sum the children + E1_add(t->sig, t->left->sig, t->right->sig); + E2_add(t->pk, t->left->pk, t->right->pk); + return t; error_node: - free(new_sig); + free(new_sig); error_sig: - free(new_pk); + free(new_pk); error: - return NULL; + return NULL; } -// verify the binary tree and fill the results using recursive batch verifications. -static void bls_batchVerify_tree(const node* root, const int len, byte* results, - const byte* data, const int data_len) { - - // verify the aggregated signature against the aggregated public key. - int res = bls_verify_ep(root->pk, root->sig, data, data_len); - - // if the result is valid, all the subtree signatures are valid. - if (res == VALID) { - for (int i=0; i < len; i++) { - if (results[i] == UNDEFINED) results[i] = VALID; // do not overwrite invalid results - } - return; +// verify the binary tree and fill the results using recursive batch +// verifications. +static void bls_batch_verify_tree(const node *root, const int len, + byte *results, const E1 *h) { + // verify the aggregated signature against the aggregated public key. + int res = bls_verify_E1(root->pk, root->sig, h); + + // if the result is valid, all the subtree signatures are valid. + if (res == VALID) { + for (int i = 0; i < len; i++) { + if (results[i] == UNDEFINED) + results[i] = VALID; // do not overwrite invalid results } - - // check if root is a leaf - if (root->left == NULL) { // no need to check the right side - *results = INVALID; - return; - } - - // otherwise, at least one of the subtree signatures is invalid. - // use the binary tree structure to find the invalid signatures. - int right_len = len/2; - int left_len = len - right_len; - bls_batchVerify_tree(root->left, left_len, &results[0], data, data_len); - bls_batchVerify_tree(root->right, right_len, &results[left_len], data, data_len); + return; + } + + // check if root is a leaf + if (root->left == NULL) { // no need to check the right side + *results = INVALID; + return; + } + + // otherwise, at least one of the subtree signatures is invalid. + // use the binary tree structure to find the invalid signatures. + int right_len = len / 2; + int left_len = len - right_len; + bls_batch_verify_tree(root->left, left_len, &results[0], h); + bls_batch_verify_tree(root->right, right_len, &results[left_len], h); } -// Batch verifies the validity of a multiple BLS signatures of the -// same message under multiple public keys. +// Batch verifies the validity of a multiple BLS signatures of the +// same message under multiple public keys. Each signature at index `i` is +// verified against the public key at index `i`. `seed` is used as the entropy +// source for randoms required by the computation. The function assumes the +// source size is at least (16*sigs_len) of random bytes of entropy at least 128 +// bits. // // - membership checks of all signatures is verified upfront. -// - use random coefficients for signatures and public keys at the same index. -// - optimize the verification by verifying an aggregated signature against an aggregated -// public key, and use a recursive verification to find invalid signatures. -void bls_batchVerify(const int sigs_len, byte* results, const ep2_st* pks_input, - const byte* sigs_bytes, const byte* data, const int data_len) { - - // initialize results to undefined - memset(results, UNDEFINED, sigs_len); - - // build the arrays of G1 and G2 elements to verify - ep2_st* pks = (ep2_st*) malloc(sigs_len * sizeof(ep2_st)); - if (!pks) return; - ep_st* sigs = (ep_st*) malloc(sigs_len * sizeof(ep_st)); - if (!sigs) goto out_sigs; - for (int i=0; i < sigs_len; i++) { - ep_new(sigs[i]); - ep2_new(pks[i]); - } - bn_t r; bn_new(r); - - for (int i=0; i < sigs_len; i++) { - // convert the signature points: - // - invalid points are stored as infinity points with an invalid result, so that - // the tree aggregations remain valid. - // - valid points are multiplied by a random scalar (same for public keys at same index) - // to make sure a signature at index (i) is verified against the public key at the same index. - int read_ret = ep_read_bin_compact(&sigs[i], &sigs_bytes[SIGNATURE_LEN*i], SIGNATURE_LEN); - if ( read_ret != RLC_OK || check_membership_G1(&sigs[i]) != VALID) { - if (read_ret == UNDEFINED) // unexpected error case - goto out; - // set signature and key to infinity (no effect on the aggregation tree) - // and set result to invalid - ep_set_infty(&sigs[i]); - ep2_set_infty(&pks[i]); - results[i] = INVALID; - // multiply signatures and public keys at the same index by random coefficients - } else { - // random non-zero coefficient of a least 128 bits - bn_rand(r, RLC_POS, SEC_BITS); - bn_add_dig(r, r, 1); - ep_mul_lwnaf(&sigs[i], &sigs[i], r); - ep2_mul_lwnaf(&pks[i], (ep2_st*) &pks_input[i], r); - } +// - use random coefficients for signatures and public keys at the same index to +// prevent +// indices mixup. +// - optimize the verification by verifying an aggregated signature against an +// aggregated +// public key, and use a top-down recursive verification to find invalid +// signatures. +void bls_batch_verify(const int sigs_len, byte *results, const E2 *pks_input, + const byte *sigs_bytes, const byte *data, + const int data_len, const byte *seed) { + + // initialize results to undefined + memset(results, UNDEFINED, sigs_len); + + // build the arrays of G1 and G2 elements to verify + E2 *pks = (E2 *)malloc(sigs_len * sizeof(E2)); + if (!pks) { + return; + } + E1 *sigs = (E1 *)malloc(sigs_len * sizeof(E1)); + if (!sigs) { + goto out_sigs; + } + + E1 h; + if (map_to_G1(&h, data, data_len) != VALID) { + goto out; + } + + for (int i = 0; i < sigs_len; i++) { + // convert the signature points: + // - invalid points are stored as infinity points with an invalid result, so + // that the tree aggregations remain valid. + // - valid points are multiplied by a random scalar (same for public keys at + // same index) to make sure a signature at index (i) is verified against the + // public key at the same index. + int read_ret = + E1_read_bytes(&sigs[i], &sigs_bytes[G1_SER_BYTES * i], G1_SER_BYTES); + if (read_ret != VALID || !E1_in_G1(&sigs[i])) { + // set signature and key to infinity (no effect on the aggregation tree) + // and set result to invalid (result won't be overwritten) + E2_set_infty(&pks[i]); + E1_set_infty(&sigs[i]); + results[i] = INVALID; + } else { + // choose a random non-zero coefficient of at least 128 bits + Fr r, one; + // r = random, i-th seed is used for i-th signature + Fr_set_zero(&r); + const int seed_len = SEC_BITS / 8; + limbs_from_be_bytes((limb_t *)&r, seed + (seed_len * i), + seed_len); // faster shortcut than Fr_map_bytes + // r = random + 1 + Fr_set_limb(&one, 1); + Fr_add(&r, &r, &one); + // multiply public key and signature by the same random exponent r + E2_mult(&pks[i], &pks_input[i], &r); + E1_mult(&sigs[i], &sigs[i], &r); } - - // build a binary tree of aggreagtions - node* root = build_tree(sigs_len, &pks[0], &sigs[0]); - if (!root) goto out; - - // verify the binary tree and fill the results using batch verification - bls_batchVerify_tree(root, sigs_len, &results[0], data, data_len); - // free the allocated tree - free_tree(root); - + } + // build a binary tree of aggregations + node *root = build_tree(sigs_len, &pks[0], &sigs[0]); + if (!root) { + goto out; + } + + // verify the binary tree and fill the results using batch verification + bls_batch_verify_tree(root, sigs_len, &results[0], &h); + // free the allocated tree + free_tree(root); out: - bn_free(r); - for (int i=0; i < sigs_len; i++) { - ep_free(sigs[i]); - ep2_free(pks[i]); - } - free(sigs); + free(sigs); out_sigs: - free(pks); + free(pks); +} + +// Verifies the validity of 2 SPoCK proofs and 2 public keys. +// Membership check in G1 of both proofs is verified in this function. +// Membership check in G2 of both keys is not verified in this function. +// the membership check in G2 is separated to allow optimizing multiple +// verifications using the same public keys. +int bls_spock_verify(const E2 *pk1, const byte *sig1, const E2 *pk2, + const byte *sig2) { + E1 elemsG1[2]; + E2 elemsG2[2]; + + // elemsG1[0] = s1 + if (E1_read_bytes(&elemsG1[0], sig1, G1_SER_BYTES) != VALID) { + return INVALID; + }; + // check s1 is in G1 + if (!E1_in_G1(&elemsG1[0])) { + return INVALID; + } + + // elemsG1[1] = s2 + if (E1_read_bytes(&elemsG1[1], sig2, G1_SER_BYTES) != VALID) { + return INVALID; + }; + // check s2 is in G1 + if (!E1_in_G1(&elemsG1[1])) { + return INVALID; + } + + // elemsG2[1] = pk1 + E2_copy(&elemsG2[1], pk1); + + // elemsG2[0] = -pk2 + E2_neg(&elemsG2[0], pk2); + + // double pairing + Fp12 e; + Fp12_multi_pairing(&e, elemsG1, elemsG2, 2); + + if (Fp12_is_one(&e)) { + return VALID; + } + return INVALID; } diff --git a/crypto/bls_crossBLST_test.go b/crypto/bls_crossBLST_test.go index 5ac9e996cc1..3b3939eaf6c 100644 --- a/crypto/bls_crossBLST_test.go +++ b/crypto/bls_crossBLST_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto // This file contains tests against the library BLST (https://github.com/supranational/blst). @@ -21,8 +18,9 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" - blst "github.com/supranational/blst/bindings/go" "pgregory.net/rapid" + + "github.com/onflow/flow-go/crypto/internal/blst" ) // validPrivateKeyBytesFlow generates bytes of a valid private key in Flow library @@ -82,7 +80,7 @@ func validSignatureBytesBLST(t *rapid.T) []byte { // testEncodeDecodePrivateKeyCrossBLST tests encoding and decoding of private keys are consistent with BLST. // This test assumes private key serialization is identical to the one in BLST. func testEncodeDecodePrivateKeyCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), prKeyLengthBLSBLS12381, prKeyLengthBLSBLS12381) + randomSlice := rapid.SliceOfN(rapid.Byte(), PrKeyLenBLSBLS12381, PrKeyLenBLSBLS12381) validSliceFlow := rapid.Custom(validPrivateKeyBytesFlow) validSliceBLST := rapid.Custom(validPrivateKeyBytesBLST) // skBytes are bytes of either a valid or a random private key @@ -129,39 +127,36 @@ func testEncodeDecodePublicKeyCrossBLST(t *rapid.T) { if flowPass && blstPass { pkFlowOutBytes := pkFlow.Encode() pkBLSTOutBytes := pkBLST.Compress() - assert.Equal(t, pkFlowOutBytes, pkBLSTOutBytes) } } -// testEncodeDecodeSignatureCrossBLST tests encoding and decoding of signatures are consistent with BLST. -// This test assumes signature serialization is identical to the one in BLST. -func testEncodeDecodeSignatureCrossBLST(t *rapid.T) { - randomSlice := rapid.SliceOfN(rapid.Byte(), SignatureLenBLSBLS12381, SignatureLenBLSBLS12381) +// testEncodeDecodeG1CrossBLST tests encoding and decoding of G1 points are consistent with BLST. +// This test assumes signature serialization is identical to BLST. +func testEncodeDecodeG1CrossBLST(t *rapid.T) { + randomSlice := rapid.SliceOfN(rapid.Byte(), g1BytesLen, g1BytesLen) validSignatureFlow := rapid.Custom(validSignatureBytesFlow) validSignatureBLST := rapid.Custom(validSignatureBytesBLST) - // sigBytes are bytes of either a valid or a random signature + // sigBytes are bytes of either a valid serialization of a E1/G1 point, or random bytes sigBytes := rapid.OneOf(randomSlice, validSignatureFlow, validSignatureBLST).Example().([]byte) // check decoding results are consistent - var pointFlow pointG1 - // here we test readPointG1 rather than the simple Signature type alias - err := readPointG1(&pointFlow, sigBytes) - flowPass := (err == nil) && (checkMembershipG1(&pointFlow) == int(valid)) + var pointFlow pointE1 + err := readPointE1(&pointFlow, sigBytes) + flowPass := (err == nil) && (checkMembershipG1(&pointFlow)) var pointBLST blst.P1Affine + // res is non-nil iff point is in G1 res := pointBLST.Uncompress(sigBytes) - // flow validation has no infinity rejection for G1 blstPass := (res != nil) && pointBLST.SigValidate(false) - require.Equal(t, flowPass, blstPass, "deserialization of signature %x differs", sigBytes) + require.Equal(t, flowPass, blstPass, "deserialization of G1 %x differs", sigBytes) - // check both signatures (G1 points) are equal + // check both serializations of G1 points are equal if flowPass && blstPass { - sigFlowOutBytes := make([]byte, signatureLengthBLSBLS12381) - writePointG1(sigFlowOutBytes, &pointFlow) + sigFlowOutBytes := make([]byte, g1BytesLen) + writePointE1(sigFlowOutBytes, &pointFlow) sigBLSTOutBytes := pointBLST.Compress() - assert.Equal(t, sigFlowOutBytes, sigBLSTOutBytes) } } @@ -177,10 +172,10 @@ func testEncodeDecodeSignatureCrossBLST(t *rapid.T) { // // The test also assumes Flow signature serialization is identical to the one in BLST. func testSignHashCrossBLST(t *rapid.T) { - // generate two private keys from the same seed + // decode two private keys from the same bytes skBytes := rapid.Custom(validPrivateKeyBytesFlow).Example().([]byte) - skFlow, err := DecodePrivateKey(BLSBLS12381, skBytes) + require.NoError(t, err) var skBLST blst.Scalar res := skBLST.Deserialize(skBytes) @@ -195,7 +190,7 @@ func testSignHashCrossBLST(t *rapid.T) { sigBytesBLST := sigBLST.Compress() skFlowBLS, ok := skFlow.(*prKeyBLSBLS12381) - require.True(t, ok, "incoherent key type assertion") + require.True(t, ok) sigFlow := skFlowBLS.signWithXMDSHA256(message) sigBytesFlow := sigFlow.Bytes() @@ -214,10 +209,10 @@ func testKeyGenCrossBLST(t *rapid.T) { assert.Equal(t, skFlow.Encode(), skBLST.Serialize()) } -func TestAgainstBLST(t *testing.T) { +func TestCrossBLST(t *testing.T) { rapid.Check(t, testKeyGenCrossBLST) rapid.Check(t, testEncodeDecodePrivateKeyCrossBLST) rapid.Check(t, testEncodeDecodePublicKeyCrossBLST) - rapid.Check(t, testEncodeDecodeSignatureCrossBLST) + rapid.Check(t, testEncodeDecodeG1CrossBLST) rapid.Check(t, testSignHashCrossBLST) } diff --git a/crypto/bls_include.h b/crypto/bls_include.h index 016845719e1..af380735237 100644 --- a/crypto/bls_include.h +++ b/crypto/bls_include.h @@ -1,48 +1,22 @@ -// +build relic - // this file is about the core functions required by the BLS signature scheme -#ifndef _REL_BLS_INCLUDE_H -#define _REL_BLS_INCLUDE_H +#ifndef _BLS_INCLUDE_H +#define _BLS_INCLUDE_H -#include "relic.h" #include "bls12381_utils.h" -// Signature, Public key and Private key lengths -#define FULL_SIGNATURE_LEN G1_BYTES -#define FULL_PK_LEN G2_BYTES -#define SIGNATURE_LEN (FULL_SIGNATURE_LEN/(G1_SERIALIZATION+1)) -#define PK_LEN (FULL_PK_LEN/(G2_SERIALIZATION+1)) -#define SK_BITS (Fr_BITS) -#define SK_LEN BITS_TO_BYTES(SK_BITS) - -// Simultaneous Pairing in verification -#define DOUBLE_PAIRING 1 -#define SINGLE_PAIRING (DOUBLE_PAIRING^1) - -// Signature and public key membership check -#define MEMBERSHIP_CHECK 1 - -// algorithm choice for the hashing to G1 -// both methods are similar implementations of the same optimzed SSWU -// but offer different timings. -#define RELIC_SSWU 1 // relic library implementation -#define LOCAL_SSWU 2 // local implementation -#define hashToPoint LOCAL_SSWU - -// bls core (functions in bls_core.c) -int get_signature_len(); -int get_pk_len(); -int get_sk_len(); - -void bls_sign(byte*, const bn_t, const byte*, const int); -int bls_verify(const ep2_t, const byte*, const byte*, const int); -int bls_verifyPerDistinctMessage(const byte*, const int, const byte*, const uint32_t*, - const uint32_t*, const ep2_st*); -int bls_verifyPerDistinctKey(const byte*, - const int, const ep2_st*, const uint32_t*, - const byte*, const uint32_t*); -void bls_batchVerify(const int, byte*, const ep2_st*, - const byte*, const byte*, const int); +// BLS signature core (functions in bls_core.c) +int bls_sign(byte *, const Fr *, const byte *, const int); +int bls_verify(const E2 *, const byte *, const byte *, const int); +int bls_verifyPerDistinctMessage(const byte *, const int, const byte *, + const uint32_t *, const uint32_t *, + const E2 *); +int bls_verifyPerDistinctKey(const byte *, const int, const E2 *, + const uint32_t *, const byte *, const uint32_t *); +void bls_batch_verify(const int, byte *, const E2 *, const byte *, const byte *, + const int, const byte *); + +// BLS based SPoCK +int bls_spock_verify(const E2 *, const byte *, const E2 *, const byte *); #endif diff --git a/crypto/bls_multisig.go b/crypto/bls_multisig.go index af2c6ce2f3c..ea534f790f1 100644 --- a/crypto/bls_multisig.go +++ b/crypto/bls_multisig.go @@ -1,9 +1,7 @@ -//go:build relic -// +build relic - package crypto import ( + "crypto/rand" "errors" "fmt" @@ -12,23 +10,23 @@ import ( // BLS multi-signature using BLS12-381 curve // ([zcash]https://github.com/zkcrypto/pairing/blob/master/src/bls12_381/README.md#bls12-381) -// Pairing, ellipic curve and modular arithmetic is using Relic library. -// This implementation does not include any security against side-channel attacks. +// Pairing, ellipic curve and modular arithmetic are using [BLST](https://github.com/supranational/blst/tree/master/src) +// tools underneath. +// This implementation does not include any security against side-channel side-channel or fault attacks. -// existing features: +// Existing features: // - the same BLS set-up in bls.go // - Use the proof of possession scheme (PoP) to prevent against rogue public-key attack. -// - Non-interactive aggregation of private keys, public keys and signatures. -// - Non-interactive subtraction of multiple public keys from an (aggregated) public key. +// - Aggregation of private keys, public keys and signatures. +// - Subtraction of multiple public keys from an (aggregated) public key. // - Multi-signature verification of an aggregated signature of a single message // under multiple public keys. // - Multi-signature verification of an aggregated signature of multiple messages under // multiple public keys. // - batch verification of multiple signatures of a single message under multiple -// public keys: use a binary tree of aggregations to find the invalid signatures. +// public keys, using a binary tree of aggregations. -// #cgo CFLAGS: -g -Wall -std=c99 -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s +// #include "bls12381_utils.h" // #include "bls_include.h" import "C" @@ -92,29 +90,26 @@ func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) { // - (nil, error) if an unexpected error occurs // - (aggregated_signature, nil) otherwise func AggregateBLSSignatures(sigs []Signature) (Signature, error) { - // set BLS context - blsInstance.reInit() - // check for empty list if len(sigs) == 0 { return nil, blsAggregateEmptyListError } // flatten the shares (required by the C layer) - flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs)) + flatSigs := make([]byte, 0, SignatureLenBLSBLS12381*len(sigs)) for i, sig := range sigs { - if len(sig) != signatureLengthBLSBLS12381 { + if len(sig) != SignatureLenBLSBLS12381 { return nil, fmt.Errorf("signature at index %d has an invalid length: %w", i, invalidSignatureError) } flatSigs = append(flatSigs, sig...) } - aggregatedSig := make([]byte, signatureLengthBLSBLS12381) + aggregatedSig := make([]byte, SignatureLenBLSBLS12381) // add the points in the C layer - result := C.ep_sum_vector_byte( + result := C.E1_sum_vector_byte( (*C.uchar)(&aggregatedSig[0]), (*C.uchar)(&flatSigs[0]), - (C.int)(len(sigs))) + (C.int)(len(flatSigs))) switch result { case valid: @@ -139,9 +134,6 @@ func AggregateBLSSignatures(sigs []Signature) (Signature, error) { // - (nil, blsAggregateEmptyListError) if no keys are provided (input slice is empty) // - (aggregated_key, nil) otherwise func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { - // set BLS context - blsInstance.reInit() - // check for empty list if len(keys) == 0 { return nil, blsAggregateEmptyListError @@ -157,8 +149,7 @@ func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { } var sum scalar - C.bn_new_wrapper((*C.bn_st)(&sum)) - C.bn_sum_vector((*C.bn_st)(&sum), (*C.bn_st)(&scalars[0]), + C.Fr_sum_vector((*C.Fr)(&sum), (*C.Fr)(&scalars[0]), (C.int)(len(scalars))) return newPrKeyBLSBLS12381(&sum), nil } @@ -177,15 +168,13 @@ func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { // - (nil, blsAggregateEmptyListError) no keys are provided (input slice is empty) // - (aggregated_key, nil) otherwise func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { - // set BLS context - blsInstance.reInit() // check for empty list if len(keys) == 0 { return nil, blsAggregateEmptyListError } - points := make([]pointG2, 0, len(keys)) + points := make([]pointE2, 0, len(keys)) for i, pk := range keys { pkBLS, ok := pk.(*pubKeyBLSBLS12381) if !ok { @@ -194,8 +183,8 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { points = append(points, pkBLS.point) } - var sum pointG2 - C.ep2_sum_vector((*C.ep2_st)(&sum), (*C.ep2_st)(&points[0]), + var sum pointE2 + C.E2_sum_vector_to_affine((*C.E2)(&sum), (*C.E2)(&points[0]), (C.int)(len(points))) sumKey := newPubKeyBLSBLS12381(&sum) @@ -203,16 +192,9 @@ func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { } // IdentityBLSPublicKey returns an identity public key which corresponds to the point -// at infinity in G2 (identity element of G2). +// at infinity in G2 (identity element g2). func IdentityBLSPublicKey() PublicKey { - // set BLS context - blsInstance.reInit() - - identity := *newPubKeyBLSBLS12381(nil) - // set the point to infinity - C.ep2_set_infty((*C.ep2_st)(&identity.point)) - identity.isIdentity = true - return &identity + return &g2PublicKey } // RemoveBLSPublicKeys removes multiple BLS public keys from a given (aggregated) public key. @@ -230,15 +212,13 @@ func IdentityBLSPublicKey() PublicKey { // - (nil, notBLSKeyError) if at least one input key is not of type BLS BLS12-381 // - (remaining_key, nil) otherwise func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, error) { - // set BLS context - blsInstance.reInit() aggPKBLS, ok := aggKey.(*pubKeyBLSBLS12381) if !ok { return nil, notBLSKeyError } - pointsToSubtract := make([]pointG2, 0, len(keysToRemove)) + pointsToSubtract := make([]pointE2, 0, len(keysToRemove)) for i, pk := range keysToRemove { pkBLS, ok := pk.(*pubKeyBLSBLS12381) if !ok { @@ -252,9 +232,9 @@ func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, return aggKey, nil } - var resultPoint pointG2 - C.ep2_subtract_vector((*C.ep2_st)(&resultPoint), (*C.ep2_st)(&aggPKBLS.point), - (*C.ep2_st)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract))) + var resultPoint pointE2 + C.E2_subtract_vector((*C.E2)(&resultPoint), (*C.E2)(&aggPKBLS.point), + (*C.E2)(&pointsToSubtract[0]), (C.int)(len(pointsToSubtract))) resultKey := newPubKeyBLSBLS12381(&resultPoint) return resultKey, nil @@ -330,11 +310,9 @@ func VerifyBLSSignatureOneMessage( func VerifyBLSSignatureManyMessages( pks []PublicKey, s Signature, messages [][]byte, kmac []hash.Hasher, ) (bool, error) { - // set BLS context - blsInstance.reInit() // check signature length - if len(s) != signatureLengthBLSBLS12381 { + if len(s) != SignatureLenBLSBLS12381 { return false, nil } // check the list lengths @@ -363,13 +341,13 @@ func VerifyBLSSignatureManyMessages( // The comparison of the maps length minimizes the number of pairings to // compute by aggregating either public keys or the message hashes in // the verification equation. - mapPerHash := make(map[string][]pointG2) - mapPerPk := make(map[pointG2][][]byte) + mapPerHash := make(map[string][]pointE2) + mapPerPk := make(map[pointE2][][]byte) // Note: mapPerPk is using a cgo structure as map keys which may lead to 2 equal public keys // being considered distinct. This does not make the verification equation wrong but leads to // computing extra pairings. This case is considered unlikely to happen since a caller is likely // to use the same struct for a same public key. - // One way to fix this is to use the public key encoding as the map keys and store the "pointG2" + // One way to fix this is to use the public key encoding as the map keys and store the "pointE2" // structure with the map value, which adds more complexity and processing time. // fill the 2 maps @@ -397,7 +375,7 @@ func VerifyBLSSignatureManyMessages( flatDistinctHashes := make([]byte, 0) lenHashes := make([]uint32, 0) pkPerHash := make([]uint32, 0, len(mapPerHash)) - allPks := make([]pointG2, 0) + allPks := make([]pointE2, 0) for hash, pksVal := range mapPerHash { flatDistinctHashes = append(flatDistinctHashes, []byte(hash)...) lenHashes = append(lenHashes, uint32(len([]byte(hash)))) @@ -410,13 +388,13 @@ func VerifyBLSSignatureManyMessages( (*C.uchar)(&flatDistinctHashes[0]), (*C.uint32_t)(&lenHashes[0]), (*C.uint32_t)(&pkPerHash[0]), - (*C.ep2_st)(&allPks[0]), + (*C.E2)(&allPks[0]), ) } else { // aggregate hashes per distinct key // using the linearity of the pairing on the G1 variables. - distinctPks := make([]pointG2, 0, len(mapPerPk)) + distinctPks := make([]pointE2, 0, len(mapPerPk)) hashPerPk := make([]uint32, 0, len(mapPerPk)) flatHashes := make([]byte, 0) lenHashes := make([]uint32, 0) @@ -432,7 +410,7 @@ func VerifyBLSSignatureManyMessages( verif = C.bls_verifyPerDistinctKey( (*C.uchar)(&s[0]), (C.int)(len(mapPerPk)), - (*C.ep2_st)(&distinctPks[0]), + (*C.E2)(&distinctPks[0]), (*C.uint32_t)(&hashPerPk[0]), (*C.uchar)(&flatHashes[0]), (*C.uint32_t)(&lenHashes[0])) @@ -482,9 +460,6 @@ func VerifyBLSSignatureManyMessages( func BatchVerifyBLSSignaturesOneMessage( pks []PublicKey, sigs []Signature, message []byte, kmac hash.Hasher, ) ([]bool, error) { - // set BLS context - blsInstance.reInit() - // boolean array returned when errors occur falseSlice := make([]bool, len(sigs)) @@ -505,10 +480,10 @@ func BatchVerifyBLSSignaturesOneMessage( } // flatten the shares (required by the C layer) - flatSigs := make([]byte, 0, signatureLengthBLSBLS12381*len(sigs)) - pkPoints := make([]pointG2, 0, len(pks)) + flatSigs := make([]byte, 0, SignatureLenBLSBLS12381*len(sigs)) + pkPoints := make([]pointE2, 0, len(pks)) - getIdentityPoint := func() pointG2 { + getIdentityPoint := func() pointE2 { pk, _ := IdentityBLSPublicKey().(*pubKeyBLSBLS12381) // second value is guaranteed to be true return pk.point } @@ -520,13 +495,13 @@ func BatchVerifyBLSSignaturesOneMessage( return falseSlice, fmt.Errorf("key at index %d is invalid: %w", i, notBLSKeyError) } - if len(sigs[i]) != signatureLengthBLSBLS12381 || pkBLS.isIdentity { + if len(sigs[i]) != SignatureLenBLSBLS12381 || pkBLS.isIdentity { // case of invalid signature: set the signature and public key at index `i` // to identities so that there is no effect on the aggregation tree computation. // However, the boolean return for index `i` is set to `false` and won't be overwritten. returnBool[i] = false pkPoints = append(pkPoints, getIdentityPoint()) - flatSigs = append(flatSigs, identityBLSSignature...) + flatSigs = append(flatSigs, g1Serialization...) } else { returnBool[i] = true // default to true pkPoints = append(pkPoints, pkBLS.point) @@ -537,14 +512,22 @@ func BatchVerifyBLSSignaturesOneMessage( // hash the input to 128 bytes h := kmac.ComputeHash(message) verifInt := make([]byte, len(sigs)) + // internal non-determministic entropy source required by bls_batch_verify + // specific length of the seed is required by bls_batch_verify. + seed := make([]byte, (securityBits/8)*len(verifInt)) + _, err := rand.Read(seed) + if err != nil { + return falseSlice, fmt.Errorf("generating randoms failed: %w", err) + } - C.bls_batchVerify( + C.bls_batch_verify( (C.int)(len(verifInt)), (*C.uchar)(&verifInt[0]), - (*C.ep2_st)(&pkPoints[0]), + (*C.E2)(&pkPoints[0]), (*C.uchar)(&flatSigs[0]), (*C.uchar)(&h[0]), (C.int)(len(h)), + (*C.uchar)(&seed[0]), ) for i, v := range verifInt { diff --git a/crypto/bls_no_relic.go b/crypto/bls_no_relic.go deleted file mode 100644 index fed6c216398..00000000000 --- a/crypto/bls_no_relic.go +++ /dev/null @@ -1,156 +0,0 @@ -//go:build !relic -// +build !relic - -package crypto - -import ( - "github.com/onflow/flow-go/crypto/hash" -) - -// The functions below are the non-Relic versions of the public APIs -// requiring the Relic library. -// All BLS functionalities in the package require the Relic dependency, -// and therefore the "relic" build tag. -// Building without the "relic" tag is successful, but and calling one of the -// BLS functions results in a runtime panic. This allows projects depending on the -// crypto library to build successfully with or without the "relic" tag. - -const relic_panic = "function is not supported when building without \"relic\" Go build tag" - -const ( - SignatureLenBLSBLS12381 = 48 -) - -// bls.go functions -func NewExpandMsgXOFKMAC128(tag string) hash.Hasher { - panic(relic_panic) -} - -func BLSInvalidSignature() Signature { - panic(relic_panic) -} - -// bls_multisig.go functions -func BLSGeneratePOP(sk PrivateKey) (Signature, error) { - panic(relic_panic) -} - -func BLSVerifyPOP(pk PublicKey, s Signature) (bool, error) { - panic(relic_panic) -} - -func AggregateBLSSignatures(sigs []Signature) (Signature, error) { - panic(relic_panic) -} - -func AggregateBLSPrivateKeys(keys []PrivateKey) (PrivateKey, error) { - panic(relic_panic) -} - -func AggregateBLSPublicKeys(keys []PublicKey) (PublicKey, error) { - panic(relic_panic) -} - -func IdentityBLSPublicKey() PublicKey { - panic(relic_panic) -} - -func IsBLSAggregateEmptyListError(err error) bool { - panic(relic_panic) -} - -func IsInvalidSignatureError(err error) bool { - panic(relic_panic) -} - -func IsNotBLSKeyError(err error) bool { - panic(relic_panic) -} - -func IsBLSSignatureIdentity(s Signature) bool { - panic(relic_panic) -} - -func RemoveBLSPublicKeys(aggKey PublicKey, keysToRemove []PublicKey) (PublicKey, error) { - panic(relic_panic) -} - -func VerifyBLSSignatureOneMessage(pks []PublicKey, s Signature, - message []byte, kmac hash.Hasher) (bool, error) { - panic(relic_panic) -} - -func VerifyBLSSignatureManyMessages(pks []PublicKey, s Signature, - messages [][]byte, kmac []hash.Hasher) (bool, error) { - panic(relic_panic) -} - -func BatchVerifyBLSSignaturesOneMessage(pks []PublicKey, sigs []Signature, - message []byte, kmac hash.Hasher) ([]bool, error) { - panic(relic_panic) -} - -func SPOCKProve(sk PrivateKey, data []byte, kmac hash.Hasher) (Signature, error) { - panic(relic_panic) -} - -func SPOCKVerifyAgainstData(pk PublicKey, proof Signature, data []byte, kmac hash.Hasher) (bool, error) { - panic(relic_panic) -} - -func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signature) (bool, error) { - panic(relic_panic) -} - -// bls_threshold.go functions -func NewBLSThresholdSignatureParticipant( - groupPublicKey PublicKey, - sharePublicKeys []PublicKey, - threshold int, - myIndex int, - myPrivateKey PrivateKey, - message []byte, - dsTag string, -) (ThresholdSignatureParticipant, error) { - panic(relic_panic) -} - -func NewBLSThresholdSignatureInspector( - groupPublicKey PublicKey, - sharePublicKeys []PublicKey, - threshold int, - message []byte, - dsTag string, -) (ThresholdSignatureInspector, error) { - panic(relic_panic) -} - -func BLSReconstructThresholdSignature(size int, threshold int, - shares []Signature, signers []int) (Signature, error) { - panic(relic_panic) -} - -func EnoughShares(threshold int, sharesNumber int) (bool, error) { - panic(relic_panic) -} - -func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, - []PublicKey, PublicKey, error) { - panic(relic_panic) -} - -// dkg.go functions -func NewFeldmanVSS(size int, threshold int, myIndex int, - processor DKGProcessor, dealerIndex int) (DKGState, error) { - panic(relic_panic) -} - -func NewFeldmanVSSQual(size int, threshold int, myIndex int, - processor DKGProcessor, dealerIndex int) (DKGState, error) { - panic(relic_panic) -} - -func NewJointFeldman(size int, threshold int, myIndex int, - processor DKGProcessor) (DKGState, error) { - panic(relic_panic) -} diff --git a/crypto/bls_no_relic_test.go b/crypto/bls_no_relic_test.go deleted file mode 100644 index 47f8120060f..00000000000 --- a/crypto/bls_no_relic_test.go +++ /dev/null @@ -1,42 +0,0 @@ -//go:build !relic -// +build !relic - -package crypto - -import ( - "testing" - - "github.com/stretchr/testify/assert" -) - -// Test for all public APIs requiring relic build tag. -// These functions should panic if build without the relic tag. -func TestNoRelicPanic(t *testing.T) { - assert.PanicsWithValue(t, relic_panic, func() { NewExpandMsgXOFKMAC128("") }) - assert.PanicsWithValue(t, relic_panic, func() { BLSInvalidSignature() }) - assert.PanicsWithValue(t, relic_panic, func() { BLSGeneratePOP(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { BLSVerifyPOP(nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { AggregateBLSSignatures(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { AggregateBLSPrivateKeys(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { AggregateBLSPublicKeys(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { IdentityBLSPublicKey() }) - assert.PanicsWithValue(t, relic_panic, func() { IsBLSAggregateEmptyListError(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { IsInvalidSignatureError(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { IsNotBLSKeyError(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { IsBLSSignatureIdentity(nil) }) - assert.PanicsWithValue(t, relic_panic, func() { RemoveBLSPublicKeys(nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { VerifyBLSSignatureOneMessage(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { VerifyBLSSignatureManyMessages(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { BatchVerifyBLSSignaturesOneMessage(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { SPOCKProve(nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { SPOCKVerify(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { SPOCKVerifyAgainstData(nil, nil, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { NewBLSThresholdSignatureParticipant(nil, nil, 0, 0, nil, nil, "") }) - assert.PanicsWithValue(t, relic_panic, func() { NewBLSThresholdSignatureInspector(nil, nil, 0, nil, "") }) - assert.PanicsWithValue(t, relic_panic, func() { BLSReconstructThresholdSignature(0, 0, nil, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { EnoughShares(0, 0) }) - assert.PanicsWithValue(t, relic_panic, func() { BLSThresholdKeyGen(0, 0, nil) }) - assert.PanicsWithValue(t, relic_panic, func() { NewFeldmanVSS(0, 0, 0, nil, 0) }) - assert.PanicsWithValue(t, relic_panic, func() { NewFeldmanVSSQual(0, 0, 0, nil, 0) }) - assert.PanicsWithValue(t, relic_panic, func() { NewJointFeldman(0, 0, 0, nil) }) -} diff --git a/crypto/bls_test.go b/crypto/bls_test.go index c967546f640..4fa02958496 100644 --- a/crypto/bls_test.go +++ b/crypto/bls_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto import ( @@ -30,7 +27,11 @@ func TestBLSMainMethods(t *testing.T) { // This test checks that: // - signature decoding handles input x-coordinates larger than p (doesn't result in an exception) // - signature decoding only accepts reduced x-coordinates to avoid signature malleability + t.Run("invalid x coordinate larger than p", func(t *testing.T) { + if !isG1Compressed() || !isG2Compressed() { + t.Skip() + } msg, err := hex.DecodeString("7f26ba692dc2da7ff828ef4675ff1cd6ab855fca0637b6dab295f1df8e51bc8bb1b8f0c6610aabd486cf1f098f2ddbc6691d94e10f928816f890a3d366ce46249836a595c7ea1828af52e899ba2ab627ab667113bb563918c5d5a787c414399487b4e3a7") require.NoError(t, err) validSig, err := hex.DecodeString("80b0cac2a0f4f8881913edf2b29065675dfed6f6f4e17e9b5d860a845d4e7d476b277d06a493b81482e63d8131f9f2fa") @@ -74,8 +75,7 @@ func TestBLSMainMethods(t *testing.T) { // test a valid signature result, err := pk.Verify(s, input, hasher) assert.NoError(t, err) - assert.True(t, result, - "Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk) + assert.True(t, result) } }) } @@ -187,26 +187,35 @@ func TestBLSEncodeDecode(t *testing.T) { // specific tests for BLS // zero private key - skBytes := make([]byte, PrKeyLenBLSBLS12381) - sk, err := DecodePrivateKey(BLSBLS12381, skBytes) - require.Error(t, err, "decoding identity private key should fail") - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, sk) + t.Run("zero private key", func(t *testing.T) { + skBytes := make([]byte, PrKeyLenBLSBLS12381) + sk, err := DecodePrivateKey(BLSBLS12381, skBytes) + require.Error(t, err, "decoding identity private key should fail") + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, sk) + }) // identity public key - pkBytes := make([]byte, PubKeyLenBLSBLS12381) - pkBytes[0] = infinityPointHeader - pk, err := DecodePublicKey(BLSBLS12381, pkBytes) - require.NoError(t, err, "decoding identity public key should succeed") - assert.True(t, pk.Equals(IdentityBLSPublicKey())) + t.Run("infinity public key", func(t *testing.T) { + // decode an identity public key + pkBytes := make([]byte, PubKeyLenBLSBLS12381) + pkBytes[0] = g2SerHeader + pk, err := DecodePublicKey(BLSBLS12381, pkBytes) + require.NoError(t, err, "decoding identity public key should succeed") + assert.True(t, pk.Equals(IdentityBLSPublicKey())) + // encode an identity public key + assert.Equal(t, pk.Encode(), pkBytes) + }) // invalid point - pkBytes = make([]byte, PubKeyLenBLSBLS12381) - pkBytes[0] = invalidBLSSignatureHeader - pk, err = DecodePublicKey(BLSBLS12381, pkBytes) - require.Error(t, err, "the key decoding should fail - key value is invalid") - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, pk) + t.Run("invalid public key", func(t *testing.T) { + pkBytes := make([]byte, PubKeyLenBLSBLS12381) + pkBytes[0] = invalidBLSSignatureHeader + pk, err := DecodePublicKey(BLSBLS12381, pkBytes) + require.Error(t, err, "the key decoding should fail - key value is invalid") + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, pk) + }) // Test a public key serialization with a point encoded with a coordinate x with // x[0] or x[1] not reduced mod p. @@ -217,21 +226,26 @@ func TestBLSEncodeDecode(t *testing.T) { // Although uniqueness of public key respresentation isn't a security property, some implementations // may implicitely rely on the property. - // valid pk with x[0] < p and x[1] < p - validPk, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b8038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") - require.NoError(t, err) - _, err = DecodePublicKey(BLSBLS12381, validPk) - assert.NoError(t, err) - // invalidpk1 with x[0]+p and same x[1] - invalidPk1, err := hex.DecodeString("9B8E840277BE772540D913E47A94F94C00003BBE60C4CEEB0C0ABCC9E876034089000EC7AF5AB6D81AF62EC9363D5E63038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") - require.NoError(t, err) - _, err = DecodePublicKey(BLSBLS12381, invalidPk1) - assert.Error(t, err) - // invalidpk1 with same x[0] and x[1]+p - invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D") - require.NoError(t, err) - _, err = DecodePublicKey(BLSBLS12381, invalidPk2) - assert.Error(t, err) + t.Run("public key with non-reduced coordinates", func(t *testing.T) { + if !isG2Compressed() { + t.Skip() + } + // valid pk with x[0] < p and x[1] < p + validPk, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b8038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") + require.NoError(t, err) + _, err = DecodePublicKey(BLSBLS12381, validPk) + assert.NoError(t, err) + // invalidpk1 with x[0]+p and same x[1] + invalidPk1, err := hex.DecodeString("9B8E840277BE772540D913E47A94F94C00003BBE60C4CEEB0C0ABCC9E876034089000EC7AF5AB6D81AF62EC9363D5E63038360809700d36d761cb266af6babe9a069dc7364d3502e84536bd893d5f09ec2dd4f07cae1f8a178ffacc450f9b9a2") + require.NoError(t, err) + _, err = DecodePublicKey(BLSBLS12381, invalidPk1) + assert.Error(t, err) + // invalidpk1 with same x[0] and x[1]+p + invalidPk2, err := hex.DecodeString("818d72183e3e908af5bd6c2e37494c749b88f0396d3fbc2ba4d9ea28f1c50d1c6a540ec8fe06b6d860f72ec9363db3b81D84726AD080BA07C1385A1CF2B758C104E127F8585862EDEB843E798A86E6C2E1894F067C35F8A132FEACC450F9644D") + require.NoError(t, err) + _, err = DecodePublicKey(BLSBLS12381, invalidPk2) + assert.Error(t, err) + }) } // TestBLSEquals tests equal for BLS keys @@ -273,7 +287,7 @@ func TestBLSPOP(t *testing.T) { // test a valid PoP result, err := BLSVerifyPOP(pk, s) require.NoError(t, err) - assert.True(t, result, "Verification should succeed:\n signature:%s\n private key:%s", s, sk) + assert.True(t, result) // test with a valid but different key seed[0] ^= 1 @@ -281,7 +295,7 @@ func TestBLSPOP(t *testing.T) { require.NoError(t, err) result, err = BLSVerifyPOP(wrongSk.PublicKey(), s) require.NoError(t, err) - assert.False(t, result, "Verification should fail:\n signature:%s\n private key:%s", s, sk) + assert.False(t, result) } }) @@ -300,7 +314,7 @@ func TestBLSPOP(t *testing.T) { } // BLS multi-signature -// signature aggregation sanity check +// signature aggregation with the same message sanity check // // Aggregate n signatures of the same message under different keys, and compare // it against the signature of the message under an aggregated private key. @@ -315,7 +329,7 @@ func TestBLSAggregateSignatures(t *testing.T) { // hasher kmac := NewExpandMsgXOFKMAC128("test tag") // number of signatures to aggregate - sigsNum := mrand.Intn(100) + 1 + sigsNum := rand.Intn(100) + 1 sigs := make([]Signature, 0, sigsNum) sks := make([]PrivateKey, 0, sigsNum) pks := make([]PublicKey, 0, sigsNum) @@ -342,40 +356,34 @@ func TestBLSAggregateSignatures(t *testing.T) { aggSig, err := AggregateBLSSignatures(sigs) require.NoError(t, err) // First check: check the signatures are equal - assert.Equal(t, aggSig, expectedSig, - "incorrect signature %s, should be %s, private keys are %s, input is %x", - aggSig, expectedSig, sks, input) + assert.Equal(t, aggSig, expectedSig) // Second check: Verify the aggregated signature valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) require.NoError(t, err) - assert.True(t, valid, - "Verification of %s failed, signature should be %s private keys are %s, input is %x", - aggSig, expectedSig, sks, input) + assert.True(t, valid) }) // check if one signature is not correct t.Run("one invalid signature", func(t *testing.T) { input[0] ^= 1 - randomIndex := mrand.Intn(sigsNum) - sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) + randomIndex := rand.Intn(sigsNum) + sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // sign a different message input[0] ^= 1 aggSig, err = AggregateBLSSignatures(sigs) require.NoError(t, err) - assert.NotEqual(t, aggSig, expectedSig, - "signature %s shouldn't be %s private keys are %s, input is %x", - aggSig, expectedSig, sks, input) + // First check: check the signatures are not equal + assert.NotEqual(t, aggSig, expectedSig) + // Second check: multi-verification should fail valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) require.NoError(t, err) - assert.False(t, valid, - "verification of signature %s should fail, it shouldn't be %s private keys are %s, input is %x", - aggSig, expectedSig, sks, input) - sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) + assert.False(t, valid) + sigs[randomIndex], err = sks[randomIndex].Sign(input, kmac) // rebuild the correct signature require.NoError(t, err) }) // check if one the public keys is not correct t.Run("one invalid public key", func(t *testing.T) { - randomIndex := mrand.Intn(sigsNum) + randomIndex := rand.Intn(sigsNum) newSk := randomSK(t, rand) sks[randomIndex] = newSk pks[randomIndex] = newSk.PublicKey() @@ -383,14 +391,10 @@ func TestBLSAggregateSignatures(t *testing.T) { require.NoError(t, err) expectedSig, err = aggSk.Sign(input, kmac) require.NoError(t, err) - assert.NotEqual(t, aggSig, expectedSig, - "signature %s shouldn't be %s, private keys are %s, input is %x, wrong key is of index %d", - aggSig, expectedSig, sks, input, randomIndex) + assert.NotEqual(t, aggSig, expectedSig) valid, err := VerifyBLSSignatureOneMessage(pks, aggSig, input, kmac) require.NoError(t, err) - assert.False(t, valid, - "signature %s should fail, shouldn't be %s, private keys are %s, input is %x, wrong key is of index %d", - aggSig, expectedSig, sks, input, randomIndex) + assert.False(t, valid) }) t.Run("invalid inputs", func(t *testing.T) { @@ -407,7 +411,7 @@ func TestBLSAggregateSignatures(t *testing.T) { assert.False(t, result) // test with a signature of a wrong length - shortSig := sigs[0][:signatureLengthBLSBLS12381-1] + shortSig := sigs[0][:SignatureLenBLSBLS12381-1] aggSig, err = AggregateBLSSignatures([]Signature{shortSig}) assert.Error(t, err) assert.True(t, IsInvalidSignatureError(err)) @@ -441,10 +445,10 @@ func TestBLSAggregateSignatures(t *testing.T) { // Aggregate n public keys and their respective private keys and compare // the public key of the aggregated private key is equal to the aggregated // public key -func TestBLSAggregatePubKeys(t *testing.T) { +func TestBLSAggregatePublicKeys(t *testing.T) { rand := getPRG(t) // number of keys to aggregate - pkNum := mrand.Intn(100) + 1 + pkNum := rand.Intn(100) + 1 pks := make([]PublicKey, 0, pkNum) sks := make([]PrivateKey, 0, pkNum) @@ -490,9 +494,7 @@ func TestBLSAggregatePubKeys(t *testing.T) { keys := []PublicKey{pks[0], IdentityBLSPublicKey()} aggPkWithIdentity, err := AggregateBLSPublicKeys(keys) assert.NoError(t, err) - assert.True(t, aggPkWithIdentity.Equals(pks[0]), - "incorrect public key %s, should be %s", - aggPkWithIdentity, pks[0]) + assert.True(t, aggPkWithIdentity.Equals(pks[0])) }) t.Run("invalid inputs", func(t *testing.T) { @@ -512,8 +514,8 @@ func TestBLSAggregatePubKeys(t *testing.T) { // check that the public key corresponding to the zero private key is indeed identity // The package doesn't allow to generate a zero private key. One way to obtain a zero - // private key is via aggrgeting opposite private keys - t.Run("public key of zero private key", func(t *testing.T) { + // private key is via aggregating opposite private keys + t.Run("Identity public key from identity private key", func(t *testing.T) { // sk1 is group order of bls12-381 minus one groupOrderMinus1 := []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x39, 0xD8, 0x08, 0x09, 0xA1, 0xD8, 0x05, 0x53, 0xBD, 0xA4, 0x02, 0xFF, 0xFE, @@ -525,9 +527,42 @@ func TestBLSAggregatePubKeys(t *testing.T) { one[PrKeyLenBLSBLS12381-1] = 1 sk2, err := DecodePrivateKey(BLSBLS12381, one) require.NoError(t, err) + // public key of aggregated private keys aggSK, err := AggregateBLSPrivateKeys([]PrivateKey{sk1, sk2}) require.NoError(t, err) assert.True(t, aggSK.PublicKey().Equals(IdentityBLSPublicKey())) + // aggregated public keys + aggPK, err := AggregateBLSPublicKeys([]PublicKey{sk1.PublicKey(), sk2.PublicKey()}) + require.NoError(t, err) + assert.True(t, aggPK.Equals(IdentityBLSPublicKey())) + // check of internal identity flag + blsKey, ok := aggPK.(*pubKeyBLSBLS12381) + require.True(t, ok) + assert.True(t, blsKey.isIdentity) + // check of encoding header + pkBytes := aggPK.Encode() + assert.Equal(t, g2SerHeader, pkBytes[0]) + }) + + t.Run("Identity public key from opposite points", func(t *testing.T) { + if !isG2Compressed() { + t.Skip() + } + pkBytes := pks[0].Encode() + negateCompressedPoint(pkBytes) + minusPk, err := DecodePublicKey(BLSBLS12381, pkBytes) + require.NoError(t, err) + // aggregated public keys + aggPK, err := AggregateBLSPublicKeys([]PublicKey{pks[0], minusPk}) + require.NoError(t, err) + assert.True(t, aggPK.Equals(IdentityBLSPublicKey())) + // check of internal identity flag + blsKey, ok := aggPK.(*pubKeyBLSBLS12381) + require.True(t, ok) + assert.True(t, blsKey.isIdentity) + // check of encoding header + pkBytes = aggPK.Encode() + assert.Equal(t, g2SerHeader, pkBytes[0]) }) } @@ -536,7 +571,7 @@ func TestBLSAggregatePubKeys(t *testing.T) { func TestBLSRemovePubKeys(t *testing.T) { rand := getPRG(t) // number of keys to aggregate - pkNum := mrand.Intn(100) + 1 + pkNum := rand.Intn(100) + 1 pks := make([]PublicKey, 0, pkNum) // generate public keys @@ -549,7 +584,7 @@ func TestBLSRemovePubKeys(t *testing.T) { require.NoError(t, err) // random number of keys to remove (at least one key is left) - pkToRemoveNum := mrand.Intn(pkNum) + pkToRemoveNum := rand.Intn(pkNum) expectedPatrialPk, err := AggregateBLSPublicKeys(pks[pkToRemoveNum:]) require.NoError(t, err) @@ -561,9 +596,7 @@ func TestBLSRemovePubKeys(t *testing.T) { BLSkey, ok := expectedPatrialPk.(*pubKeyBLSBLS12381) require.True(t, ok) - assert.True(t, BLSkey.Equals(partialPk), - "incorrect key %s, should be %s, keys are %s, index is %d", - partialPk, BLSkey, pks, pkToRemoveNum) + assert.True(t, BLSkey.Equals(partialPk)) }) // remove an extra key and check inequality @@ -574,9 +607,7 @@ func TestBLSRemovePubKeys(t *testing.T) { BLSkey, ok := expectedPatrialPk.(*pubKeyBLSBLS12381) require.True(t, ok) - assert.False(t, BLSkey.Equals(partialPk), - "incorrect key %s, should not be %s, keys are %s, index is %d, extra key is %s", - partialPk, BLSkey, pks, pkToRemoveNum, extraPk) + assert.False(t, BLSkey.Equals(partialPk)) }) // specific test to remove all keys @@ -591,9 +622,7 @@ func TestBLSRemovePubKeys(t *testing.T) { BLSRandomPk, ok := randomPk.(*pubKeyBLSBLS12381) require.True(t, ok) - assert.True(t, BLSRandomPk.Equals(randomPkPlusIdentityPk), - "incorrect key %s, should be infinity point, keys are %s", - identityPk, pks) + assert.True(t, BLSRandomPk.Equals(randomPkPlusIdentityPk)) }) // specific test with an empty slice of keys to remove @@ -604,9 +633,7 @@ func TestBLSRemovePubKeys(t *testing.T) { aggBLSkey, ok := aggPk.(*pubKeyBLSBLS12381) require.True(t, ok) - assert.True(t, aggBLSkey.Equals(partialPk), - "incorrect key %s, should be %s", - partialPk, aggBLSkey) + assert.True(t, aggBLSkey.Equals(partialPk)) }) t.Run("invalid inputs", func(t *testing.T) { @@ -640,7 +667,6 @@ func TestBLSBatchVerify(t *testing.T) { // number of signatures to aggregate sigsNum := rand.Intn(100) + 2 sigs := make([]Signature, 0, sigsNum) - sks := make([]PrivateKey, 0, sigsNum) pks := make([]PublicKey, 0, sigsNum) expectedValid := make([]bool, 0, sigsNum) @@ -650,7 +676,6 @@ func TestBLSBatchVerify(t *testing.T) { s, err := sk.Sign(input, kmac) require.NoError(t, err) sigs = append(sigs, s) - sks = append(sks, sk) pks = append(pks, sk.PublicKey()) expectedValid = append(expectedValid, true) } @@ -659,9 +684,26 @@ func TestBLSBatchVerify(t *testing.T) { t.Run("all signatures are valid", func(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) require.NoError(t, err) - assert.Equal(t, valid, expectedValid, - "Verification of %s failed, private keys are %s, input is %x, results is %v", - sigs, sks, input, valid) + assert.Equal(t, valid, expectedValid) + }) + + // valid signatures but indices aren't correct: sig[i] is correct under pks[j] + // and sig[j] is correct under pks[j]. + // implementations simply aggregating all signatures and keys would fail this test. + t.Run("valid signatures with incorrect indices", func(t *testing.T) { + i := rand.Intn(sigsNum-1) + 1 + j := rand.Intn(i) + // swap correct keys + pks[i], pks[j] = pks[j], pks[i] + + valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) + require.NoError(t, err) + expectedValid[i], expectedValid[j] = false, false + assert.Equal(t, valid, expectedValid) + + // restore keys + pks[i], pks[j] = pks[j], pks[i] + expectedValid[i], expectedValid[j] = true, true }) // valid signatures but indices aren't correct: sig[i] is correct under pks[j] @@ -676,9 +718,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) require.NoError(t, err) expectedValid[i], expectedValid[j] = false, false - assert.Equal(t, valid, expectedValid, - "Verification of %s failed, private keys are %s, input is %x, results is %v", - sigs, sks, input, valid) + assert.Equal(t, valid, expectedValid) // restore keys pks[i], pks[j] = pks[j], pks[i] @@ -689,9 +729,7 @@ func TestBLSBatchVerify(t *testing.T) { t.Run("one valid signature", func(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:1], sigs[:1], input, kmac) require.NoError(t, err) - assert.Equal(t, valid, expectedValid[:1], - "Verification of %s failed, private keys are %s, input is %x, results is %v", - sigs, sks, input, valid) + assert.Equal(t, expectedValid[:1], valid) }) // pick a random number of invalid signatures @@ -715,9 +753,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) require.NoError(t, err) - assert.Equal(t, expectedValid, valid, - "Verification of %s failed\n private keys are %s\n input is %x\n results is %v", - sigs, sks, input, valid) + assert.Equal(t, expectedValid, valid) }) // all signatures are invalid @@ -732,9 +768,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks, sigs, input, kmac) require.NoError(t, err) - assert.Equal(t, valid, expectedValid, - "Verification of %s failed, private keys are %s, input is %x, results is %v", - sigs, sks, input, valid) + assert.Equal(t, valid, expectedValid) }) // test the empty list case @@ -742,8 +776,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:0], sigs[:0], input, kmac) require.Error(t, err) assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.Equal(t, valid, expectedValid[:0], - "verification should fail with empty list key, got %v", valid) + assert.Equal(t, valid, expectedValid[:0]) }) // test incorrect inputs @@ -754,8 +787,7 @@ func TestBLSBatchVerify(t *testing.T) { valid, err := BatchVerifyBLSSignaturesOneMessage(pks[:len(pks)-1], sigs, input, kmac) require.Error(t, err) assert.True(t, IsInvalidInputsError(err)) - assert.Equal(t, valid, expectedValid, - "verification should fail with incorrect input lenghts, got %v", valid) + assert.Equal(t, valid, expectedValid) }) // test wrong hasher @@ -767,8 +799,7 @@ func TestBLSBatchVerify(t *testing.T) { require.Error(t, err) assert.True(t, IsNilHasherError(err)) - assert.Equal(t, valid, expectedValid, - "verification should fail with nil hasher, got %v", valid) + assert.Equal(t, valid, expectedValid) }) // test wrong key @@ -781,11 +812,17 @@ func TestBLSBatchVerify(t *testing.T) { require.Error(t, err) assert.True(t, IsNotBLSKeyError(err)) - assert.Equal(t, valid, expectedValid, - "verification should fail with invalid key, got %v", valid) + assert.Equal(t, valid, expectedValid) }) } +// Utility function that flips a point sign bit to negate the point +// this is shortcut which works only for zcash BLS12-381 compressed serialization. +// Applicable to both signatures and public keys. +func negateCompressedPoint(pointbytes []byte) { + pointbytes[0] ^= 0x20 +} + // alter or fix a signature func alterSignature(s Signature) { // this causes the signature to remain in G1 and be invalid @@ -855,16 +892,15 @@ func BenchmarkBatchVerify(b *testing.B) { // // Aggregate n signatures of distinct messages under different keys, // and verify the aggregated signature using the multi-signature verification with -// many message. +// many messages. func TestBLSAggregateSignaturesManyMessages(t *testing.T) { rand := getPRG(t) - // number of signatures to aggregate - sigsNum := mrand.Intn(20) + 1 + sigsNum := rand.Intn(40) + 1 sigs := make([]Signature, 0, sigsNum) - // number of keys - keysNum := mrand.Intn(sigsNum) + 1 + // number of keys (less than the number of signatures) + keysNum := rand.Intn(sigsNum) + 1 sks := make([]PrivateKey, 0, keysNum) // generate the keys for i := 0; i < keysNum; i++ { @@ -873,7 +909,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { } // number of messages (could be larger or smaller than the number of keys) - msgsNum := mrand.Intn(sigsNum) + 1 + msgsNum := rand.Intn(sigsNum) + 1 messages := make([][20]byte, msgsNum) for i := 0; i < msgsNum; i++ { _, err := rand.Read(messages[i][:]) @@ -888,10 +924,10 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { for i := 0; i < sigsNum; i++ { kmac := NewExpandMsgXOFKMAC128("test tag") // pick a key randomly from the list - skRand := mrand.Intn(keysNum) + skRand := rand.Intn(keysNum) sk := sks[skRand] // pick a message randomly from the list - msgRand := mrand.Intn(msgsNum) + msgRand := rand.Intn(msgsNum) msg := messages[msgRand][:] // generate a signature s, err := sk.Sign(msg, kmac) @@ -912,15 +948,13 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { // Verify the aggregated signature valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) require.NoError(t, err) - assert.True(t, valid, - "Verification of %s failed, should be valid, private keys are %s, inputs are %x, input public keys are %s", - aggSig, sks, inputMsgs, inputPks) + assert.True(t, valid) }) // check if one of the signatures is not correct t.Run("one signature is invalid", func(t *testing.T) { - randomIndex := mrand.Intn(sigsNum) // pick a random signature - messages[0][0] ^= 1 // make sure the signature is different + randomIndex := rand.Intn(sigsNum) // pick a random signature + messages[0][0] ^= 1 // make sure the signature is different var err error sigs[randomIndex], err = sks[0].Sign(messages[0][:], inputKmacs[0]) require.NoError(t, err) @@ -929,9 +963,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { require.NoError(t, err) valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) require.NoError(t, err) - assert.False(t, valid, - "Verification of %s should fail, private keys are %s, inputs are %x, input public keys are %s", - aggSig, sks, inputMsgs, inputPks) + assert.False(t, valid) }) // test the empty keys case @@ -939,8 +971,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { valid, err := VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs) assert.Error(t, err) assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.False(t, valid, - "verification should fail with an empty key list") + assert.False(t, valid) }) // test inconsistent input arrays @@ -949,13 +980,13 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { valid, err := VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs[:sigsNum-1], inputKmacs) assert.Error(t, err) assert.True(t, IsInvalidInputsError(err)) - assert.False(t, valid, "verification should fail with inconsistent messages and hashers") + assert.False(t, valid) // empty key list valid, err = VerifyBLSSignatureManyMessages(inputPks[:0], aggSig, inputMsgs, inputKmacs) assert.Error(t, err) assert.True(t, IsBLSAggregateEmptyListError(err)) - assert.False(t, valid, "verification should fail with empty list key") + assert.False(t, valid) // nil hasher tmp := inputKmacs[0] @@ -963,7 +994,7 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { valid, err = VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) assert.Error(t, err) assert.True(t, IsNilHasherError(err)) - assert.False(t, valid, "verification should fail with nil hasher") + assert.False(t, valid) inputKmacs[0] = tmp // wrong key @@ -972,9 +1003,48 @@ func TestBLSAggregateSignaturesManyMessages(t *testing.T) { valid, err = VerifyBLSSignatureManyMessages(inputPks, aggSig, inputMsgs, inputKmacs) assert.Error(t, err) assert.True(t, IsNotBLSKeyError(err)) - assert.False(t, valid, "verification should fail with nil hasher") + assert.False(t, valid) inputPks[0] = tmpPK }) + + t.Run("variable number of distinct keys and messages", func(t *testing.T) { + // use a specific PRG for easier reproduction + prg := getPRG(t) + // number of signatures to aggregate + N := 100 + sigs := make([]Signature, 0, N) + msgs := make([][]byte, 0, N) + pks := make([]PublicKey, 0, N) + kmacs := make([]hash.Hasher, 0, N) + kmac := NewExpandMsgXOFKMAC128("test tag") + for i := 0; i < N; i++ { + // distinct message + msg := make([]byte, 20) + msgs = append(msgs, msg) + _, err := prg.Read(msg) + require.NoError(t, err) + // distinct key + sk := randomSK(t, prg) + pks = append(pks, sk.PublicKey()) + // generate a signature + s, err := sk.Sign(msg, kmac) + require.NoError(t, err) + sigs = append(sigs, s) + kmacs = append(kmacs, kmac) + } + + // go through all numbers of couples (msg, key) + for i := 1; i < N; i++ { + // aggregate signatures + var err error + aggSig, err = AggregateBLSSignatures(sigs[:i]) + require.NoError(t, err) + // Verify the aggregated signature + valid, err := VerifyBLSSignatureManyMessages(pks[:i], aggSig, msgs[:i], kmacs[:i]) + require.NoError(t, err, "verification errored with %d couples (msg,key)", i) + assert.True(t, valid, "verification failed with %d couples (msg,key)", i) + } + }) } // TestBLSErrorTypes verifies working of error-type-detecting functions @@ -1111,17 +1181,22 @@ func TestBLSIdentity(t *testing.T) { hasher := NewExpandMsgXOFKMAC128("") t.Run("identity signature comparison", func(t *testing.T) { + if !isG1Compressed() { + t.Skip() + } // verify that constructed identity signatures are recognized as such by IsBLSSignatureIdentity. // construct identity signature by summing (aggregating) a random signature and its inverse. - assert.True(t, IsBLSSignatureIdentity(identityBLSSignature)) + + // sanity check to start + assert.True(t, IsBLSSignatureIdentity(g1Serialization)) // sum up a random signature and its inverse to get identity sk := randomSK(t, rand) sig, err := sk.Sign(msg, hasher) require.NoError(t, err) - oppositeSig := make([]byte, signatureLengthBLSBLS12381) + oppositeSig := make([]byte, SignatureLenBLSBLS12381) copy(oppositeSig, sig) - negatePoint(oppositeSig) + negateCompressedPoint(oppositeSig) aggSig, err := AggregateBLSSignatures([]Signature{sig, oppositeSig}) require.NoError(t, err) assert.True(t, IsBLSSignatureIdentity(aggSig)) diff --git a/crypto/bls_thresholdsign.go b/crypto/bls_thresholdsign.go index 4256af84ab9..412f06f962a 100644 --- a/crypto/bls_thresholdsign.go +++ b/crypto/bls_thresholdsign.go @@ -1,9 +1,5 @@ -//go:build relic -// +build relic - package crypto -// #cgo CFLAGS: -g -Wall -std=c99 // #include "bls_thresholdsign_include.h" import "C" @@ -46,6 +42,8 @@ type blsThresholdSignatureParticipant struct { myPrivateKey PrivateKey } +var _ ThresholdSignatureParticipant = (*blsThresholdSignatureParticipant)(nil) + // blsThresholdSignatureInspector implements ThresholdSignatureInspector // based on the BLS signature scheme type blsThresholdSignatureInspector struct { @@ -72,6 +70,8 @@ type blsThresholdSignatureInspector struct { lock sync.RWMutex } +var _ ThresholdSignatureInspector = (*blsThresholdSignatureInspector)(nil) + // NewBLSThresholdSignatureParticipant creates a new instance of Threshold signature Participant using BLS. // A participant is able to participate in a threshold signing protocol as well as following the // protocol. @@ -82,8 +82,8 @@ type blsThresholdSignatureInspector struct { // participant is indexed by `myIndex` and holds the input private key // where n is the length of the public key shares slice. // -// The function returns -// - (nil, invalidInputsError) if: +// The function returns: +// - (nil, invalidInputsError) if: // - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] // - threshold value is not in interval [1, n-1] // - input private key and public key at my index do not match @@ -138,8 +138,8 @@ func NewBLSThresholdSignatureParticipant( // Participants are defined by their public key share, and are indexed from 0 to n-1 // where n is the length of the public key shares slice. // -// The function returns -// - (nil, invalidInputsError) if: +// The function returns: +// - (nil, invalidInputsError) if: // - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] // - threshold value is not in interval [1, n-1] // - (nil, notBLSKeyError) at least one public key is not of type pubKeyBLSBLS12381 @@ -402,24 +402,21 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat return nil, notEnoughSharesErrorf("number of signature shares %d is not enough, %d are required", len(s.shares), s.threshold+1) } - thresholdSignature := make([]byte, signatureLengthBLSBLS12381) + thresholdSignature := make([]byte, SignatureLenBLSBLS12381) // prepare the C layer inputs - shares := make([]byte, 0, len(s.shares)*signatureLengthBLSBLS12381) + shares := make([]byte, 0, len(s.shares)*SignatureLenBLSBLS12381) signers := make([]index, 0, len(s.shares)) for index, share := range s.shares { shares = append(shares, share...) - signers = append(signers, index) + signers = append(signers, index+1) } - // set BLS settings - blsInstance.reInit() - // Lagrange Interpolate at point 0 - result := C.G1_lagrangeInterpolateAtZero( + result := C.E1_lagrange_interpolate_at_zero_write( (*C.uchar)(&thresholdSignature[0]), (*C.uchar)(&shares[0]), - (*C.uint8_t)(&signers[0]), (C.int)(s.threshold+1)) + (*C.uint8_t)(&signers[0]), (C.int)(s.threshold)) if result != valid { return nil, invalidSignatureError @@ -443,10 +440,14 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat // // size is the number of participants, it must be in the range [ThresholdSignMinSize..ThresholdSignMaxSize]. // threshold is the threshold value, it must be in the range [MinimumThreshold..size-1]. -// The function does not check the validity of the shares, and does not check -// the validity of the resulting signature. +// The function does not accept any input public key. Therefore, it does not check the validity of the +// shares against individual public keys, and does not check the validity of the resulting signature +// against the group public key. // BLSReconstructThresholdSignature returns: -// - (nil, error) if the inputs are not in the correct range, if the threshold is not reached +// - (nil, invalidInputsError) if : +// -- numbers of shares does not match the number of signers +// -- the inputs are not in the correct range. +// - (nil, notEnoughSharesError) if the threshold is not reached. // - (nil, duplicatedSignerError) if input signers are not distinct. // - (nil, invalidSignatureError) if at least one of the first (threshold+1) signatures. // does not serialize to a valid E1 point. @@ -456,8 +457,6 @@ func (s *blsThresholdSignatureInspector) reconstructThresholdSignature() (Signat // are considered to reconstruct the signature. func BLSReconstructThresholdSignature(size int, threshold int, shares []Signature, signers []int) (Signature, error) { - // set BLS settings - blsInstance.reInit() if size < ThresholdSignMinSize || size > ThresholdSignMaxSize { return nil, invalidInputsErrorf( @@ -478,15 +477,15 @@ func BLSReconstructThresholdSignature(size int, threshold int, } if len(shares) < threshold+1 { - return nil, invalidInputsErrorf( - "the number of signatures does not reach the threshold") + return nil, notEnoughSharesErrorf( + "the number of signatures %d is less than the minimum %d", len(shares), threshold+1) } // map to check signers are distinct m := make(map[index]bool) // flatten the shares (required by the C layer) - flatShares := make([]byte, 0, signatureLengthBLSBLS12381*(threshold+1)) + flatShares := make([]byte, 0, SignatureLenBLSBLS12381*(threshold+1)) indexSigners := make([]index, 0, threshold+1) for i, share := range shares { flatShares = append(flatShares, share...) @@ -501,15 +500,15 @@ func BLSReconstructThresholdSignature(size int, threshold int, "%d is a duplicate signer", index(signers[i])) } m[index(signers[i])] = true - indexSigners = append(indexSigners, index(signers[i])) + indexSigners = append(indexSigners, index(signers[i])+1) } - thresholdSignature := make([]byte, signatureLengthBLSBLS12381) + thresholdSignature := make([]byte, SignatureLenBLSBLS12381) // Lagrange Interpolate at point 0 - if C.G1_lagrangeInterpolateAtZero( + if C.E1_lagrange_interpolate_at_zero_write( (*C.uchar)(&thresholdSignature[0]), (*C.uchar)(&flatShares[0]), - (*C.uint8_t)(&indexSigners[0]), (C.int)(threshold+1), + (*C.uint8_t)(&indexSigners[0]), (C.int)(threshold), ) != valid { return nil, invalidSignatureError } @@ -536,13 +535,15 @@ func EnoughShares(threshold int, sharesNumber int) (bool, error) { // BLSThresholdKeyGen is a key generation for a BLS-based // threshold signature scheme with a trusted dealer. // -// The function returns : -// - (nil, nil, nil, invalidInputsErrorf) if: +// The function returns: +// - (nil, nil, nil, invalidInputsErrorf) if: +// - seed is too short // - n is not in [`ThresholdSignMinSize`, `ThresholdSignMaxSize`] // - threshold value is not in interval [1, n-1] // - (groupPrivKey, []pubKeyShares, groupPubKey, nil) otherwise func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, []PublicKey, PublicKey, error) { + if size < ThresholdSignMinSize || size > ThresholdSignMaxSize { return nil, nil, nil, invalidInputsErrorf( "size should be between %d and %d, got %d", @@ -558,33 +559,23 @@ func BLSThresholdKeyGen(size int, threshold int, seed []byte) ([]PrivateKey, threshold) } - // set BLS settings - blsInstance.reInit() - // the scalars x and G2 points y x := make([]scalar, size) - y := make([]pointG2, size) - var X0 pointG2 - - // seed relic - if err := seedRelic(seed); err != nil { - return nil, nil, nil, fmt.Errorf("seeding relic failed: %w", err) - } - // Generate a polynomial P in Zr[X] of degree t - a := make([]scalar, threshold+1) - randZrStar(&a[0]) // non-identity key - if threshold > 0 { - for i := 1; i < threshold; i++ { - randZr(&a[i]) - } - randZrStar(&a[threshold]) // enforce the polynomial degree + y := make([]pointE2, size) + var X0 pointE2 + + // Generate a polynomial P in Fr[X] of degree t + a, err := generateFrPolynomial(seed, threshold) + if err != nil { + return nil, nil, nil, fmt.Errorf("failed to generate random polynomial: %w", err) } + // compute the shares for i := index(1); int(i) <= size; i++ { - C.Zr_polynomialImage( - (*C.bn_st)(&x[i-1]), - (*C.ep2_st)(&y[i-1]), - (*C.bn_st)(&a[0]), (C.int)(len(a)), + C.Fr_polynomial_image( + (*C.Fr)(&x[i-1]), + (*C.E2)(&y[i-1]), + (*C.Fr)(&a[0]), (C.int)(len(a)-1), (C.uint8_t)(i), ) } diff --git a/crypto/bls_thresholdsign_core.c b/crypto/bls_thresholdsign_core.c index dc57355df47..7c1d809d228 100644 --- a/crypto/bls_thresholdsign_core.c +++ b/crypto/bls_thresholdsign_core.c @@ -1,123 +1,118 @@ -// +build relic - #include "bls_thresholdsign_include.h" -// Computes the Lagrange coefficient L(i+1) at 0 with regards to the range [signers(0)+1..signers(t)+1] -// and stores it in res, where t is the degree of the polynomial P -static void Zr_lagrangeCoefficientAtZero(bn_t res, const int i, const uint8_t* signers, const int len){ - // r is the order of G1 and G2 - bn_t r, r_2; - bn_new(r); - g2_get_ord(r); - // (r-2) is needed to compute the inverse in Zr - // using little Fermat theorem - bn_new(r_2); - bn_sub_dig(r_2, r, 2); - //#define MOD_METHOD MONTY - #define MOD_METHOD BASIC +// the highest index of a threshold participant +#define MAX_IND 255 +#define MAX_IND_BITS 8 // equal to ceiling(log_2(MAX_IND)) + +// Computes the Lagrange coefficient L_i(0) in Fr with regards to the range +// [indices(0)..indices(t)] and stores it in `res`, where t is the degree of the +// polynomial P. +// `degree` is equal to the polynomial degree `t`. +static void Fr_lagrange_coeff_at_zero(Fr *res, const int i, + const byte indices[], const int degree) { - #if MOD_METHOD == MONTY - bn_t u; - bn_new(u) - // Montgomery reduction constant - // TODO: hardcode u - bn_mod_pre_monty(u, r); - #endif + // coefficient is computed as N * D^(-1) + Fr numerator; // eventually would represent N*R^k + Fr denominator; // eventually would represent D*R^k - // temp buffers - bn_t acc, inv, base, numerator; - bn_new(inv); - bn_new(base); - bn_new_size(base, BITS_TO_DIGITS(Fr_BITS)) - bn_new(acc); - bn_new(numerator); - bn_new_size(acc, BITS_TO_DIGITS(3*Fr_BITS)); + // Initialize N and D to Montgomery constant R + Fr_copy(&numerator, &BLS12_381_rR); + Fr_copy(&denominator, &BLS12_381_rR); - // the accumulator of the largarnge coeffiecient - // the sign (sign of acc) is equal to 1 if acc is positive, 0 otherwise - bn_set_dig(acc, 1); - int sign = 1; + // sign of D: 0 for positive and 1 for negative + int sign = 0; - // loops is the maximum number of loops that takes the accumulator to - // overflow modulo r, mainly the highest k such that fact(MAX_IND)/fact(MAX_IND-k) < r - const int loops = MAX_IND_LOOPS; - int k,j = 0; - while (j/src/*.c` and `/src/*.h` files (C source files) but `server.c`. +- `server.c` is replaced by `./blst_src.c` (which lists only the files needed by Flow crypto). +- all `/build` (assembly generated files). +- this `README` file. + +To upgrade the BLST version: +- [ ] audit all BLST updates, with focus on `/src`: https://github.com/supranational/blst/compare/v0.3.11... +- [ ] delete all files in this folder `./blst_src/` but `blst_src.c` and `README.md`. +- [ ] delete all files in `./internal/blst/`. +- [ ] open BLST repository on the new version. +- [ ] copy all `.c` and `.h` files from `/src/` into `./blst_src/`. +- [ ] delete newly copied `./blst_src/server.c`. +- [ ] copy the folder `/build/` into this folder `./blst_src`. +- [ ] copy `/bindings/blst.h`, `/bindings/blst_aux.h`, and `/bindings/go/blst.go` into `./internal/blst/.`. +- [ ] check that C flags in `./bls12381_utils.go` still include the C flags in `/bindings/go/blst.go`. +- [ ] update `./blst_src/blst_src.c` if needed. +- [ ] solve all breaking changes that may occur. +- [ ] update the commit version on this `./blst_src/README`. + +Note that Flow crypto is using non exported internal functions from BLST. Checking for interfaces breaking changes in BLST should be done along with auditing changes between the old and new versions. This includes checking logical changes and assumptions beyond interfaces, and assessing their security and performance impact on protocols implemented in Flow crypto. diff --git a/crypto/blst_src/aggregate.c b/crypto/blst_src/aggregate.c new file mode 100644 index 00000000000..ca78876acad --- /dev/null +++ b/crypto/blst_src/aggregate.c @@ -0,0 +1,673 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * Usage pattern on single-processor system is + * + * blst_pairing_init(ctx, hash_or_encode, DST); + * blst_pairing_aggregate_pk_in_g1(ctx, PK[0], aggregated_signature, msg[0]); + * blst_pairing_aggregate_pk_in_g1(ctx, PK[1], NULL, msg[1]); + * ... + * blst_pairing_commit(ctx); + * blst_pairing_finalverify(ctx, NULL); + * + *********************************************************************** + * Usage pattern on multi-processor system is + * + * blst_pairing_init(pk[0], hash_or_encode, DST); + * blst_pairing_init(pk[1], hash_or_encode, DST); + * ... + * start threads each processing an N/nthreads slice of PKs and messages: + * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+0], NULL, msg[i*n+0]); + * blst_pairing_aggregate_pk_in_g1(pk[i], PK[i*n+1], NULL, msg[i*n+1]); + * ... + * blst_pairing_commit(pkx); + * ... + * meanwhile in main thread + * blst_fp12 gtsig; + * blst_aggregated_in_g2(>sig, aggregated_signature); + * join threads and merge their contexts: + * blst_pairing_merge(pk[0], pk[1]); + * blst_pairing_merge(pk[0], pk[2]); + * ... + * blst_pairing_finalverify(pk[0], gtsig); + */ + +#ifndef N_MAX +# define N_MAX 8 +#endif + +typedef union { POINTonE1 e1; POINTonE2 e2; } AggregatedSignature; +typedef struct { + unsigned int ctrl; + unsigned int nelems; + const void *DST; + size_t DST_len; + vec384fp12 GT; + AggregatedSignature AggrSign; + POINTonE2_affine Q[N_MAX]; + POINTonE1_affine P[N_MAX]; +} PAIRING; + +enum { AGGR_UNDEFINED = 0, + AGGR_MIN_SIG = 1, + AGGR_MIN_PK = 2, + AGGR_SIGN_SET = 0x10, + AGGR_GT_SET = 0x20, + AGGR_HASH_OR_ENCODE = 0x40 }; +#define MIN_SIG_OR_PK (AGGR_MIN_SIG | AGGR_MIN_PK) + +static const size_t sizeof_pairing = (sizeof(PAIRING) + 7) & ~(size_t)7; + +size_t blst_pairing_sizeof(void) +{ return sizeof_pairing; } + +void blst_pairing_init(PAIRING *ctx, int hash_or_encode, + const void *DST, size_t DST_len) +{ + ctx->ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx->nelems = 0; + ctx->DST = (uptr_t)DST==(uptr_t)((byte *)ctx+sizeof_pairing) ? (void *)42 + : DST; + ctx->DST_len = DST_len; +} + +static const void *pairing_get_dst(const PAIRING *ctx) +{ return (uptr_t)ctx->DST==(uptr_t)42 ? (const byte *)ctx+sizeof_pairing + : ctx->DST; +} + +const void *blst_pairing_get_dst(const PAIRING *ctx) +{ return pairing_get_dst(ctx); } + +#define FROM_AFFINE(out,in) do { \ + vec_copy((out)->X, in->X, 2*sizeof(in->X)), \ + vec_select((out)->Z, in->X, BLS12_381_Rx.p, sizeof(in->X), \ + vec_is_zero(in->X, 2*sizeof(in->X))); } while(0) + +/* + * Optional |nbits|-wide |scalar| is used to facilitate multiple aggregated + * signature verification as discussed at + * https://ethresear.ch/t/fast-verification-of-multiple-bls-signatures/5407. + * Usage pattern is not finalized yet, because (sig != NULL) is better and + * will be handled separately... + */ +static BLST_ERROR PAIRING_Aggregate_PK_in_G2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_groupcheck, + const POINTonE1_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_PK) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_SIG; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE1 *S = &ctx->AggrSign.e1; + POINTonE1 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + POINTonE1_mult_w5(P, P, scalar, nbits); + POINTonE1_dadd(S, S, P, NULL); + } else { + POINTonE1_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE1 H[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE2 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G1(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + if (nbits != 0 && scalar != NULL) + POINTonE1_mult_w5(H, H, scalar, nbits); + + POINTonE1_from_Jacobian(H, H); + + n = ctx->nelems; + vec_copy(ctx->Q + n, PK, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, H, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + const POINTonE1_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(PAIRING *ctx, + const POINTonE2_affine *PK, + size_t pk_grpchk, + const POINTonE1_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G2(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static BLST_ERROR PAIRING_Aggregate_PK_in_G1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_groupcheck, + const POINTonE2_affine *sig, + size_t sig_groupcheck, + const byte *scalar, size_t nbits, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ + if (ctx->ctrl & AGGR_MIN_SIG) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= AGGR_MIN_PK; + + /* + * Since we don't know if the signature is individual or aggregated, + * the only sensible thing to do is to skip over infinite one and + * count on the corresponding infinite public key to be rejected, + * in case the signature is non-aggregated that is. + */ + if (sig != NULL && !vec_is_zero(sig, sizeof(*sig))) { + POINTonE2 *S = &ctx->AggrSign.e2; + POINTonE2 P[1]; + + FROM_AFFINE(P, sig); + + if (sig_groupcheck && !POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (ctx->ctrl & AGGR_SIGN_SET) { + if (nbits != 0 && scalar != NULL) { + + POINTonE2_mult_w5(P, P, scalar, nbits); + POINTonE2_dadd(S, S, P, NULL); + } else { + POINTonE2_dadd_affine(S, S, sig); + } + } else { + ctx->ctrl |= AGGR_SIGN_SET; + if (nbits != 0 && scalar != NULL) + POINTonE2_mult_w5(S, P, scalar, nbits); + else + vec_copy(S, P, sizeof(P)); + } + } + + if (PK != NULL) { + unsigned int n; + POINTonE2 H[1]; + POINTonE1 pk[1]; + const void *DST = pairing_get_dst(ctx); + + /* + * Reject infinite public keys. + */ + if (vec_is_zero(PK, sizeof(*PK))) + return BLST_PK_IS_INFINITY; + + if (pk_groupcheck) { + POINTonE1 P[1]; + + FROM_AFFINE(P, PK); + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + } + + if (ctx->ctrl & AGGR_HASH_OR_ENCODE) + Hash_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + else + Encode_to_G2(H, msg, msg_len, DST, ctx->DST_len, aug, aug_len); + + POINTonE2_from_Jacobian(H, H); + + if (nbits != 0 && scalar != NULL) { + FROM_AFFINE(pk, PK); + POINTonE1_mult_w5(pk, pk, scalar, nbits); + POINTonE1_from_Jacobian(pk, pk); + PK = (const POINTonE1_affine *)pk; + } + + n = ctx->nelems; + vec_copy(ctx->Q + n, H, sizeof(POINTonE2_affine)); + vec_copy(ctx->P + n, PK, sizeof(POINTonE1_affine)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; + } + + return BLST_SUCCESS; +} + +BLST_ERROR blst_pairing_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *signature, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + const POINTonE2_affine *sig, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, 0, sig, 1, scalar, nbits, + msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *signature, + size_t sig_grpchk, + const void *msg, size_t msg_len, + const void *aug, size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, signature, sig_grpchk, + NULL, 0, msg, msg_len, aug, aug_len); +} + +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(PAIRING *ctx, + const POINTonE1_affine *PK, + size_t pk_grpchk, + const POINTonE2_affine *sig, + size_t sig_grpchk, + const byte *scalar, + size_t nbits, + const void *msg, + size_t msg_len, + const void *aug, + size_t aug_len) +{ return PAIRING_Aggregate_PK_in_G1(ctx, PK, pk_grpchk, sig, sig_grpchk, + scalar, nbits, + msg, msg_len, aug, aug_len); +} + +static void PAIRING_Commit(PAIRING *ctx) +{ + unsigned int n; + + if ((n = ctx->nelems) != 0) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + ctx->nelems = 0; + } +} + +void blst_pairing_commit(PAIRING *ctx) +{ PAIRING_Commit(ctx); } + +BLST_ERROR blst_pairing_merge(PAIRING *ctx, const PAIRING *ctx1) +{ + if ((ctx->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx1->ctrl & MIN_SIG_OR_PK) != AGGR_UNDEFINED + && (ctx->ctrl & ctx1->ctrl & MIN_SIG_OR_PK) == 0) + return BLST_AGGR_TYPE_MISMATCH; + + /* context producers are expected to have called blst_pairing_commit */ + if (ctx->nelems || ctx1->nelems) + return BLST_AGGR_TYPE_MISMATCH; + + ctx->ctrl |= ctx1->ctrl & MIN_SIG_OR_PK; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE1_dadd(&ctx->AggrSign.e1, &ctx->AggrSign.e1, + &ctx1->AggrSign.e1, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e1, &ctx1->AggrSign.e1, + sizeof(ctx->AggrSign.e1)); + } + break; + case AGGR_MIN_PK: + if (ctx->ctrl & ctx1->ctrl & AGGR_SIGN_SET) { + POINTonE2_dadd(&ctx->AggrSign.e2, &ctx->AggrSign.e2, + &ctx1->AggrSign.e2, NULL); + } else if (ctx1->ctrl & AGGR_SIGN_SET) { + ctx->ctrl |= AGGR_SIGN_SET; + vec_copy(&ctx->AggrSign.e2, &ctx1->AggrSign.e2, + sizeof(ctx->AggrSign.e2)); + } + break; + case AGGR_UNDEFINED: + break; + default: + return BLST_AGGR_TYPE_MISMATCH; + } + + if (ctx->ctrl & ctx1->ctrl & AGGR_GT_SET) { + mul_fp12(ctx->GT, ctx->GT, ctx1->GT); + } else if (ctx1->ctrl & AGGR_GT_SET) { + ctx->ctrl |= AGGR_GT_SET; + vec_copy(ctx->GT, ctx1->GT, sizeof(ctx->GT)); + } + + return BLST_SUCCESS; +} + +static bool_t PAIRING_FinalVerify(const PAIRING *ctx, const vec384fp12 GTsig) +{ + vec384fp12 GT; + + if (!(ctx->ctrl & AGGR_GT_SET)) + return 0; + + if (GTsig != NULL) { + vec_copy(GT, GTsig, sizeof(GT)); + } else if (ctx->ctrl & AGGR_SIGN_SET) { + AggregatedSignature AggrSign; + + switch (ctx->ctrl & MIN_SIG_OR_PK) { + case AGGR_MIN_SIG: + POINTonE1_from_Jacobian(&AggrSign.e1, &ctx->AggrSign.e1); + miller_loop_n(GT, (const POINTonE2_affine *)&BLS12_381_G2, + (const POINTonE1_affine *)&AggrSign.e1, 1); + break; + case AGGR_MIN_PK: + POINTonE2_from_Jacobian(&AggrSign.e2, &ctx->AggrSign.e2); + miller_loop_n(GT, (const POINTonE2_affine *)&AggrSign.e2, + (const POINTonE1_affine *)&BLS12_381_G1, 1); + break; + default: + return 0; + } + } else { + /* + * The aggregated signature was infinite, relation between the + * hashes and the public keys has to be VERY special... + */ + vec_copy(GT, BLS12_381_Rx.p12, sizeof(GT)); + } + + conjugate_fp12(GT); + mul_fp12(GT, GT, ctx->GT); + final_exp(GT, GT); + + /* return GT==1 */ + return vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0])); +} + +int blst_pairing_finalverify(const PAIRING *ctx, const vec384fp12 GTsig) +{ return (int)PAIRING_FinalVerify(ctx, GTsig); } + +int blst_fp12_finalverify(const vec384fp12 GT1, const vec384fp12 GT2) +{ + vec384fp12 GT; + + vec_copy(GT, GT1, sizeof(GT)); + conjugate_fp12(GT); + mul_fp12(GT, GT, GT2); + final_exp(GT, GT); + + /* return GT==1 */ + return (int)(vec_is_equal(GT[0][0], BLS12_381_Rx.p2, sizeof(GT[0][0])) & + vec_is_zero(GT[0][1], sizeof(GT) - sizeof(GT[0][0]))); +} + +void blst_pairing_raw_aggregate(PAIRING *ctx, const POINTonE2_affine *q, + const POINTonE1_affine *p) +{ + unsigned int n; + + if (vec_is_zero(q, sizeof(*q)) & vec_is_zero(p, sizeof(*p))) + return; + + n = ctx->nelems; + vec_copy(ctx->Q + n, q, sizeof(*q)); + vec_copy(ctx->P + n, p, sizeof(*p)); + if (++n == N_MAX) { + if (ctx->ctrl & AGGR_GT_SET) { + vec384fp12 GT; + miller_loop_n(GT, ctx->Q, ctx->P, n); + mul_fp12(ctx->GT, ctx->GT, GT); + } else { + miller_loop_n(ctx->GT, ctx->Q, ctx->P, n); + ctx->ctrl |= AGGR_GT_SET; + } + n = 0; + } + ctx->nelems = n; +} + +vec384fp12 *blst_pairing_as_fp12(PAIRING *ctx) +{ + PAIRING_Commit(ctx); + return (vec384fp12 *)ctx->GT; +} + +/* + * PAIRING context-free entry points. + * + * To perform FastAggregateVerify, aggregate all public keys and + * signatures with corresponding blst_aggregate_in_g{12}, convert + * result to affine and call suitable blst_core_verify_pk_in_g{12} + * or blst_aggregated_in_g{12}... + */ +BLST_ERROR blst_aggregate_in_g1(POINTonE1 *out, const POINTonE1 *in, + const unsigned char *zwire) +{ + POINTonE1 P[1]; + BLST_ERROR ret; + + ret = POINTonE1_Deserialize_Z((POINTonE1_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE1_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE1_in_G1(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) + vec_copy(out, P, sizeof(P)); + else + POINTonE1_dadd_affine(out, in, (POINTonE1_affine *)P); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_aggregate_in_g2(POINTonE2 *out, const POINTonE2 *in, + const unsigned char *zwire) +{ + POINTonE2 P[1]; + BLST_ERROR ret; + + ret = POINTonE2_Deserialize_Z((POINTonE2_affine *)P, zwire); + + if (ret != BLST_SUCCESS) + return ret; + + if (vec_is_zero(P, sizeof(POINTonE2_affine))) { + if (in == NULL) + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + + vec_copy(P->Z, BLS12_381_Rx.p, sizeof(P->Z)); + + if (!POINTonE2_in_G2(P)) + return BLST_POINT_NOT_IN_GROUP; + + if (in == NULL) { + vec_copy(out, P, sizeof(P)); + } else { + POINTonE2_dadd_affine(out, in, (POINTonE2_affine *)P); + } + return BLST_SUCCESS; +} + +void blst_aggregated_in_g1(vec384fp12 ret, const POINTonE1_affine *sig) +{ miller_loop_n(ret, (const POINTonE2_affine *)&BLS12_381_G2, sig, 1); } + +void blst_aggregated_in_g2(vec384fp12 ret, const POINTonE2_affine *sig) +{ miller_loop_n(ret, sig, (const POINTonE1_affine *)&BLS12_381_G1, 1); } + +BLST_ERROR blst_core_verify_pk_in_g1(const POINTonE1_affine *pk, + const POINTonE2_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G1(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} + +BLST_ERROR blst_core_verify_pk_in_g2(const POINTonE2_affine *pk, + const POINTonE1_affine *signature, + int hash_or_encode, + const void *msg, size_t msg_len, + const void *DST, size_t DST_len, + const void *aug, size_t aug_len) +{ + PAIRING ctx; + BLST_ERROR ret; + + ctx.ctrl = AGGR_UNDEFINED | (hash_or_encode ? AGGR_HASH_OR_ENCODE : 0); + ctx.nelems = 0; + ctx.DST = DST; + ctx.DST_len = DST_len; + + ret = PAIRING_Aggregate_PK_in_G2(&ctx, pk, 1, signature, 1, NULL, 0, + msg, msg_len, aug, aug_len); + if (ret != BLST_SUCCESS) + return ret; + + PAIRING_Commit(&ctx); + + return PAIRING_FinalVerify(&ctx, NULL) ? BLST_SUCCESS : BLST_VERIFY_FAIL; +} diff --git a/crypto/blst_src/blst_src.c b/crypto/blst_src/blst_src.c new file mode 100644 index 00000000000..9e064657e72 --- /dev/null +++ b/crypto/blst_src/blst_src.c @@ -0,0 +1,24 @@ +// This file contains all BLST lib C files needed for +// Flow crypto. +// +// The list may need to be updated in a new version of BLST is used. + +#include "keygen.c" +#include "hash_to_field.c" +#include "e1.c" +#include "map_to_g1.c" +#include "e2.c" +#include "map_to_g2.c" +#include "fp12_tower.c" +#include "pairing.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "aggregate.c" +#include "bulk_addition.c" +#include "multi_scalar.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" + + diff --git a/crypto/blst_src/build/assembly.S b/crypto/blst_src/build/assembly.S new file mode 100644 index 00000000000..c0c5db30850 --- /dev/null +++ b/crypto/blst_src/build/assembly.S @@ -0,0 +1,116 @@ +#if defined(__x86_64) || defined(__x86_64__) +# if defined(__ELF__) +# if defined(__BLST_PORTABLE__) +# include "elf/sha256-portable-x86_64.s" +# define blst_sha256_block_data_order blst_sha256_block_ssse3 +# endif +# include "elf/sha256-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "elf/ctx_inverse_mod_384-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "elf/ctq_inverse_mod_384-x86_64.s" +# endif +# include "elf/add_mod_384-x86_64.s" +# include "elf/add_mod_384x384-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "elf/mulx_mont_384-x86_64.s" +# include "elf/mulx_mont_256-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "elf/mulq_mont_384-x86_64.s" +# include "elf/mulq_mont_256-x86_64.s" +# endif +# include "elf/add_mod_256-x86_64.s" +# include "elf/ct_inverse_mod_256-x86_64.s" +# include "elf/div3w-x86_64.s" +# include "elf/ct_is_square_mod_384-x86_64.s" +# elif defined(_WIN64) || defined(__CYGWIN__) +# include "coff/sha256-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "coff/ctx_inverse_mod_384-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "coff/ctq_inverse_mod_384-x86_64.s" +# endif +# include "coff/add_mod_384-x86_64.s" +# include "coff/add_mod_384x384-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "coff/mulx_mont_384-x86_64.s" +# include "coff/mulx_mont_256-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "coff/mulq_mont_384-x86_64.s" +# include "coff/mulq_mont_256-x86_64.s" +# endif +# include "coff/add_mod_256-x86_64.s" +# include "coff/ct_inverse_mod_256-x86_64.s" +# include "coff/div3w-x86_64.s" +# include "coff/ct_is_square_mod_384-x86_64.s" +# elif defined(__APPLE__) +# include "mach-o/sha256-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "mach-o/ctx_inverse_mod_384-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "mach-o/ctq_inverse_mod_384-x86_64.s" +# endif +# include "mach-o/add_mod_384-x86_64.s" +# include "mach-o/add_mod_384x384-x86_64.s" +# if defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "mach-o/mulx_mont_384-x86_64.s" +# include "mach-o/mulx_mont_256-x86_64.s" +# endif +# if !defined(__ADX__) || defined(__BLST_PORTABLE__) +# include "mach-o/mulq_mont_384-x86_64.s" +# include "mach-o/mulq_mont_256-x86_64.s" +# endif +# include "mach-o/add_mod_256-x86_64.s" +# include "mach-o/ct_inverse_mod_256-x86_64.s" +# include "mach-o/div3w-x86_64.s" +# include "mach-o/ct_is_square_mod_384-x86_64.s" +# endif +#elif defined(__aarch64__) +# if defined(__ELF__) +# include "elf/sha256-armv8.S" +# include "elf/ct_inverse_mod_384-armv8.S" +# include "elf/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "elf/mul_mont_384-armv8.S" +# include "elf/mul_mont_256-armv8.S" +# include "elf/add_mod_256-armv8.S" +# include "elf/ct_inverse_mod_256-armv8.S" +# include "elf/div3w-armv8.S" +# include "elf/ct_is_square_mod_384-armv8.S" +# elif defined(_WIN64) +# include "coff/sha256-armv8.S" +# include "coff/ct_inverse_mod_384-armv8.S" +# include "coff/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "coff/mul_mont_384-armv8.S" +# include "coff/mul_mont_256-armv8.S" +# include "coff/add_mod_256-armv8.S" +# include "coff/ct_inverse_mod_256-armv8.S" +# include "coff/div3w-armv8.S" +# include "coff/ct_is_square_mod_384-armv8.S" +# elif defined(__APPLE__) +# include "mach-o/sha256-armv8.S" +# include "mach-o/ct_inverse_mod_384-armv8.S" +# include "mach-o/add_mod_384-armv8.S" +# define __add_mod_384 __add_mont_384 +# define __sub_mod_384 __sub_mont_384 +# include "mach-o/mul_mont_384-armv8.S" +# include "mach-o/mul_mont_256-armv8.S" +# include "mach-o/add_mod_256-armv8.S" +# include "mach-o/ct_inverse_mod_256-armv8.S" +# include "mach-o/div3w-armv8.S" +# include "mach-o/ct_is_square_mod_384-armv8.S" +# endif +#elif defined(__BLST_NO_ASM__) || \ + (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__==4) +/* inaccurate way to detect a 32-bit processor, but it's close enough */ +#else +# error "unsupported platform" +#endif diff --git a/crypto/blst_src/build/bindings_trim.pl b/crypto/blst_src/build/bindings_trim.pl new file mode 100755 index 00000000000..0880352d79e --- /dev/null +++ b/crypto/blst_src/build/bindings_trim.pl @@ -0,0 +1,40 @@ +#!/usr/bin/env perl + +# read whole file +while(<>) { push @file, $_; } + +# traverse and remove auto-generated PartialEq for chosen types +for (my $i = 0; $i <= $#file; $i++) { + if (@file[$i] =~ m/pub\s+(?:struct|enum)\s+(\w+)/) { + push @structs, $1; + } + + if (@file[$i] =~ m/struct\s+blst_p[12]/) { + @file[$i-1] =~ s/,\s*PartialEq//; + } elsif (@file[$i] =~ m/struct\s+blst_fp12/) { + @file[$i-1] =~ s/,\s*(?:Default|PartialEq)//g; + } elsif (@file[$i] =~ m/struct\s+(blst_pairing|blst_uniq)/) { + @file[$i-1] =~ s/,\s*(?:Copy|Clone|Eq|PartialEq)//g; + } elsif (@file[$i] =~ m/struct\s+blst_scalar/) { + @file[$i-1] =~ s/,\s*Copy//; + @file[$i-1] =~ s/\)/, Zeroize\)/; + splice @file, $i, 0, "#[zeroize(drop)]\n"; $i++; + } else { + @file[$i] =~ s/::std::/::core::/g; + } +} + +print @file; + +print << '___'; +#[test] +fn bindgen_test_normal_types() { + // from "Rust for Rustaceans" by Jon Gjengset + fn is_normal() {} +___ +for (@structs) { + print " is_normal::<$_>();\n"; +} +print "}\n"; + +close STDOUT; diff --git a/crypto/blst_src/build/coff/add_mod_256-armv8.S b/crypto/blst_src/build/coff/add_mod_256-armv8.S new file mode 100644 index 00000000000..27b64ef4ca4 --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_256-armv8.S @@ -0,0 +1,397 @@ +.text + +.globl add_mod_256 + +.def add_mod_256; +.type 32; +.endef +.p2align 5 +add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl mul_by_3_mod_256 + +.def mul_by_3_mod_256; +.type 32; +.endef +.p2align 5 +mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl lshift_mod_256 + +.def lshift_mod_256; +.type 32; +.endef +.p2align 5 +lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,.Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl rshift_mod_256 + +.def rshift_mod_256; +.type 32; +.endef +.p2align 5 +rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,.Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl cneg_mod_256 + +.def cneg_mod_256; +.type 32; +.endef +.p2align 5 +cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret + + +.globl sub_mod_256 + +.def sub_mod_256; +.type 32; +.endef +.p2align 5 +sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret + + +.globl check_mod_256 + +.def check_mod_256; +.type 32; +.endef +.p2align 5 +check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret + + +.globl add_n_check_mod_256 + +.def add_n_check_mod_256; +.type 32; +.endef +.p2align 5 +add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + + +.globl sub_n_check_mod_256 + +.def sub_n_check_mod_256; +.type 32; +.endef +.p2align 5 +sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + diff --git a/crypto/blst_src/build/coff/add_mod_256-x86_64.s b/crypto/blst_src/build/coff/add_mod_256-x86_64.s new file mode 100644 index 00000000000..c2c83502a18 --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_256-x86_64.s @@ -0,0 +1,924 @@ +.text + +.globl add_mod_256 + +.def add_mod_256; .scl 2; .type 32; .endef +.p2align 5 +add_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_256: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + subq $8,%rsp + +.LSEH_body_add_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_add_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_256: + + +.globl mul_by_3_mod_256 + +.def mul_by_3_mod_256; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_256: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + pushq %rbx + + pushq %r12 + +.LSEH_body_mul_by_3_mod_256: + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 + + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_256: + +.def __lshift_mod_256; .scl 3; .type 32; .endef +.p2align 5 +__lshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 + + + +.globl lshift_mod_256 + +.def lshift_mod_256; .scl 2; .type 32; .endef +.p2align 5 +lshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_lshift_mod_256: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + +.LSEH_body_lshift_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_lshift_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_lshift_mod_256: + + +.globl rshift_mod_256 + +.def rshift_mod_256; .scl 2; .type 32; .endef +.p2align 5 +rshift_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_rshift_mod_256: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + subq $8,%rsp + +.LSEH_body_rshift_mod_256: + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_rshift_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_rshift_mod_256: + + +.globl cneg_mod_256 + +.def cneg_mod_256; .scl 2; .type 32; .endef +.p2align 5 +cneg_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_cneg_mod_256: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + +.LSEH_body_cneg_mod_256: + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_cneg_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_cneg_mod_256: + + +.globl sub_mod_256 + +.def sub_mod_256; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_256: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sub_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sub_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_256: + + +.globl check_mod_256 + +.def check_mod_256; .scl 2; .type 32; .endef +.p2align 5 +check_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_check_mod_256: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax +.LSEH_epilogue_check_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_check_mod_256: + + +.globl add_n_check_mod_256 + +.def add_n_check_mod_256; .scl 2; .type 32; .endef +.p2align 5 +add_n_check_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_n_check_mod_256: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + subq $8,%rsp + +.LSEH_body_add_n_check_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_add_n_check_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_n_check_mod_256: + + +.globl sub_n_check_mod_256 + +.def sub_n_check_mod_256; .scl 2; .type 32; .endef +.p2align 5 +sub_n_check_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_n_check_mod_256: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sub_n_check_mod_256: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sub_n_check_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_n_check_mod_256: +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_256 +.rva .LSEH_body_add_mod_256 +.rva .LSEH_info_add_mod_256_prologue + +.rva .LSEH_body_add_mod_256 +.rva .LSEH_epilogue_add_mod_256 +.rva .LSEH_info_add_mod_256_body + +.rva .LSEH_epilogue_add_mod_256 +.rva .LSEH_end_add_mod_256 +.rva .LSEH_info_add_mod_256_epilogue + +.rva .LSEH_begin_mul_by_3_mod_256 +.rva .LSEH_body_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_prologue + +.rva .LSEH_body_mul_by_3_mod_256 +.rva .LSEH_epilogue_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_body + +.rva .LSEH_epilogue_mul_by_3_mod_256 +.rva .LSEH_end_mul_by_3_mod_256 +.rva .LSEH_info_mul_by_3_mod_256_epilogue + +.rva .LSEH_begin_lshift_mod_256 +.rva .LSEH_body_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_prologue + +.rva .LSEH_body_lshift_mod_256 +.rva .LSEH_epilogue_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_body + +.rva .LSEH_epilogue_lshift_mod_256 +.rva .LSEH_end_lshift_mod_256 +.rva .LSEH_info_lshift_mod_256_epilogue + +.rva .LSEH_begin_rshift_mod_256 +.rva .LSEH_body_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_prologue + +.rva .LSEH_body_rshift_mod_256 +.rva .LSEH_epilogue_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_body + +.rva .LSEH_epilogue_rshift_mod_256 +.rva .LSEH_end_rshift_mod_256 +.rva .LSEH_info_rshift_mod_256_epilogue + +.rva .LSEH_begin_cneg_mod_256 +.rva .LSEH_body_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_prologue + +.rva .LSEH_body_cneg_mod_256 +.rva .LSEH_epilogue_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_body + +.rva .LSEH_epilogue_cneg_mod_256 +.rva .LSEH_end_cneg_mod_256 +.rva .LSEH_info_cneg_mod_256_epilogue + +.rva .LSEH_begin_sub_mod_256 +.rva .LSEH_body_sub_mod_256 +.rva .LSEH_info_sub_mod_256_prologue + +.rva .LSEH_body_sub_mod_256 +.rva .LSEH_epilogue_sub_mod_256 +.rva .LSEH_info_sub_mod_256_body + +.rva .LSEH_epilogue_sub_mod_256 +.rva .LSEH_end_sub_mod_256 +.rva .LSEH_info_sub_mod_256_epilogue + +.rva .LSEH_epilogue_check_mod_256 +.rva .LSEH_end_check_mod_256 +.rva .LSEH_info_check_mod_256_epilogue + +.rva .LSEH_begin_add_n_check_mod_256 +.rva .LSEH_body_add_n_check_mod_256 +.rva .LSEH_info_add_n_check_mod_256_prologue + +.rva .LSEH_body_add_n_check_mod_256 +.rva .LSEH_epilogue_add_n_check_mod_256 +.rva .LSEH_info_add_n_check_mod_256_body + +.rva .LSEH_epilogue_add_n_check_mod_256 +.rva .LSEH_end_add_n_check_mod_256 +.rva .LSEH_info_add_n_check_mod_256_epilogue + +.rva .LSEH_begin_sub_n_check_mod_256 +.rva .LSEH_body_sub_n_check_mod_256 +.rva .LSEH_info_sub_n_check_mod_256_prologue + +.rva .LSEH_body_sub_n_check_mod_256 +.rva .LSEH_epilogue_sub_n_check_mod_256 +.rva .LSEH_info_sub_n_check_mod_256_body + +.rva .LSEH_epilogue_sub_n_check_mod_256 +.rva .LSEH_end_sub_n_check_mod_256 +.rva .LSEH_info_sub_n_check_mod_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_add_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_add_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_by_3_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_mul_by_3_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_lshift_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_lshift_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_lshift_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_rshift_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_rshift_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_rshift_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_cneg_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_cneg_mod_256_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_cneg_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sub_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sub_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_check_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_add_n_check_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_add_n_check_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_add_n_check_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_n_check_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sub_n_check_mod_256_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sub_n_check_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/add_mod_384-armv8.S b/crypto/blst_src/build/coff/add_mod_384-armv8.S new file mode 100644 index 00000000000..2eff0677f54 --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_384-armv8.S @@ -0,0 +1,1056 @@ +.text + +.globl add_mod_384 + +.def add_mod_384; +.type 32; +.endef +.p2align 5 +add_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __add_mod_384; +.type 32; +.endef +.p2align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl add_mod_384x + +.def add_mod_384x; +.type 32; +.endef +.p2align 5 +add_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl rshift_mod_384 + +.def rshift_mod_384; +.type 32; +.endef +.p2align 5 +rshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __rshift_mod_384; +.type 32; +.endef +.p2align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret + + +.globl div_by_2_mod_384 + +.def div_by_2_mod_384; +.type 32; +.endef +.p2align 5 +div_by_2_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl lshift_mod_384 + +.def lshift_mod_384; +.type 32; +.endef +.p2align 5 +lshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __lshift_mod_384; +.type 32; +.endef +.p2align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl mul_by_3_mod_384 + +.def mul_by_3_mod_384; +.type 32; +.endef +.p2align 5 +mul_by_3_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_8_mod_384 + +.def mul_by_8_mod_384; +.type 32; +.endef +.p2align 5 +mul_by_8_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_3_mod_384x + +.def mul_by_3_mod_384x; +.type 32; +.endef +.p2align 5 +mul_by_3_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_8_mod_384x + +.def mul_by_8_mod_384x; +.type 32; +.endef +.p2align 5 +mul_by_8_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl cneg_mod_384 + +.def cneg_mod_384; +.type 32; +.endef +.p2align 5 +cneg_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl sub_mod_384 + +.def sub_mod_384; +.type 32; +.endef +.p2align 5 +sub_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.def __sub_mod_384; +.type 32; +.endef +.p2align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret + + +.globl sub_mod_384x + +.def sub_mod_384x; +.type 32; +.endef +.p2align 5 +sub_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl mul_by_1_plus_i_mod_384x + +.def mul_by_1_plus_i_mod_384x; +.type 32; +.endef +.p2align 5 +mul_by_1_plus_i_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl sgn0_pty_mod_384 + +.def sgn0_pty_mod_384; +.type 32; +.endef +.p2align 5 +sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret + + +.globl sgn0_pty_mod_384x + +.def sgn0_pty_mod_384x; +.type 32; +.endef +.p2align 5 +sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret + +.globl vec_select_32 + +.def vec_select_32; +.type 32; +.endef +.p2align 5 +vec_select_32: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl vec_select_48 + +.def vec_select_48; +.type 32; +.endef +.p2align 5 +vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl vec_select_96 + +.def vec_select_96; +.type 32; +.endef +.p2align 5 +vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl vec_select_192 + +.def vec_select_192; +.type 32; +.endef +.p2align 5 +vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl vec_select_144 + +.def vec_select_144; +.type 32; +.endef +.p2align 5 +vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl vec_select_288 + +.def vec_select_288; +.type 32; +.endef +.p2align 5 +vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl vec_prefetch + +.def vec_prefetch; +.type 32; +.endef +.p2align 5 +vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret + +.globl vec_is_zero_16x + +.def vec_is_zero_16x; +.type 32; +.endef +.p2align 5 +vec_is_zero_16x: + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, .Loop_is_zero_done + +.Loop_is_zero: + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, .Loop_is_zero + +.Loop_is_zero_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + +.globl vec_is_equal_16x + +.def vec_is_equal_16x; +.type 32; +.endef +.p2align 5 +vec_is_equal_16x: + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +.Loop_is_equal: + sub x2, x2, #1 + cbz x2, .Loop_is_equal_done + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b .Loop_is_equal + nop + +.Loop_is_equal_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + diff --git a/crypto/blst_src/build/coff/add_mod_384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384-x86_64.s new file mode 100644 index 00000000000..3ef562a3bf2 --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_384-x86_64.s @@ -0,0 +1,2510 @@ +.text + +.globl add_mod_384 + +.def add_mod_384; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_add_mod_384: + + + call __add_mod_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_add_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384: + +.def __add_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl add_mod_384x + +.def add_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384x: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_add_mod_384x: + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 + + movq 24+8(%rsp),%r14 + + movq 24+16(%rsp),%r13 + + movq 24+24(%rsp),%r12 + + movq 24+32(%rsp),%rbx + + movq 24+40(%rsp),%rbp + + leaq 24+48(%rsp),%rsp + +.LSEH_epilogue_add_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384x: + + +.globl rshift_mod_384 + +.def rshift_mod_384; .scl 2; .type 32; .endef +.p2align 5 +rshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_rshift_mod_384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_rshift_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz .Loop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_rshift_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_rshift_mod_384: + +.def __rshift_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__rshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 + + +.globl div_by_2_mod_384 + +.def div_by_2_mod_384; .scl 2; .type 32; .endef +.p2align 5 +div_by_2_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_div_by_2_mod_384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_div_by_2_mod_384: + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_div_by_2_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_div_by_2_mod_384: + + +.globl lshift_mod_384 + +.def lshift_mod_384; .scl 2; .type 32; .endef +.p2align 5 +lshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_lshift_mod_384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_lshift_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_lshift_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_lshift_mod_384: + +.def __lshift_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__lshift_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 + + + +.globl mul_by_3_mod_384 + +.def mul_by_3_mod_384; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_3_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_384: + +.globl mul_by_8_mod_384 + +.def mul_by_8_mod_384; .scl 2; .type 32; .endef +.p2align 5 +mul_by_8_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_8_mod_384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mul_by_8_mod_384: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_8_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_8_mod_384: + + +.globl mul_by_3_mod_384x + +.def mul_by_3_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_3_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_3_mod_384x: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_3_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_3_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_3_mod_384x: + +.globl mul_by_8_mod_384x + +.def mul_by_8_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_8_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_8_mod_384x: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_mul_by_8_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_by_8_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_8_mod_384x: + + +.globl cneg_mod_384 + +.def cneg_mod_384; .scl 2; .type 32; .endef +.p2align 5 +cneg_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_cneg_mod_384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdx + +.LSEH_body_cneg_mod_384: + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_cneg_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_cneg_mod_384: + + +.globl sub_mod_384 + +.def sub_mod_384; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sub_mod_384: + + + call __sub_mod_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384: + +.def __sub_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sub_mod_384x + +.def sub_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384x: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_sub_mod_384x: + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 + + movq 24+8(%rsp),%r14 + + movq 24+16(%rsp),%r13 + + movq 24+24(%rsp),%r12 + + movq 24+32(%rsp),%rbx + + movq 24+40(%rsp),%rbp + + leaq 24+48(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384x: +.globl mul_by_1_plus_i_mod_384x + +.def mul_by_1_plus_i_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_by_1_plus_i_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_by_1_plus_i_mod_384x: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $56,%rsp + +.LSEH_body_mul_by_1_plus_i_mod_384x: + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 + + movq 56+8(%rsp),%r14 + + movq 56+16(%rsp),%r13 + + movq 56+24(%rsp),%r12 + + movq 56+32(%rsp),%rbx + + movq 56+40(%rsp),%rbp + + leaq 56+48(%rsp),%rsp + +.LSEH_epilogue_mul_by_1_plus_i_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_by_1_plus_i_mod_384x: +.globl sgn0_pty_mod_384 + +.def sgn0_pty_mod_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mod_384: + + + movq %rcx,%rdi + movq %rdx,%rsi +.LSEH_body_sgn0_pty_mod_384: + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + +.LSEH_epilogue_sgn0_pty_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mod_384: + +.globl sgn0_pty_mod_384x + +.def sgn0_pty_mod_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mod_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mod_384x: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + pushq %rbx + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mod_384x: + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mod_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mod_384x: +.globl vec_select_32 + +.def vec_select_32; .scl 2; .type 32; .endef +.p2align 5 +vec_select_32: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 16(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 16(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 16(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-16(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-16(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-16(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,16-16(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_48 + +.def vec_select_48; .scl 2; .type 32; .endef +.p2align 5 +vec_select_48: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 24(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 24(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 24(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-24(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rcx) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_96 + +.def vec_select_96; .scl 2; .type 32; .endef +.p2align 5 +vec_select_96: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 48(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 48(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 48(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-48(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_192 + +.def vec_select_192; .scl 2; .type 32; .endef +.p2align 5 +vec_select_192: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 96(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 96(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 96(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rcx) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rcx) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rcx) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rcx) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rcx) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rcx) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_144 + +.def vec_select_144; .scl 2; .type 32; .endef +.p2align 5 +vec_select_144: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 72(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 72(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 72(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rcx) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rcx) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rcx) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rcx) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rcx) + .byte 0xf3,0xc3 + +.globl vec_select_288 + +.def vec_select_288; .scl 2; .type 32; .endef +.p2align 5 +vec_select_288: + .byte 0xf3,0x0f,0x1e,0xfa + + movd %r9d,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rdx),%xmm0 + leaq 144(%rdx),%rdx + pcmpeqd %xmm4,%xmm5 + movdqu (%r8),%xmm1 + leaq 144(%r8),%r8 + pcmpeqd %xmm5,%xmm4 + leaq 144(%rcx),%rcx + pand %xmm4,%xmm0 + movdqu 0+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rcx) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rcx) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rcx) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rcx) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rcx) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rcx) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rcx) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rcx) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rcx) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rcx) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rcx) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rcx) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rcx) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rcx) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rcx) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rdx),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%r8),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rcx) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rdx),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%r8),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rcx) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rcx) + .byte 0xf3,0xc3 + +.globl vec_prefetch + +.def vec_prefetch; .scl 2; .type 32; .endef +.p2align 5 +vec_prefetch: + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rcx,%rdx,1),%rdx + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + cmovaq %r8,%rax + prefetchnta (%rcx) + leaq (%rcx,%rax,1),%rcx + cmpq %rdx,%rcx + cmovaq %rdx,%rcx + prefetchnta (%rcx) + .byte 0xf3,0xc3 + +.globl vec_is_zero_16x + +.def vec_is_zero_16x; .scl 2; .type 32; .endef +.p2align 5 +vec_is_zero_16x: + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%edx + movdqu (%rcx),%xmm0 + leaq 16(%rcx),%rcx + +.Loop_is_zero: + decl %edx + jz .Loop_is_zero_done + movdqu (%rcx),%xmm1 + leaq 16(%rcx),%rcx + por %xmm1,%xmm0 + jmp .Loop_is_zero + +.Loop_is_zero_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %edx + testq %rax,%rax + cmovnzl %edx,%eax + xorl $1,%eax + .byte 0xf3,0xc3 + +.globl vec_is_equal_16x + +.def vec_is_equal_16x; .scl 2; .type 32; .endef +.p2align 5 +vec_is_equal_16x: + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%r8d + movdqu (%rcx),%xmm0 + movdqu (%rdx),%xmm1 + subq %rcx,%rdx + leaq 16(%rcx),%rcx + pxor %xmm1,%xmm0 + +.Loop_is_equal: + decl %r8d + jz .Loop_is_equal_done + movdqu (%rcx),%xmm1 + movdqu (%rcx,%rdx,1),%xmm2 + leaq 16(%rcx),%rcx + pxor %xmm2,%xmm1 + por %xmm1,%xmm0 + jmp .Loop_is_equal + +.Loop_is_equal_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %r8d + testq %rax,%rax + cmovnzl %r8d,%eax + xorl $1,%eax + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_384 +.rva .LSEH_body_add_mod_384 +.rva .LSEH_info_add_mod_384_prologue + +.rva .LSEH_body_add_mod_384 +.rva .LSEH_epilogue_add_mod_384 +.rva .LSEH_info_add_mod_384_body + +.rva .LSEH_epilogue_add_mod_384 +.rva .LSEH_end_add_mod_384 +.rva .LSEH_info_add_mod_384_epilogue + +.rva .LSEH_begin_add_mod_384x +.rva .LSEH_body_add_mod_384x +.rva .LSEH_info_add_mod_384x_prologue + +.rva .LSEH_body_add_mod_384x +.rva .LSEH_epilogue_add_mod_384x +.rva .LSEH_info_add_mod_384x_body + +.rva .LSEH_epilogue_add_mod_384x +.rva .LSEH_end_add_mod_384x +.rva .LSEH_info_add_mod_384x_epilogue + +.rva .LSEH_begin_rshift_mod_384 +.rva .LSEH_body_rshift_mod_384 +.rva .LSEH_info_rshift_mod_384_prologue + +.rva .LSEH_body_rshift_mod_384 +.rva .LSEH_epilogue_rshift_mod_384 +.rva .LSEH_info_rshift_mod_384_body + +.rva .LSEH_epilogue_rshift_mod_384 +.rva .LSEH_end_rshift_mod_384 +.rva .LSEH_info_rshift_mod_384_epilogue + +.rva .LSEH_begin_div_by_2_mod_384 +.rva .LSEH_body_div_by_2_mod_384 +.rva .LSEH_info_div_by_2_mod_384_prologue + +.rva .LSEH_body_div_by_2_mod_384 +.rva .LSEH_epilogue_div_by_2_mod_384 +.rva .LSEH_info_div_by_2_mod_384_body + +.rva .LSEH_epilogue_div_by_2_mod_384 +.rva .LSEH_end_div_by_2_mod_384 +.rva .LSEH_info_div_by_2_mod_384_epilogue + +.rva .LSEH_begin_lshift_mod_384 +.rva .LSEH_body_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_prologue + +.rva .LSEH_body_lshift_mod_384 +.rva .LSEH_epilogue_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_body + +.rva .LSEH_epilogue_lshift_mod_384 +.rva .LSEH_end_lshift_mod_384 +.rva .LSEH_info_lshift_mod_384_epilogue + +.rva .LSEH_begin_mul_by_3_mod_384 +.rva .LSEH_body_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_prologue + +.rva .LSEH_body_mul_by_3_mod_384 +.rva .LSEH_epilogue_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_body + +.rva .LSEH_epilogue_mul_by_3_mod_384 +.rva .LSEH_end_mul_by_3_mod_384 +.rva .LSEH_info_mul_by_3_mod_384_epilogue + +.rva .LSEH_begin_mul_by_8_mod_384 +.rva .LSEH_body_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_prologue + +.rva .LSEH_body_mul_by_8_mod_384 +.rva .LSEH_epilogue_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_body + +.rva .LSEH_epilogue_mul_by_8_mod_384 +.rva .LSEH_end_mul_by_8_mod_384 +.rva .LSEH_info_mul_by_8_mod_384_epilogue + +.rva .LSEH_begin_mul_by_3_mod_384x +.rva .LSEH_body_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_prologue + +.rva .LSEH_body_mul_by_3_mod_384x +.rva .LSEH_epilogue_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_body + +.rva .LSEH_epilogue_mul_by_3_mod_384x +.rva .LSEH_end_mul_by_3_mod_384x +.rva .LSEH_info_mul_by_3_mod_384x_epilogue + +.rva .LSEH_begin_mul_by_8_mod_384x +.rva .LSEH_body_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_prologue + +.rva .LSEH_body_mul_by_8_mod_384x +.rva .LSEH_epilogue_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_body + +.rva .LSEH_epilogue_mul_by_8_mod_384x +.rva .LSEH_end_mul_by_8_mod_384x +.rva .LSEH_info_mul_by_8_mod_384x_epilogue + +.rva .LSEH_begin_cneg_mod_384 +.rva .LSEH_body_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_prologue + +.rva .LSEH_body_cneg_mod_384 +.rva .LSEH_epilogue_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_body + +.rva .LSEH_epilogue_cneg_mod_384 +.rva .LSEH_end_cneg_mod_384 +.rva .LSEH_info_cneg_mod_384_epilogue + +.rva .LSEH_begin_sub_mod_384 +.rva .LSEH_body_sub_mod_384 +.rva .LSEH_info_sub_mod_384_prologue + +.rva .LSEH_body_sub_mod_384 +.rva .LSEH_epilogue_sub_mod_384 +.rva .LSEH_info_sub_mod_384_body + +.rva .LSEH_epilogue_sub_mod_384 +.rva .LSEH_end_sub_mod_384 +.rva .LSEH_info_sub_mod_384_epilogue + +.rva .LSEH_begin_sub_mod_384x +.rva .LSEH_body_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_prologue + +.rva .LSEH_body_sub_mod_384x +.rva .LSEH_epilogue_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_body + +.rva .LSEH_epilogue_sub_mod_384x +.rva .LSEH_end_sub_mod_384x +.rva .LSEH_info_sub_mod_384x_epilogue + +.rva .LSEH_begin_mul_by_1_plus_i_mod_384x +.rva .LSEH_body_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_prologue + +.rva .LSEH_body_mul_by_1_plus_i_mod_384x +.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_body + +.rva .LSEH_epilogue_mul_by_1_plus_i_mod_384x +.rva .LSEH_end_mul_by_1_plus_i_mod_384x +.rva .LSEH_info_mul_by_1_plus_i_mod_384x_epilogue + +.rva .LSEH_begin_sgn0_pty_mod_384 +.rva .LSEH_body_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_prologue + +.rva .LSEH_body_sgn0_pty_mod_384 +.rva .LSEH_epilogue_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_body + +.rva .LSEH_epilogue_sgn0_pty_mod_384 +.rva .LSEH_end_sgn0_pty_mod_384 +.rva .LSEH_info_sgn0_pty_mod_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mod_384x +.rva .LSEH_body_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_prologue + +.rva .LSEH_body_sgn0_pty_mod_384x +.rva .LSEH_epilogue_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_body + +.rva .LSEH_epilogue_sgn0_pty_mod_384x +.rva .LSEH_end_sgn0_pty_mod_384x +.rva .LSEH_info_sgn0_pty_mod_384x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_add_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_add_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_add_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_add_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_add_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_rshift_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_rshift_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_rshift_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_div_by_2_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_div_by_2_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_div_by_2_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_lshift_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_lshift_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_lshift_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_by_3_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mul_by_3_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_8_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_by_8_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mul_by_8_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_3_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_by_3_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mul_by_3_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_8_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_by_8_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mul_by_8_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_cneg_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_cneg_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_cneg_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sub_mod_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sub_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sub_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sub_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_by_1_plus_i_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_by_1_plus_i_mod_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x07,0x00 +.byte 0x00,0xe4,0x08,0x00 +.byte 0x00,0xd4,0x09,0x00 +.byte 0x00,0xc4,0x0a,0x00 +.byte 0x00,0x34,0x0b,0x00 +.byte 0x00,0x54,0x0c,0x00 +.byte 0x00,0x74,0x0e,0x00 +.byte 0x00,0x64,0x0f,0x00 +.byte 0x00,0xc2 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mul_by_1_plus_i_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sgn0_pty_mod_384_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sgn0_pty_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mod_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sgn0_pty_mod_384x_body: +.byte 1,0,9,0 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sgn0_pty_mod_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s new file mode 100644 index 00000000000..53662b4a56a --- /dev/null +++ b/crypto/blst_src/build/coff/add_mod_384x384-x86_64.s @@ -0,0 +1,330 @@ +.text + +.def __add_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__add_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __sub_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.globl add_mod_384x384 + +.def add_mod_384x384; .scl 2; .type 32; .endef +.p2align 5 +add_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_add_mod_384x384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_add_mod_384x384: + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_add_mod_384x384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_add_mod_384x384: + +.globl sub_mod_384x384 + +.def sub_mod_384x384; .scl 2; .type 32; .endef +.p2align 5 +sub_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sub_mod_384x384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sub_mod_384x384: + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sub_mod_384x384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sub_mod_384x384: +.section .pdata +.p2align 2 +.rva .LSEH_begin_add_mod_384x384 +.rva .LSEH_body_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_prologue + +.rva .LSEH_body_add_mod_384x384 +.rva .LSEH_epilogue_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_body + +.rva .LSEH_epilogue_add_mod_384x384 +.rva .LSEH_end_add_mod_384x384 +.rva .LSEH_info_add_mod_384x384_epilogue + +.rva .LSEH_begin_sub_mod_384x384 +.rva .LSEH_body_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_prologue + +.rva .LSEH_body_sub_mod_384x384 +.rva .LSEH_epilogue_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_body + +.rva .LSEH_epilogue_sub_mod_384x384 +.rva .LSEH_end_sub_mod_384x384 +.rva .LSEH_info_sub_mod_384x384_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_add_mod_384x384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_add_mod_384x384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_add_mod_384x384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sub_mod_384x384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sub_mod_384x384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sub_mod_384x384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S new file mode 100644 index 00000000000..d2fd83182b4 --- /dev/null +++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-armv8.S @@ -0,0 +1,799 @@ +.text + +.globl ct_inverse_mod_256 + +.def ct_inverse_mod_256; +.type 32; +.endef +.p2align 5 +ct_inverse_mod_256: +.long 3573752639 + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// +.def __smul_256x63; +.type 32; +.endef +.p2align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret + + +.def __smul_512x63_tail; +.type 32; +.endef +.p2align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret + + +.def __smul_256_n_shift_by_31; +.type 32; +.endef +.p2align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret + +.def __ab_approximation_31_256; +.type 32; +.endef +.p2align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +.Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret + + +.def __inner_loop_31_256; +.type 32; +.endef +.p2align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, .Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret + + +.def __inner_loop_62_256; +.type 32; +.endef +.p2align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +.Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62_256 + + ret + diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s new file mode 100644 index 00000000000..d1aa7597bc0 --- /dev/null +++ b/crypto/blst_src/build/coff/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1213 @@ +.text + +.globl ct_inverse_mod_256 + +.def ct_inverse_mod_256; .scl 2; .type 32; .endef +.p2align 5 +ct_inverse_mod_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ct_inverse_mod_256: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1072,%rsp + +.LSEH_body_ct_inverse_mod_256: + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ct_inverse_mod_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ct_inverse_mod_256: +.def __smulq_512x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_512x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 + + +.def __smulq_256x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_256x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 + +.def __smulq_256_n_shift_by_31; .scl 3; .type 32; .endef +.p2align 5 +__smulq_256_n_shift_by_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 + +.def __ab_approximation_31_256; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_31_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 + +.def __inner_loop_31_256; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_31_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz .Loop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 + + +.def __inner_loop_62_256; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_62_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +.Loop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz .Loop_62_256 + + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ct_inverse_mod_256 +.rva .LSEH_body_ct_inverse_mod_256 +.rva .LSEH_info_ct_inverse_mod_256_prologue + +.rva .LSEH_body_ct_inverse_mod_256 +.rva .LSEH_epilogue_ct_inverse_mod_256 +.rva .LSEH_info_ct_inverse_mod_256_body + +.rva .LSEH_epilogue_ct_inverse_mod_256 +.rva .LSEH_end_ct_inverse_mod_256 +.rva .LSEH_info_ct_inverse_mod_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ct_inverse_mod_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_ct_inverse_mod_256_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x86,0x00 +.byte 0x00,0xe4,0x87,0x00 +.byte 0x00,0xd4,0x88,0x00 +.byte 0x00,0xc4,0x89,0x00 +.byte 0x00,0x34,0x8a,0x00 +.byte 0x00,0x54,0x8b,0x00 +.byte 0x00,0x74,0x8d,0x00 +.byte 0x00,0x64,0x8e,0x00 +.byte 0x00,0x01,0x8c,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_ct_inverse_mod_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S new file mode 100644 index 00000000000..86fdc405828 --- /dev/null +++ b/crypto/blst_src/build/coff/ct_inverse_mod_384-armv8.S @@ -0,0 +1,730 @@ +.text + +.globl ct_inverse_mod_383 + +.def ct_inverse_mod_383; +.type 32; +.endef +.p2align 5 +ct_inverse_mod_383: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl .Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.def __smul_383x63; +.type 32; +.endef +.p2align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret + + +.def __smul_767x63_tail; +.type 32; +.endef +.p2align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret + + +.def __smul_383_n_shift_by_62; +.type 32; +.endef +.p2align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret + +.def __ab_approximation_62; +.type 32; +.endef +.p2align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +.Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret + +.def __inner_loop_62; +.type 32; +.endef +.p2align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +.Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62 + + ret + diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S new file mode 100644 index 00000000000..efe90a82144 --- /dev/null +++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-armv8.S @@ -0,0 +1,335 @@ +.text + +.globl ct_is_square_mod_384 + +.def ct_is_square_mod_384; +.type 32; +.endef +.p2align 5 +ct_is_square_mod_384: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the .Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b .Loop_is_square + +.p2align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + +.def __smul_384_n_shift_by_30; +.type 32; +.endef +.p2align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret + +.def __ab_approximation_30; +.type 32; +.endef +.p2align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret + + +.def __inner_loop_30; +.type 32; +.endef +.p2align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, .Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret + +.def __inner_loop_48; +.type 32; +.endef +.p2align 4 +__inner_loop_48: +.Loop_48: + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x9, x9, x3, hs // |b_| = |a_| + csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, .Loop_48 + + ret + diff --git a/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s new file mode 100644 index 00000000000..9ac32f50852 --- /dev/null +++ b/crypto/blst_src/build/coff/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,509 @@ +.text + +.globl ct_is_square_mod_384 + +.def ct_is_square_mod_384; .scl 2; .type 32; .endef +.p2align 5 +ct_is_square_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ct_is_square_mod_384: + + + pushq %rbp + + movq %rcx,%rdi + movq %rdx,%rsi + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $536,%rsp + +.LSEH_body_ct_is_square_mod_384: + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp .Loop_is_square + +.p2align 5 +.Loop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz .Loop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ct_is_square_mod_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ct_is_square_mod_384: + +.def __smulq_384_n_shift_by_30; .scl 3; .type 32; .endef +.p2align 5 +__smulq_384_n_shift_by_30: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.def __ab_approximation_30; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_30: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 + +.def __inner_loop_30; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_30: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +.Loop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz .Loop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 + + +.def __inner_loop_48; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_48: + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +.Loop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz .Loop_48 + + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ct_is_square_mod_384 +.rva .LSEH_body_ct_is_square_mod_384 +.rva .LSEH_info_ct_is_square_mod_384_prologue + +.rva .LSEH_body_ct_is_square_mod_384 +.rva .LSEH_epilogue_ct_is_square_mod_384 +.rva .LSEH_info_ct_is_square_mod_384_body + +.rva .LSEH_epilogue_ct_is_square_mod_384 +.rva .LSEH_end_ct_is_square_mod_384 +.rva .LSEH_info_ct_is_square_mod_384_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ct_is_square_mod_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_ct_is_square_mod_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x43,0x00 +.byte 0x00,0xe4,0x44,0x00 +.byte 0x00,0xd4,0x45,0x00 +.byte 0x00,0xc4,0x46,0x00 +.byte 0x00,0x34,0x47,0x00 +.byte 0x00,0x54,0x48,0x00 +.byte 0x00,0x74,0x4a,0x00 +.byte 0x00,0x64,0x4b,0x00 +.byte 0x00,0x01,0x49,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_ct_is_square_mod_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..d027a6dc5c0 --- /dev/null +++ b/crypto/blst_src/build/coff/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1230 @@ +.comm __blst_platform_cap,4 +.text + +.globl ct_inverse_mod_383 + +.def ct_inverse_mod_383; .scl 2; .type 32; .endef +.p2align 5 +ct_inverse_mod_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ct_inverse_mod_383: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz ct_inverse_mod_383$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1112,%rsp + +.LSEH_body_ct_inverse_mod_383: + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ct_inverse_mod_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ct_inverse_mod_383: +.def __smulq_767x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_767x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 + +.def __smulq_383x63; .scl 3; .type 32; .endef +.p2align 5 +__smulq_383x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.def __smulq_383_n_shift_by_62; .scl 3; .type 32; .endef +.p2align 5 +__smulq_383_n_shift_by_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 + +.def __ab_approximation_62; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 + +.def __inner_loop_62; .scl 3; .type 32; .endef +.p2align 3 +.long 0 +__inner_loop_62: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +.Loop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ct_inverse_mod_383 +.rva .LSEH_body_ct_inverse_mod_383 +.rva .LSEH_info_ct_inverse_mod_383_prologue + +.rva .LSEH_body_ct_inverse_mod_383 +.rva .LSEH_epilogue_ct_inverse_mod_383 +.rva .LSEH_info_ct_inverse_mod_383_body + +.rva .LSEH_epilogue_ct_inverse_mod_383 +.rva .LSEH_end_ct_inverse_mod_383 +.rva .LSEH_info_ct_inverse_mod_383_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ct_inverse_mod_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_ct_inverse_mod_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x8b,0x00 +.byte 0x00,0xe4,0x8c,0x00 +.byte 0x00,0xd4,0x8d,0x00 +.byte 0x00,0xc4,0x8e,0x00 +.byte 0x00,0x34,0x8f,0x00 +.byte 0x00,0x54,0x90,0x00 +.byte 0x00,0x74,0x92,0x00 +.byte 0x00,0x64,0x93,0x00 +.byte 0x00,0x01,0x91,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_ct_inverse_mod_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..4f7dd6d1552 --- /dev/null +++ b/crypto/blst_src/build/coff/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1601 @@ +.text + +.globl ctx_inverse_mod_383 + +.def ctx_inverse_mod_383; .scl 2; .type 32; .endef +.p2align 5 +ctx_inverse_mod_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_ctx_inverse_mod_383: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +ct_inverse_mod_383$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $1112,%rsp + +.LSEH_body_ctx_inverse_mod_383: + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __tail_loop_53 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_ctx_inverse_mod_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_ctx_inverse_mod_383: +.def __smulx_767x63; .scl 3; .type 32; .endef +.p2align 5 +__smulx_767x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 + +.def __smulx_383x63; .scl 3; .type 32; .endef +.p2align 5 +__smulx_383x63: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.def __smulx_383_n_shift_by_31; .scl 3; .type 32; .endef +.p2align 5 +__smulx_383_n_shift_by_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 + +.def __smulx_191_n_shift_by_31; .scl 3; .type 32; .endef +.p2align 5 +__smulx_191_n_shift_by_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 + +.def __ab_approximation_31; .scl 3; .type 32; .endef +.p2align 5 +__ab_approximation_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 + +.def __inner_loop_31; .scl 3; .type 32; .endef +.p2align 5 +__inner_loop_31: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz .Loop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 + + +.def __tail_loop_53; .scl 3; .type 32; .endef +.p2align 5 +__tail_loop_53: + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +.Loop_53: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_53 + + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_ctx_inverse_mod_383 +.rva .LSEH_body_ctx_inverse_mod_383 +.rva .LSEH_info_ctx_inverse_mod_383_prologue + +.rva .LSEH_body_ctx_inverse_mod_383 +.rva .LSEH_epilogue_ctx_inverse_mod_383 +.rva .LSEH_info_ctx_inverse_mod_383_body + +.rva .LSEH_epilogue_ctx_inverse_mod_383 +.rva .LSEH_end_ctx_inverse_mod_383 +.rva .LSEH_info_ctx_inverse_mod_383_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_ctx_inverse_mod_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_ctx_inverse_mod_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x8b,0x00 +.byte 0x00,0xe4,0x8c,0x00 +.byte 0x00,0xd4,0x8d,0x00 +.byte 0x00,0xc4,0x8e,0x00 +.byte 0x00,0x34,0x8f,0x00 +.byte 0x00,0x54,0x90,0x00 +.byte 0x00,0x74,0x92,0x00 +.byte 0x00,0x64,0x93,0x00 +.byte 0x00,0x01,0x91,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_ctx_inverse_mod_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/div3w-armv8.S b/crypto/blst_src/build/coff/div3w-armv8.S new file mode 100644 index 00000000000..2e5d7045d6a --- /dev/null +++ b/crypto/blst_src/build/coff/div3w-armv8.S @@ -0,0 +1,94 @@ +.text + +.globl div_3_limbs +.def div_3_limbs; +.type 32; +.endef +.p2align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + speculative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret + +.globl quot_rem_128 +.def quot_rem_128; +.type 32; +.endef +.p2align 5 +quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret + + +.globl quot_rem_64 +.def quot_rem_64; +.type 32; +.endef +.p2align 5 +quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret + diff --git a/crypto/blst_src/build/coff/div3w-x86_64.s b/crypto/blst_src/build/coff/div3w-x86_64.s new file mode 100644 index 00000000000..033d1eb3055 --- /dev/null +++ b/crypto/blst_src/build/coff/div3w-x86_64.s @@ -0,0 +1,248 @@ +.text + +.globl div_3_limbs + +.def div_3_limbs; .scl 2; .type 32; .endef +.p2align 5 +div_3_limbs: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_div_3_limbs: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +.LSEH_body_div_3_limbs: + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +.Loop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz .Loop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + +.LSEH_epilogue_div_3_limbs: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_div_3_limbs: +.globl quot_rem_128 + +.def quot_rem_128; .scl 2; .type 32; .endef +.p2align 5 +quot_rem_128: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_quot_rem_128: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +.LSEH_body_quot_rem_128: + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + +.LSEH_epilogue_quot_rem_128: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_quot_rem_128: + + + + + +.globl quot_rem_64 + +.def quot_rem_64; .scl 2; .type 32; .endef +.p2align 5 +quot_rem_64: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_quot_rem_64: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +.LSEH_body_quot_rem_64: + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + +.LSEH_epilogue_quot_rem_64: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_quot_rem_64: +.section .pdata +.p2align 2 +.rva .LSEH_begin_div_3_limbs +.rva .LSEH_body_div_3_limbs +.rva .LSEH_info_div_3_limbs_prologue + +.rva .LSEH_body_div_3_limbs +.rva .LSEH_epilogue_div_3_limbs +.rva .LSEH_info_div_3_limbs_body + +.rva .LSEH_epilogue_div_3_limbs +.rva .LSEH_end_div_3_limbs +.rva .LSEH_info_div_3_limbs_epilogue + +.rva .LSEH_begin_quot_rem_128 +.rva .LSEH_body_quot_rem_128 +.rva .LSEH_info_quot_rem_128_prologue + +.rva .LSEH_body_quot_rem_128 +.rva .LSEH_epilogue_quot_rem_128 +.rva .LSEH_info_quot_rem_128_body + +.rva .LSEH_epilogue_quot_rem_128 +.rva .LSEH_end_quot_rem_128 +.rva .LSEH_info_quot_rem_128_epilogue + +.rva .LSEH_begin_quot_rem_64 +.rva .LSEH_body_quot_rem_64 +.rva .LSEH_info_quot_rem_64_prologue + +.rva .LSEH_body_quot_rem_64 +.rva .LSEH_epilogue_quot_rem_64 +.rva .LSEH_info_quot_rem_64_body + +.rva .LSEH_epilogue_quot_rem_64 +.rva .LSEH_end_quot_rem_64 +.rva .LSEH_info_quot_rem_64_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_div_3_limbs_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_div_3_limbs_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_div_3_limbs_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_quot_rem_128_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_quot_rem_128_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_quot_rem_128_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_quot_rem_64_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_quot_rem_64_body: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_quot_rem_64_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/mul_mont_256-armv8.S b/crypto/blst_src/build/coff/mul_mont_256-armv8.S new file mode 100644 index 00000000000..8cadbb89344 --- /dev/null +++ b/crypto/blst_src/build/coff/mul_mont_256-armv8.S @@ -0,0 +1,474 @@ +.text + +.globl mul_mont_sparse_256 + +.def mul_mont_sparse_256; +.type 32; +.endef +.p2align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + +.globl sqr_mont_sparse_256 + +.def sqr_mont_sparse_256; +.type 32; +.endef +.p2align 5 +sqr_mont_sparse_256: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + +.globl from_mont_256 + +.def from_mont_256; +.type 32; +.endef +.p2align 5 +from_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + +.globl redc_mont_256 + +.def redc_mont_256; +.type 32; +.endef +.p2align 5 +redc_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + +.def __mul_by_1_mont_256; +.type 32; +.endef +.p2align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret + diff --git a/crypto/blst_src/build/coff/mul_mont_384-armv8.S b/crypto/blst_src/build/coff/mul_mont_384-armv8.S new file mode 100644 index 00000000000..074f38c495c --- /dev/null +++ b/crypto/blst_src/build/coff/mul_mont_384-armv8.S @@ -0,0 +1,2424 @@ +.text + +.globl add_mod_384x384 +.def add_mod_384x384; +.type 32; +.endef +.p2align 5 +add_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + +.def __add_mod_384x384; +.type 32; +.endef +.p2align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret + + +.globl sub_mod_384x384 +.def sub_mod_384x384; +.type 32; +.endef +.p2align 5 +sub_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + +.def __sub_mod_384x384; +.type 32; +.endef +.p2align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret + + +.def __add_mod_384; +.type 32; +.endef +.p2align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.def __sub_mod_384; +.type 32; +.endef +.p2align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl mul_mont_384x + +.def mul_mont_384x; +.type 32; +.endef +.p2align 5 +mul_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_mont_384x + +.def sqr_mont_384x; +.type 32; +.endef +.p2align 5 +sqr_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl mul_mont_384 + +.def mul_mont_384; +.type 32; +.endef +.p2align 5 +mul_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_mont_384; +.type 32; +.endef +.p2align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret + + +.globl sqr_mont_384 + +.def sqr_mont_384; +.type 32; +.endef +.p2align 5 +sqr_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_n_mul_mont_383 + +.def sqr_n_mul_mont_383; +.type 32; +.endef +.p2align 5 +sqr_n_mul_mont_383: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +.Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,.Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + +.def __sqr_384; +.type 32; +.endef +.p2align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret + +.globl sqr_384 + +.def sqr_384; +.type 32; +.endef +.p2align 5 +sqr_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl redc_mont_384 + +.def redc_mont_384; +.type 32; +.endef +.p2align 5 +redc_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl from_mont_384 + +.def from_mont_384; +.type 32; +.endef +.p2align 5 +from_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_by_1_mont_384; +.type 32; +.endef +.p2align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret + + +.def __redc_tail_mont_384; +.type 32; +.endef +.p2align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl mul_384 + +.def mul_384; +.type 32; +.endef +.p2align 5 +mul_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_384; +.type 32; +.endef +.p2align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret + + +.globl mul_382x + +.def mul_382x; +.type 32; +.endef +.p2align 5 +mul_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_382x + +.def sqr_382x; +.type 32; +.endef +.p2align 5 +sqr_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sqr_mont_382x + +.def sqr_mont_382x; +.type 32; +.endef +.p2align 5 +sqr_mont_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.def __mul_mont_383_nonred; +.type 32; +.endef +.p2align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret + + +.globl sgn0_pty_mont_384 + +.def sgn0_pty_mont_384; +.type 32; +.endef +.p2align 5 +sgn0_pty_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl sgn0_pty_mont_384x + +.def sgn0_pty_mont_384x; +.type 32; +.endef +.p2align 5 +sgn0_pty_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + diff --git a/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s new file mode 100644 index 00000000000..2dd30bc5b5d --- /dev/null +++ b/crypto/blst_src/build/coff/mulq_mont_256-x86_64.s @@ -0,0 +1,897 @@ +.comm __blst_platform_cap,4 +.text + +.globl mul_mont_sparse_256 + +.def mul_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_sparse_256: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_sparse_256$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_mul_mont_sparse_256: + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mul_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_sparse_256: + +.globl sqr_mont_sparse_256 + +.def sqr_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_sparse_256: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_sparse_256$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_sqr_mont_sparse_256: + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_sparse_256: +.def __mulq_mont_sparse_256; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 + + +.globl from_mont_256 + +.def from_mont_256; .scl 2; .type 32; .endef +.p2align 5 +from_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_from_mont_256: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz from_mont_256$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_from_mont_256: + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_from_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_from_mont_256: + +.globl redc_mont_256 + +.def redc_mont_256; .scl 2; .type 32; .endef +.p2align 5 +redc_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redc_mont_256: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz redc_mont_256$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redc_mont_256: + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redc_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redc_mont_256: +.def __mulq_by_1_mont_256; .scl 3; .type 32; .endef +.p2align 5 +__mulq_by_1_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_mul_mont_sparse_256 +.rva .LSEH_body_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_prologue + +.rva .LSEH_body_mul_mont_sparse_256 +.rva .LSEH_epilogue_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_body + +.rva .LSEH_epilogue_mul_mont_sparse_256 +.rva .LSEH_end_mul_mont_sparse_256 +.rva .LSEH_info_mul_mont_sparse_256_epilogue + +.rva .LSEH_begin_sqr_mont_sparse_256 +.rva .LSEH_body_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_prologue + +.rva .LSEH_body_sqr_mont_sparse_256 +.rva .LSEH_epilogue_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_body + +.rva .LSEH_epilogue_sqr_mont_sparse_256 +.rva .LSEH_end_sqr_mont_sparse_256 +.rva .LSEH_info_sqr_mont_sparse_256_epilogue + +.rva .LSEH_begin_from_mont_256 +.rva .LSEH_body_from_mont_256 +.rva .LSEH_info_from_mont_256_prologue + +.rva .LSEH_body_from_mont_256 +.rva .LSEH_epilogue_from_mont_256 +.rva .LSEH_info_from_mont_256_body + +.rva .LSEH_epilogue_from_mont_256 +.rva .LSEH_end_from_mont_256 +.rva .LSEH_info_from_mont_256_epilogue + +.rva .LSEH_begin_redc_mont_256 +.rva .LSEH_body_redc_mont_256 +.rva .LSEH_info_redc_mont_256_prologue + +.rva .LSEH_body_redc_mont_256 +.rva .LSEH_epilogue_redc_mont_256 +.rva .LSEH_info_redc_mont_256_body + +.rva .LSEH_epilogue_redc_mont_256 +.rva .LSEH_end_redc_mont_256 +.rva .LSEH_info_redc_mont_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mul_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mul_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqr_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqr_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_from_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_from_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_from_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redc_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_redc_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_redc_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s new file mode 100644 index 00000000000..ee646f5b137 --- /dev/null +++ b/crypto/blst_src/build/coff/mulq_mont_384-x86_64.s @@ -0,0 +1,4303 @@ +.comm __blst_platform_cap,4 +.text + + + + + + + +.def __subq_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__subq_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __addq_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__addq_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.def __subq_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__subq_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__subq_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.globl mul_mont_384x + +.def mul_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_384x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_384x$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $328,%rsp + +.LSEH_body_mul_mont_384x: + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __addq_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __addq_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __subq_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __subq_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __subq_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mul_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_384x: +.globl sqr_mont_384x + +.def sqr_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_384x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_384x$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_mont_384x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __addq_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __subq_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_384x: + +.globl mul_382x + +.def mul_382x; .scl 2; .type 32; .endef +.p2align 5 +mul_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_382x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_382x$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_mul_382x: + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __subq_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __subq_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __subq_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mul_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_382x: +.globl sqr_382x + +.def sqr_382x; .scl 2; .type 32; .endef +.p2align 5 +sqr_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_382x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_382x$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_sqr_382x: + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __subq_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_382x: +.globl mul_384 + +.def mul_384; .scl 2; .type 32; .endef +.p2align 5 +mul_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_384$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + +.LSEH_body_mul_384: + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 + + movq 8(%rsp),%rbx + + movq 16(%rsp),%rbp + + leaq 24(%rsp),%rsp + +.LSEH_epilogue_mul_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_384: + +.def __mulq_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 + +.globl sqr_384 + +.def sqr_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_384: + + + movq %rcx,%rdi + movq %rdx,%rsi +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_384$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sqr_384: + + + call __sqrq_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqr_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_384: + +.def __sqrq_384; .scl 3; .type 32; .endef +.p2align 5 +__sqrq_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 + + +.globl sqr_mont_384 + +.def sqr_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_384$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $120,%rsp + +.LSEH_body_sqr_mont_384: + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_384: + + + +.globl redc_mont_384 + +.def redc_mont_384; .scl 2; .type 32; .endef +.p2align 5 +redc_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redc_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz redc_mont_384$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redc_mont_384: + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redc_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redc_mont_384: + + + + +.globl from_mont_384 + +.def from_mont_384; .scl 2; .type 32; .endef +.p2align 5 +from_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_from_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz from_mont_384$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_from_mont_384: + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_from_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_from_mont_384: +.def __mulq_by_1_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_by_1_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 + + +.def __redq_tail_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__redq_tail_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sgn0_pty_mont_384 + +.def sgn0_pty_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sgn0_pty_mont_384$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mont_384: + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mont_384: + +.globl sgn0_pty_mont_384x + +.def sgn0_pty_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0_pty_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0_pty_mont_384x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sgn0_pty_mont_384x$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0_pty_mont_384x: + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0_pty_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0_pty_mont_384x: +.globl mul_mont_384 + +.def mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mul_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_384$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $24,%rsp + +.LSEH_body_mul_mont_384: + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mul_mont_384: +.def __mulq_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + +.globl sqr_n_mul_mont_384 + +.def sqr_n_mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqr_n_mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_n_mul_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_n_mul_mont_384$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_n_mul_mont_384: + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_n_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_n_mul_mont_384: + +.globl sqr_n_mul_mont_383 + +.def sqr_n_mul_mont_383; .scl 2; .type 32; .endef +.p2align 5 +sqr_n_mul_mont_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_n_mul_mont_383: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_n_mul_mont_383$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_n_mul_mont_383: + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_n_mul_mont_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_n_mul_mont_383: +.def __mulq_mont_383_nonred; .scl 3; .type 32; .endef +.p2align 5 +__mulq_mont_383_nonred: + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 + +.globl sqr_mont_382x + +.def sqr_mont_382x; .scl 2; .type 32; .endef +.p2align 5 +sqr_mont_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqr_mont_382x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_382x$1 +#endif + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqr_mont_382x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqr_mont_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqr_mont_382x: +.section .pdata +.p2align 2 +.rva .LSEH_begin_mul_mont_384x +.rva .LSEH_body_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_prologue + +.rva .LSEH_body_mul_mont_384x +.rva .LSEH_epilogue_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_body + +.rva .LSEH_epilogue_mul_mont_384x +.rva .LSEH_end_mul_mont_384x +.rva .LSEH_info_mul_mont_384x_epilogue + +.rva .LSEH_begin_sqr_mont_384x +.rva .LSEH_body_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_prologue + +.rva .LSEH_body_sqr_mont_384x +.rva .LSEH_epilogue_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_body + +.rva .LSEH_epilogue_sqr_mont_384x +.rva .LSEH_end_sqr_mont_384x +.rva .LSEH_info_sqr_mont_384x_epilogue + +.rva .LSEH_begin_mul_382x +.rva .LSEH_body_mul_382x +.rva .LSEH_info_mul_382x_prologue + +.rva .LSEH_body_mul_382x +.rva .LSEH_epilogue_mul_382x +.rva .LSEH_info_mul_382x_body + +.rva .LSEH_epilogue_mul_382x +.rva .LSEH_end_mul_382x +.rva .LSEH_info_mul_382x_epilogue + +.rva .LSEH_begin_sqr_382x +.rva .LSEH_body_sqr_382x +.rva .LSEH_info_sqr_382x_prologue + +.rva .LSEH_body_sqr_382x +.rva .LSEH_epilogue_sqr_382x +.rva .LSEH_info_sqr_382x_body + +.rva .LSEH_epilogue_sqr_382x +.rva .LSEH_end_sqr_382x +.rva .LSEH_info_sqr_382x_epilogue + +.rva .LSEH_begin_mul_384 +.rva .LSEH_body_mul_384 +.rva .LSEH_info_mul_384_prologue + +.rva .LSEH_body_mul_384 +.rva .LSEH_epilogue_mul_384 +.rva .LSEH_info_mul_384_body + +.rva .LSEH_epilogue_mul_384 +.rva .LSEH_end_mul_384 +.rva .LSEH_info_mul_384_epilogue + +.rva .LSEH_begin_sqr_384 +.rva .LSEH_body_sqr_384 +.rva .LSEH_info_sqr_384_prologue + +.rva .LSEH_body_sqr_384 +.rva .LSEH_epilogue_sqr_384 +.rva .LSEH_info_sqr_384_body + +.rva .LSEH_epilogue_sqr_384 +.rva .LSEH_end_sqr_384 +.rva .LSEH_info_sqr_384_epilogue + +.rva .LSEH_begin_sqr_mont_384 +.rva .LSEH_body_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_prologue + +.rva .LSEH_body_sqr_mont_384 +.rva .LSEH_epilogue_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_body + +.rva .LSEH_epilogue_sqr_mont_384 +.rva .LSEH_end_sqr_mont_384 +.rva .LSEH_info_sqr_mont_384_epilogue + +.rva .LSEH_begin_redc_mont_384 +.rva .LSEH_body_redc_mont_384 +.rva .LSEH_info_redc_mont_384_prologue + +.rva .LSEH_body_redc_mont_384 +.rva .LSEH_epilogue_redc_mont_384 +.rva .LSEH_info_redc_mont_384_body + +.rva .LSEH_epilogue_redc_mont_384 +.rva .LSEH_end_redc_mont_384 +.rva .LSEH_info_redc_mont_384_epilogue + +.rva .LSEH_begin_from_mont_384 +.rva .LSEH_body_from_mont_384 +.rva .LSEH_info_from_mont_384_prologue + +.rva .LSEH_body_from_mont_384 +.rva .LSEH_epilogue_from_mont_384 +.rva .LSEH_info_from_mont_384_body + +.rva .LSEH_epilogue_from_mont_384 +.rva .LSEH_end_from_mont_384 +.rva .LSEH_info_from_mont_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mont_384 +.rva .LSEH_body_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_prologue + +.rva .LSEH_body_sgn0_pty_mont_384 +.rva .LSEH_epilogue_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_body + +.rva .LSEH_epilogue_sgn0_pty_mont_384 +.rva .LSEH_end_sgn0_pty_mont_384 +.rva .LSEH_info_sgn0_pty_mont_384_epilogue + +.rva .LSEH_begin_sgn0_pty_mont_384x +.rva .LSEH_body_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_prologue + +.rva .LSEH_body_sgn0_pty_mont_384x +.rva .LSEH_epilogue_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_body + +.rva .LSEH_epilogue_sgn0_pty_mont_384x +.rva .LSEH_end_sgn0_pty_mont_384x +.rva .LSEH_info_sgn0_pty_mont_384x_epilogue + +.rva .LSEH_begin_mul_mont_384 +.rva .LSEH_body_mul_mont_384 +.rva .LSEH_info_mul_mont_384_prologue + +.rva .LSEH_body_mul_mont_384 +.rva .LSEH_epilogue_mul_mont_384 +.rva .LSEH_info_mul_mont_384_body + +.rva .LSEH_epilogue_mul_mont_384 +.rva .LSEH_end_mul_mont_384 +.rva .LSEH_info_mul_mont_384_epilogue + +.rva .LSEH_begin_sqr_n_mul_mont_384 +.rva .LSEH_body_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_prologue + +.rva .LSEH_body_sqr_n_mul_mont_384 +.rva .LSEH_epilogue_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_body + +.rva .LSEH_epilogue_sqr_n_mul_mont_384 +.rva .LSEH_end_sqr_n_mul_mont_384 +.rva .LSEH_info_sqr_n_mul_mont_384_epilogue + +.rva .LSEH_begin_sqr_n_mul_mont_383 +.rva .LSEH_body_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_prologue + +.rva .LSEH_body_sqr_n_mul_mont_383 +.rva .LSEH_epilogue_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_body + +.rva .LSEH_epilogue_sqr_n_mul_mont_383 +.rva .LSEH_end_sqr_n_mul_mont_383 +.rva .LSEH_info_sqr_n_mul_mont_383_epilogue + +.rva .LSEH_begin_sqr_mont_382x +.rva .LSEH_body_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_prologue + +.rva .LSEH_body_sqr_mont_382x +.rva .LSEH_epilogue_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_body + +.rva .LSEH_epilogue_sqr_mont_382x +.rva .LSEH_end_sqr_mont_382x +.rva .LSEH_info_sqr_mont_382x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mul_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x29,0x00 +.byte 0x00,0xe4,0x2a,0x00 +.byte 0x00,0xd4,0x2b,0x00 +.byte 0x00,0xc4,0x2c,0x00 +.byte 0x00,0x34,0x2d,0x00 +.byte 0x00,0x54,0x2e,0x00 +.byte 0x00,0x74,0x30,0x00 +.byte 0x00,0x64,0x31,0x00 +.byte 0x00,0x01,0x2f,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mul_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqr_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqr_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mul_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqr_382x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqr_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_384_body: +.byte 1,0,11,0 +.byte 0x00,0xc4,0x00,0x00 +.byte 0x00,0x34,0x01,0x00 +.byte 0x00,0x54,0x02,0x00 +.byte 0x00,0x74,0x04,0x00 +.byte 0x00,0x64,0x05,0x00 +.byte 0x00,0x22 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.LSEH_info_mul_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqr_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqr_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqr_mont_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x0f,0x00 +.byte 0x00,0xe4,0x10,0x00 +.byte 0x00,0xd4,0x11,0x00 +.byte 0x00,0xc4,0x12,0x00 +.byte 0x00,0x34,0x13,0x00 +.byte 0x00,0x54,0x14,0x00 +.byte 0x00,0x74,0x16,0x00 +.byte 0x00,0x64,0x17,0x00 +.byte 0x00,0x01,0x15,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqr_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redc_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_redc_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_redc_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_from_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_from_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_from_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sgn0_pty_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sgn0_pty_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0_pty_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sgn0_pty_mont_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sgn0_pty_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mul_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_n_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqr_n_mul_mont_384_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqr_n_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_n_mul_mont_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqr_n_mul_mont_383_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqr_n_mul_mont_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqr_mont_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqr_mont_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqr_mont_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s new file mode 100644 index 00000000000..cba65569c52 --- /dev/null +++ b/crypto/blst_src/build/coff/mulx_mont_256-x86_64.s @@ -0,0 +1,796 @@ +.text + +.globl mulx_mont_sparse_256 + +.def mulx_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_sparse_256: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 +mul_mont_sparse_256$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_mulx_mont_sparse_256: + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_mulx_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_sparse_256: + +.globl sqrx_mont_sparse_256 + +.def sqrx_mont_sparse_256; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_sparse_256: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +sqr_mont_sparse_256$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sqrx_mont_sparse_256: + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_mont_sparse_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_sparse_256: +.def __mulx_mont_sparse_256; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_sparse_256: + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 + +.globl fromx_mont_256 + +.def fromx_mont_256; .scl 2; .type 32; .endef +.p2align 5 +fromx_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_fromx_mont_256: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +from_mont_256$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_fromx_mont_256: + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_fromx_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_fromx_mont_256: + +.globl redcx_mont_256 + +.def redcx_mont_256; .scl 2; .type 32; .endef +.p2align 5 +redcx_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redcx_mont_256: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +redc_mont_256$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redcx_mont_256: + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redcx_mont_256: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redcx_mont_256: +.def __mulx_by_1_mont_256; .scl 3; .type 32; .endef +.p2align 5 +__mulx_by_1_mont_256: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_mulx_mont_sparse_256 +.rva .LSEH_body_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_prologue + +.rva .LSEH_body_mulx_mont_sparse_256 +.rva .LSEH_epilogue_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_body + +.rva .LSEH_epilogue_mulx_mont_sparse_256 +.rva .LSEH_end_mulx_mont_sparse_256 +.rva .LSEH_info_mulx_mont_sparse_256_epilogue + +.rva .LSEH_begin_sqrx_mont_sparse_256 +.rva .LSEH_body_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_prologue + +.rva .LSEH_body_sqrx_mont_sparse_256 +.rva .LSEH_epilogue_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_body + +.rva .LSEH_epilogue_sqrx_mont_sparse_256 +.rva .LSEH_end_sqrx_mont_sparse_256 +.rva .LSEH_info_sqrx_mont_sparse_256_epilogue + +.rva .LSEH_begin_fromx_mont_256 +.rva .LSEH_body_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_prologue + +.rva .LSEH_body_fromx_mont_256 +.rva .LSEH_epilogue_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_body + +.rva .LSEH_epilogue_fromx_mont_256 +.rva .LSEH_end_fromx_mont_256 +.rva .LSEH_info_fromx_mont_256_epilogue + +.rva .LSEH_begin_redcx_mont_256 +.rva .LSEH_body_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_prologue + +.rva .LSEH_body_redcx_mont_256 +.rva .LSEH_epilogue_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_body + +.rva .LSEH_epilogue_redcx_mont_256 +.rva .LSEH_end_redcx_mont_256 +.rva .LSEH_info_redcx_mont_256_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mulx_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mulx_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mulx_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_sparse_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqrx_mont_sparse_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqrx_mont_sparse_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_fromx_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_fromx_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_fromx_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redcx_mont_256_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_redcx_mont_256_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_redcx_mont_256_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s new file mode 100644 index 00000000000..ce1354f46b4 --- /dev/null +++ b/crypto/blst_src/build/coff/mulx_mont_384-x86_64.s @@ -0,0 +1,3608 @@ +.text + + + + + + + +.def __subx_mod_384x384; .scl 3; .type 32; .endef +.p2align 5 +__subx_mod_384x384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 + + +.def __addx_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__addx_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + + +.def __subx_mod_384; .scl 3; .type 32; .endef +.p2align 5 +__subx_mod_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__subx_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 + +.globl mulx_mont_384x + +.def mulx_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_384x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 +mul_mont_384x$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $328,%rsp + +.LSEH_body_mulx_mont_384x: + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __addx_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __addx_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __subx_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __subx_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __subx_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mulx_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_384x: +.globl sqrx_mont_384x + +.def sqrx_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_384x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +sqr_mont_384x$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqrx_mont_384x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __addx_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __subx_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqrx_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_384x: + +.globl mulx_382x + +.def mulx_382x; .scl 2; .type 32; .endef +.p2align 5 +mulx_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_382x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +mul_382x$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_mulx_382x: + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __subx_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __subx_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __subx_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_mulx_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_382x: +.globl sqrx_382x + +.def sqrx_382x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_382x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +sqr_382x$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rsi + +.LSEH_body_sqrx_382x: + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __subx_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_382x: +.globl mulx_384 + +.def mulx_384; .scl 2; .type 32; .endef +.p2align 5 +mulx_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +mul_384$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +.LSEH_body_mulx_384: + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +.LSEH_epilogue_mulx_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_384: + +.def __mulx_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 + +.globl sqrx_384 + +.def sqrx_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_384: + + + movq %rcx,%rdi + movq %rdx,%rsi +sqr_384$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rdi + +.LSEH_body_sqrx_384: + + + call __sqrx_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sqrx_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_384: +.def __sqrx_384; .scl 3; .type 32; .endef +.p2align 5 +__sqrx_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 + + + + +.globl redcx_mont_384 + +.def redcx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +redcx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_redcx_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +redc_mont_384$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_redcx_mont_384: + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_redcx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_redcx_mont_384: + + + + +.globl fromx_mont_384 + +.def fromx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +fromx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_fromx_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +from_mont_384$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_fromx_mont_384: + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_fromx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_fromx_mont_384: +.def __mulx_by_1_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_by_1_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 + + +.def __redx_tail_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__redx_tail_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 + + +.globl sgn0x_pty_mont_384 + +.def sgn0x_pty_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sgn0x_pty_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0x_pty_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +sgn0_pty_mont_384$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0x_pty_mont_384: + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0x_pty_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0x_pty_mont_384: + +.globl sgn0x_pty_mont_384x + +.def sgn0x_pty_mont_384x; .scl 2; .type 32; .endef +.p2align 5 +sgn0x_pty_mont_384x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sgn0x_pty_mont_384x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +sgn0_pty_mont_384x$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $8,%rsp + +.LSEH_body_sgn0x_pty_mont_384x: + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 + + movq 16(%rsp),%r14 + + movq 24(%rsp),%r13 + + movq 32(%rsp),%r12 + + movq 40(%rsp),%rbx + + movq 48(%rsp),%rbp + + leaq 56(%rsp),%rsp + +.LSEH_epilogue_sgn0x_pty_mont_384x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sgn0x_pty_mont_384x: +.globl mulx_mont_384 + +.def mulx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +mulx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_mulx_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 +mul_mont_384$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_mulx_mont_384: + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_mulx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_mulx_mont_384: +.def __mulx_mont_384; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 + + +.globl sqrx_mont_384 + +.def sqrx_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +sqr_mont_384$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -24(%rsp),%rsp + +.LSEH_body_sqrx_mont_384: + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 + + movq 32(%rsp),%r14 + + movq 40(%rsp),%r13 + + movq 48(%rsp),%r12 + + movq 56(%rsp),%rbx + + movq 64(%rsp),%rbp + + leaq 72(%rsp),%rsp + +.LSEH_epilogue_sqrx_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_384: + +.globl sqrx_n_mul_mont_384 + +.def sqrx_n_mul_mont_384; .scl 2; .type 32; .endef +.p2align 5 +sqrx_n_mul_mont_384: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_n_mul_mont_384: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 +sqr_n_mul_mont_384$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -40(%rsp),%rsp + +.LSEH_body_sqrx_n_mul_mont_384: + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 + + movq 48(%rsp),%r14 + + movq 56(%rsp),%r13 + + movq 64(%rsp),%r12 + + movq 72(%rsp),%rbx + + movq 80(%rsp),%rbp + + leaq 88(%rsp),%rsp + +.LSEH_epilogue_sqrx_n_mul_mont_384: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_n_mul_mont_384: + +.globl sqrx_n_mul_mont_383 + +.def sqrx_n_mul_mont_383; .scl 2; .type 32; .endef +.p2align 5 +sqrx_n_mul_mont_383: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_n_mul_mont_383: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx + movq 40(%rsp),%r8 + movq 48(%rsp),%r9 +sqr_n_mul_mont_383$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + leaq -40(%rsp),%rsp + +.LSEH_body_sqrx_n_mul_mont_383: + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 + + movq 48(%rsp),%r14 + + movq 56(%rsp),%r13 + + movq 64(%rsp),%r12 + + movq 72(%rsp),%rbx + + movq 80(%rsp),%rbp + + leaq 88(%rsp),%rsp + +.LSEH_epilogue_sqrx_n_mul_mont_383: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_n_mul_mont_383: +.def __mulx_mont_383_nonred; .scl 3; .type 32; .endef +.p2align 5 +__mulx_mont_383_nonred: + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 + + +.globl sqrx_mont_382x + +.def sqrx_mont_382x; .scl 2; .type 32; .endef +.p2align 5 +sqrx_mont_382x: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_sqrx_mont_382x: + + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + movq %r9,%rcx +sqr_mont_382x$1: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $136,%rsp + +.LSEH_body_sqrx_mont_382x: + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 + + movq 8(%r8),%r14 + + movq 16(%r8),%r13 + + movq 24(%r8),%r12 + + movq 32(%r8),%rbx + + movq 40(%r8),%rbp + + leaq 48(%r8),%rsp + +.LSEH_epilogue_sqrx_mont_382x: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_sqrx_mont_382x: +.section .pdata +.p2align 2 +.rva .LSEH_begin_mulx_mont_384x +.rva .LSEH_body_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_prologue + +.rva .LSEH_body_mulx_mont_384x +.rva .LSEH_epilogue_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_body + +.rva .LSEH_epilogue_mulx_mont_384x +.rva .LSEH_end_mulx_mont_384x +.rva .LSEH_info_mulx_mont_384x_epilogue + +.rva .LSEH_begin_sqrx_mont_384x +.rva .LSEH_body_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_prologue + +.rva .LSEH_body_sqrx_mont_384x +.rva .LSEH_epilogue_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_body + +.rva .LSEH_epilogue_sqrx_mont_384x +.rva .LSEH_end_sqrx_mont_384x +.rva .LSEH_info_sqrx_mont_384x_epilogue + +.rva .LSEH_begin_mulx_382x +.rva .LSEH_body_mulx_382x +.rva .LSEH_info_mulx_382x_prologue + +.rva .LSEH_body_mulx_382x +.rva .LSEH_epilogue_mulx_382x +.rva .LSEH_info_mulx_382x_body + +.rva .LSEH_epilogue_mulx_382x +.rva .LSEH_end_mulx_382x +.rva .LSEH_info_mulx_382x_epilogue + +.rva .LSEH_begin_sqrx_382x +.rva .LSEH_body_sqrx_382x +.rva .LSEH_info_sqrx_382x_prologue + +.rva .LSEH_body_sqrx_382x +.rva .LSEH_epilogue_sqrx_382x +.rva .LSEH_info_sqrx_382x_body + +.rva .LSEH_epilogue_sqrx_382x +.rva .LSEH_end_sqrx_382x +.rva .LSEH_info_sqrx_382x_epilogue + +.rva .LSEH_begin_mulx_384 +.rva .LSEH_body_mulx_384 +.rva .LSEH_info_mulx_384_prologue + +.rva .LSEH_body_mulx_384 +.rva .LSEH_epilogue_mulx_384 +.rva .LSEH_info_mulx_384_body + +.rva .LSEH_epilogue_mulx_384 +.rva .LSEH_end_mulx_384 +.rva .LSEH_info_mulx_384_epilogue + +.rva .LSEH_begin_sqrx_384 +.rva .LSEH_body_sqrx_384 +.rva .LSEH_info_sqrx_384_prologue + +.rva .LSEH_body_sqrx_384 +.rva .LSEH_epilogue_sqrx_384 +.rva .LSEH_info_sqrx_384_body + +.rva .LSEH_epilogue_sqrx_384 +.rva .LSEH_end_sqrx_384 +.rva .LSEH_info_sqrx_384_epilogue + +.rva .LSEH_begin_redcx_mont_384 +.rva .LSEH_body_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_prologue + +.rva .LSEH_body_redcx_mont_384 +.rva .LSEH_epilogue_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_body + +.rva .LSEH_epilogue_redcx_mont_384 +.rva .LSEH_end_redcx_mont_384 +.rva .LSEH_info_redcx_mont_384_epilogue + +.rva .LSEH_begin_fromx_mont_384 +.rva .LSEH_body_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_prologue + +.rva .LSEH_body_fromx_mont_384 +.rva .LSEH_epilogue_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_body + +.rva .LSEH_epilogue_fromx_mont_384 +.rva .LSEH_end_fromx_mont_384 +.rva .LSEH_info_fromx_mont_384_epilogue + +.rva .LSEH_begin_sgn0x_pty_mont_384 +.rva .LSEH_body_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_prologue + +.rva .LSEH_body_sgn0x_pty_mont_384 +.rva .LSEH_epilogue_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_body + +.rva .LSEH_epilogue_sgn0x_pty_mont_384 +.rva .LSEH_end_sgn0x_pty_mont_384 +.rva .LSEH_info_sgn0x_pty_mont_384_epilogue + +.rva .LSEH_begin_sgn0x_pty_mont_384x +.rva .LSEH_body_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_prologue + +.rva .LSEH_body_sgn0x_pty_mont_384x +.rva .LSEH_epilogue_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_body + +.rva .LSEH_epilogue_sgn0x_pty_mont_384x +.rva .LSEH_end_sgn0x_pty_mont_384x +.rva .LSEH_info_sgn0x_pty_mont_384x_epilogue + +.rva .LSEH_begin_mulx_mont_384 +.rva .LSEH_body_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_prologue + +.rva .LSEH_body_mulx_mont_384 +.rva .LSEH_epilogue_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_body + +.rva .LSEH_epilogue_mulx_mont_384 +.rva .LSEH_end_mulx_mont_384 +.rva .LSEH_info_mulx_mont_384_epilogue + +.rva .LSEH_begin_sqrx_mont_384 +.rva .LSEH_body_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_prologue + +.rva .LSEH_body_sqrx_mont_384 +.rva .LSEH_epilogue_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_body + +.rva .LSEH_epilogue_sqrx_mont_384 +.rva .LSEH_end_sqrx_mont_384 +.rva .LSEH_info_sqrx_mont_384_epilogue + +.rva .LSEH_begin_sqrx_n_mul_mont_384 +.rva .LSEH_body_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_prologue + +.rva .LSEH_body_sqrx_n_mul_mont_384 +.rva .LSEH_epilogue_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_body + +.rva .LSEH_epilogue_sqrx_n_mul_mont_384 +.rva .LSEH_end_sqrx_n_mul_mont_384 +.rva .LSEH_info_sqrx_n_mul_mont_384_epilogue + +.rva .LSEH_begin_sqrx_n_mul_mont_383 +.rva .LSEH_body_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_prologue + +.rva .LSEH_body_sqrx_n_mul_mont_383 +.rva .LSEH_epilogue_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_body + +.rva .LSEH_epilogue_sqrx_n_mul_mont_383 +.rva .LSEH_end_sqrx_n_mul_mont_383 +.rva .LSEH_info_sqrx_n_mul_mont_383_epilogue + +.rva .LSEH_begin_sqrx_mont_382x +.rva .LSEH_body_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_prologue + +.rva .LSEH_body_sqrx_mont_382x +.rva .LSEH_epilogue_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_body + +.rva .LSEH_epilogue_sqrx_mont_382x +.rva .LSEH_end_sqrx_mont_382x +.rva .LSEH_info_sqrx_mont_382x_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_mulx_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mulx_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x29,0x00 +.byte 0x00,0xe4,0x2a,0x00 +.byte 0x00,0xd4,0x2b,0x00 +.byte 0x00,0xc4,0x2c,0x00 +.byte 0x00,0x34,0x2d,0x00 +.byte 0x00,0x54,0x2e,0x00 +.byte 0x00,0x74,0x30,0x00 +.byte 0x00,0x64,0x31,0x00 +.byte 0x00,0x01,0x2f,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mulx_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqrx_mont_384x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqrx_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mulx_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mulx_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqrx_382x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqrx_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mulx_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x00,0x00 +.byte 0x00,0xe4,0x01,0x00 +.byte 0x00,0xd4,0x02,0x00 +.byte 0x00,0xc4,0x03,0x00 +.byte 0x00,0x34,0x04,0x00 +.byte 0x00,0x54,0x05,0x00 +.byte 0x00,0x74,0x07,0x00 +.byte 0x00,0x64,0x08,0x00 +.byte 0x00,0x52 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mulx_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqrx_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqrx_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_redcx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_redcx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_redcx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_fromx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_fromx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_fromx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0x_pty_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sgn0x_pty_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sgn0x_pty_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sgn0x_pty_mont_384x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sgn0x_pty_mont_384x_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x01,0x00 +.byte 0x00,0xe4,0x02,0x00 +.byte 0x00,0xd4,0x03,0x00 +.byte 0x00,0xc4,0x04,0x00 +.byte 0x00,0x34,0x05,0x00 +.byte 0x00,0x54,0x06,0x00 +.byte 0x00,0x74,0x08,0x00 +.byte 0x00,0x64,0x09,0x00 +.byte 0x00,0x62 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sgn0x_pty_mont_384x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_mulx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_mulx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_mulx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqrx_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x03,0x00 +.byte 0x00,0xe4,0x04,0x00 +.byte 0x00,0xd4,0x05,0x00 +.byte 0x00,0xc4,0x06,0x00 +.byte 0x00,0x34,0x07,0x00 +.byte 0x00,0x54,0x08,0x00 +.byte 0x00,0x74,0x0a,0x00 +.byte 0x00,0x64,0x0b,0x00 +.byte 0x00,0x82 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqrx_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_n_mul_mont_384_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqrx_n_mul_mont_384_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x05,0x00 +.byte 0x00,0xe4,0x06,0x00 +.byte 0x00,0xd4,0x07,0x00 +.byte 0x00,0xc4,0x08,0x00 +.byte 0x00,0x34,0x09,0x00 +.byte 0x00,0x54,0x0a,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0xa2 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqrx_n_mul_mont_384_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_n_mul_mont_383_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqrx_n_mul_mont_383_body: +.byte 1,0,17,0 +.byte 0x00,0xf4,0x05,0x00 +.byte 0x00,0xe4,0x06,0x00 +.byte 0x00,0xd4,0x07,0x00 +.byte 0x00,0xc4,0x08,0x00 +.byte 0x00,0x34,0x09,0x00 +.byte 0x00,0x54,0x0a,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0xa2 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqrx_n_mul_mont_383_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_sqrx_mont_382x_prologue: +.byte 1,0,5,0x0b +.byte 0,0x74,1,0 +.byte 0,0x64,2,0 +.byte 0,0xb3 +.byte 0,0 +.long 0,0 +.LSEH_info_sqrx_mont_382x_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x11,0x00 +.byte 0x00,0xe4,0x12,0x00 +.byte 0x00,0xd4,0x13,0x00 +.byte 0x00,0xc4,0x14,0x00 +.byte 0x00,0x34,0x15,0x00 +.byte 0x00,0x54,0x16,0x00 +.byte 0x00,0x74,0x18,0x00 +.byte 0x00,0x64,0x19,0x00 +.byte 0x00,0x01,0x17,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_sqrx_mont_382x_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/sha256-armv8.S b/crypto/blst_src/build/coff/sha256-armv8.S new file mode 100644 index 00000000000..a4cd8090896 --- /dev/null +++ b/crypto/blst_src/build/coff/sha256-armv8.S @@ -0,0 +1,1093 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with rationale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.comm __blst_platform_cap,4 +.text + +.p2align 6 + +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.p2align 2 +.globl blst_sha256_block_armv8 +.def blst_sha256_block_armv8; +.type 32; +.endef +.p2align 6 +blst_sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +.globl blst_sha256_block_data_order +.def blst_sha256_block_data_order; +.type 32; +.endef +.p2align 4 +blst_sha256_block_data_order: + adrp x16,__blst_platform_cap + ldr w16,[x16,#:lo12:__blst_platform_cap] + tst w16,#1 + b.ne .Lv8_entry + + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.p2align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret + +.globl blst_sha256_emit + +.def blst_sha256_emit; +.type 32; +.endef +.p2align 4 +blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret + + +.globl blst_sha256_bcopy + +.def blst_sha256_bcopy; +.type 32; +.endef +.p2align 4 +blst_sha256_bcopy: +.Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,.Loop_bcopy + ret + + +.globl blst_sha256_hcopy + +.def blst_sha256_hcopy; +.type 32; +.endef +.p2align 4 +blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret + diff --git a/crypto/blst_src/build/coff/sha256-portable-x86_64.s b/crypto/blst_src/build/coff/sha256-portable-x86_64.s new file mode 100644 index 00000000000..603e46c53d7 --- /dev/null +++ b/crypto/blst_src/build/coff/sha256-portable-x86_64.s @@ -0,0 +1,1792 @@ +.comm __blst_platform_cap,4 +.text + +.globl blst_sha256_block_data_order +.def blst_sha256_block_data_order; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_block_data_order: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_blst_sha256_block_data_order: + + + pushq %rbp + + movq %rsp,%rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +#ifdef __BLST_PORTABLE__ + testl $2,__blst_platform_cap(%rip) + jnz .Lblst_sha256_block_data_order$2 +#endif + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $64+24,%rsp + + +.LSEH_body_blst_sha256_block_data_order: + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.p2align 4 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp .Lrounds_16_xx +.p2align 4 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + leaq 64+24+48(%rsp),%r11 + + movq 64+24(%rsp),%r15 + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbx + movq -8(%r11),%rbp +.LSEH_epilogue_blst_sha256_block_data_order: + mov 8(%r11),%rdi + mov 16(%r11),%rsi + + leaq (%r11),%rsp + .byte 0xf3,0xc3 + +.LSEH_end_blst_sha256_block_data_order: + +#ifndef __BLST_PORTABLE__ +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_emit + +.def blst_sha256_emit; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_emit: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + bswapq %r8 + movq 24(%rdx),%r11 + bswapq %r9 + movl %r8d,4(%rcx) + bswapq %r10 + movl %r9d,12(%rcx) + bswapq %r11 + movl %r10d,20(%rcx) + shrq $32,%r8 + movl %r11d,28(%rcx) + shrq $32,%r9 + movl %r8d,0(%rcx) + shrq $32,%r10 + movl %r9d,8(%rcx) + shrq $32,%r11 + movl %r10d,16(%rcx) + movl %r11d,24(%rcx) + .byte 0xf3,0xc3 + + +.globl blst_sha256_bcopy + +.def blst_sha256_bcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_bcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rdx,%rcx +.Loop_bcopy: + movzbl (%rdx),%eax + leaq 1(%rdx),%rdx + movb %al,-1(%rcx,%rdx,1) + decq %r8 + jnz .Loop_bcopy + .byte 0xf3,0xc3 + + +.globl blst_sha256_hcopy + +.def blst_sha256_hcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_hcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + .byte 0xf3,0xc3 + +#endif +.section .pdata +.p2align 2 +.rva .LSEH_begin_blst_sha256_block_data_order +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_prologue + +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_body + +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_end_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_blst_sha256_block_data_order_prologue: +.byte 1,4,6,0x05 +.byte 4,0x74,2,0 +.byte 4,0x64,3,0 +.byte 4,0x53 +.byte 1,0x50 +.long 0,0 +.LSEH_info_blst_sha256_block_data_order_body: +.byte 1,0,18,0 +.byte 0x00,0xf4,0x0b,0x00 +.byte 0x00,0xe4,0x0c,0x00 +.byte 0x00,0xd4,0x0d,0x00 +.byte 0x00,0xc4,0x0e,0x00 +.byte 0x00,0x34,0x0f,0x00 +.byte 0x00,0x54,0x10,0x00 +.byte 0x00,0x74,0x12,0x00 +.byte 0x00,0x64,0x13,0x00 +.byte 0x00,0x01,0x11,0x00 +.byte 0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_blst_sha256_block_data_order_epilogue: +.byte 1,0,5,11 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0xb3 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/coff/sha256-x86_64.s b/crypto/blst_src/build/coff/sha256-x86_64.s new file mode 100644 index 00000000000..d65df5d0d4d --- /dev/null +++ b/crypto/blst_src/build/coff/sha256-x86_64.s @@ -0,0 +1,1562 @@ +.comm __blst_platform_cap,4 +.text + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_block_data_order_shaext + +.def blst_sha256_block_data_order_shaext; .scl 2; .type 32; .endef +.p2align 6 +blst_sha256_block_data_order_shaext: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_blst_sha256_block_data_order_shaext: + + + pushq %rbp + + movq %rsp,%rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx +.Lblst_sha256_block_data_order$2: + subq $0x50,%rsp + + movaps %xmm6,-80(%rbp) + movaps %xmm7,-64(%rbp) + movaps %xmm8,-48(%rbp) + movaps %xmm9,-32(%rbp) + movaps %xmm10,-16(%rbp) + +.LSEH_body_blst_sha256_block_data_order_shaext: + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.p2align 4 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + movaps -80(%rbp),%xmm6 + movaps -64(%rbp),%xmm7 + movaps -48(%rbp),%xmm8 + movaps -32(%rbp),%xmm9 + movaps -16(%rbp),%xmm10 + movq %rbp,%rsp + + popq %rbp + +.LSEH_epilogue_blst_sha256_block_data_order_shaext: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_blst_sha256_block_data_order_shaext: +.globl blst_sha256_block_data_order + +.def blst_sha256_block_data_order; .scl 2; .type 32; .endef +.p2align 6 +blst_sha256_block_data_order: + .byte 0xf3,0x0f,0x1e,0xfa + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + movq %rsp,%r11 +.LSEH_begin_blst_sha256_block_data_order: + + + pushq %rbp + + movq %rsp,%rbp + + movq %rcx,%rdi + movq %rdx,%rsi + movq %r8,%rdx + testl $2,__blst_platform_cap(%rip) + jnz .Lblst_sha256_block_data_order$2 + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $88,%rsp + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,-64(%rbp) + + movq %rdx,-48(%rbp) + movaps %xmm6,-128(%rbp) + movaps %xmm7,-112(%rbp) + movaps %xmm8,-96(%rbp) + movaps %xmm9,-80(%rbp) + +.LSEH_body_blst_sha256_block_data_order: + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.p2align 4 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,-56(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.p2align 4 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq -64(%rbp),%rdi + movl %r14d,%eax + movq -56(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq -48(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movaps -128(%rbp),%xmm6 + movaps -112(%rbp),%xmm7 + movaps -96(%rbp),%xmm8 + movaps -80(%rbp),%xmm9 + movq -40(%rbp),%r15 + movq -32(%rbp),%r14 + movq -24(%rbp),%r13 + movq -16(%rbp),%r12 + movq -8(%rbp),%rbx + movq %rbp,%rsp + + popq %rbp + +.LSEH_epilogue_blst_sha256_block_data_order: + mov 8(%rsp),%rdi + mov 16(%rsp),%rsi + + .byte 0xf3,0xc3 + +.LSEH_end_blst_sha256_block_data_order: +.globl blst_sha256_emit + +.def blst_sha256_emit; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_emit: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + bswapq %r8 + movq 24(%rdx),%r11 + bswapq %r9 + movl %r8d,4(%rcx) + bswapq %r10 + movl %r9d,12(%rcx) + bswapq %r11 + movl %r10d,20(%rcx) + shrq $32,%r8 + movl %r11d,28(%rcx) + shrq $32,%r9 + movl %r8d,0(%rcx) + shrq $32,%r10 + movl %r9d,8(%rcx) + shrq $32,%r11 + movl %r10d,16(%rcx) + movl %r11d,24(%rcx) + .byte 0xf3,0xc3 + + +.globl blst_sha256_bcopy + +.def blst_sha256_bcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_bcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rdx,%rcx +.Loop_bcopy: + movzbl (%rdx),%eax + leaq 1(%rdx),%rdx + movb %al,-1(%rcx,%rdx,1) + decq %r8 + jnz .Loop_bcopy + .byte 0xf3,0xc3 + + +.globl blst_sha256_hcopy + +.def blst_sha256_hcopy; .scl 2; .type 32; .endef +.p2align 4 +blst_sha256_hcopy: + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq %r8,0(%rcx) + movq %r9,8(%rcx) + movq %r10,16(%rcx) + movq %r11,24(%rcx) + .byte 0xf3,0xc3 + +.section .pdata +.p2align 2 +.rva .LSEH_begin_blst_sha256_block_data_order_shaext +.rva .LSEH_body_blst_sha256_block_data_order_shaext +.rva .LSEH_info_blst_sha256_block_data_order_shaext_prologue + +.rva .LSEH_body_blst_sha256_block_data_order_shaext +.rva .LSEH_epilogue_blst_sha256_block_data_order_shaext +.rva .LSEH_info_blst_sha256_block_data_order_shaext_body + +.rva .LSEH_epilogue_blst_sha256_block_data_order_shaext +.rva .LSEH_end_blst_sha256_block_data_order_shaext +.rva .LSEH_info_blst_sha256_block_data_order_shaext_epilogue + +.rva .LSEH_begin_blst_sha256_block_data_order +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_prologue + +.rva .LSEH_body_blst_sha256_block_data_order +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_body + +.rva .LSEH_epilogue_blst_sha256_block_data_order +.rva .LSEH_end_blst_sha256_block_data_order +.rva .LSEH_info_blst_sha256_block_data_order_epilogue + +.section .xdata +.p2align 3 +.LSEH_info_blst_sha256_block_data_order_shaext_prologue: +.byte 1,4,6,0x05 +.byte 4,0x74,2,0 +.byte 4,0x64,3,0 +.byte 4,0x53 +.byte 1,0x50 +.long 0,0 +.LSEH_info_blst_sha256_block_data_order_shaext_body: +.byte 1,0,17,85 +.byte 0x00,0x68,0x00,0x00 +.byte 0x00,0x78,0x01,0x00 +.byte 0x00,0x88,0x02,0x00 +.byte 0x00,0x98,0x03,0x00 +.byte 0x00,0xa8,0x04,0x00 +.byte 0x00,0x74,0x0c,0x00 +.byte 0x00,0x64,0x0d,0x00 +.byte 0x00,0x53 +.byte 0x00,0x92 +.byte 0x00,0x50 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_blst_sha256_block_data_order_shaext_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + +.LSEH_info_blst_sha256_block_data_order_prologue: +.byte 1,4,6,0x05 +.byte 4,0x74,2,0 +.byte 4,0x64,3,0 +.byte 4,0x53 +.byte 1,0x50 +.long 0,0 +.LSEH_info_blst_sha256_block_data_order_body: +.byte 1,0,25,133 +.byte 0x00,0x68,0x00,0x00 +.byte 0x00,0x78,0x01,0x00 +.byte 0x00,0x88,0x02,0x00 +.byte 0x00,0x98,0x03,0x00 +.byte 0x00,0xf4,0x0b,0x00 +.byte 0x00,0xe4,0x0c,0x00 +.byte 0x00,0xd4,0x0d,0x00 +.byte 0x00,0xc4,0x0e,0x00 +.byte 0x00,0x34,0x0f,0x00 +.byte 0x00,0x74,0x12,0x00 +.byte 0x00,0x64,0x13,0x00 +.byte 0x00,0x53 +.byte 0x00,0xf2 +.byte 0x00,0x50 +.byte 0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0x00,0x00,0x00,0x00 +.LSEH_info_blst_sha256_block_data_order_epilogue: +.byte 1,0,4,0 +.byte 0x00,0x74,0x01,0x00 +.byte 0x00,0x64,0x02,0x00 +.byte 0x00,0x00,0x00,0x00 + diff --git a/crypto/blst_src/build/elf/add_mod_256-armv8.S b/crypto/blst_src/build/elf/add_mod_256-armv8.S new file mode 100644 index 00000000000..57476aaa1da --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_256-armv8.S @@ -0,0 +1,379 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,%function +.align 5 +add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size add_mod_256,.-add_mod_256 + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,%function +.align 5 +mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,%function +.align 5 +lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,.Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size lshift_mod_256,.-lshift_mod_256 + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,%function +.align 5 +rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +.Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,.Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret +.size rshift_mod_256,.-rshift_mod_256 + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,%function +.align 5 +cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret +.size cneg_mod_256,.-cneg_mod_256 + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,%function +.align 5 +sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret +.size sub_mod_256,.-sub_mod_256 + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,%function +.align 5 +check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret +.size check_mod_256,.-check_mod_256 + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,%function +.align 5 +add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size add_n_check_mod_256,.-add_n_check_mod_256 + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,%function +.align 5 +sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret +.size sub_n_check_mod_256,.-sub_n_check_mod_256 diff --git a/crypto/blst_src/build/elf/add_mod_256-x86_64.s b/crypto/blst_src/build/elf/add_mod_256-x86_64.s new file mode 100644 index 00000000000..2f41781959c --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_256-x86_64.s @@ -0,0 +1,572 @@ +.text + +.globl add_mod_256 +.hidden add_mod_256 +.type add_mod_256,@function +.align 32 +add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_256,.-add_mod_256 + + +.globl mul_by_3_mod_256 +.hidden mul_by_3_mod_256 +.type mul_by_3_mod_256,@function +.align 32 +mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp .Loaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_256,.-mul_by_3_mod_256 + +.type __lshift_mod_256,@function +.align 32 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_256,.-__lshift_mod_256 + + +.globl lshift_mod_256 +.hidden lshift_mod_256 +.type lshift_mod_256,@function +.align 32 +lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz .Loop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_256,.-lshift_mod_256 + + +.globl rshift_mod_256 +.hidden rshift_mod_256 +.type rshift_mod_256,@function +.align 32 +rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +.Loop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz .Loop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_256,.-rshift_mod_256 + + +.globl cneg_mod_256 +.hidden cneg_mod_256 +.type cneg_mod_256,@function +.align 32 +cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_256,.-cneg_mod_256 + + +.globl sub_mod_256 +.hidden sub_mod_256 +.type sub_mod_256,@function +.align 32 +sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_256,.-sub_mod_256 + + +.globl check_mod_256 +.hidden check_mod_256 +.type check_mod_256,@function +.align 32 +check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax + + .byte 0xf3,0xc3 +.cfi_endproc +.size check_mod_256,.-check_mod_256 + + +.globl add_n_check_mod_256 +.hidden add_n_check_mod_256 +.type add_n_check_mod_256,@function +.align 32 +add_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_n_check_mod_256,.-add_n_check_mod_256 + + +.globl sub_n_check_mod_256 +.hidden sub_n_check_mod_256 +.type sub_n_check_mod_256,@function +.align 32 +sub_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_n_check_mod_256,.-sub_n_check_mod_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/add_mod_384-armv8.S b/crypto/blst_src/build/elf/add_mod_384-armv8.S new file mode 100644 index 00000000000..5c18d7fe892 --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_384-armv8.S @@ -0,0 +1,1000 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,%function +.align 5 +add_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,%function +.align 5 +add_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size add_mod_384x,.-add_mod_384x + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,%function +.align 5 +rshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,.Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,%function +.align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,%function +.align 5 +div_by_2_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size div_by_2_mod_384,.-div_by_2_mod_384 + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,%function +.align 5 +lshift_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +.Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,.Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,%function +.align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret +.size __lshift_mod_384,.-__lshift_mod_384 + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,%function +.align 5 +mul_by_3_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,%function +.align 5 +mul_by_8_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,%function +.align 5 +mul_by_3_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,%function +.align 5 +mul_by_8_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,%function +.align 5 +cneg_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size cneg_mod_384,.-cneg_mod_384 + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,%function +.align 5 +sub_mod_384: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,%function +.align 5 +sub_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sub_mod_384x,.-sub_mod_384x + +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,%function +.align 5 +mul_by_1_plus_i_mod_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x + +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,%function +.align 5 +sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,%function +.align 5 +sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_32 +.hidden vec_select_32 +.type vec_select_32,%function +.align 5 +vec_select_32: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_32,.-vec_select_32 +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,%function +.align 5 +vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,%function +.align 5 +vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,%function +.align 5 +vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,%function +.align 5 +vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,%function +.align 5 +vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,%function +.align 5 +vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret +.size vec_prefetch,.-vec_prefetch +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,%function +.align 5 +vec_is_zero_16x: + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, .Loop_is_zero_done + +.Loop_is_zero: + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, .Loop_is_zero + +.Loop_is_zero_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_zero_16x,.-vec_is_zero_16x +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,%function +.align 5 +vec_is_equal_16x: + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +.Loop_is_equal: + sub x2, x2, #1 + cbz x2, .Loop_is_equal_done + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b .Loop_is_equal + nop + +.Loop_is_equal_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret +.size vec_is_equal_16x,.-vec_is_equal_16x diff --git a/crypto/blst_src/build/elf/add_mod_384-x86_64.s b/crypto/blst_src/build/elf/add_mod_384-x86_64.s new file mode 100644 index 00000000000..39eee6d1752 --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_384-x86_64.s @@ -0,0 +1,1907 @@ +.text + +.globl add_mod_384 +.hidden add_mod_384 +.type add_mod_384,@function +.align 32 +add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384,.-add_mod_384 + +.type __add_mod_384,@function +.align 32 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384,.-__add_mod_384 + +.globl add_mod_384x +.hidden add_mod_384x +.type add_mod_384x,@function +.align 32 +add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x,.-add_mod_384x + + +.globl rshift_mod_384 +.hidden rshift_mod_384 +.type rshift_mod_384,@function +.align 32 +rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz .Loop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size rshift_mod_384,.-rshift_mod_384 + +.type __rshift_mod_384,@function +.align 32 +__rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __rshift_mod_384,.-__rshift_mod_384 + +.globl div_by_2_mod_384 +.hidden div_by_2_mod_384 +.type div_by_2_mod_384,@function +.align 32 +div_by_2_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_by_2_mod_384,.-div_by_2_mod_384 + + +.globl lshift_mod_384 +.hidden lshift_mod_384 +.type lshift_mod_384,@function +.align 32 +lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +.Loop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz .Loop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size lshift_mod_384,.-lshift_mod_384 + +.type __lshift_mod_384,@function +.align 32 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __lshift_mod_384,.-__lshift_mod_384 + + +.globl mul_by_3_mod_384 +.hidden mul_by_3_mod_384 +.type mul_by_3_mod_384,@function +.align 32 +mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384,.-mul_by_3_mod_384 + +.globl mul_by_8_mod_384 +.hidden mul_by_8_mod_384 +.type mul_by_8_mod_384,@function +.align 32 +mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384,.-mul_by_8_mod_384 + + +.globl mul_by_3_mod_384x +.hidden mul_by_3_mod_384x +.type mul_by_3_mod_384x,@function +.align 32 +mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_3_mod_384x,.-mul_by_3_mod_384x + +.globl mul_by_8_mod_384x +.hidden mul_by_8_mod_384x +.type mul_by_8_mod_384x,@function +.align 32 +mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_8_mod_384x,.-mul_by_8_mod_384x + + +.globl cneg_mod_384 +.hidden cneg_mod_384 +.type cneg_mod_384,@function +.align 32 +cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size cneg_mod_384,.-cneg_mod_384 + + +.globl sub_mod_384 +.hidden sub_mod_384 +.type sub_mod_384,@function +.align 32 +sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384,.-sub_mod_384 + +.type __sub_mod_384,@function +.align 32 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384,.-__sub_mod_384 + +.globl sub_mod_384x +.hidden sub_mod_384x +.type sub_mod_384x,@function +.align 32 +sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x,.-sub_mod_384x +.globl mul_by_1_plus_i_mod_384x +.hidden mul_by_1_plus_i_mod_384x +.type mul_by_1_plus_i_mod_384x,@function +.align 32 +mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_by_1_plus_i_mod_384x,.-mul_by_1_plus_i_mod_384x +.globl sgn0_pty_mod_384 +.hidden sgn0_pty_mod_384 +.type sgn0_pty_mod_384,@function +.align 32 +sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384,.-sgn0_pty_mod_384 + +.globl sgn0_pty_mod_384x +.hidden sgn0_pty_mod_384x +.type sgn0_pty_mod_384x,@function +.align 32 +sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mod_384x,.-sgn0_pty_mod_384x +.globl vec_select_32 +.hidden vec_select_32 +.type vec_select_32,@function +.align 32 +vec_select_32: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 16(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 16(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 16(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-16(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-16(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-16(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,16-16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_32,.-vec_select_32 +.globl vec_select_48 +.hidden vec_select_48 +.type vec_select_48,@function +.align 32 +vec_select_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 24(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 24(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 24(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-24(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_48,.-vec_select_48 +.globl vec_select_96 +.hidden vec_select_96 +.type vec_select_96,@function +.align 32 +vec_select_96: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 48(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 48(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 48(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_96,.-vec_select_96 +.globl vec_select_192 +.hidden vec_select_192 +.type vec_select_192,@function +.align 32 +vec_select_192: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 96(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 96(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 96(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_192,.-vec_select_192 +.globl vec_select_144 +.hidden vec_select_144 +.type vec_select_144,@function +.align 32 +vec_select_144: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 72(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 72(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 72(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_144,.-vec_select_144 +.globl vec_select_288 +.hidden vec_select_288 +.type vec_select_288,@function +.align 32 +vec_select_288: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 144(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 144(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 144(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rdi) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rdi) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rdi) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rdi) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rdi) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rdi) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_select_288,.-vec_select_288 +.globl vec_prefetch +.hidden vec_prefetch +.type vec_prefetch,@function +.align 32 +vec_prefetch: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rdi,%rsi,1),%rsi + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + prefetchnta (%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_prefetch,.-vec_prefetch +.globl vec_is_zero_16x +.hidden vec_is_zero_16x +.type vec_is_zero_16x,@function +.align 32 +vec_is_zero_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%esi + movdqu (%rdi),%xmm0 + leaq 16(%rdi),%rdi + +.Loop_is_zero: + decl %esi + jz .Loop_is_zero_done + movdqu (%rdi),%xmm1 + leaq 16(%rdi),%rdi + por %xmm1,%xmm0 + jmp .Loop_is_zero + +.Loop_is_zero_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %esi + testq %rax,%rax + cmovnzl %esi,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_is_zero_16x,.-vec_is_zero_16x +.globl vec_is_equal_16x +.hidden vec_is_equal_16x +.type vec_is_equal_16x,@function +.align 32 +vec_is_equal_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%edx + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm1 + subq %rdi,%rsi + leaq 16(%rdi),%rdi + pxor %xmm1,%xmm0 + +.Loop_is_equal: + decl %edx + jz .Loop_is_equal_done + movdqu (%rdi),%xmm1 + movdqu (%rdi,%rsi,1),%xmm2 + leaq 16(%rdi),%rdi + pxor %xmm2,%xmm1 + por %xmm1,%xmm0 + jmp .Loop_is_equal + +.Loop_is_equal_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %edx + testq %rax,%rax + cmovnzl %edx,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc +.size vec_is_equal_16x,.-vec_is_equal_16x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s b/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s new file mode 100644 index 00000000000..084f3d8262d --- /dev/null +++ b/crypto/blst_src/build/elf/add_mod_384x384-x86_64.s @@ -0,0 +1,252 @@ +.text + +.type __add_mod_384x384,@function +.align 32 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __add_mod_384x384,.-__add_mod_384x384 + +.type __sub_mod_384x384,@function +.align 32 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.globl add_mod_384x384 +.hidden add_mod_384x384 +.type add_mod_384x384,@function +.align 32 +add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size add_mod_384x384,.-add_mod_384x384 + +.globl sub_mod_384x384 +.hidden sub_mod_384x384 +.type sub_mod_384x384,@function +.align 32 +sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sub_mod_384x384,.-sub_mod_384x384 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S new file mode 100644 index 00000000000..0c5ac5b882d --- /dev/null +++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-armv8.S @@ -0,0 +1,785 @@ +.text + +.globl ct_inverse_mod_256 +.hidden ct_inverse_mod_256 +.type ct_inverse_mod_256, %function +.align 5 +ct_inverse_mod_256: + .inst 0xd503233f + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl .Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + .inst 0xd50323bf + ret +.size ct_inverse_mod_256,.-ct_inverse_mod_256 + +//////////////////////////////////////////////////////////////////////// +.type __smul_256x63, %function +.align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret +.size __smul_256x63,.-__smul_256x63 + +.type __smul_512x63_tail, %function +.align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret +.size __smul_512x63_tail,.-__smul_512x63_tail + +.type __smul_256_n_shift_by_31, %function +.align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret +.size __smul_256_n_shift_by_31,.-__smul_256_n_shift_by_31 +.type __ab_approximation_31_256, %function +.align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +.Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret +.size __ab_approximation_31_256,.-__ab_approximation_31_256 + +.type __inner_loop_31_256, %function +.align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +.Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, .Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256, %function +.align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +.Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62_256 + + ret +.size __inner_loop_62_256,.-__inner_loop_62_256 diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s new file mode 100644 index 00000000000..0f0ca4923d7 --- /dev/null +++ b/crypto/blst_src/build/elf/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1186 @@ +.text + +.globl ct_inverse_mod_256 +.hidden ct_inverse_mod_256 +.type ct_inverse_mod_256,@function +.align 32 +ct_inverse_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1072,%rsp +.cfi_adjust_cfa_offset 1072 + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1072-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_256,.-ct_inverse_mod_256 +.type __smulq_512x63,@function +.align 32 +__smulq_512x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_512x63,.-__smulq_512x63 + +.type __smulq_256x63,@function +.align 32 +__smulq_256x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256x63,.-__smulq_256x63 +.type __smulq_256_n_shift_by_31,@function +.align 32 +__smulq_256_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_256_n_shift_by_31,.-__smulq_256_n_shift_by_31 +.type __ab_approximation_31_256,@function +.align 32 +__ab_approximation_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31_256,.-__ab_approximation_31_256 +.type __inner_loop_31_256,@function +.align 32 +__inner_loop_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz .Loop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31_256,.-__inner_loop_31_256 + +.type __inner_loop_62_256,@function +.align 32 +__inner_loop_62_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +.Loop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz .Loop_62_256 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62_256,.-__inner_loop_62_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S new file mode 100644 index 00000000000..99bb9def767 --- /dev/null +++ b/crypto/blst_src/build/elf/ct_inverse_mod_384-armv8.S @@ -0,0 +1,718 @@ +.text + +.globl ct_inverse_mod_383 +.hidden ct_inverse_mod_383 +.type ct_inverse_mod_383, %function +.align 5 +ct_inverse_mod_383: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl .Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_inverse_mod_383,.-ct_inverse_mod_383 + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... +.type __smul_383x63, %function +.align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret +.size __smul_383x63,.-__smul_383x63 + +.type __smul_767x63_tail, %function +.align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret +.size __smul_767x63_tail,.-__smul_767x63_tail + +.type __smul_383_n_shift_by_62, %function +.align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret +.size __smul_383_n_shift_by_62,.-__smul_383_n_shift_by_62 +.type __ab_approximation_62, %function +.align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +.Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62, %function +.align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +.Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, .Loop_62 + + ret +.size __inner_loop_62,.-__inner_loop_62 diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S new file mode 100644 index 00000000000..07dd99a8af3 --- /dev/null +++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-armv8.S @@ -0,0 +1,325 @@ +.text + +.globl ct_is_square_mod_384 +.hidden ct_is_square_mod_384 +.type ct_is_square_mod_384, %function +.align 5 +ct_is_square_mod_384: + .inst 0xd503233f + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the .Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b .Loop_is_square + +.align 4 +.Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, .Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + .inst 0xd50323bf + ret +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smul_384_n_shift_by_30, %function +.align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret +.size __smul_384_n_shift_by_30,.-__smul_384_n_shift_by_30 +.type __ab_approximation_30, %function +.align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret +.size __ab_approximation_30,.-__ab_approximation_30 + +.type __inner_loop_30, %function +.align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +.Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, .Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret +.size __inner_loop_30,.-__inner_loop_30 +.type __inner_loop_48, %function +.align 4 +__inner_loop_48: +.Loop_48: + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x9, x9, x3, hs // |b_| = |a_| + csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, .Loop_48 + + ret +.size __inner_loop_48,.-__inner_loop_48 diff --git a/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s new file mode 100644 index 00000000000..bf610fa7440 --- /dev/null +++ b/crypto/blst_src/build/elf/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,480 @@ +.text + +.globl ct_is_square_mod_384 +.hidden ct_is_square_mod_384 +.type ct_is_square_mod_384,@function +.align 32 +ct_is_square_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $536,%rsp +.cfi_adjust_cfa_offset 536 + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp .Loop_is_square + +.align 32 +.Loop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz .Loop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -536-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_is_square_mod_384,.-ct_is_square_mod_384 + +.type __smulq_384_n_shift_by_30,@function +.align 32 +__smulq_384_n_shift_by_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_384_n_shift_by_30,.-__smulq_384_n_shift_by_30 +.type __ab_approximation_30,@function +.align 32 +__ab_approximation_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_30,.-__ab_approximation_30 +.type __inner_loop_30,@function +.align 32 +__inner_loop_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +.Loop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz .Loop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_30,.-__inner_loop_30 + +.type __inner_loop_48,@function +.align 32 +__inner_loop_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +.Loop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz .Loop_48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_48,.-__inner_loop_48 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..9cca518721f --- /dev/null +++ b/crypto/blst_src/build/elf/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1201 @@ +.comm __blst_platform_cap,4 +.text + +.globl ct_inverse_mod_383 +.hidden ct_inverse_mod_383 +.type ct_inverse_mod_383,@function +.align 32 +ct_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz ct_inverse_mod_383$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ct_inverse_mod_383,.-ct_inverse_mod_383 +.type __smulq_767x63,@function +.align 32 +__smulq_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_767x63,.-__smulq_767x63 +.type __smulq_383x63,@function +.align 32 +__smulq_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383x63,.-__smulq_383x63 +.type __smulq_383_n_shift_by_62,@function +.align 32 +__smulq_383_n_shift_by_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulq_383_n_shift_by_62,.-__smulq_383_n_shift_by_62 +.type __ab_approximation_62,@function +.align 32 +__ab_approximation_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_62,.-__ab_approximation_62 +.type __inner_loop_62,@function +.align 8 +.long 0 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +.Loop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_62,.-__inner_loop_62 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..9f4d12babd4 --- /dev/null +++ b/crypto/blst_src/build/elf/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1576 @@ +.text + +.globl ctx_inverse_mod_383 +.hidden ctx_inverse_mod_383 +.type ctx_inverse_mod_383,@function +.align 32 +ctx_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +ct_inverse_mod_383$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __tail_loop_53 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size ctx_inverse_mod_383,.-ctx_inverse_mod_383 +.type __smulx_767x63,@function +.align 32 +__smulx_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_767x63,.-__smulx_767x63 +.type __smulx_383x63,@function +.align 32 +__smulx_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383x63,.-__smulx_383x63 +.type __smulx_383_n_shift_by_31,@function +.align 32 +__smulx_383_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_383_n_shift_by_31,.-__smulx_383_n_shift_by_31 +.type __smulx_191_n_shift_by_31,@function +.align 32 +__smulx_191_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __smulx_191_n_shift_by_31,.-__smulx_191_n_shift_by_31 +.type __ab_approximation_31,@function +.align 32 +__ab_approximation_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ab_approximation_31,.-__ab_approximation_31 +.type __inner_loop_31,@function +.align 32 +__inner_loop_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +.Loop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz .Loop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __inner_loop_31,.-__inner_loop_31 + +.type __tail_loop_53,@function +.align 32 +__tail_loop_53: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +.Loop_53: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz .Loop_53 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __tail_loop_53,.-__tail_loop_53 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/div3w-armv8.S b/crypto/blst_src/build/elf/div3w-armv8.S new file mode 100644 index 00000000000..37621bee415 --- /dev/null +++ b/crypto/blst_src/build/elf/div3w-armv8.S @@ -0,0 +1,88 @@ +.text + +.globl div_3_limbs +.type div_3_limbs,%function +.align 5 +div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +.Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,.Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + speculative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.type quot_rem_128,%function +.align 5 +quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret +.size quot_rem_128,.-quot_rem_128 + +.globl quot_rem_64 +.type quot_rem_64,%function +.align 5 +quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret +.size quot_rem_64,.-quot_rem_64 diff --git a/crypto/blst_src/build/elf/div3w-x86_64.s b/crypto/blst_src/build/elf/div3w-x86_64.s new file mode 100644 index 00000000000..5d9fd8a9139 --- /dev/null +++ b/crypto/blst_src/build/elf/div3w-x86_64.s @@ -0,0 +1,132 @@ +.text + +.globl div_3_limbs +.hidden div_3_limbs +.type div_3_limbs,@function +.align 32 +div_3_limbs: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +.Loop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz .Loop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc +.size div_3_limbs,.-div_3_limbs +.globl quot_rem_128 +.hidden quot_rem_128 +.type quot_rem_128,@function +.align 32 +quot_rem_128: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_128,.-quot_rem_128 + + + + + +.globl quot_rem_64 +.hidden quot_rem_64 +.type quot_rem_64,@function +.align 32 +quot_rem_64: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + + .byte 0xf3,0xc3 +.cfi_endproc +.size quot_rem_64,.-quot_rem_64 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/mul_mont_256-armv8.S b/crypto/blst_src/build/elf/mul_mont_256-armv8.S new file mode 100644 index 00000000000..8bb1197f464 --- /dev/null +++ b/crypto/blst_src/build/elf/mul_mont_256-armv8.S @@ -0,0 +1,464 @@ +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,%function +.align 5 +mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size mul_mont_sparse_256,.-mul_mont_sparse_256 +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,%function +.align 5 +sqr_mont_sparse_256: + .inst 0xd503233f + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + .inst 0xd50323bf + ret +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,%function +.align 5 +from_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,%function +.align 5 +redc_mont_256: + .inst 0xd503233f + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + .inst 0xd50323bf + ret +.size redc_mont_256,.-redc_mont_256 + +.type __mul_by_1_mont_256,%function +.align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret +.size __mul_by_1_mont_256,.-__mul_by_1_mont_256 diff --git a/crypto/blst_src/build/elf/mul_mont_384-armv8.S b/crypto/blst_src/build/elf/mul_mont_384-armv8.S new file mode 100644 index 00000000000..c048e816b85 --- /dev/null +++ b/crypto/blst_src/build/elf/mul_mont_384-armv8.S @@ -0,0 +1,2372 @@ +.text + +.globl add_mod_384x384 +.type add_mod_384x384,%function +.align 5 +add_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size add_mod_384x384,.-add_mod_384x384 + +.type __add_mod_384x384,%function +.align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret +.size __add_mod_384x384,.-__add_mod_384x384 + +.globl sub_mod_384x384 +.type sub_mod_384x384,%function +.align 5 +sub_mod_384x384: + .inst 0xd503233f + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + .inst 0xd50323bf + ret +.size sub_mod_384x384,.-sub_mod_384x384 + +.type __sub_mod_384x384,%function +.align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret +.size __sub_mod_384x384,.-__sub_mod_384x384 + +.type __add_mod_384,%function +.align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __add_mod_384,.-__add_mod_384 + +.type __sub_mod_384,%function +.align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __sub_mod_384,.-__sub_mod_384 + +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,%function +.align 5 +mul_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384x,.-mul_mont_384x + +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,%function +.align 5 +sqr_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,%function +.align 5 +mul_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_mont_384,.-mul_mont_384 + +.type __mul_mont_384,%function +.align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret +.size __mul_mont_384,.-__mul_mont_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,%function +.align 5 +sqr_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_384,.-sqr_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,%function +.align 5 +sqr_n_mul_mont_383: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +.Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,.Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __sqr_384,%function +.align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret +.size __sqr_384,.-__sqr_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,%function +.align 5 +sqr_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_384,.-sqr_384 + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,%function +.align 5 +redc_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size redc_mont_384,.-redc_mont_384 + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,%function +.align 5 +from_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size from_mont_384,.-from_mont_384 + +.type __mul_by_1_mont_384,%function +.align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret +.size __mul_by_1_mont_384,.-__mul_by_1_mont_384 + +.type __redc_tail_mont_384,%function +.align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret +.size __redc_tail_mont_384,.-__redc_tail_mont_384 + +.globl mul_384 +.hidden mul_384 +.type mul_384,%function +.align 5 +mul_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_384,.-mul_384 + +.type __mul_384,%function +.align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret +.size __mul_384,.-__mul_384 + +.globl mul_382x +.hidden mul_382x +.type mul_382x,%function +.align 5 +mul_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size mul_382x,.-mul_382x + +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,%function +.align 5 +sqr_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_382x,.-sqr_382x + +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,%function +.align 5 +sqr_mont_382x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sqr_mont_382x,.-sqr_mont_382x + +.type __mul_mont_383_nonred,%function +.align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret +.size __mul_mont_383_nonred,.-__mul_mont_383_nonred + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,%function +.align 5 +sgn0_pty_mont_384: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,%function +.align 5 +sgn0_pty_mont_384x: + .inst 0xd503233f + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + .inst 0xd50323bf + ret +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x diff --git a/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s new file mode 100644 index 00000000000..10b1b56cb50 --- /dev/null +++ b/crypto/blst_src/build/elf/mulq_mont_256-x86_64.s @@ -0,0 +1,731 @@ +.comm __blst_platform_cap,4 +.text + +.globl mul_mont_sparse_256 +.hidden mul_mont_sparse_256 +.type mul_mont_sparse_256,@function +.align 32 +mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_sparse_256$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_sparse_256,.-mul_mont_sparse_256 + +.globl sqr_mont_sparse_256 +.hidden sqr_mont_sparse_256 +.type sqr_mont_sparse_256,@function +.align 32 +sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_sparse_256$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_sparse_256,.-sqr_mont_sparse_256 +.type __mulq_mont_sparse_256,@function +.align 32 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_sparse_256,.-__mulq_mont_sparse_256 +.globl from_mont_256 +.hidden from_mont_256 +.type from_mont_256,@function +.align 32 +from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz from_mont_256$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_256,.-from_mont_256 + +.globl redc_mont_256 +.hidden redc_mont_256 +.type redc_mont_256,@function +.align 32 +redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz redc_mont_256$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_256,.-redc_mont_256 +.type __mulq_by_1_mont_256,@function +.align 32 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_256,.-__mulq_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s new file mode 100644 index 00000000000..903ba23b12c --- /dev/null +++ b/crypto/blst_src/build/elf/mulq_mont_384-x86_64.s @@ -0,0 +1,3681 @@ +.comm __blst_platform_cap,4 +.text + + + + + + + +.type __subq_mod_384x384,@function +.align 32 +__subq_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __subq_mod_384x384,.-__subq_mod_384x384 + +.type __addq_mod_384,@function +.align 32 +__addq_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __addq_mod_384,.-__addq_mod_384 + +.type __subq_mod_384,@function +.align 32 +__subq_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__subq_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __subq_mod_384,.-__subq_mod_384 +.globl mul_mont_384x +.hidden mul_mont_384x +.type mul_mont_384x,@function +.align 32 +mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_384x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __addq_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __addq_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __subq_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __subq_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __subq_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384x,.-mul_mont_384x +.globl sqr_mont_384x +.hidden sqr_mont_384x +.type sqr_mont_384x,@function +.align 32 +sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_384x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __addq_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __subq_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384x,.-sqr_mont_384x + +.globl mul_382x +.hidden mul_382x +.type mul_382x,@function +.align 32 +mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_382x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __subq_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __subq_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __subq_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_382x,.-mul_382x +.globl sqr_382x +.hidden sqr_382x +.type sqr_382x,@function +.align 32 +sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_382x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __subq_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_382x,.-sqr_382x +.globl mul_384 +.hidden mul_384 +.type mul_384,@function +.align 32 +mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_384,.-mul_384 + +.type __mulq_384,@function +.align 32 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_384,.-__mulq_384 +.globl sqr_384 +.hidden sqr_384 +.type sqr_384,@function +.align 32 +sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_384,.-sqr_384 + +.type __sqrq_384,@function +.align 32 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrq_384,.-__sqrq_384 + +.globl sqr_mont_384 +.hidden sqr_mont_384 +.type sqr_mont_384,@function +.align 32 +sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_384,.-sqr_mont_384 + + + +.globl redc_mont_384 +.hidden redc_mont_384 +.type redc_mont_384,@function +.align 32 +redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz redc_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redc_mont_384,.-redc_mont_384 + + + + +.globl from_mont_384 +.hidden from_mont_384 +.type from_mont_384,@function +.align 32 +from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz from_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size from_mont_384,.-from_mont_384 +.type __mulq_by_1_mont_384,@function +.align 32 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_by_1_mont_384,.-__mulq_by_1_mont_384 + +.type __redq_tail_mont_384,@function +.align 32 +__redq_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redq_tail_mont_384,.-__redq_tail_mont_384 + +.globl sgn0_pty_mont_384 +.hidden sgn0_pty_mont_384 +.type sgn0_pty_mont_384,@function +.align 32 +sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sgn0_pty_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384,.-sgn0_pty_mont_384 + +.globl sgn0_pty_mont_384x +.hidden sgn0_pty_mont_384x +.type sgn0_pty_mont_384x,@function +.align 32 +sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sgn0_pty_mont_384x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0_pty_mont_384x,.-sgn0_pty_mont_384x +.globl mul_mont_384 +.hidden mul_mont_384 +.type mul_mont_384,@function +.align 32 +mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz mul_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mul_mont_384,.-mul_mont_384 +.type __mulq_mont_384,@function +.align 32 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_384,.-__mulq_mont_384 +.globl sqr_n_mul_mont_384 +.hidden sqr_n_mul_mont_384 +.type sqr_n_mul_mont_384,@function +.align 32 +sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_n_mul_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz .Loop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_384,.-sqr_n_mul_mont_384 + +.globl sqr_n_mul_mont_383 +.hidden sqr_n_mul_mont_383 +.type sqr_n_mul_mont_383,@function +.align 32 +sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_n_mul_mont_383$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +.Loop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz .Loop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_n_mul_mont_383,.-sqr_n_mul_mont_383 +.type __mulq_mont_383_nonred,@function +.align 32 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulq_mont_383_nonred,.-__mulq_mont_383_nonred +.globl sqr_mont_382x +.hidden sqr_mont_382x +.type sqr_mont_382x,@function +.align 32 +sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,__blst_platform_cap(%rip) + jnz sqr_mont_382x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqr_mont_382x,.-sqr_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s new file mode 100644 index 00000000000..42e89134cff --- /dev/null +++ b/crypto/blst_src/build/elf/mulx_mont_256-x86_64.s @@ -0,0 +1,631 @@ +.text + +.globl mulx_mont_sparse_256 +.hidden mulx_mont_sparse_256 +.type mulx_mont_sparse_256,@function +.align 32 +mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_mont_sparse_256$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_sparse_256,.-mulx_mont_sparse_256 + +.globl sqrx_mont_sparse_256 +.hidden sqrx_mont_sparse_256 +.type sqrx_mont_sparse_256,@function +.align 32 +sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_mont_sparse_256$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_sparse_256,.-sqrx_mont_sparse_256 +.type __mulx_mont_sparse_256,@function +.align 32 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_sparse_256,.-__mulx_mont_sparse_256 +.globl fromx_mont_256 +.hidden fromx_mont_256 +.type fromx_mont_256,@function +.align 32 +fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +from_mont_256$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_256,.-fromx_mont_256 + +.globl redcx_mont_256 +.hidden redcx_mont_256 +.type redcx_mont_256,@function +.align 32 +redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +redc_mont_256$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_256,.-redcx_mont_256 +.type __mulx_by_1_mont_256,@function +.align 32 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_256,.-__mulx_by_1_mont_256 + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s new file mode 100644 index 00000000000..5c67d918d22 --- /dev/null +++ b/crypto/blst_src/build/elf/mulx_mont_384-x86_64.s @@ -0,0 +1,2983 @@ +.text + + + + + + + +.type __subx_mod_384x384,@function +.align 32 +__subx_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __subx_mod_384x384,.-__subx_mod_384x384 + +.type __addx_mod_384,@function +.align 32 +__addx_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __addx_mod_384,.-__addx_mod_384 + +.type __subx_mod_384,@function +.align 32 +__subx_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__subx_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __subx_mod_384,.-__subx_mod_384 +.globl mulx_mont_384x +.hidden mulx_mont_384x +.type mulx_mont_384x,@function +.align 32 +mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_mont_384x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __addx_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __addx_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __subx_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __subx_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __subx_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384x,.-mulx_mont_384x +.globl sqrx_mont_384x +.hidden sqrx_mont_384x +.type sqrx_mont_384x,@function +.align 32 +sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_mont_384x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __addx_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __subx_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384x,.-sqrx_mont_384x + +.globl mulx_382x +.hidden mulx_382x +.type mulx_382x,@function +.align 32 +mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_382x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __subx_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __subx_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __subx_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_382x,.-mulx_382x +.globl sqrx_382x +.hidden sqrx_382x +.type sqrx_382x,@function +.align 32 +sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_382x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __subx_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_382x,.-sqrx_382x +.globl mulx_384 +.hidden mulx_384 +.type mulx_384,@function +.align 32 +mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_384,.-mulx_384 + +.type __mulx_384,@function +.align 32 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_384,.-__mulx_384 +.globl sqrx_384 +.hidden sqrx_384 +.type sqrx_384,@function +.align 32 +sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_384,.-sqrx_384 +.type __sqrx_384,@function +.align 32 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __sqrx_384,.-__sqrx_384 + + + +.globl redcx_mont_384 +.hidden redcx_mont_384 +.type redcx_mont_384,@function +.align 32 +redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +redc_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size redcx_mont_384,.-redcx_mont_384 + + + + +.globl fromx_mont_384 +.hidden fromx_mont_384 +.type fromx_mont_384,@function +.align 32 +fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +from_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size fromx_mont_384,.-fromx_mont_384 +.type __mulx_by_1_mont_384,@function +.align 32 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_by_1_mont_384,.-__mulx_by_1_mont_384 + +.type __redx_tail_mont_384,@function +.align 32 +__redx_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __redx_tail_mont_384,.-__redx_tail_mont_384 + +.globl sgn0x_pty_mont_384 +.hidden sgn0x_pty_mont_384 +.type sgn0x_pty_mont_384,@function +.align 32 +sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sgn0_pty_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384,.-sgn0x_pty_mont_384 + +.globl sgn0x_pty_mont_384x +.hidden sgn0x_pty_mont_384x +.type sgn0x_pty_mont_384x,@function +.align 32 +sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sgn0_pty_mont_384x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sgn0x_pty_mont_384x,.-sgn0x_pty_mont_384x +.globl mulx_mont_384 +.hidden mulx_mont_384 +.type mulx_mont_384,@function +.align 32 +mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size mulx_mont_384,.-mulx_mont_384 +.type __mulx_mont_384,@function +.align 32 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_384,.-__mulx_mont_384 +.globl sqrx_mont_384 +.hidden sqrx_mont_384 +.type sqrx_mont_384,@function +.align 32 +sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_384,.-sqrx_mont_384 + +.globl sqrx_n_mul_mont_384 +.hidden sqrx_n_mul_mont_384 +.type sqrx_n_mul_mont_384,@function +.align 32 +sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_n_mul_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +.Loop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_384,.-sqrx_n_mul_mont_384 + +.globl sqrx_n_mul_mont_383 +.hidden sqrx_n_mul_mont_383 +.type sqrx_n_mul_mont_383,@function +.align 32 +sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_n_mul_mont_383$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +.Loop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz .Loop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_n_mul_mont_383,.-sqrx_n_mul_mont_383 +.type __mulx_mont_383_nonred,@function +.align 32 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc +.size __mulx_mont_383_nonred,.-__mulx_mont_383_nonred +.globl sqrx_mont_382x +.hidden sqrx_mont_382x +.type sqrx_mont_382x,@function +.align 32 +sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_mont_382x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc +.size sqrx_mont_382x,.-sqrx_mont_382x + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/sha256-armv8.S b/crypto/blst_src/build/elf/sha256-armv8.S new file mode 100644 index 00000000000..45c1162c467 --- /dev/null +++ b/crypto/blst_src/build/elf/sha256-armv8.S @@ -0,0 +1,1083 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with rationale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.comm __blst_platform_cap,4 +.text + +.align 6 +.type .LK256,%object +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator +.size .LK256,.-.LK256 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.align 2 +.globl blst_sha256_block_armv8 +.type blst_sha256_block_armv8,%function +.align 6 +blst_sha256_block_armv8: +.Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size blst_sha256_block_armv8,.-blst_sha256_block_armv8 +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,%function +.align 4 +blst_sha256_block_data_order: + adrp x16,__blst_platform_cap + ldr w16,[x16,#:lo12:__blst_platform_cap] + tst w16,#1 + b.ne .Lv8_entry + + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,.LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b .L_00_48 + +.align 4 +.L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne .L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne .L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,%function +.align 4 +blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,%function +.align 4 +blst_sha256_bcopy: +.Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,.Loop_bcopy + ret +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,%function +.align 4 +blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret +.size blst_sha256_hcopy,.-blst_sha256_hcopy diff --git a/crypto/blst_src/build/elf/sha256-portable-x86_64.s b/crypto/blst_src/build/elf/sha256-portable-x86_64.s new file mode 100644 index 00000000000..2fd6a770917 --- /dev/null +++ b/crypto/blst_src/build/elf/sha256-portable-x86_64.s @@ -0,0 +1,1758 @@ +.comm __blst_platform_cap,4 +.text + +.globl blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 16 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp +#ifdef __BLST_PORTABLE__ + testl $2,__blst_platform_cap(%rip) + jnz .Lblst_sha256_block_data_order$2 +#endif + pushq %rbx +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+24,%rsp + +.cfi_def_cfa %rsp,144 + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + leaq 64+24+48(%rsp),%r11 +.cfi_def_cfa %r11,8 + movq 64+24(%rsp),%r15 + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbx + movq -8(%r11),%rbp +.cfi_restore %r12 +.cfi_restore %r13 +.cfi_restore %r14 +.cfi_restore %r15 +.cfi_restore %rbp +.cfi_restore %rbx + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order + +#ifndef __BLST_PORTABLE__ +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy +#endif + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/elf/sha256-x86_64.s b/crypto/blst_src/build/elf/sha256-x86_64.s new file mode 100644 index 00000000000..940051aab16 --- /dev/null +++ b/crypto/blst_src/build/elf/sha256-x86_64.s @@ -0,0 +1,1455 @@ +.comm __blst_platform_cap,4 +.text + +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl blst_sha256_block_data_order_shaext +.hidden blst_sha256_block_data_order_shaext +.type blst_sha256_block_data_order_shaext,@function +.align 64 +blst_sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp +.Lblst_sha256_block_data_order$2: + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order_shaext,.-blst_sha256_block_data_order_shaext +.globl blst_sha256_block_data_order +.hidden blst_sha256_block_data_order +.type blst_sha256_block_data_order,@function +.align 64 +blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + testl $2,__blst_platform_cap(%rip) + jnz .Lblst_sha256_block_data_order$2 + pushq %rbx +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $24,%rsp + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,-64(%rbp) + + movq %rdx,-48(%rbp) + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,-56(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq -64(%rbp),%rdi + movl %r14d,%eax + movq -56(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq -48(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + xorps %xmm0,%xmm0 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq -40(%rbp),%r15 + movq -32(%rbp),%r14 + movq -24(%rbp),%r13 + movq -16(%rbp),%r12 + movq -8(%rbp),%rbx + movq %rbp,%rsp +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp +.cfi_restore %r12 +.cfi_restore %r13 +.cfi_restore %r14 +.cfi_restore %r15 +.cfi_restore %rbx + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_block_data_order,.-blst_sha256_block_data_order +.globl blst_sha256_emit +.hidden blst_sha256_emit +.type blst_sha256_emit,@function +.align 16 +blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_emit,.-blst_sha256_emit + +.globl blst_sha256_bcopy +.hidden blst_sha256_bcopy +.type blst_sha256_bcopy,@function +.align 16 +blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +.Loop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz .Loop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_bcopy,.-blst_sha256_bcopy + +.globl blst_sha256_hcopy +.hidden blst_sha256_hcopy +.type blst_sha256_hcopy,@function +.align 16 +blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size blst_sha256_hcopy,.-blst_sha256_hcopy + +.section .note.GNU-stack,"",@progbits +.section .note.gnu.property,"a",@note + .long 4,2f-1f,5 + .byte 0x47,0x4E,0x55,0 +1: .long 0xc0000002,4,3 +.align 8 +2: diff --git a/crypto/blst_src/build/mach-o/add_mod_256-armv8.S b/crypto/blst_src/build/mach-o/add_mod_256-armv8.S new file mode 100644 index 00000000000..198d65aef69 --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_256-armv8.S @@ -0,0 +1,379 @@ +.text + +.globl _add_mod_256 +.private_extern _add_mod_256 + +.align 5 +_add_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl _mul_by_3_mod_256 +.private_extern _mul_by_3_mod_256 + +.align 5 +_mul_by_3_mod_256: + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + stp x8,x9,[x0] + csel x11,x11,x2,lo + stp x10,x11,[x0,#16] + + ret + + +.globl _lshift_mod_256 +.private_extern _lshift_mod_256 + +.align 5 +_lshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +Loop_lshift_mod_256: + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x12,lo + csel x9,x9,x13,lo + csel x10,x10,x14,lo + csel x11,x11,x15,lo + + cbnz x2,Loop_lshift_mod_256 + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl _rshift_mod_256 +.private_extern _rshift_mod_256 + +.align 5 +_rshift_mod_256: + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +Loop_rshift: + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + csel x12,x12,x8,ne + csel x13,x13,x9,ne + csel x14,x14,x10,ne + csel x15,x15,x11,ne + csel x3,x3,xzr,ne + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,Loop_rshift + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + + +.globl _cneg_mod_256 +.private_extern _cneg_mod_256 + +.align 5 +_cneg_mod_256: + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x8,x8,x12,eq + csel x9,x9,x13,eq + csel x10,x10,x14,eq + stp x8,x9,[x0] + csel x11,x11,x15,eq + stp x10,x11,[x0,#16] + + ret + + +.globl _sub_mod_256 +.private_extern _sub_mod_256 + +.align 5 +_sub_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret + + +.globl _check_mod_256 +.private_extern _check_mod_256 + +.align 5 +_check_mod_256: + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + csel x0,x0,xzr,ne + and x0,x0,x1 + + ret + + +.globl _add_n_check_mod_256 +.private_extern _add_n_check_mod_256 + +.align 5 +_add_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csel x8,x8,x16,lo + csel x9,x9,x17,lo + csel x10,x10,x1,lo + csel x11,x11,x2,lo + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + + +.globl _sub_n_check_mod_256 +.private_extern _sub_n_check_mod_256 + +.align 5 +_sub_n_check_mod_256: + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + csel x0, x17, xzr, ne + + ret + diff --git a/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s new file mode 100644 index 00000000000..19e5ba9834f --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_256-x86_64.s @@ -0,0 +1,564 @@ +.text + +.globl _add_mod_256 +.private_extern _add_mod_256 + +.p2align 5 +_add_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oaded_a_add_mod_256: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_256 +.private_extern _mul_by_3_mod_256 + +.p2align 5 +_mul_by_3_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rcx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rsi,%rdx + movq 24(%rsi),%r11 + + call __lshift_mod_256 + movq 0(%rsp),%r12 +.cfi_restore %r12 + jmp L$oaded_a_add_mod_256 + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + movq %r8,%rax + adcq %r10,%r10 + movq %r9,%rsi + adcq %r11,%r11 + sbbq %r12,%r12 + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + cmovcq %rbx,%r10 + cmovcq %rbp,%r11 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _lshift_mod_256 +.private_extern _lshift_mod_256 + +.p2align 5 +_lshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oop_lshift_mod_256: + call __lshift_mod_256 + decl %edx + jnz L$oop_lshift_mod_256 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _rshift_mod_256 +.private_extern _rshift_mod_256 + +.p2align 5 +_rshift_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rbp + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + +L$oop_rshift_mod_256: + movq %rbp,%r8 + andq $1,%rbp + movq 0(%rcx),%rax + negq %rbp + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + + andq %rbp,%rax + andq %rbp,%rsi + andq %rbp,%rbx + andq 24(%rcx),%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + adcq %rbx,%r10 + adcq %rbp,%r11 + sbbq %rax,%rax + + shrq $1,%r8 + movq %r9,%rbp + shrq $1,%r9 + movq %r10,%rbx + shrq $1,%r10 + movq %r11,%rsi + shrq $1,%r11 + + shlq $63,%rbp + shlq $63,%rbx + orq %r8,%rbp + shlq $63,%rsi + orq %rbx,%r9 + shlq $63,%rax + orq %rsi,%r10 + orq %rax,%r11 + + decl %edx + jnz L$oop_rshift_mod_256 + + movq %rbp,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _cneg_mod_256 +.private_extern _cneg_mod_256 + +.p2align 5 +_cneg_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq 0(%rsi),%r12 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %r12,%r8 + movq 24(%rsi),%r11 + orq %r9,%r12 + orq %r10,%r12 + orq %r11,%r12 + movq $-1,%rbp + + movq 0(%rcx),%rax + cmovnzq %rbp,%r12 + movq 8(%rcx),%rsi + movq 16(%rcx),%rbx + andq %r12,%rax + movq 24(%rcx),%rbp + andq %r12,%rsi + andq %r12,%rbx + andq %r12,%rbp + + subq %r8,%rax + sbbq %r9,%rsi + sbbq %r10,%rbx + sbbq %r11,%rbp + + orq %rdx,%rdx + + cmovzq %r8,%rax + cmovzq %r9,%rsi + movq %rax,0(%rdi) + cmovzq %r10,%rbx + movq %rsi,8(%rdi) + cmovzq %r11,%rbp + movq %rbx,16(%rdi) + movq %rbp,24(%rdi) + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_mod_256 +.private_extern _sub_mod_256 + +.p2align 5 +_sub_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _check_mod_256 +.private_extern _check_mod_256 + +.p2align 5 +_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + movq 0(%rdi),%rax + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + + movq %rax,%r8 + orq %r9,%rax + orq %r10,%rax + orq %r11,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq %rsi,%rsi + + movq $1,%rdx + cmpq $0,%rax + cmovneq %rdx,%rax + andq %rsi,%rax + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _add_n_check_mod_256 +.private_extern _add_n_check_mod_256 + +.p2align 5 +_add_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + movq %r8,%rax + adcq 16(%rdx),%r10 + movq %r9,%rsi + adcq 24(%rdx),%r11 + sbbq %rdx,%rdx + + movq %r10,%rbx + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + sbbq 16(%rcx),%r10 + movq %r11,%rbp + sbbq 24(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %rax,%r8 + cmovcq %rsi,%r9 + movq %r8,0(%rdi) + cmovcq %rbx,%r10 + movq %r9,8(%rdi) + cmovcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_n_check_mod_256 +.private_extern _sub_n_check_mod_256 + +.p2align 5 +_sub_n_check_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + movq 0(%rcx),%rax + sbbq 8(%rdx),%r9 + movq 8(%rcx),%rsi + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rbx + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbp + sbbq %rdx,%rdx + + andq %rdx,%rax + andq %rdx,%rsi + andq %rdx,%rbx + andq %rdx,%rbp + + addq %rax,%r8 + adcq %rsi,%r9 + movq %r8,0(%rdi) + adcq %rbx,%r10 + movq %r9,8(%rdi) + adcq %rbp,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + orq %r9,%r8 + orq %r11,%r10 + orq %r10,%r8 + movq $1,%rax + cmovzq %r8,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/add_mod_384-armv8.S b/crypto/blst_src/build/mach-o/add_mod_384-armv8.S new file mode 100644 index 00000000000..a62995f2bed --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_384-armv8.S @@ -0,0 +1,1000 @@ +.text + +.globl _add_mod_384 +.private_extern _add_mod_384 + +.align 5 +_add_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__add_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +__add_mod_384_ab_are_loaded: + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl _add_mod_384x +.private_extern _add_mod_384x + +.align 5 +_add_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _rshift_mod_384 +.private_extern _rshift_mod_384 + +.align 5 +_rshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +Loop_rshift_mod_384: + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,Loop_rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__rshift_mod_384: + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret + + +.globl _div_by_2_mod_384 +.private_extern _div_by_2_mod_384 + +.align 5 +_div_by_2_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _lshift_mod_384 +.private_extern _lshift_mod_384 + +.align 5 +_lshift_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +Loop_lshift_mod_384: + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,Loop_lshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__lshift_mod_384: + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csel x10,x10,x16,lo + csel x11,x11,x17,lo + csel x12,x12,x19,lo + csel x13,x13,x20,lo + csel x14,x14,x21,lo + csel x15,x15,x22,lo + + ret + + +.globl _mul_by_3_mod_384 +.private_extern _mul_by_3_mod_384 + +.align 5 +_mul_by_3_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_8_mod_384 +.private_extern _mul_by_8_mod_384 + +.align 5 +_mul_by_8_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_3_mod_384x +.private_extern _mul_by_3_mod_384x + +.align 5 +_mul_by_3_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_8_mod_384x +.private_extern _mul_by_8_mod_384x + +.align 5 +_mul_by_8_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _cneg_mod_384 +.private_extern _cneg_mod_384 + +.align 5 +_cneg_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetm x3,ne + ands x2,x2,x3 + + csel x10,x10,x16,eq + csel x11,x11,x17,eq + csel x12,x12,x19,eq + csel x13,x13,x20,eq + stp x10,x11,[x0] + csel x14,x14,x21,eq + stp x12,x13,[x0,#16] + csel x15,x15,x22,eq + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _sub_mod_384 +.private_extern _sub_mod_384 + +.align 5 +_sub_mod_384: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + + +.align 5 +__sub_mod_384: + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret + + +.globl _sub_mod_384x +.private_extern _sub_mod_384x + +.align 5 +_sub_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _mul_by_1_plus_i_mod_384x +.private_extern _mul_by_1_plus_i_mod_384x + +.align 5 +_mul_by_1_plus_i_mod_384x: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + + +.globl _sgn0_pty_mod_384 +.private_extern _sgn0_pty_mod_384 + +.align 5 +_sgn0_pty_mod_384: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret + + +.globl _sgn0_pty_mod_384x +.private_extern _sgn0_pty_mod_384x + +.align 5 +_sgn0_pty_mod_384x: + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret + +.globl _vec_select_32 +.private_extern _vec_select_32 + +.align 5 +_vec_select_32: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl _vec_select_48 +.private_extern _vec_select_48 + +.align 5 +_vec_select_48: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl _vec_select_96 +.private_extern _vec_select_96 + +.align 5 +_vec_select_96: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl _vec_select_192 +.private_extern _vec_select_192 + +.align 5 +_vec_select_192: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl _vec_select_144 +.private_extern _vec_select_144 + +.align 5 +_vec_select_144: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + +.globl _vec_select_288 +.private_extern _vec_select_288 + +.align 5 +_vec_select_288: + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + +.globl _vec_prefetch +.private_extern _vec_prefetch + +.align 5 +_vec_prefetch: + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + csel x2, xzr, x2, hi + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + csel x0, x1, x0, hi + prfm pldl1keep, [x0] + ret + +.globl _vec_is_zero_16x +.private_extern _vec_is_zero_16x + +.align 5 +_vec_is_zero_16x: + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, Loop_is_zero_done + +Loop_is_zero: + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, Loop_is_zero + +Loop_is_zero_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + +.globl _vec_is_equal_16x +.private_extern _vec_is_equal_16x + +.align 5 +_vec_is_equal_16x: + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +Loop_is_equal: + sub x2, x2, #1 + cbz x2, Loop_is_equal_done + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b Loop_is_equal + nop + +Loop_is_equal_done: + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + csel x0, x0, xzr, eq + ret + diff --git a/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s new file mode 100644 index 00000000000..974978e3425 --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_384-x86_64.s @@ -0,0 +1,1899 @@ +.text + +.globl _add_mod_384 +.private_extern _add_mod_384 + +.p2align 5 +_add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__add_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__add_mod_384_a_is_loaded: + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _add_mod_384x +.private_extern _add_mod_384x + +.p2align 5 +_add_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __add_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __add_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _rshift_mod_384 +.private_extern _rshift_mod_384 + +.p2align 5 +_rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +L$oop_rshift_mod_384: + call __rshift_mod_384 + decl %edx + jnz L$oop_rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__rshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rsi + movq 0(%rcx),%r14 + andq %r8,%rsi + movq 8(%rcx),%r15 + negq %rsi + movq 16(%rcx),%rax + andq %rsi,%r14 + movq 24(%rcx),%rbx + andq %rsi,%r15 + movq 32(%rcx),%rbp + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%rbx + adcq %r12,%rbp + adcq %r13,%rsi + sbbq %r13,%r13 + + shrq $1,%r14 + movq %r15,%r8 + shrq $1,%r15 + movq %rax,%r9 + shrq $1,%rax + movq %rbx,%r10 + shrq $1,%rbx + movq %rbp,%r11 + shrq $1,%rbp + movq %rsi,%r12 + shrq $1,%rsi + shlq $63,%r8 + shlq $63,%r9 + orq %r14,%r8 + shlq $63,%r10 + orq %r15,%r9 + shlq $63,%r11 + orq %rax,%r10 + shlq $63,%r12 + orq %rbx,%r11 + shlq $63,%r13 + orq %rbp,%r12 + orq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _div_by_2_mod_384 +.private_extern _div_by_2_mod_384 + +.p2align 5 +_div_by_2_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq %rdx,%rcx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + call __rshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _lshift_mod_384 +.private_extern _lshift_mod_384 + +.p2align 5 +_lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +L$oop_lshift_mod_384: + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdi,%rdi + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdi + + movq (%rsp),%rdi + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + decl %edx + jnz L$oop_lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__lshift_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + movq %r8,%r14 + adcq %r11,%r11 + movq %r9,%r15 + adcq %r12,%r12 + movq %r10,%rax + adcq %r13,%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + cmovcq %rbx,%r11 + cmovcq %rbp,%r12 + cmovcq %rsi,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_384 +.private_extern _mul_by_3_mod_384 + +.p2align 5 +_mul_by_3_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_8_mod_384 +.private_extern _mul_by_8_mod_384 + +.p2align 5 +_mul_by_8_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _mul_by_3_mod_384x +.private_extern _mul_by_3_mod_384x + +.p2align 5 +_mul_by_3_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + + movq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq (%rsp),%rsi + leaq 48(%rdi),%rdi + + movq 48(%rsi),%r8 + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + movq 72(%rsi),%r11 + movq 80(%rsi),%r12 + movq 88(%rsi),%r13 + + call __lshift_mod_384 + + movq $48,%rdx + addq (%rsp),%rdx + call __add_mod_384_a_is_loaded + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_by_8_mod_384x +.private_extern _mul_by_8_mod_384x + +.p2align 5 +_mul_by_8_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq %rdx,%rcx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq (%rsp),%rsi + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + movq %r8,48+0(%rdi) + movq %r9,48+8(%rdi) + movq %r10,48+16(%rdi) + movq %r11,48+24(%rdi) + movq %r12,48+32(%rdi) + movq %r13,48+40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _cneg_mod_384 +.private_extern _cneg_mod_384 + +.p2align 5 +_cneg_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdx +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq %rdx,%r8 + movq 24(%rsi),%r11 + orq %r9,%rdx + movq 32(%rsi),%r12 + orq %r10,%rdx + movq 40(%rsi),%r13 + orq %r11,%rdx + movq $-1,%rsi + orq %r12,%rdx + orq %r13,%rdx + + movq 0(%rcx),%r14 + cmovnzq %rsi,%rdx + movq 8(%rcx),%r15 + movq 16(%rcx),%rax + andq %rdx,%r14 + movq 24(%rcx),%rbx + andq %rdx,%r15 + movq 32(%rcx),%rbp + andq %rdx,%rax + movq 40(%rcx),%rsi + andq %rdx,%rbx + movq 0(%rsp),%rcx + andq %rdx,%rbp + andq %rdx,%rsi + + subq %r8,%r14 + sbbq %r9,%r15 + sbbq %r10,%rax + sbbq %r11,%rbx + sbbq %r12,%rbp + sbbq %r13,%rsi + + orq %rcx,%rcx + + cmovzq %r8,%r14 + cmovzq %r9,%r15 + cmovzq %r10,%rax + movq %r14,0(%rdi) + cmovzq %r11,%rbx + movq %r15,8(%rdi) + cmovzq %r12,%rbp + movq %rax,16(%rdi) + cmovzq %r13,%rsi + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rsi,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.globl _sub_mod_384 +.private_extern _sub_mod_384 + +.p2align 5 +_sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sub_mod_384x +.private_extern _sub_mod_384x + +.p2align 5 +_sub_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 24 + + + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + leaq 48(%rsi),%rsi + leaq 48(%rdx),%rdx + leaq 48(%rdi),%rdi + call __sub_mod_384 + + movq 0(%rsp),%rsi + movq 8(%rsp),%rdx + leaq -48(%rdi),%rdi + call __sub_mod_384 + + movq 24+0(%rsp),%r15 +.cfi_restore %r15 + movq 24+8(%rsp),%r14 +.cfi_restore %r14 + movq 24+16(%rsp),%r13 +.cfi_restore %r13 + movq 24+24(%rsp),%r12 +.cfi_restore %r12 + movq 24+32(%rsp),%rbx +.cfi_restore %rbx + movq 24+40(%rsp),%rbp +.cfi_restore %rbp + leaq 24+48(%rsp),%rsp +.cfi_adjust_cfa_offset -24-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_by_1_plus_i_mod_384x +.private_extern _mul_by_1_plus_i_mod_384x + +.p2align 5 +_mul_by_1_plus_i_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $56,%rsp +.cfi_adjust_cfa_offset 56 + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rbx + adcq 72(%rsi),%r11 + movq %r12,%rcx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + movq %rdi,48(%rsp) + sbbq %rdi,%rdi + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rbx + sbbq 80(%rsi),%rcx + sbbq 88(%rsi),%rbp + sbbq %rsi,%rsi + + movq %r8,0(%rsp) + movq 0(%rdx),%r8 + movq %r9,8(%rsp) + movq 8(%rdx),%r9 + movq %r10,16(%rsp) + movq 16(%rdx),%r10 + movq %r11,24(%rsp) + movq 24(%rdx),%r11 + movq %r12,32(%rsp) + andq %rsi,%r8 + movq 32(%rdx),%r12 + movq %r13,40(%rsp) + andq %rsi,%r9 + movq 40(%rdx),%r13 + andq %rsi,%r10 + andq %rsi,%r11 + andq %rsi,%r12 + andq %rsi,%r13 + movq 48(%rsp),%rsi + + addq %r8,%r14 + movq 0(%rsp),%r8 + adcq %r9,%r15 + movq 8(%rsp),%r9 + adcq %r10,%rax + movq 16(%rsp),%r10 + adcq %r11,%rbx + movq 24(%rsp),%r11 + adcq %r12,%rcx + movq 32(%rsp),%r12 + adcq %r13,%rbp + movq 40(%rsp),%r13 + + movq %r14,0(%rsi) + movq %r8,%r14 + movq %r15,8(%rsi) + movq %rax,16(%rsi) + movq %r9,%r15 + movq %rbx,24(%rsi) + movq %rcx,32(%rsi) + movq %r10,%rax + movq %rbp,40(%rsi) + + subq 0(%rdx),%r8 + movq %r11,%rbx + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + movq %r12,%rcx + sbbq 24(%rdx),%r11 + sbbq 32(%rdx),%r12 + movq %r13,%rbp + sbbq 40(%rdx),%r13 + sbbq $0,%rdi + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,48(%rsi) + cmovcq %rbx,%r11 + movq %r9,56(%rsi) + cmovcq %rcx,%r12 + movq %r10,64(%rsi) + cmovcq %rbp,%r13 + movq %r11,72(%rsi) + movq %r12,80(%rsi) + movq %r13,88(%rsi) + + movq 56+0(%rsp),%r15 +.cfi_restore %r15 + movq 56+8(%rsp),%r14 +.cfi_restore %r14 + movq 56+16(%rsp),%r13 +.cfi_restore %r13 + movq 56+24(%rsp),%r12 +.cfi_restore %r12 + movq 56+32(%rsp),%rbx +.cfi_restore %rbx + movq 56+40(%rsp),%rbp +.cfi_restore %rbp + leaq 56+48(%rsp),%rsp +.cfi_adjust_cfa_offset -56-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sgn0_pty_mod_384 +.private_extern _sgn0_pty_mod_384 + +.p2align 5 +_sgn0_pty_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%rcx + movq 40(%rdi),%rdx + + xorq %rax,%rax + movq %r8,%rdi + addq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + notq %rax + andq $1,%rdi + andq $2,%rax + orq %rdi,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mod_384x +.private_extern _sgn0_pty_mod_384x + +.p2align 5 +_sgn0_pty_mod_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq 48(%rdi),%r8 + movq 56(%rdi),%r9 + movq 64(%rdi),%r10 + movq 72(%rdi),%r11 + movq 80(%rdi),%rcx + movq 88(%rdi),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + leaq 0(%rdi),%rax + xorq %rdi,%rdi + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rdi + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rdi + + movq %r8,0(%rsp) + notq %rdi + andq $1,%rbp + andq $2,%rdi + orq %rbp,%rdi + + movq 0(%rax),%r8 + movq 8(%rax),%r9 + movq 16(%rax),%r10 + movq 24(%rax),%r11 + movq 32(%rax),%rcx + movq 40(%rax),%rdx + + movq %r8,%rbx + orq %r9,%r8 + orq %r10,%r8 + orq %r11,%r8 + orq %rcx,%r8 + orq %rdx,%r8 + + xorq %rax,%rax + movq %rbx,%rbp + addq %rbx,%rbx + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %rcx,%rcx + adcq %rdx,%rdx + adcq $0,%rax + + subq 0(%rsi),%rbx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + sbbq 24(%rsi),%r11 + sbbq 32(%rsi),%rcx + sbbq 40(%rsi),%rdx + sbbq $0,%rax + + movq 0(%rsp),%rbx + + notq %rax + + testq %r8,%r8 + cmovzq %rdi,%rbp + + testq %rbx,%rbx + cmovnzq %rdi,%rax + + andq $1,%rbp + andq $2,%rax + orq %rbp,%rax + + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_32 +.private_extern _vec_select_32 + +.p2align 5 +_vec_select_32: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 16(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 16(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 16(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-16(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-16(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-16(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,16-16(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_48 +.private_extern _vec_select_48 + +.p2align 5 +_vec_select_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 24(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 24(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 24(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-24(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-24(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-24(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-24(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-24(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-24(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,32-24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_96 +.private_extern _vec_select_96 + +.p2align 5 +_vec_select_96: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 48(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 48(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 48(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-48(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-48(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-48(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-48(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-48(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-48(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-48(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-48(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-48(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,80-48(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_192 +.private_extern _vec_select_192 + +.p2align 5 +_vec_select_192: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 96(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 96(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 96(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-96(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-96(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-96(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-96(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-96(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-96(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-96(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-96(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-96(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-96(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-96(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-96(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-96(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-96(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-96(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,176-96(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_144 +.private_extern _vec_select_144 + +.p2align 5 +_vec_select_144: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 72(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 72(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 72(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-72(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-72(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-72(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-72(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-72(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-72(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-72(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-72(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-72(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-72(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-72(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-72(%rdi) + pand %xmm4,%xmm0 + pand %xmm5,%xmm1 + por %xmm1,%xmm0 + movdqu %xmm0,128-72(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_select_288 +.private_extern _vec_select_288 + +.p2align 5 +_vec_select_288: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movd %ecx,%xmm5 + pxor %xmm4,%xmm4 + pshufd $0,%xmm5,%xmm5 + movdqu (%rsi),%xmm0 + leaq 144(%rsi),%rsi + pcmpeqd %xmm4,%xmm5 + movdqu (%rdx),%xmm1 + leaq 144(%rdx),%rdx + pcmpeqd %xmm5,%xmm4 + leaq 144(%rdi),%rdi + pand %xmm4,%xmm0 + movdqu 0+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 0+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,0-144(%rdi) + pand %xmm4,%xmm2 + movdqu 16+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 16+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,16-144(%rdi) + pand %xmm4,%xmm0 + movdqu 32+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 32+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,32-144(%rdi) + pand %xmm4,%xmm2 + movdqu 48+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 48+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,48-144(%rdi) + pand %xmm4,%xmm0 + movdqu 64+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 64+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,64-144(%rdi) + pand %xmm4,%xmm2 + movdqu 80+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 80+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,80-144(%rdi) + pand %xmm4,%xmm0 + movdqu 96+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 96+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,96-144(%rdi) + pand %xmm4,%xmm2 + movdqu 112+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 112+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,112-144(%rdi) + pand %xmm4,%xmm0 + movdqu 128+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 128+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,128-144(%rdi) + pand %xmm4,%xmm2 + movdqu 144+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 144+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,144-144(%rdi) + pand %xmm4,%xmm0 + movdqu 160+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 160+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,160-144(%rdi) + pand %xmm4,%xmm2 + movdqu 176+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 176+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,176-144(%rdi) + pand %xmm4,%xmm0 + movdqu 192+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 192+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,192-144(%rdi) + pand %xmm4,%xmm2 + movdqu 208+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 208+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,208-144(%rdi) + pand %xmm4,%xmm0 + movdqu 224+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 224+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,224-144(%rdi) + pand %xmm4,%xmm2 + movdqu 240+16-144(%rsi),%xmm0 + pand %xmm5,%xmm3 + movdqu 240+16-144(%rdx),%xmm1 + por %xmm3,%xmm2 + movdqu %xmm2,240-144(%rdi) + pand %xmm4,%xmm0 + movdqu 256+16-144(%rsi),%xmm2 + pand %xmm5,%xmm1 + movdqu 256+16-144(%rdx),%xmm3 + por %xmm1,%xmm0 + movdqu %xmm0,256-144(%rdi) + pand %xmm4,%xmm2 + pand %xmm5,%xmm3 + por %xmm3,%xmm2 + movdqu %xmm2,272-144(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_prefetch +.private_extern _vec_prefetch + +.p2align 5 +_vec_prefetch: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + leaq -1(%rdi,%rsi,1),%rsi + movq $64,%rax + xorq %r8,%r8 + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + cmovaq %r8,%rax + prefetchnta (%rdi) + leaq (%rdi,%rax,1),%rdi + cmpq %rsi,%rdi + cmovaq %rsi,%rdi + prefetchnta (%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_is_zero_16x +.private_extern _vec_is_zero_16x + +.p2align 5 +_vec_is_zero_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%esi + movdqu (%rdi),%xmm0 + leaq 16(%rdi),%rdi + +L$oop_is_zero: + decl %esi + jz L$oop_is_zero_done + movdqu (%rdi),%xmm1 + leaq 16(%rdi),%rdi + por %xmm1,%xmm0 + jmp L$oop_is_zero + +L$oop_is_zero_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %esi + testq %rax,%rax + cmovnzl %esi,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _vec_is_equal_16x +.private_extern _vec_is_equal_16x + +.p2align 5 +_vec_is_equal_16x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + shrl $4,%edx + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm1 + subq %rdi,%rsi + leaq 16(%rdi),%rdi + pxor %xmm1,%xmm0 + +L$oop_is_equal: + decl %edx + jz L$oop_is_equal_done + movdqu (%rdi),%xmm1 + movdqu (%rdi,%rsi,1),%xmm2 + leaq 16(%rdi),%rdi + pxor %xmm2,%xmm1 + por %xmm1,%xmm0 + jmp L$oop_is_equal + +L$oop_is_equal_done: + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 +.byte 102,72,15,126,192 + incl %edx + testq %rax,%rax + cmovnzl %edx,%eax + xorl $1,%eax + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s b/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s new file mode 100644 index 00000000000..2dc58f81608 --- /dev/null +++ b/crypto/blst_src/build/mach-o/add_mod_384x384-x86_64.s @@ -0,0 +1,244 @@ +.text + + +.p2align 5 +__add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + addq 0(%rdx),%r8 + movq 56(%rsi),%r15 + adcq 8(%rdx),%r9 + movq 64(%rsi),%rax + adcq 16(%rdx),%r10 + movq 72(%rsi),%rbx + adcq 24(%rdx),%r11 + movq 80(%rsi),%rbp + adcq 32(%rdx),%r12 + movq 88(%rsi),%rsi + adcq 40(%rdx),%r13 + movq %r8,0(%rdi) + adcq 48(%rdx),%r14 + movq %r9,8(%rdi) + adcq 56(%rdx),%r15 + movq %r10,16(%rdi) + adcq 64(%rdx),%rax + movq %r12,32(%rdi) + movq %r14,%r8 + adcq 72(%rdx),%rbx + movq %r11,24(%rdi) + movq %r15,%r9 + adcq 80(%rdx),%rbp + movq %r13,40(%rdi) + movq %rax,%r10 + adcq 88(%rdx),%rsi + movq %rbx,%r11 + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %rbp,%r12 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%rbx + sbbq 32(%rcx),%rbp + movq %rsi,%r13 + sbbq 40(%rcx),%rsi + sbbq $0,%rdx + + cmovcq %r8,%r14 + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %r14,48(%rdi) + cmovcq %r11,%rbx + movq %r15,56(%rdi) + cmovcq %r12,%rbp + movq %rax,64(%rdi) + cmovcq %r13,%rsi + movq %rbx,72(%rdi) + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _add_mod_384x384 +.private_extern _add_mod_384x384 + +.p2align 5 +_add_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __add_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sub_mod_384x384 +.private_extern _sub_mod_384x384 + +.p2align 5 +_sub_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sub_mod_384x384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S new file mode 100644 index 00000000000..2fd4847a496 --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-armv8.S @@ -0,0 +1,785 @@ +.text + +.globl _ct_inverse_mod_256 +.private_extern _ct_inverse_mod_256 + +.align 5 +_ct_inverse_mod_256: +.long 3573752639 + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl Lab_approximation_31_256_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// + +.align 5 +__smul_256x63: + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + csel x22, x22, xzr, ne + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + csel x23, x23, xzr, ne + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret + + + +.align 5 +__smul_512x63_tail: + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret + + + +.align 5 +__smul_256_n_shift_by_31: + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret + + +.align 4 +__ab_approximation_31_256: + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +Lab_approximation_31_256_loaded: + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x5, ne + orr x19, x7, x11 // and ones before top-most, ... + csel x10, x10, x9, ne + + cmp x19, #0 + csel x7, x7, x6, ne + csel x11, x11, x10, ne + csel x6, x6, x4, ne + orr x19, x7, x11 // and one more, ... + csel x10, x10, x8, ne + + clz x19, x19 + cmp x19, #64 + csel x19, x19, xzr, ne + csel x7, x7, x6, ne + csel x11, x11, x10, ne + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret + + + +.align 4 +__inner_loop_31_256: + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +Loop_31_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x15, x15, x13, hs // exchange |fg0| and |fg1| + csel x13, x13, x19, hs + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, Loop_31_256 + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret + + + +.align 4 +__inner_loop_62_256: + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +Loop_62_256: + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + csel x11, x11, x7, hs // |b_| = |a_| + csel x7, x21, x20, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + mov x20, x13 + csel x12, x12, x14, hs // exchange |f0| and |f1| + csel x14, x14, x19, hs + csel x13, x13, x15, hs // exchange |g0| and |g1| + csel x15, x15, x20, hs + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, Loop_62_256 + + ret + diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s new file mode 100644 index 00000000000..bf0ad8986e7 --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_256-x86_64.s @@ -0,0 +1,1178 @@ +.text + +.globl _ct_inverse_mod_256 +.private_extern _ct_inverse_mod_256 + +.p2align 5 +_ct_inverse_mod_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1072,%rsp +.cfi_adjust_cfa_offset 1072 + + + leaq 48+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + movq 0(%rdx),%r12 + movq 8(%rdx),%r13 + movq 16(%rdx),%r14 + movq 24(%rdx),%r15 + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + + movq %r12,32(%rax) + movq %r13,40(%rax) + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rax,%rsi + + + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,64(%rdi) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + movq %rdx,72(%rdi) + + + xorq $256,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + + + + movq 64(%rsi),%r8 + movq 104(%rsi),%r12 + movq %r8,%r9 + imulq 0(%rsp),%r8 + movq %r12,%r13 + imulq 8(%rsp),%r12 + addq %r12,%r8 + movq %r8,32(%rdi) + sarq $63,%r8 + movq %r8,40(%rdi) + movq %r8,48(%rdi) + movq %r8,56(%rdi) + movq %r8,64(%rdi) + leaq 64(%rsi),%rsi + + imulq %rdx,%r9 + imulq %rcx,%r13 + addq %r13,%r9 + movq %r9,72(%rdi) + sarq $63,%r9 + movq %r9,80(%rdi) + movq %r9,88(%rdi) + movq %r9,96(%rdi) + movq %r9,104(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_256x63 + sarq $63,%rbp + movq %rbp,40(%rdi) + movq %rbp,48(%rdi) + movq %rbp,56(%rdi) + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + xorq $256+64,%rsi + movl $31,%edx + call __ab_approximation_31_256 + + + movq %r12,16(%rsp) + movq %r13,24(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,0(%rsp) + movq %rcx,8(%rsp) + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 32(%rdi),%rdi + call __smulq_256_n_shift_by_31 + movq %rdx,16(%rsp) + movq %rcx,24(%rsp) + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq 64(%rsi),%rsi + leaq 32(%rdi),%rdi + call __smulq_256x63 + + movq 16(%rsp),%rdx + movq 24(%rsp),%rcx + leaq 40(%rdi),%rdi + call __smulq_512x63 + + xorq $256+64,%rsi + movl $47,%edx + + movq 0(%rsi),%r8 + + movq 32(%rsi),%r10 + + call __inner_loop_62_256 + + + + + + + + leaq 64(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_512x63 + adcq %rbp,%rdx + + movq 40(%rsp),%rsi + movq %rdx,%rax + sarq $63,%rdx + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + adcq $0,%rax + + movq %rax,%rdx + negq %rax + orq %rax,%rdx + sarq $63,%rax + + movq %rdx,%r8 + movq %rdx,%r9 + andq 0(%rsi),%r8 + movq %rdx,%r10 + andq 8(%rsi),%r9 + andq 16(%rsi),%r10 + andq 24(%rsi),%rdx + + xorq %rax,%r8 + xorq %rcx,%rcx + xorq %rax,%r9 + subq %rax,%rcx + xorq %rax,%r10 + xorq %rax,%rdx + addq %rcx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rdx + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %rdx,%r15 + + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 1072(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1072-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_512x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %r9,8(%rdi) + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %r10,16(%rdi) + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %r11,24(%rdi) + + movq 40(%rsi),%r8 + movq 48(%rsi),%r9 + movq 56(%rsi),%r10 + movq 64(%rsi),%r11 + movq 72(%rsi),%r12 + movq 80(%rsi),%r13 + movq 88(%rsi),%r14 + movq 96(%rsi),%r15 + + movq %rcx,%rdx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rcx + addq %rax,%rcx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rcx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rcx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rcx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rcx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rcx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rcx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rcx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + imulq %rcx + addq %rax,%r15 + adcq $0,%rdx + + movq %rbp,%rbx + sarq $63,%rbp + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq %rbx,%r12 + adcq %rbp,%r13 + adcq %rbp,%r14 + adcq %rbp,%r15 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__smulq_256x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%rbp + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%rbp + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%rbp + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + andq %rbx,%rbp + negq %rbp + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq %rcx,%rdx + movq 40+0(%rsi),%r12 + movq 40+8(%rsi),%r13 + movq 40+16(%rsi),%r14 + movq 40+24(%rsi),%r15 + movq 40+32(%rsi),%rcx + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rcx + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rcx + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + andq %rbx,%rcx + negq %rcx + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %rbp,32(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_256_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,0(%rdi) + movq %rcx,8(%rdi) + movq %rdx,%rbp + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + + movq %rbp,%rbx + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rbx + addq %rax,%rbx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + andq %rbx,%rbp + negq %rbp + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + adcq %rdx,%rbp + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r14 + movq 32+24(%rsi),%r15 + + movq %rcx,%rbx + sarq $63,%rcx + xorq %rax,%rax + subq %rcx,%rax + + xorq %rcx,%rbx + addq %rax,%rbx + + xorq %rcx,%r12 + xorq %rcx,%r13 + xorq %rcx,%r14 + xorq %rcx,%r15 + addq %r12,%rax + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + mulq %rbx + movq %rax,%r12 + movq %r13,%rax + andq %rbx,%rcx + negq %rcx + movq %rdx,%r13 + mulq %rbx + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rbx + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rbx + addq %rax,%r15 + adcq %rdx,%rcx + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rcx,%rbp + + movq 0(%rdi),%rdx + movq 8(%rdi),%rcx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%rbp,%r11 + + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + xorq %rbp,%rdx + xorq %rbp,%rcx + addq %rax,%rdx + addq %rax,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 24(%rsi),%r9 + movq 56(%rsi),%r11 + movq 16(%rsi),%rbx + movq 48(%rsi),%rbp + movq 8(%rsi),%r8 + movq 40(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 32(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + notq %rax + andq %rax,%r9 + andq %rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31_256 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__inner_loop_31_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +L$oop_31_256: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edx + jnz L$oop_31_256 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__inner_loop_62_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl %edx,%r15d + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq %rdx,%r13 + movq %rdx,%r14 + +L$oop_62_256: + xorq %rax,%rax + testq %r14,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq %r14,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%r15d + jnz L$oop_62_256 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S new file mode 100644 index 00000000000..b9c3acde200 --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_inverse_mod_384-armv8.S @@ -0,0 +1,718 @@ +.text + +.globl _ct_inverse_mod_383 +.private_extern _ct_inverse_mod_383 + +.align 5 +_ct_inverse_mod_383: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl Lab_approximation_62_loaded + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... + +.align 5 +__smul_383x63: + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret + + + +.align 5 +__smul_767x63_tail: + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret + + + +.align 5 +__smul_383_n_shift_by_62: + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret + + +.align 4 +__ab_approximation_62: + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +Lab_approximation_62_loaded: + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x22, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x22, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x22, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x22, x8, x14 + csel x13, x13, x10, ne + + clz x22, x22 + cmp x22, #64 + csel x22, x22, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret + + +.align 4 +__inner_loop_62: + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +Loop_62: + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + csel x9, x9, x3, hs // |b_| = |a_| + csel x14, x14, x8, hs + csel x3, x26, x24, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x8, x27, x25, hs + csel x15, x15, x17, hs // exchange |f0| and |f1| + csel x17, x17, x22, hs + csel x16, x16, x19, hs // exchange |g0| and |g1| + csel x19, x19, x23, hs + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, Loop_62 + + ret + diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S new file mode 100644 index 00000000000..9fe0df88b59 --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-armv8.S @@ -0,0 +1,325 @@ +.text + +.globl _ct_is_square_mod_384 +.private_extern _ct_is_square_mod_384 + +.align 5 +_ct_is_square_mod_384: +.long 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the Legendre symbol + mov x15, #24 // 24 is 768/30-1 + b Loop_is_square + +.align 4 +Loop_is_square: + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, Loop_is_square + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 +.long 3573752767 + ret + + + +.align 5 +__smul_384_n_shift_by_30: + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret + + +.align 4 +__ab_approximation_30: + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x6, ne + orr x21, x8, x14 // ... ones before top-most, ... + csel x13, x13, x12, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x5, ne + orr x21, x8, x14 // ... and ones before that ... + csel x13, x13, x11, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x4, ne + orr x21, x8, x14 // and one more, ... + csel x13, x13, x10, ne + + cmp x21, #0 + csel x8, x8, x7, ne + csel x14, x14, x13, ne + csel x7, x7, x3, ne + orr x21, x8, x14 + csel x13, x13, x9, ne + + clz x21, x21 + cmp x21, #64 + csel x21, x21, xzr, ne + csel x8, x8, x7, ne + csel x14, x14, x13, ne + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret + + + +.align 4 +__inner_loop_30: + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +Loop_30: + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + csel x14, x14, x8, hs // |b_| = |a_| + csel x8, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x20, x20, x17, hs // exchange |fg0| and |fg1| + csel x17, x17, x21, hs + csel x2, x2, x25, hs + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, Loop_30 + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret + + +.align 4 +__inner_loop_48: +Loop_48: + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + csel x9, x9, x3, hs // |b_| = |a_| + csel x3, x23, x22, hs // borrow means |a_|<|b_|, replace with |b_|-|a_| + csel x2, x2, x25, hs + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, Loop_48 + + ret + diff --git a/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s new file mode 100644 index 00000000000..5faadb8dbff --- /dev/null +++ b/crypto/blst_src/build/mach-o/ct_is_square_mod_384-x86_64.s @@ -0,0 +1,472 @@ +.text + +.globl _ct_is_square_mod_384 +.private_extern _ct_is_square_mod_384 + +.p2align 5 +_ct_is_square_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $536,%rsp +.cfi_adjust_cfa_offset 536 + + + leaq 24+255(%rsp),%rax + andq $-256,%rax + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbx + movq 24(%rsi),%rcx + movq 32(%rsi),%rdx + movq 40(%rsi),%rdi + movq %rax,%rsi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rcx,72(%rax) + movq %rdx,80(%rax) + movq %rdi,88(%rax) + + xorq %rbp,%rbp + movl $24,%ecx + jmp L$oop_is_square + +.p2align 5 +L$oop_is_square: + movl %ecx,16(%rsp) + + call __ab_approximation_30 + movq %rax,0(%rsp) + movq %rbx,8(%rsp) + + movq $128+48,%rdi + xorq %rsi,%rdi + call __smulq_384_n_shift_by_30 + + movq 0(%rsp),%rdx + movq 8(%rsp),%rcx + leaq -48(%rdi),%rdi + call __smulq_384_n_shift_by_30 + + movl 16(%rsp),%ecx + xorq $128,%rsi + + andq 48(%rdi),%r14 + shrq $1,%r14 + addq %r14,%rbp + + subl $1,%ecx + jnz L$oop_is_square + + + + + movq 48(%rsi),%r9 + call __inner_loop_48 + + movq $1,%rax + andq %rbp,%rax + xorq $1,%rax + + leaq 536(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -536-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__smulq_384_n_shift_by_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r14 + andq %rbx,%r14 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r14 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r14 + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbx + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbx + addq %rax,%rbx + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %rdx,%r15 + andq %rbx,%r15 + mulq %rbx + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbx + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbx + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbx + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbx + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + negq %r15 + mulq %rbx + addq %rax,%r13 + adcq %rdx,%r15 + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %r15,%r14 + + shrdq $30,%r9,%r8 + shrdq $30,%r10,%r9 + shrdq $30,%r11,%r10 + shrdq $30,%r12,%r11 + shrdq $30,%r13,%r12 + shrdq $30,%r14,%r13 + + sarq $63,%r14 + xorq %rbx,%rbx + subq %r14,%rbx + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbx,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 88(%rsi),%rbx + movq 80(%rsi),%r15 + movq 72(%rsi),%r14 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r11,%r12 + movq 64(%rsi),%r11 + cmovzq %r14,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r10,%r12 + movq 56(%rsi),%r10 + cmovzq %r11,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r9,%r12 + movq 48(%rsi),%r9 + cmovzq %r10,%r15 + + movq %r13,%rax + orq %rbx,%rax + cmovzq %r12,%r13 + cmovzq %r15,%rbx + cmovzq %r8,%r12 + cmovzq %r9,%r15 + + movq %r13,%rax + orq %rbx,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r13 + cmovzq %r9,%rbx + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%r12,%r13 + shldq %cl,%r15,%rbx + + movq $0xFFFFFFFF00000000,%rax + movl %r8d,%r8d + movl %r9d,%r9d + andq %rax,%r13 + andq %rax,%rbx + orq %r13,%r8 + orq %rbx,%r9 + + jmp __inner_loop_30 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__inner_loop_30: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rbx + movq $0x800000007FFFFFFF,%rcx + leaq -1(%rbx),%r15 + movl $30,%edi + +L$oop_30: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbx,%r12 + movq %rcx,%r13 + movq %rbp,%r14 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rcx,%rbx + cmovbq %r12,%rcx + cmovbq %rax,%rbp + + subq %r9,%r8 + subq %rcx,%rbx + addq %r15,%rbx + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbx + cmovzq %r13,%rcx + cmovzq %r14,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rcx,%rcx + leaq (%rax,%rbp,1),%rbp + subq %r15,%rcx + + subl $1,%edi + jnz L$oop_30 + + shrq $32,%r15 + movl %ebx,%eax + shrq $32,%rbx + movl %ecx,%edx + shrq $32,%rcx + subq %r15,%rax + subq %r15,%rbx + subq %r15,%rdx + subq %r15,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__inner_loop_48: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movl $48,%edi + +L$oop_48: + movq %r8,%rax + andq %r9,%rax + shrq $1,%rax + + cmpq %r9,%r8 + movq %r8,%r10 + movq %r9,%r11 + leaq (%rax,%rbp,1),%rax + movq %rbp,%r12 + cmovbq %r9,%r8 + cmovbq %r10,%r9 + cmovbq %rax,%rbp + + subq %r9,%r8 + + testq $1,%r10 + cmovzq %r10,%r8 + cmovzq %r11,%r9 + cmovzq %r12,%rbp + + leaq 2(%r9),%rax + shrq $1,%r8 + shrq $2,%rax + addq %rax,%rbp + + subl $1,%edi + jnz L$oop_48 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..eebe131d0cb --- /dev/null +++ b/crypto/blst_src/build/mach-o/ctq_inverse_mod_384-x86_64.s @@ -0,0 +1,1193 @@ +.comm ___blst_platform_cap,4 +.text + +.globl _ct_inverse_mod_383 +.private_extern _ct_inverse_mod_383 + +.p2align 5 +_ct_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz ct_inverse_mod_383$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + xorq $256+96,%rsi + movl $62,%edi + call __ab_approximation_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_383_n_shift_by_62 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + xorq $256+96,%rsi + movl $62,%edi + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 48(%rsi),%r10 + movq 56(%rsi),%r11 + call __inner_loop_62 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + movq %r8,0(%rdi) + movq %r10,48(%rdi) + + + + leaq 96(%rsi),%rsi + leaq 96(%rdi),%rdi + call __smulq_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulq_767x63 + + + xorq $256+96,%rsi + movl $22,%edi + + movq 0(%rsi),%r8 + xorq %r9,%r9 + movq 48(%rsi),%r10 + xorq %r11,%r11 + call __inner_loop_62 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulq_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,0(%rdi) + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + movq %r9,8(%rdi) + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + movq %r10,16(%rdi) + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %r11,24(%rdi) + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + movq %r12,32(%rdi) + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + movq %r13,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + movq %rdx,%rsi + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rsi + addq %rax,%rsi + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + xorq %rdx,%r14 + xorq %rdx,%r15 + xorq %rdx,%rbx + xorq %rdx,%rbp + xorq %rdx,%rcx + xorq %rdx,%rdi + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulq %rsi + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rsi + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rsi + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rsi + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rsi + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + mulq %rsi + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %rdx,%rbx + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + mulq %rsi + addq %rax,%rbp + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rcx + mulq %rsi + addq %rax,%rcx + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%rdi + movq 8(%rsp),%rdx + imulq %rsi,%rax + movq 16(%rsp),%rsi + addq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq 48(%rsi),%rsi + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp,%rax + addq %rax,%r13 + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulq_383_n_shift_by_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq 48(%rsi),%rsi + movq %rdx,%r14 + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rdx + xorq %rax,%rax + subq %rdx,%rax + + xorq %rdx,%rbp + addq %rax,%rbp + + xorq %rdx,%r8 + xorq %rdx,%r9 + xorq %rdx,%r10 + xorq %rdx,%r11 + xorq %rdx,%r12 + xorq %rdx,%r13 + addq %r8,%rax + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulq %rbp + movq %rax,%r8 + movq %r9,%rax + movq %rdx,%r9 + mulq %rbp + addq %rax,%r9 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r10 + mulq %rbp + addq %rax,%r10 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r11 + mulq %rbp + addq %rax,%r11 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r12 + mulq %rbp + addq %rax,%r12 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r13 + imulq %rbp + addq %rax,%r13 + adcq $0,%rdx + + leaq -48(%rsi),%rsi + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $62,%r9,%r8 + shrdq $62,%r10,%r9 + shrdq $62,%r11,%r10 + shrdq $62,%r12,%r11 + shrdq $62,%r13,%r12 + shrdq $62,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 16(%rsi),%r8 + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 8(%rsi),%r8 + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + movq 0(%rsi),%r8 + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + jmp __inner_loop_62 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 3 +.long 0 +__inner_loop_62: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + movq %rsi,8(%rsp) + +L$oop_62: + xorq %rax,%rax + xorq %rbx,%rbx + testq $1,%r8 + movq %r10,%rbp + movq %r11,%r14 + cmovnzq %r10,%rax + cmovnzq %r11,%rbx + subq %r8,%rbp + sbbq %r9,%r14 + movq %r8,%r15 + movq %r9,%rsi + subq %rax,%r8 + sbbq %rbx,%r9 + cmovcq %rbp,%r8 + cmovcq %r14,%r9 + cmovcq %r15,%r10 + cmovcq %rsi,%r11 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrdq $1,%r9,%r8 + shrq $1,%r9 + testq $1,%r15 + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz L$oop_62 + + movq 8(%rsp),%rsi + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s new file mode 100644 index 00000000000..3f999075813 --- /dev/null +++ b/crypto/blst_src/build/mach-o/ctx_inverse_mod_384-x86_64.s @@ -0,0 +1,1568 @@ +.text + +.globl _ctx_inverse_mod_383 +.private_extern _ctx_inverse_mod_383 + +.p2align 5 +_ctx_inverse_mod_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +ct_inverse_mod_383$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $1112,%rsp +.cfi_adjust_cfa_offset 1112 + + + leaq 88+511(%rsp),%rax + andq $-512,%rax + movq %rdi,32(%rsp) + movq %rcx,40(%rsp) + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq 0(%rdx),%r14 + movq 8(%rdx),%r15 + movq 16(%rdx),%rbx + movq 24(%rdx),%rbp + movq 32(%rdx),%rsi + movq 40(%rdx),%rdi + + movq %r8,0(%rax) + movq %r9,8(%rax) + movq %r10,16(%rax) + movq %r11,24(%rax) + movq %r12,32(%rax) + movq %r13,40(%rax) + + movq %r14,48(%rax) + movq %r15,56(%rax) + movq %rbx,64(%rax) + movq %rbp,72(%rax) + movq %rsi,80(%rax) + movq %rax,%rsi + movq %rdi,88(%rax) + + + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + movq %rdx,96(%rdi) + + + xorq $256,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + + + + movq 96(%rsi),%rax + movq 144(%rsi),%r11 + movq %rdx,%rbx + movq %rax,%r10 + imulq 56(%rsp) + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq 64(%rsp) + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + sarq $63,%r9 + movq %r9,64(%rdi) + movq %r9,72(%rdi) + movq %r9,80(%rdi) + movq %r9,88(%rdi) + leaq 96(%rsi),%rsi + + movq %r10,%rax + imulq %rbx + movq %rax,%r8 + movq %r11,%rax + movq %rdx,%r9 + imulq %rcx + addq %rax,%r8 + adcq %rdx,%r9 + movq %r8,96(%rdi) + movq %r9,104(%rdi) + sarq $63,%r9 + movq %r9,112(%rdi) + movq %r9,120(%rdi) + movq %r9,128(%rdi) + movq %r9,136(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383x63 + sarq $63,%r13 + movq %r13,48(%rdi) + movq %r13,56(%rdi) + movq %r13,64(%rdi) + movq %r13,72(%rdi) + movq %r13,80(%rdi) + movq %r13,88(%rdi) + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_383_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + xorq $256+96,%rsi + movl $31,%edi + call __ab_approximation_31 + + + movq %r12,72(%rsp) + movq %r13,80(%rsp) + + movq $256,%rdi + xorq %rsi,%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,56(%rsp) + movq %rcx,64(%rsp) + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_191_n_shift_by_31 + movq %rdx,72(%rsp) + movq %rcx,80(%rsp) + + movq 56(%rsp),%rdx + movq 64(%rsp),%rcx + leaq 96(%rsi),%rsi + leaq 48(%rdi),%rdi + call __smulx_383x63 + + movq 72(%rsp),%rdx + movq 80(%rsp),%rcx + leaq 48(%rdi),%rdi + call __smulx_767x63 + + xorq $256+96,%rsi + movl $53,%edi + + movq 0(%rsi),%r8 + + movq 48(%rsi),%r10 + + call __tail_loop_53 + + + + + + + + leaq 96(%rsi),%rsi + + + + + + movq %r12,%rdx + movq %r13,%rcx + movq 32(%rsp),%rdi + call __smulx_767x63 + + movq 40(%rsp),%rsi + movq %rax,%rdx + sarq $63,%rax + + movq %rax,%r8 + movq %rax,%r9 + movq %rax,%r10 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + movq %rax,%r11 + andq 16(%rsi),%r10 + andq 24(%rsi),%r11 + movq %rax,%r12 + andq 32(%rsi),%r12 + andq 40(%rsi),%rax + + addq %r8,%r14 + adcq %r9,%r15 + adcq %r10,%rbx + adcq %r11,%rbp + adcq %r12,%rcx + adcq %rax,%rdx + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %rbx,64(%rdi) + movq %rbp,72(%rdi) + movq %rcx,80(%rdi) + movq %rdx,88(%rdi) + + leaq 1112(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -1112-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_767x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + leaq 48(%rsi),%rsi + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq %rdx,48(%rdi) + sarq $63,%rdx + movq %rdx,56(%rdi) + movq %rcx,%rdx + movq %rcx,%rax + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + movq 56(%rsi),%r15 + movq 64(%rsi),%rbx + movq 72(%rsi),%rbp + movq 80(%rsi),%rcx + movq 88(%rsi),%rdi + + sarq $63,%rax + xorq %rsi,%rsi + subq %rax,%rsi + + xorq %rax,%rdx + addq %rsi,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %rax,%r13 + xorq %rax,%r14 + xorq %rax,%r15 + xorq %rax,%rbx + xorq %rax,%rbp + xorq %rax,%rcx + xorq %rax,%rdi + addq %rsi,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rbx + adcq $0,%rbp + adcq $0,%rcx + adcq $0,%rdi + + mulxq %r8,%r8,%rax + mulxq %r9,%r9,%rsi + addq %rax,%r9 + mulxq %r10,%r10,%rax + adcq %rsi,%r10 + mulxq %r11,%r11,%rsi + adcq %rax,%r11 + mulxq %r12,%r12,%rax + adcq %rsi,%r12 + mulxq %r13,%r13,%rsi + adcq %rax,%r13 + mulxq %r14,%r14,%rax + adcq %rsi,%r14 + mulxq %r15,%r15,%rsi + adcq %rax,%r15 + mulxq %rbx,%rbx,%rax + adcq %rsi,%rbx + mulxq %rbp,%rbp,%rsi + adcq %rax,%rbp + mulxq %rcx,%rcx,%rax + adcq %rsi,%rcx + mulxq %rdi,%rdi,%rsi + movq 8(%rsp),%rdx + movq 16(%rsp),%rsi + adcq %rdi,%rax + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq 32(%rdx),%r12 + adcq 40(%rdx),%r13 + adcq 48(%rdx),%r14 + movq 56(%rdx),%rdi + adcq %rdi,%r15 + adcq %rdi,%rbx + adcq %rdi,%rbp + adcq %rdi,%rcx + adcq %rdi,%rax + + movq %rdx,%rdi + + movq %r8,0(%rdx) + movq %r9,8(%rdx) + movq %r10,16(%rdx) + movq %r11,24(%rdx) + movq %r12,32(%rdx) + movq %r13,40(%rdx) + movq %r14,48(%rdx) + movq %r15,56(%rdx) + movq %rbx,64(%rdx) + movq %rbp,72(%rdx) + movq %rcx,80(%rdx) + movq %rax,88(%rdx) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_383x63: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + movq %rcx,%rdx + adcq %rbp,%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rbp + sarq $63,%rbp + xorq %rax,%rax + subq %rbp,%rax + + xorq %rbp,%rdx + addq %rax,%rdx + + xorq %rbp,%r8 + xorq %rbp,%r9 + xorq %rbp,%r10 + xorq %rbp,%r11 + xorq %rbp,%r12 + xorq %rbp,%r13 + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%rax + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %rax,%r10 + mulxq %r11,%r11,%rax + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %rax,%r12 + mulxq %r13,%r13,%rax + adcq %rbp,%r13 + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_383_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + xorq %r14,%r14 + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + movq 0+24(%rsi),%r11 + movq 0+32(%rsi),%r12 + movq 0+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq %rdx,%r14 + + movq %rcx,%rdx + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + movq 48+0(%rsi),%r8 + movq 48+8(%rsi),%r9 + movq 48+16(%rsi),%r10 + movq 48+24(%rsi),%r11 + movq 48+32(%rsi),%r12 + movq 48+40(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %rax,%r10 + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r13 + addq %rbp,%r9 + mulxq %r10,%r10,%rbp + adcq %r13,%r10 + mulxq %r11,%r11,%r13 + adcq %rbp,%r11 + mulxq %r12,%r12,%rbp + adcq %r13,%r12 + adcq $0,%rbp + imulq %rdx + addq %rbp,%rax + adcq $0,%rdx + + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%rax + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r9,%r8 + shrdq $31,%r10,%r9 + shrdq $31,%r11,%r10 + shrdq $31,%r12,%r11 + shrdq $31,%rax,%r12 + shrdq $31,%r14,%rax + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r8 + xorq %r14,%r9 + xorq %r14,%r10 + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%rax + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %rax,40(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__smulx_191_n_shift_by_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rdx,%rbx + movq 0+0(%rsi),%r8 + movq 0+8(%rsi),%r9 + movq 0+16(%rsi),%r10 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r8 + xorq %rax,%r9 + xorq %r10,%rax + addq %rbp,%r8 + adcq $0,%r9 + adcq $0,%rax + + mulxq %r8,%r8,%rbp + mulxq %r9,%r9,%r10 + addq %rbp,%r9 + adcq $0,%r10 + imulq %rdx + addq %rax,%r10 + adcq $0,%rdx + movq %rdx,%r14 + movq %rcx,%rdx + movq 48+0(%rsi),%r11 + movq 48+8(%rsi),%r12 + movq 48+16(%rsi),%r13 + + movq %rdx,%rax + sarq $63,%rax + xorq %rbp,%rbp + subq %rax,%rbp + + xorq %rax,%rdx + addq %rbp,%rdx + + xorq %rax,%r11 + xorq %rax,%r12 + xorq %r13,%rax + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%rax + + mulxq %r11,%r11,%rbp + mulxq %r12,%r12,%r13 + addq %rbp,%r12 + adcq $0,%r13 + imulq %rdx + addq %rax,%r13 + adcq $0,%rdx + addq %r8,%r11 + adcq %r9,%r12 + adcq %r10,%r13 + adcq %rdx,%r14 + movq %rbx,%rdx + + shrdq $31,%r12,%r11 + shrdq $31,%r13,%r12 + shrdq $31,%r14,%r13 + + sarq $63,%r14 + xorq %rbp,%rbp + subq %r14,%rbp + + xorq %r14,%r11 + xorq %r14,%r12 + xorq %r14,%r13 + addq %rbp,%r11 + adcq $0,%r12 + adcq $0,%r13 + + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + + xorq %r14,%rdx + xorq %r14,%rcx + addq %rbp,%rdx + addq %rbp,%rcx + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__ab_approximation_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 40(%rsi),%r9 + movq 88(%rsi),%r11 + movq 32(%rsi),%rbx + movq 80(%rsi),%rbp + movq 24(%rsi),%r8 + movq 72(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 16(%rsi),%r8 + cmovzq %r10,%rbp + movq 64(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 8(%rsi),%r8 + cmovzq %r10,%rbp + movq 56(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + movq 0(%rsi),%r8 + cmovzq %r10,%rbp + movq 48(%rsi),%r10 + + movq %r9,%rax + orq %r11,%rax + cmovzq %rbx,%r9 + cmovzq %rbp,%r11 + cmovzq %r8,%rbx + cmovzq %r10,%rbp + + movq %r9,%rax + orq %r11,%rax + bsrq %rax,%rcx + leaq 1(%rcx),%rcx + cmovzq %r8,%r9 + cmovzq %r10,%r11 + cmovzq %rax,%rcx + negq %rcx + + + shldq %cl,%rbx,%r9 + shldq %cl,%rbp,%r11 + + movl $0x7FFFFFFF,%eax + andq %rax,%r8 + andq %rax,%r10 + andnq %r9,%rax,%r9 + andnq %r11,%rax,%r11 + orq %r9,%r8 + orq %r11,%r10 + + jmp __inner_loop_31 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__inner_loop_31: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $0x7FFFFFFF80000000,%rcx + movq $0x800000007FFFFFFF,%r13 + movq $0x7FFFFFFF7FFFFFFF,%r15 + +L$oop_31: + cmpq %r10,%r8 + movq %r8,%rax + movq %r10,%rbx + movq %rcx,%rbp + movq %r13,%r14 + cmovbq %r10,%r8 + cmovbq %rax,%r10 + cmovbq %r13,%rcx + cmovbq %rbp,%r13 + + subq %r10,%r8 + subq %r13,%rcx + addq %r15,%rcx + + testq $1,%rax + cmovzq %rax,%r8 + cmovzq %rbx,%r10 + cmovzq %rbp,%rcx + cmovzq %r14,%r13 + + shrq $1,%r8 + addq %r13,%r13 + subq %r15,%r13 + subl $1,%edi + jnz L$oop_31 + + shrq $32,%r15 + movl %ecx,%edx + movl %r13d,%r12d + shrq $32,%rcx + shrq $32,%r13 + subq %r15,%rdx + subq %r15,%rcx + subq %r15,%r12 + subq %r15,%r13 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__tail_loop_53: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq $1,%rdx + xorq %rcx,%rcx + xorq %r12,%r12 + movq $1,%r13 + +L$oop_53: + xorq %rax,%rax + testq $1,%r8 + movq %r10,%rbx + cmovnzq %r10,%rax + subq %r8,%rbx + movq %r8,%rbp + subq %rax,%r8 + cmovcq %rbx,%r8 + cmovcq %rbp,%r10 + movq %rdx,%rax + cmovcq %r12,%rdx + cmovcq %rax,%r12 + movq %rcx,%rbx + cmovcq %r13,%rcx + cmovcq %rbx,%r13 + xorq %rax,%rax + xorq %rbx,%rbx + shrq $1,%r8 + testq $1,%rbp + cmovnzq %r12,%rax + cmovnzq %r13,%rbx + addq %r12,%r12 + addq %r13,%r13 + subq %rax,%rdx + subq %rbx,%rcx + subl $1,%edi + jnz L$oop_53 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/div3w-armv8.S b/crypto/blst_src/build/mach-o/div3w-armv8.S new file mode 100644 index 00000000000..4b130080123 --- /dev/null +++ b/crypto/blst_src/build/mach-o/div3w-armv8.S @@ -0,0 +1,88 @@ +.text + +.globl _div_3_limbs + +.align 5 +_div_3_limbs: + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +Loop: + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csel x4,x4,x6,lo // select between R and R - D + extr x1,x2,x1,#1 // D >>= 1 + csel x5,x5,x7,lo + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,Loop + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + speculative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret + +.globl _quot_rem_128 + +.align 5 +_quot_rem_128: + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret + + +.globl _quot_rem_64 + +.align 5 +_quot_rem_64: + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret + diff --git a/crypto/blst_src/build/mach-o/div3w-x86_64.s b/crypto/blst_src/build/mach-o/div3w-x86_64.s new file mode 100644 index 00000000000..99a94d50a2b --- /dev/null +++ b/crypto/blst_src/build/mach-o/div3w-x86_64.s @@ -0,0 +1,124 @@ +.text + +.globl _div_3_limbs +.private_extern _div_3_limbs + +.p2align 5 +_div_3_limbs: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq (%rdi),%r8 + movq 8(%rdi),%r9 + xorq %rax,%rax + movl $64,%ecx + +L$oop: + movq %r8,%r10 + subq %rsi,%r8 + movq %r9,%r11 + sbbq %rdx,%r9 + leaq 1(%rax,%rax,1),%rax + movq %rdx,%rdi + cmovcq %r10,%r8 + cmovcq %r11,%r9 + sbbq $0,%rax + shlq $63,%rdi + shrq $1,%rsi + shrq $1,%rdx + orq %rdi,%rsi + subl $1,%ecx + jnz L$oop + + leaq 1(%rax,%rax,1),%rcx + sarq $63,%rax + + subq %rsi,%r8 + sbbq %rdx,%r9 + sbbq $0,%rcx + + orq %rcx,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _quot_rem_128 +.private_extern _quot_rem_128 + +.p2align 5 +_quot_rem_128: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq %rdx,%rax + movq %rdx,%rcx + + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + adcq $0,%rdx + + movq 0(%rdi),%r10 + movq 8(%rdi),%r11 + movq 16(%rdi),%rax + + subq %r8,%r10 + sbbq %r9,%r11 + sbbq %rdx,%rax + sbbq %r8,%r8 + + addq %r8,%rcx + movq %r8,%r9 + andq 0(%rsi),%r8 + andq 8(%rsi),%r9 + addq %r8,%r10 + adcq %r9,%r11 + + movq %r10,0(%rdi) + movq %r11,8(%rdi) + movq %rcx,16(%rdi) + + movq %rcx,%rax + + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + + +.globl _quot_rem_64 +.private_extern _quot_rem_64 + +.p2align 5 +_quot_rem_64: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + + movq %rdx,%rax + imulq 0(%rsi),%rdx + + movq 0(%rdi),%r10 + + subq %rdx,%r10 + + movq %r10,0(%rdi) + movq %rax,8(%rdi) + + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S b/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S new file mode 100644 index 00000000000..4f506b58b0f --- /dev/null +++ b/crypto/blst_src/build/mach-o/mul_mont_256-armv8.S @@ -0,0 +1,464 @@ +.text + +.globl _mul_mont_sparse_256 +.private_extern _mul_mont_sparse_256 + +.align 5 +_mul_mont_sparse_256: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csel x19,x19,x14,lo + csel x20,x20,x15,lo + csel x21,x21,x16,lo + csel x22,x22,x17,lo + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + +.globl _sqr_mont_sparse_256 +.private_extern _sqr_mont_sparse_256 + +.align 5 +_sqr_mont_sparse_256: +.long 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 +.long 3573752767 + ret + +.globl _from_mont_256 +.private_extern _from_mont_256 + +.align 5 +_from_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + +.globl _redc_mont_256 +.private_extern _redc_mont_256 + +.align 5 +_redc_mont_256: +.long 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csel x10,x10,x14,lo + csel x11,x11,x15,lo + csel x12,x12,x16,lo + csel x13,x13,x17,lo + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 +.long 3573752767 + ret + + + +.align 5 +__mul_by_1_mont_256: + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret + diff --git a/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S b/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S new file mode 100644 index 00000000000..5aa2e9f3ae7 --- /dev/null +++ b/crypto/blst_src/build/mach-o/mul_mont_384-armv8.S @@ -0,0 +1,2372 @@ +.text + +.globl _add_mod_384x384 + +.align 5 +_add_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + + +.align 5 +__add_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + stp x11,x12,[x0,#48] + csel x15,x15,x23,lo + stp x13,x14,[x0,#64] + csel x16,x16,x24,lo + stp x15,x16,[x0,#80] + + ret + + +.globl _sub_mod_384x384 + +.align 5 +_sub_mod_384x384: +.long 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 +.long 3573752767 + ret + + + +.align 5 +__sub_mod_384x384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret + + + +.align 5 +__add_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + stp x11,x12,[x0] + csel x16,x16,x24,lo + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + + +.align 5 +__sub_mod_384: + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl _mul_mont_384x +.private_extern _mul_mont_384x + +.align 5 +_mul_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_mont_384x +.private_extern _sqr_mont_384x + +.align 5 +_sqr_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x19,x11,x19,lo + csel x20,x12,x20,lo + csel x21,x13,x21,lo + ldp x11,x12,[sp] + csel x22,x14,x22,lo + ldr x17, [sp,#48] + csel x23,x15,x23,lo + ldp x13,x14,[sp,#16] + csel x24,x16,x24,lo + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _mul_mont_384 +.private_extern _mul_mont_384 + +.align 5 +_mul_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_mont_384: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csel x11,x19,x26,lo + csel x12,x20,x27,lo + csel x13,x21,x28,lo + csel x14,x22,x0,lo + csel x15,x23,x1,lo + csel x16,x24,x3,lo + ret + + +.globl _sqr_mont_384 +.private_extern _sqr_mont_384 + +.align 5 +_sqr_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_n_mul_mont_383 +.private_extern _sqr_n_mul_mont_383 + +.align 5 +_sqr_n_mul_mont_383: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +Loop_sqr_383: + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,Loop_sqr_383 + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.align 5 +__sqr_384: + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret + +.globl _sqr_384 +.private_extern _sqr_384 + +.align 5 +_sqr_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _redc_mont_384 +.private_extern _redc_mont_384 + +.align 5 +_redc_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _from_mont_384 +.private_extern _from_mont_384 + +.align 5 +_from_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_by_1_mont_384: + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret + + + +.align 5 +__redc_tail_mont_384: + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csel x11,x11,x19,lo + csel x12,x12,x20,lo + csel x13,x13,x21,lo + csel x14,x14,x22,lo + csel x15,x15,x23,lo + csel x16,x16,x24,lo + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + + +.globl _mul_384 +.private_extern _mul_384 + +.align 5 +_mul_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_384: + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret + + +.globl _mul_382x +.private_extern _mul_382x + +.align 5 +_mul_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // _mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // _mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // _mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_382x +.private_extern _sqr_382x + +.align 5 +_sqr_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // _mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // _mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sqr_mont_382x +.private_extern _sqr_mont_382x + +.align 5 +_sqr_mont_382x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // _mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // _mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + + +.align 5 +__mul_mont_383_nonred: + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret + + +.globl _sgn0_pty_mont_384 +.private_extern _sgn0_pty_mont_384 + +.align 5 +_sgn0_pty_mont_384: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + + +.globl _sgn0_pty_mont_384x +.private_extern _sgn0_pty_mont_384x + +.align 5 +_sgn0_pty_mont_384x: +.long 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + csel x3,x0,x2,eq // a->re==0? prty(a->im) : prty(a->re) + + cmp x1,#0 + csel x1,x0,x2,ne // a->im!=0? sgn0(a->im) : sgn0(a->re) + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +.long 3573752767 + ret + diff --git a/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s new file mode 100644 index 00000000000..842c39225b6 --- /dev/null +++ b/crypto/blst_src/build/mach-o/mulq_mont_256-x86_64.s @@ -0,0 +1,723 @@ +.comm ___blst_platform_cap,4 +.text + +.globl _mul_mont_sparse_256 +.private_extern _mul_mont_sparse_256 + +.p2align 5 +_mul_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_mont_sparse_256$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r13 + movq 8(%rsi),%r14 + movq 16(%rsi),%r12 + movq 24(%rsi),%rbp + movq %rdx,%rbx + + movq %rax,%r15 + mulq %r13 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_mont_sparse_256 +.private_extern _sqr_mont_sparse_256 + +.p2align 5 +_sqr_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_mont_sparse_256$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + movq 0(%rsi),%rax + movq %rcx,%r8 + movq 8(%rsi),%r14 + movq %rdx,%rcx + movq 16(%rsi),%r12 + leaq (%rsi),%rbx + movq 24(%rsi),%rbp + + movq %rax,%r15 + mulq %rax + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + call __mulq_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulq %r14 + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq 8(%rbx),%rax + adcq $0,%rdx + xorq %r14,%r14 + movq %rdx,%r13 + + movq %r9,%rdi + imulq %r8,%r9 + + + movq %rax,%r15 + mulq 0(%rsi) + addq %rax,%r10 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + xorq %r15,%r15 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r9,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rdi,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + addq %rdx,%r13 + adcq $0,%r14 + adcq $0,%r15 + movq %r10,%rdi + imulq %r8,%r10 + + + movq %rax,%r9 + mulq 0(%rsi) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + xorq %r9,%r9 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r10,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rdi,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r13 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + addq %rdx,%r14 + adcq $0,%r15 + adcq $0,%r9 + movq %r11,%rdi + imulq %r8,%r11 + + + movq %rax,%r10 + mulq 0(%rsi) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rsi) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r9 + xorq %r10,%r10 + + + mulq 0(%rcx) + addq %rax,%rdi + movq %r11,%rax + adcq %rdx,%rdi + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rdi,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + addq %rdx,%r15 + adcq $0,%r9 + adcq $0,%r10 + imulq %r8,%rax + movq 8(%rsp),%rsi + + + movq %rax,%r11 + mulq 0(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r12,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + movq %r14,%rbx + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rdx,%r9 + adcq $0,%r10 + + + + + movq %r15,%r12 + subq 0(%rcx),%r13 + sbbq 8(%rcx),%r14 + sbbq 16(%rcx),%r15 + movq %r9,%rbp + sbbq 24(%rcx),%r9 + sbbq $0,%r10 + + cmovcq %rax,%r13 + cmovcq %rbx,%r14 + cmovcq %r12,%r15 + movq %r13,0(%rsi) + cmovcq %rbp,%r9 + movq %r14,8(%rsi) + movq %r15,16(%rsi) + movq %r9,24(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _from_mont_256 +.private_extern _from_mont_256 + +.p2align 5 +_from_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz from_mont_256$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + + + + + movq %r14,%r10 + movq %r15,%r11 + movq %r9,%r12 + + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + sbbq 24(%rbx),%r9 + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _redc_mont_256 +.private_extern _redc_mont_256 + +.p2align 5 +_redc_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz redc_mont_256$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_256 + + addq 32(%rsi),%r13 + adcq 40(%rsi),%r14 + movq %r13,%rax + adcq 48(%rsi),%r15 + movq %r14,%r10 + adcq 56(%rsi),%r9 + sbbq %rsi,%rsi + + + + + movq %r15,%r11 + subq 0(%rbx),%r13 + sbbq 8(%rbx),%r14 + sbbq 16(%rbx),%r15 + movq %r9,%r12 + sbbq 24(%rbx),%r9 + sbbq $0,%rsi + + cmovncq %r13,%rax + cmovncq %r14,%r10 + cmovncq %r15,%r11 + movq %rax,0(%rdi) + cmovncq %r9,%r12 + movq %r10,8(%rdi) + movq %r11,16(%rdi) + movq %r12,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + movq %rax,%r13 + imulq %rcx,%rax + movq %rax,%r9 + + mulq 0(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq %rdx,%r13 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r10 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 16(%rbx) + movq %r10,%r14 + imulq %rcx,%r10 + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r13,%r11 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r13,%r12 + adcq $0,%rdx + movq %rdx,%r13 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r9 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s new file mode 100644 index 00000000000..7052343d0ac --- /dev/null +++ b/crypto/blst_src/build/mach-o/mulq_mont_384-x86_64.s @@ -0,0 +1,3673 @@ +.comm ___blst_platform_cap,4 +.text + + + + + + + + +.p2align 5 +__subq_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__addq_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__subq_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__subq_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_mont_384x +.private_extern _mul_mont_384x + +.p2align 5 +_mul_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_mont_384x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulq_384 + + + leaq 48(%rbx),%rbx + leaq 48(%rsi),%rsi + leaq 40+96(%rsp),%rdi + call __mulq_384 + + + movq 8(%rsp),%rcx + leaq -48(%rsi),%rdx + leaq 40+192+48(%rsp),%rdi + call __addq_mod_384 + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __addq_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulq_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __subq_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __subq_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __subq_mod_384x384 + + movq %rcx,%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_mont_384x +.private_extern _sqr_mont_384x + +.p2align 5 +_sqr_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_mont_384x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,8(%rsp) + movq %rsi,16(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __addq_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __subq_mod_384 + + + movq 16(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + call __mulq_mont_384 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + movq %r14,%r12 + adcq %r9,%r9 + movq %r15,%r13 + adcq %r10,%r10 + movq %r8,%rax + adcq %r11,%r11 + movq %r9,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r10,%rbp + sbbq 16(%rcx),%r8 + sbbq 24(%rcx),%r9 + sbbq 32(%rcx),%r10 + movq %r11,%rsi + sbbq 40(%rcx),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r14 + cmovcq %r13,%r15 + cmovcq %rax,%r8 + movq %r14,48(%rdi) + cmovcq %rbx,%r9 + movq %r15,56(%rdi) + cmovcq %rbp,%r10 + movq %r8,64(%rdi) + cmovcq %rsi,%r11 + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mul_382x +.private_extern _mul_382x + +.p2align 5 +_mul_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_382x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulq_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulq_384 + + + leaq 48(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulq_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __subq_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __subq_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __subq_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_382x +.private_extern _sqr_382x + +.p2align 5 +_sqr_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_382x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __subq_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulq_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulq_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_384 +.private_extern _mul_384 + +.p2align 5 +_mul_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + + + movq %rdx,%rbx + call __mulq_384 + + movq 0(%rsp),%r12 +.cfi_restore %r12 + movq 8(%rsp),%rbx +.cfi_restore %rbx + movq 16(%rsp),%rbp +.cfi_restore %rbp + leaq 24(%rsp),%rsp +.cfi_adjust_cfa_offset -24 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__mulq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rax + + movq %rax,%rbp + mulq 0(%rsi) + movq %rax,0(%rdi) + movq %rbp,%rax + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r11 + movq 8(%rbx),%rax + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,8(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,16(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,24(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,32(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%rcx + movq %rbp,%rax + adcq $0,%rdx + movq %rcx,40(%rdi) + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %rax,%r12 + movq %rax,%rax + adcq $0,%rdx + addq %r12,%r11 + adcq $0,%rdx + movq %rdx,%r12 + movq %rcx,48(%rdi) + movq %r8,56(%rdi) + movq %r9,64(%rdi) + movq %r10,72(%rdi) + movq %r11,80(%rdi) + movq %r12,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_384 +.private_extern _sqr_384 + +.p2align 5 +_sqr_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + call __sqrq_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__sqrq_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r15 + movq 16(%rsi),%rcx + movq 24(%rsi),%rbx + + + movq %rax,%r14 + mulq %r15 + movq %rax,%r9 + movq %r14,%rax + movq 32(%rsi),%rbp + movq %rdx,%r10 + + mulq %rcx + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + movq 40(%rsi),%rsi + movq %rdx,%r11 + + mulq %rbx + addq %rax,%r11 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq %rbp + addq %rax,%r12 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq %rsi + addq %rax,%r13 + movq %r14,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq %rax + xorq %r8,%r8 + movq %rax,0(%rdi) + movq %r15,%rax + addq %r9,%r9 + adcq $0,%r8 + addq %rdx,%r9 + adcq $0,%r8 + movq %r9,8(%rdi) + + mulq %rcx + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbx + addq %rax,%r12 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rbp + addq %rax,%r13 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq %rsi + addq %rax,%r14 + movq %r15,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rax + xorq %r9,%r9 + addq %rax,%r8 + movq %rcx,%rax + addq %r10,%r10 + adcq %r11,%r11 + adcq $0,%r9 + addq %r8,%r10 + adcq %rdx,%r11 + adcq $0,%r9 + movq %r10,16(%rdi) + + mulq %rbx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + movq %r11,24(%rdi) + movq %rdx,%r8 + + mulq %rbp + addq %rax,%r14 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq %rsi + addq %rax,%r15 + movq %rcx,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + xorq %r11,%r11 + addq %rax,%r9 + movq %rbx,%rax + addq %r12,%r12 + adcq %r13,%r13 + adcq $0,%r11 + addq %r9,%r12 + adcq %rdx,%r13 + adcq $0,%r11 + movq %r12,32(%rdi) + + + mulq %rbp + addq %rax,%r15 + movq %rbx,%rax + adcq $0,%rdx + movq %r13,40(%rdi) + movq %rdx,%r8 + + mulq %rsi + addq %rax,%rcx + movq %rbx,%rax + adcq $0,%rdx + addq %r8,%rcx + adcq $0,%rdx + movq %rdx,%rbx + + mulq %rax + xorq %r12,%r12 + addq %rax,%r11 + movq %rbp,%rax + addq %r14,%r14 + adcq %r15,%r15 + adcq $0,%r12 + addq %r11,%r14 + adcq %rdx,%r15 + movq %r14,48(%rdi) + adcq $0,%r12 + movq %r15,56(%rdi) + + + mulq %rsi + addq %rax,%rbx + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + xorq %r13,%r13 + addq %rax,%r12 + movq %rsi,%rax + addq %rcx,%rcx + adcq %rbx,%rbx + adcq $0,%r13 + addq %r12,%rcx + adcq %rdx,%rbx + movq %rcx,64(%rdi) + adcq $0,%r13 + movq %rbx,72(%rdi) + + + mulq %rax + addq %r13,%rax + addq %rbp,%rbp + adcq $0,%rdx + addq %rbp,%rax + adcq $0,%rdx + movq %rax,80(%rdi) + movq %rdx,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_mont_384 +.private_extern _sqr_mont_384 + +.p2align 5 +_sqr_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $120,%rsp +.cfi_adjust_cfa_offset 8*15 + + + movq %rcx,96(%rsp) + movq %rdx,104(%rsp) + movq %rdi,112(%rsp) + + movq %rsp,%rdi + call __sqrq_384 + + leaq 0(%rsp),%rsi + movq 96(%rsp),%rcx + movq 104(%rsp),%rbx + movq 112(%rsp),%rdi + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + leaq 120(%rsp),%r8 + movq 120(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*21 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + +.globl _redc_mont_384 +.private_extern _redc_mont_384 + +.p2align 5 +_redc_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz redc_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + +.globl _from_mont_384 +.private_extern _from_mont_384 + +.p2align 5 +_from_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz from_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulq_by_1_mont_384 + + + + + + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r8 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r9 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r10 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %r9,%r15 + imulq %rcx,%r9 + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 32(%rbx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 40(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r9,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %r10,%r8 + imulq %rcx,%r10 + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rbx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r8 + movq %r10,%rax + adcq %rdx,%r8 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rbx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %r11,%r9 + imulq %rcx,%r11 + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r8,%r15 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rbx) + addq %rax,%r9 + movq %r11,%rax + adcq %rdx,%r9 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %r12,%r10 + imulq %rcx,%r12 + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r9,%r8 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %r13,%r11 + imulq %rcx,%r13 + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rbx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r9 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rbx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__redq_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mont_384 +.private_extern _sgn0_pty_mont_384 + +.p2align 5 +_sgn0_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sgn0_pty_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0_pty_mont_384x +.private_extern _sgn0_pty_mont_384x + +.p2align 5 +_sgn0_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sgn0_pty_mont_384x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulq_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mul_mont_384 +.private_extern _mul_mont_384 + +.p2align 5 +_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz mul_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $24,%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq 0(%rdx),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq %rdx,%rbx + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + + call __mulq_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -72 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rdi + mulq %r14 + movq %rax,%r8 + movq %rdi,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%rbp + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + xorq %r15,%r15 + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r8,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq %rdx,%r14 + adcq $0,%r15 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r9,%rbp + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r14 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r14 + movq %r9,%rax + adcq %rdx,%r15 + adcq $0,%r8 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r9,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq %rdx,%r15 + adcq $0,%r8 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r10 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r10,%rbp + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r15 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r15 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r10,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r11 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq %rdx,%r8 + adcq $0,%r9 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r11 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r11,%rbp + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r11,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r12 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq %rdx,%r9 + adcq $0,%r10 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r12 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r12,%rbp + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r9 + adcq $0,%rdx + xorq %r11,%r11 + addq %rax,%r9 + movq %r12,%rax + adcq %rdx,%r10 + adcq $0,%r11 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r12,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r13 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %rbp,%r8 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq %rdx,%r10 + adcq $0,%r11 + + movq %rax,%rdi + mulq 0(%rsi) + addq %rax,%r13 + movq %rdi,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + movq %r13,%rbp + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r8 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rdi,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rsi) + addq %r12,%r10 + adcq $0,%rdx + xorq %r12,%r12 + addq %rax,%r10 + movq %r13,%rax + adcq %rdx,%r11 + adcq $0,%r12 + + mulq 0(%rcx) + addq %rax,%rbp + movq %r13,%rax + adcq %rdx,%rbp + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r14 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r15 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 24(%rcx) + addq %rbp,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %rbp,%r9 + adcq $0,%rdx + movq %rdx,%rbp + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %rbp,%r10 + adcq %rdx,%r11 + adcq $0,%r12 + + + + + movq 16(%rsp),%rdi + subq 0(%rcx),%r14 + movq %r15,%rdx + sbbq 8(%rcx),%r15 + movq %r8,%rbx + sbbq 16(%rcx),%r8 + movq %r9,%rsi + sbbq 24(%rcx),%r9 + movq %r10,%rbp + sbbq 32(%rcx),%r10 + movq %r11,%r13 + sbbq 40(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rdx,%r15 + cmovcq %rbx,%r8 + movq %r14,0(%rdi) + cmovcq %rsi,%r9 + movq %r15,8(%rdi) + cmovcq %rbp,%r10 + movq %r8,16(%rdi) + cmovcq %r13,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_n_mul_mont_384 +.private_extern _sqr_n_mul_mont_384 + +.p2align 5 +_sqr_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_n_mul_mont_384$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +L$oop_sqr_384: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + movd %xmm1,%edx + leaq 0(%rdi),%rsi + decl %edx + jnz L$oop_sqr_384 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqr_n_mul_mont_383 +.private_extern _sqr_n_mul_mont_383 + +.p2align 5 +_sqr_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_n_mul_mont_383$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 8*17 + + + movq %r8,0(%rsp) + movq %rdi,8(%rsp) + movq %rcx,16(%rsp) + leaq 32(%rsp),%rdi + movq %r9,24(%rsp) + movq (%r9),%xmm2 + +L$oop_sqr_383: + movd %edx,%xmm1 + + call __sqrq_384 + + leaq 0(%rdi),%rsi + movq 0(%rsp),%rcx + movq 16(%rsp),%rbx + call __mulq_by_1_mont_384 + + movd %xmm1,%edx + addq 48(%rsi),%r14 + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + leaq 0(%rdi),%rsi + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + decl %edx + jnz L$oop_sqr_383 + +.byte 102,72,15,126,208 + movq %rbx,%rcx + movq 24(%rsp),%rbx + + + + + + + movq %r8,%r12 + movq %r9,%r13 + + call __mulq_mont_384 + + leaq 136(%rsp),%r8 + movq 136(%rsp),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -8*23 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulq_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq %rax,%rbp + mulq %r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r12 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + movq %r8,%r15 + imulq 8(%rsp),%r8 + + mulq %r13 + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r13 + + mulq 40(%rsi) + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rcx) + addq %rax,%r15 + movq %r8,%rax + adcq %rdx,%r15 + + mulq 8(%rcx) + addq %rax,%r9 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r9 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rcx) + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rcx) + addq %r15,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rcx) + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rcx) + addq %rax,%r13 + movq 8(%rbx),%rax + adcq $0,%rdx + addq %r15,%r13 + adcq %rdx,%r14 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 8(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r10 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r11 + adcq $0,%rdx + movq %rdx,%r15 + + movq %r9,%r8 + imulq 8(%rsp),%r9 + + mulq 24(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 32(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 40(%rsi) + addq %r15,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rcx) + addq %rax,%r8 + movq %r9,%rax + adcq %rdx,%r8 + + mulq 8(%rcx) + addq %rax,%r10 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r10 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rcx) + addq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 24(%rcx) + addq %r8,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rcx) + addq %rax,%r13 + movq %r9,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rcx) + addq %rax,%r14 + movq 16(%rbx),%rax + adcq $0,%rdx + addq %r8,%r14 + adcq %rdx,%r15 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 8(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r11 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 16(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r12 + adcq $0,%rdx + movq %rdx,%r8 + + movq %r10,%r9 + imulq 8(%rsp),%r10 + + mulq 24(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r13 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 32(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r8,%r14 + adcq $0,%rdx + movq %rdx,%r8 + + mulq 40(%rsi) + addq %r8,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r8 + + mulq 0(%rcx) + addq %rax,%r9 + movq %r10,%rax + adcq %rdx,%r9 + + mulq 8(%rcx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r11 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rcx) + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 24(%rcx) + addq %r9,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rcx) + addq %rax,%r14 + movq %r10,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rcx) + addq %rax,%r15 + movq 24(%rbx),%rax + adcq $0,%rdx + addq %r9,%r15 + adcq %rdx,%r8 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r12 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 16(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r13 + adcq $0,%rdx + movq %rdx,%r9 + + movq %r11,%r10 + imulq 8(%rsp),%r11 + + mulq 24(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r14 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 32(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r9,%r15 + adcq $0,%rdx + movq %rdx,%r9 + + mulq 40(%rsi) + addq %r9,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r9 + + mulq 0(%rcx) + addq %rax,%r10 + movq %r11,%rax + adcq %rdx,%r10 + + mulq 8(%rcx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r12 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rcx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rcx) + addq %r10,%r14 + adcq $0,%rdx + addq %rax,%r14 + movq %r11,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rcx) + addq %rax,%r15 + movq %r11,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rcx) + addq %rax,%r8 + movq 32(%rbx),%rax + adcq $0,%rdx + addq %r10,%r8 + adcq %rdx,%r9 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 8(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + movq %r12,%r11 + imulq 8(%rsp),%r12 + + mulq 24(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 32(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r10,%r8 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 40(%rsi) + addq %r10,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rcx) + addq %rax,%r11 + movq %r12,%rax + adcq %rdx,%r11 + + mulq 8(%rcx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rcx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rcx) + addq %r11,%r15 + adcq $0,%rdx + addq %rax,%r15 + movq %r12,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rcx) + addq %rax,%r8 + movq %r12,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rcx) + addq %rax,%r9 + movq 40(%rbx),%rax + adcq $0,%rdx + addq %r11,%r9 + adcq %rdx,%r10 + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 8(%rsi) + addq %rax,%r14 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rsi) + addq %rax,%r15 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + movq %r13,%r12 + imulq 8(%rsp),%r13 + + mulq 24(%rsi) + addq %rax,%r8 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r8 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 32(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + addq %r11,%r9 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 40(%rsi) + addq %r11,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq 0(%rcx) + addq %rax,%r12 + movq %r13,%rax + adcq %rdx,%r12 + + mulq 8(%rcx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r14 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 16(%rcx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r15 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 24(%rcx) + addq %r12,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r13,%rax + adcq $0,%rdx + movq %rdx,%r12 + + mulq 32(%rcx) + addq %rax,%r9 + movq %r13,%rax + adcq $0,%rdx + addq %r12,%r9 + adcq $0,%rdx + movq %rdx,%r12 + + mulq 40(%rcx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r12,%r10 + adcq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqr_mont_382x +.private_extern _sqr_mont_382x + +.p2align 5 +_sqr_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +#ifdef __BLST_PORTABLE__ + testl $1,___blst_platform_cap(%rip) + jnz sqr_mont_382x$1 +#endif + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rsi,16(%rsp) + movq %rdi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rax + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq 24(%rsp),%rdi + call __mulq_mont_383_nonred + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + + movq %r14,48(%rdi) + movq %r15,56(%rdi) + movq %r8,64(%rdi) + movq %r9,72(%rdi) + movq %r10,80(%rdi) + movq %r11,88(%rdi) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rax + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%r12 + movq 32+24(%rsp),%r13 + + call __mulq_mont_383_nonred + movq 32+96(%rsp),%rsi + movq 32+0(%rsp),%r12 + movq 32+8(%rsp),%r13 + andq %rsi,%r12 + movq 32+16(%rsp),%rax + andq %rsi,%r13 + movq 32+24(%rsp),%rbx + andq %rsi,%rax + movq 32+32(%rsp),%rbp + andq %rsi,%rbx + andq %rsi,%rbp + andq 32+40(%rsp),%rsi + + subq %r12,%r14 + movq 0(%rcx),%r12 + sbbq %r13,%r15 + movq 8(%rcx),%r13 + sbbq %rax,%r8 + movq 16(%rcx),%rax + sbbq %rbx,%r9 + movq 24(%rcx),%rbx + sbbq %rbp,%r10 + movq 32(%rcx),%rbp + sbbq %rsi,%r11 + sbbq %rsi,%rsi + + andq %rsi,%r12 + andq %rsi,%r13 + andq %rsi,%rax + andq %rsi,%rbx + andq %rsi,%rbp + andq 40(%rcx),%rsi + + addq %r12,%r14 + adcq %r13,%r15 + adcq %rax,%r8 + adcq %rbx,%r9 + adcq %rbp,%r10 + adcq %rsi,%r11 + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s new file mode 100644 index 00000000000..ae9a76b739c --- /dev/null +++ b/crypto/blst_src/build/mach-o/mulx_mont_256-x86_64.s @@ -0,0 +1,623 @@ +.text + +.globl _mulx_mont_sparse_256 +.private_extern _mulx_mont_sparse_256 + +.p2align 5 +_mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_mont_sparse_256$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_mont_sparse_256 +.private_extern _sqrx_mont_sparse_256 + +.p2align 5 +_sqrx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_mont_sparse_256$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + movq %rcx,%r8 + movq %rdx,%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rbp + movq 24(%rsi),%r9 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%rax,%r11 + call __mulx_mont_sparse_256 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_sparse_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + mulxq %r15,%r15,%r12 + mulxq %rbp,%rbp,%r13 + addq %r15,%r11 + mulxq %r9,%r9,%r14 + movq 8(%rbx),%rdx + adcq %rbp,%r12 + adcq %r9,%r13 + adcq $0,%r14 + + movq %rax,%r10 + imulq %r8,%rax + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r11 + adcxq %r9,%r12 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r14 + adcxq %r15,%r9 + adoxq %r9,%r15 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r10 + adoxq %r11,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r12 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r12 + adoxq %r9,%r13 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 16(%rbx),%rdx + adcxq %rbp,%r13 + adoxq %r9,%r14 + adcxq %r10,%r14 + adoxq %r10,%r15 + adcxq %r10,%r15 + adoxq %r10,%r10 + adcq $0,%r10 + movq %rax,%r11 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r12 + adcxq %r9,%r13 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r15 + adcxq %r10,%r9 + adoxq %r9,%r10 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r11 + adoxq %r12,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r13 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r13 + adoxq %r9,%r14 + + mulxq 24+128(%rcx),%rbp,%r9 + movq 24(%rbx),%rdx + adcxq %rbp,%r14 + adoxq %r9,%r15 + adcxq %r11,%r15 + adoxq %r11,%r10 + adcxq %r11,%r10 + adoxq %r11,%r11 + adcq $0,%r11 + movq %rax,%r12 + imulq %r8,%rax + + + xorq %rbp,%rbp + mulxq 0+128(%rsi),%rbp,%r9 + adoxq %rbp,%r13 + adcxq %r9,%r14 + + mulxq 8+128(%rsi),%rbp,%r9 + adoxq %rbp,%r14 + adcxq %r9,%r15 + + mulxq 16+128(%rsi),%rbp,%r9 + adoxq %rbp,%r15 + adcxq %r9,%r10 + + mulxq 24+128(%rsi),%rbp,%r9 + movq %rax,%rdx + adoxq %rbp,%r10 + adcxq %r11,%r9 + adoxq %r9,%r11 + + + mulxq 0+128(%rcx),%rbp,%rax + adcxq %rbp,%r12 + adoxq %r13,%rax + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%rax + adoxq %r9,%r14 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %rax,%rdx + adcxq %rbp,%r15 + adoxq %r9,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + adoxq %r12,%r12 + adcq $0,%r12 + imulq %r8,%rdx + + + xorq %rbp,%rbp + mulxq 0+128(%rcx),%r13,%r9 + adcxq %rax,%r13 + adoxq %r9,%r14 + + mulxq 8+128(%rcx),%rbp,%r9 + adcxq %rbp,%r14 + adoxq %r9,%r15 + + mulxq 16+128(%rcx),%rbp,%r9 + adcxq %rbp,%r15 + adoxq %r9,%r10 + + mulxq 24+128(%rcx),%rbp,%r9 + movq %r14,%rdx + leaq 128(%rcx),%rcx + adcxq %rbp,%r10 + adoxq %r9,%r11 + movq %r15,%rax + adcxq %r13,%r11 + adoxq %r13,%r12 + adcq $0,%r12 + + + + + movq %r10,%rbp + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + sbbq 16(%rcx),%r10 + movq %r11,%r9 + sbbq 24(%rcx),%r11 + sbbq $0,%r12 + + cmovcq %rdx,%r14 + cmovcq %rax,%r15 + cmovcq %rbp,%r10 + movq %r14,0(%rdi) + cmovcq %r9,%r11 + movq %r15,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _fromx_mont_256 +.private_extern _fromx_mont_256 + +.p2align 5 +_fromx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +from_mont_256$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + + + + + movq %r15,%rdx + movq %r10,%r12 + movq %r11,%r13 + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + sbbq 24(%rbx),%r11 + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _redcx_mont_256 +.private_extern _redcx_mont_256 + +.p2align 5 +_redcx_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +redc_mont_256$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_256 + + addq 32(%rsi),%r14 + adcq 40(%rsi),%r15 + movq %r14,%rax + adcq 48(%rsi),%r10 + movq %r15,%rdx + adcq 56(%rsi),%r11 + sbbq %rsi,%rsi + + + + + movq %r10,%r12 + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r10 + movq %r11,%r13 + sbbq 24(%rbx),%r11 + sbbq $0,%rsi + + cmovncq %r14,%rax + cmovncq %r15,%rdx + cmovncq %r10,%r12 + movq %rax,0(%rdi) + cmovncq %r11,%r13 + movq %rdx,8(%rdi) + movq %r12,16(%rdi) + movq %r13,24(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_by_1_mont_256: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rax + movq 8(%rsi),%r11 + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + + movq %rax,%r14 + imulq %rcx,%rax + movq %rax,%r10 + + mulq 0(%rbx) + addq %rax,%r14 + movq %r10,%rax + adcq %rdx,%r14 + + mulq 8(%rbx) + addq %rax,%r11 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r11 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 16(%rbx) + movq %r11,%r15 + imulq %rcx,%r11 + addq %rax,%r12 + movq %r10,%rax + adcq $0,%rdx + addq %r14,%r12 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 24(%rbx) + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r14,%r13 + adcq $0,%rdx + movq %rdx,%r14 + + mulq 0(%rbx) + addq %rax,%r15 + movq %r11,%rax + adcq %rdx,%r15 + + mulq 8(%rbx) + addq %rax,%r12 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r12 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 16(%rbx) + movq %r12,%r10 + imulq %rcx,%r12 + addq %rax,%r13 + movq %r11,%rax + adcq $0,%rdx + addq %r15,%r13 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 24(%rbx) + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r15,%r14 + adcq $0,%rdx + movq %rdx,%r15 + + mulq 0(%rbx) + addq %rax,%r10 + movq %r12,%rax + adcq %rdx,%r10 + + mulq 8(%rbx) + addq %rax,%r13 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rbx) + movq %r13,%r11 + imulq %rcx,%r13 + addq %rax,%r14 + movq %r12,%rax + adcq $0,%rdx + addq %r10,%r14 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 24(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r10,%r15 + adcq $0,%rdx + movq %rdx,%r10 + + mulq 0(%rbx) + addq %rax,%r11 + movq %r13,%rax + adcq %rdx,%r11 + + mulq 8(%rbx) + addq %rax,%r14 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 16(%rbx) + addq %rax,%r15 + movq %r13,%rax + adcq $0,%rdx + addq %r11,%r15 + adcq $0,%rdx + movq %rdx,%r11 + + mulq 24(%rbx) + addq %rax,%r10 + movq %r14,%rax + adcq $0,%rdx + addq %r11,%r10 + adcq $0,%rdx + movq %rdx,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s new file mode 100644 index 00000000000..c5afeec8a51 --- /dev/null +++ b/crypto/blst_src/build/mach-o/mulx_mont_384-x86_64.s @@ -0,0 +1,2975 @@ +.text + + + + + + + + +.p2align 5 +__subx_mod_384x384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + movq 48(%rsi),%r14 + + subq 0(%rdx),%r8 + movq 56(%rsi),%r15 + sbbq 8(%rdx),%r9 + movq 64(%rsi),%rax + sbbq 16(%rdx),%r10 + movq 72(%rsi),%rbx + sbbq 24(%rdx),%r11 + movq 80(%rsi),%rbp + sbbq 32(%rdx),%r12 + movq 88(%rsi),%rsi + sbbq 40(%rdx),%r13 + movq %r8,0(%rdi) + sbbq 48(%rdx),%r14 + movq 0(%rcx),%r8 + movq %r9,8(%rdi) + sbbq 56(%rdx),%r15 + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + sbbq 64(%rdx),%rax + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + sbbq 72(%rdx),%rbx + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + sbbq 80(%rdx),%rbp + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + sbbq 88(%rdx),%rsi + movq 40(%rcx),%r13 + sbbq %rdx,%rdx + + andq %rdx,%r8 + andq %rdx,%r9 + andq %rdx,%r10 + andq %rdx,%r11 + andq %rdx,%r12 + andq %rdx,%r13 + + addq %r8,%r14 + adcq %r9,%r15 + movq %r14,48(%rdi) + adcq %r10,%rax + movq %r15,56(%rdi) + adcq %r11,%rbx + movq %rax,64(%rdi) + adcq %r12,%rbp + movq %rbx,72(%rdi) + adcq %r13,%rsi + movq %rbp,80(%rdi) + movq %rsi,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__addx_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + movq %r8,%r14 + adcq 24(%rdx),%r11 + movq %r9,%r15 + adcq 32(%rdx),%r12 + movq %r10,%rax + adcq 40(%rdx),%r13 + movq %r11,%rbx + sbbq %rdx,%rdx + + subq 0(%rcx),%r8 + sbbq 8(%rcx),%r9 + movq %r12,%rbp + sbbq 16(%rcx),%r10 + sbbq 24(%rcx),%r11 + sbbq 32(%rcx),%r12 + movq %r13,%rsi + sbbq 40(%rcx),%r13 + sbbq $0,%rdx + + cmovcq %r14,%r8 + cmovcq %r15,%r9 + cmovcq %rax,%r10 + movq %r8,0(%rdi) + cmovcq %rbx,%r11 + movq %r9,8(%rdi) + cmovcq %rbp,%r12 + movq %r10,16(%rdi) + cmovcq %rsi,%r13 + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__subx_mod_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + +__subx_mod_384_a_is_loaded: + subq 0(%rdx),%r8 + movq 0(%rcx),%r14 + sbbq 8(%rdx),%r9 + movq 8(%rcx),%r15 + sbbq 16(%rdx),%r10 + movq 16(%rcx),%rax + sbbq 24(%rdx),%r11 + movq 24(%rcx),%rbx + sbbq 32(%rdx),%r12 + movq 32(%rcx),%rbp + sbbq 40(%rdx),%r13 + movq 40(%rcx),%rsi + sbbq %rdx,%rdx + + andq %rdx,%r14 + andq %rdx,%r15 + andq %rdx,%rax + andq %rdx,%rbx + andq %rdx,%rbp + andq %rdx,%rsi + + addq %r14,%r8 + adcq %r15,%r9 + movq %r8,0(%rdi) + adcq %rax,%r10 + movq %r9,8(%rdi) + adcq %rbx,%r11 + movq %r10,16(%rdi) + adcq %rbp,%r12 + movq %r11,24(%rdi) + adcq %rsi,%r13 + movq %r12,32(%rdi) + movq %r13,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_mont_384x +.private_extern _mulx_mont_384x + +.p2align 5 +_mulx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_mont_384x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $328,%rsp +.cfi_adjust_cfa_offset 328 + + + movq %rdx,%rbx + movq %rdi,32(%rsp) + movq %rsi,24(%rsp) + movq %rdx,16(%rsp) + movq %rcx,8(%rsp) + movq %r8,0(%rsp) + + + + + leaq 40(%rsp),%rdi + call __mulx_384 + + + leaq 48(%rbx),%rbx + leaq 128+48(%rsi),%rsi + leaq 96(%rdi),%rdi + call __mulx_384 + + + movq 8(%rsp),%rcx + leaq (%rbx),%rsi + leaq -48(%rbx),%rdx + leaq 40+192+48(%rsp),%rdi + call __addx_mod_384 + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq -48(%rdi),%rdi + call __addx_mod_384 + + leaq (%rdi),%rbx + leaq 48(%rdi),%rsi + call __mulx_384 + + + leaq (%rdi),%rsi + leaq 40(%rsp),%rdx + movq 8(%rsp),%rcx + call __subx_mod_384x384 + + leaq (%rdi),%rsi + leaq -96(%rdi),%rdx + call __subx_mod_384x384 + + + leaq 40(%rsp),%rsi + leaq 40+96(%rsp),%rdx + leaq 40(%rsp),%rdi + call __subx_mod_384x384 + + leaq (%rcx),%rbx + + + leaq 40(%rsp),%rsi + movq 0(%rsp),%rcx + movq 32(%rsp),%rdi + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + + leaq 40+192(%rsp),%rsi + movq 0(%rsp),%rcx + leaq 48(%rdi),%rdi + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + leaq 328(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -328-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_384x +.private_extern _sqrx_mont_384x + +.p2align 5 +_sqrx_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_mont_384x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + leaq 48(%rsi),%rdx + leaq 32(%rsp),%rdi + call __addx_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rdx + leaq 32+48(%rsp),%rdi + call __subx_mod_384 + + + movq 24(%rsp),%rsi + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + movq %rdx,%r8 + adcq %r12,%r12 + movq %r15,%r9 + adcq %rdi,%rdi + movq %rax,%r10 + adcq %rbp,%rbp + movq %r12,%r11 + sbbq %rsi,%rsi + + subq 0(%rcx),%rdx + sbbq 8(%rcx),%r15 + movq %rdi,%r13 + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r12 + sbbq 32(%rcx),%rdi + movq %rbp,%r14 + sbbq 40(%rcx),%rbp + sbbq $0,%rsi + + cmovcq %r8,%rdx + cmovcq %r9,%r15 + cmovcq %r10,%rax + movq %rdx,48(%rbx) + cmovcq %r11,%r12 + movq %r15,56(%rbx) + cmovcq %r13,%rdi + movq %rax,64(%rbx) + cmovcq %r14,%rbp + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _mulx_382x +.private_extern _mulx_382x + +.p2align 5 +_mulx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_382x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + leaq 96(%rdi),%rdi + movq %rsi,0(%rsp) + movq %rdx,8(%rsp) + movq %rdi,16(%rsp) + movq %rcx,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + addq 48(%rsi),%r8 + adcq 56(%rsi),%r9 + adcq 64(%rsi),%r10 + adcq 72(%rsi),%r11 + adcq 80(%rsi),%r12 + adcq 88(%rsi),%r13 + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + movq 32(%rdx),%r12 + movq 40(%rdx),%r13 + + addq 48(%rdx),%r8 + adcq 56(%rdx),%r9 + adcq 64(%rdx),%r10 + adcq 72(%rdx),%r11 + adcq 80(%rdx),%r12 + adcq 88(%rdx),%r13 + + movq %r8,32+48(%rsp) + movq %r9,32+56(%rsp) + movq %r10,32+64(%rsp) + movq %r11,32+72(%rsp) + movq %r12,32+80(%rsp) + movq %r13,32+88(%rsp) + + + leaq 32+0(%rsp),%rsi + leaq 32+48(%rsp),%rbx + call __mulx_384 + + + movq 0(%rsp),%rsi + movq 8(%rsp),%rbx + leaq -96(%rdi),%rdi + call __mulx_384 + + + leaq 48+128(%rsi),%rsi + leaq 48(%rbx),%rbx + leaq 32(%rsp),%rdi + call __mulx_384 + + + movq 16(%rsp),%rsi + leaq 32(%rsp),%rdx + movq 24(%rsp),%rcx + movq %rsi,%rdi + call __subx_mod_384x384 + + + leaq 0(%rdi),%rsi + leaq -96(%rdi),%rdx + call __subx_mod_384x384 + + + leaq -96(%rdi),%rsi + leaq 32(%rsp),%rdx + leaq -96(%rdi),%rdi + call __subx_mod_384x384 + + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_382x +.private_extern _sqrx_382x + +.p2align 5 +_sqrx_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_382x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rcx + + + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%rbx + movq 32(%rsi),%rbp + movq 40(%rsi),%rdx + + movq %r14,%r8 + addq 48(%rsi),%r14 + movq %r15,%r9 + adcq 56(%rsi),%r15 + movq %rax,%r10 + adcq 64(%rsi),%rax + movq %rbx,%r11 + adcq 72(%rsi),%rbx + movq %rbp,%r12 + adcq 80(%rsi),%rbp + movq %rdx,%r13 + adcq 88(%rsi),%rdx + + movq %r14,0(%rdi) + movq %r15,8(%rdi) + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + movq %rbp,32(%rdi) + movq %rdx,40(%rdi) + + + leaq 48(%rsi),%rdx + leaq 48(%rdi),%rdi + call __subx_mod_384_a_is_loaded + + + leaq (%rdi),%rsi + leaq -48(%rdi),%rbx + leaq -48(%rdi),%rdi + call __mulx_384 + + + movq (%rsp),%rsi + leaq 48(%rsi),%rbx + leaq 96(%rdi),%rdi + call __mulx_384 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq 64(%rdi),%rax + movq 72(%rdi),%rbx + movq 80(%rdi),%rbp + addq %r8,%r8 + movq 88(%rdi),%rdx + adcq %r9,%r9 + movq %r8,0(%rdi) + adcq %r10,%r10 + movq %r9,8(%rdi) + adcq %r11,%r11 + movq %r10,16(%rdi) + adcq %r12,%r12 + movq %r11,24(%rdi) + adcq %r13,%r13 + movq %r12,32(%rdi) + adcq %r14,%r14 + movq %r13,40(%rdi) + adcq %r15,%r15 + movq %r14,48(%rdi) + adcq %rax,%rax + movq %r15,56(%rdi) + adcq %rbx,%rbx + movq %rax,64(%rdi) + adcq %rbp,%rbp + movq %rbx,72(%rdi) + adcq %rdx,%rdx + movq %rbp,80(%rdi) + movq %rdx,88(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -8*7 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_384 +.private_extern _mulx_384 + +.p2align 5 +_mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + movq %rdx,%rbx + call __mulx_384 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 + + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__mulx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rbx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + leaq -128(%rsi),%rsi + + mulxq %r14,%r9,%rcx + xorq %rbp,%rbp + + mulxq %r15,%r8,%rax + adcxq %rcx,%r8 + movq %r9,0(%rdi) + + mulxq %r10,%r9,%rcx + adcxq %rax,%r9 + + mulxq %r11,%r10,%rax + adcxq %rcx,%r10 + + mulxq %r12,%r11,%rcx + adcxq %rax,%r11 + + mulxq %r13,%r12,%r13 + movq 8(%rbx),%rdx + adcxq %rcx,%r12 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,8(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 16(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,16(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 24(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,24(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 32(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,32(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq 40(%rbx),%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + mulxq %r14,%rax,%rcx + adcxq %r8,%rax + adoxq %rcx,%r9 + movq %rax,40(%rdi) + + mulxq %r15,%r8,%rcx + adcxq %r9,%r8 + adoxq %rcx,%r10 + + mulxq 128+16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 + + mulxq 128+24(%rsi),%r10,%rcx + adcxq %r11,%r10 + adoxq %rcx,%r12 + + mulxq 128+32(%rsi),%r11,%rax + adcxq %r12,%r11 + adoxq %r13,%rax + + mulxq 128+40(%rsi),%r12,%r13 + movq %rax,%rdx + adcxq %rax,%r12 + adoxq %rbp,%r13 + adcxq %rbp,%r13 + movq %r8,48(%rdi) + movq %r9,56(%rdi) + movq %r10,64(%rdi) + movq %r11,72(%rdi) + movq %r12,80(%rdi) + movq %r13,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_384 +.private_extern _sqrx_384 + +.p2align 5 +_sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 + + + call __sqrx_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__sqrx_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%rcx + movq 32(%rsi),%rbx + + + mulxq %r14,%r8,%rdi + movq 40(%rsi),%rbp + mulxq %r15,%r9,%rax + addq %rdi,%r9 + mulxq %rcx,%r10,%rdi + adcq %rax,%r10 + mulxq %rbx,%r11,%rax + adcq %rdi,%r11 + mulxq %rbp,%r12,%r13 + movq %r14,%rdx + adcq %rax,%r12 + adcq $0,%r13 + + + xorq %r14,%r14 + mulxq %r15,%rdi,%rax + adcxq %rdi,%r10 + adoxq %rax,%r11 + + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r11 + adoxq %rax,%r12 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbp,%rdi,%rax + movq %r15,%rdx + adcxq %rdi,%r13 + adoxq %r14,%rax + adcxq %rax,%r14 + + + xorq %r15,%r15 + mulxq %rcx,%rdi,%rax + adcxq %rdi,%r12 + adoxq %rax,%r13 + + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r13 + adoxq %rax,%r14 + + mulxq %rbp,%rdi,%rax + movq %rcx,%rdx + adcxq %rdi,%r14 + adoxq %r15,%rax + adcxq %rax,%r15 + + + xorq %rcx,%rcx + mulxq %rbx,%rdi,%rax + adcxq %rdi,%r14 + adoxq %rax,%r15 + + mulxq %rbp,%rdi,%rax + movq %rbx,%rdx + adcxq %rdi,%r15 + adoxq %rcx,%rax + adcxq %rax,%rcx + + + mulxq %rbp,%rdi,%rbx + movq 0(%rsi),%rdx + addq %rdi,%rcx + movq 8(%rsp),%rdi + adcq $0,%rbx + + + xorq %rbp,%rbp + adcxq %r8,%r8 + adcxq %r9,%r9 + adcxq %r10,%r10 + adcxq %r11,%r11 + adcxq %r12,%r12 + + + mulxq %rdx,%rdx,%rax + movq %rdx,0(%rdi) + movq 8(%rsi),%rdx + adoxq %rax,%r8 + movq %r8,8(%rdi) + + mulxq %rdx,%r8,%rax + movq 16(%rsi),%rdx + adoxq %r8,%r9 + adoxq %rax,%r10 + movq %r9,16(%rdi) + movq %r10,24(%rdi) + + mulxq %rdx,%r8,%r9 + movq 24(%rsi),%rdx + adoxq %r8,%r11 + adoxq %r9,%r12 + adcxq %r13,%r13 + adcxq %r14,%r14 + movq %r11,32(%rdi) + movq %r12,40(%rdi) + + mulxq %rdx,%r8,%r9 + movq 32(%rsi),%rdx + adoxq %r8,%r13 + adoxq %r9,%r14 + adcxq %r15,%r15 + adcxq %rcx,%rcx + movq %r13,48(%rdi) + movq %r14,56(%rdi) + + mulxq %rdx,%r8,%r9 + movq 40(%rsi),%rdx + adoxq %r8,%r15 + adoxq %r9,%rcx + adcxq %rbx,%rbx + adcxq %rbp,%rbp + movq %r15,64(%rdi) + movq %rcx,72(%rdi) + + mulxq %rdx,%r8,%r9 + adoxq %r8,%rbx + adoxq %r9,%rbp + + movq %rbx,80(%rdi) + movq %rbp,88(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + + + +.globl _redcx_mont_384 +.private_extern _redcx_mont_384 + +.p2align 5 +_redcx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +redc_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + + + + +.globl _fromx_mont_384 +.private_extern _fromx_mont_384 + +.p2align 5 +_fromx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +from_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rdx,%rbx + call __mulx_by_1_mont_384 + + + + + movq %r14,%rax + movq %r15,%rcx + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_by_1_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq %rcx,%rdx + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + imulq %r8,%rdx + + + xorq %r14,%r14 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r13 + adoxq %r14,%rbp + adcxq %rbp,%r14 + imulq %r9,%rdx + + + xorq %r15,%r15 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r14 + adoxq %r15,%rbp + adcxq %rbp,%r15 + imulq %r10,%rdx + + + xorq %r8,%r8 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r10 + adoxq %rbp,%r11 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r15 + adoxq %r8,%rbp + adcxq %rbp,%r8 + imulq %r11,%rdx + + + xorq %r9,%r9 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r11 + adoxq %rbp,%r12 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r8 + adoxq %r9,%rbp + adcxq %rbp,%r9 + imulq %r12,%rdx + + + xorq %r10,%r10 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r9 + adoxq %r10,%rbp + adcxq %rbp,%r10 + imulq %r13,%rdx + + + xorq %r11,%r11 + mulxq 0(%rbx),%rax,%rbp + adcxq %rax,%r13 + adoxq %rbp,%r14 + + mulxq 8(%rbx),%rax,%rbp + adcxq %rax,%r14 + adoxq %rbp,%r15 + + mulxq 16(%rbx),%rax,%rbp + adcxq %rax,%r15 + adoxq %rbp,%r8 + + mulxq 24(%rbx),%rax,%rbp + adcxq %rax,%r8 + adoxq %rbp,%r9 + + mulxq 32(%rbx),%rax,%rbp + adcxq %rax,%r9 + adoxq %rbp,%r10 + + mulxq 40(%rbx),%rax,%rbp + movq %rcx,%rdx + adcxq %rax,%r10 + adoxq %r11,%rbp + adcxq %rbp,%r11 + .byte 0xf3,0xc3 +.cfi_endproc + + + +.p2align 5 +__redx_tail_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + addq 48(%rsi),%r14 + movq %r14,%rax + adcq 56(%rsi),%r15 + adcq 64(%rsi),%r8 + adcq 72(%rsi),%r9 + movq %r15,%rcx + adcq 80(%rsi),%r10 + adcq 88(%rsi),%r11 + sbbq %r12,%r12 + + + + + movq %r8,%rdx + movq %r9,%rbp + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + movq %r10,%r13 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + movq %r11,%rsi + sbbq 40(%rbx),%r11 + sbbq $0,%r12 + + cmovcq %rax,%r14 + cmovcq %rcx,%r15 + cmovcq %rdx,%r8 + movq %r14,0(%rdi) + cmovcq %rbp,%r9 + movq %r15,8(%rdi) + cmovcq %r13,%r10 + movq %r8,16(%rdi) + cmovcq %rsi,%r11 + movq %r9,24(%rdi) + movq %r10,32(%rdi) + movq %r11,40(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0x_pty_mont_384 +.private_extern _sgn0x_pty_mont_384 + +.p2align 5 +_sgn0x_pty_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sgn0_pty_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 0(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + xorq %rax,%rax + movq %r14,%r13 + addq %r14,%r14 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r14 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + notq %rax + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sgn0x_pty_mont_384x +.private_extern _sgn0x_pty_mont_384x + +.p2align 5 +_sgn0x_pty_mont_384x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sgn0_pty_mont_384x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movq %rsi,%rbx + leaq 48(%rdi),%rsi + movq %rdx,%rcx + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + leaq 0(%rdi),%rsi + xorq %rdi,%rdi + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rdi + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rdi + + movq %r14,0(%rsp) + notq %rdi + andq $1,%r13 + andq $2,%rdi + orq %r13,%rdi + + call __mulx_by_1_mont_384 + + movq %r14,%r12 + orq %r15,%r14 + orq %r8,%r14 + orq %r9,%r14 + orq %r10,%r14 + orq %r11,%r14 + + xorq %rax,%rax + movq %r12,%r13 + addq %r12,%r12 + adcq %r15,%r15 + adcq %r8,%r8 + adcq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq $0,%rax + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r15 + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + sbbq 32(%rbx),%r10 + sbbq 40(%rbx),%r11 + sbbq $0,%rax + + movq 0(%rsp),%r12 + + notq %rax + + testq %r14,%r14 + cmovzq %rdi,%r13 + + testq %r12,%r12 + cmovnzq %rdi,%rax + + andq $1,%r13 + andq $2,%rax + orq %r13,%rax + + movq 8(%rsp),%r15 +.cfi_restore %r15 + movq 16(%rsp),%r14 +.cfi_restore %r14 + movq 24(%rsp),%r13 +.cfi_restore %r13 + movq 32(%rsp),%r12 +.cfi_restore %r12 + movq 40(%rsp),%rbx +.cfi_restore %rbx + movq 48(%rsp),%rbp +.cfi_restore %rbp + leaq 56(%rsp),%rsp +.cfi_adjust_cfa_offset -56 + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _mulx_mont_384 +.private_extern _mulx_mont_384 + +.p2align 5 +_mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +mul_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + movq %r8,(%rsp) + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + xorq %r15,%r15 + + movq %r8,16(%rsp) + imulq 8(%rsp),%r8 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %rbp,%r15 + adoxq %rax,%r15 + adoxq %rax,%rax + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %r8,%r14 + adoxq %r8,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r9,16(%rsp) + imulq 8(%rsp),%r9 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rbp,%rax + adoxq %r8,%rax + adoxq %r8,%r8 + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r9,%r15 + adoxq %r9,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r10,16(%rsp) + imulq 8(%rsp),%r10 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %rbp,%r8 + adoxq %r9,%r8 + adoxq %r9,%r9 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r10,%rax + adoxq %r10,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r11,16(%rsp) + imulq 8(%rsp),%r11 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %rbp,%r9 + adoxq %r10,%r9 + adoxq %r10,%r10 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r11,%r8 + adoxq %r11,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + movq %r12,16(%rsp) + imulq 8(%rsp),%r12 + + + xorq %r11,%r11 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %rbp,%r10 + adoxq %r11,%r10 + adoxq %r11,%r11 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq 16(%rsp),%rdi + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r12,%r9 + adoxq %r12,%r10 + adcxq %r12,%r10 + adoxq %r12,%r11 + adcxq %r12,%r11 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + movq %r15,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + movq %rax,%rsi + + mulxq 40+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + movq %r14,%rdx + adcxq %r12,%r10 + adoxq %r12,%r11 + leaq 128(%rcx),%rcx + movq %r8,%r12 + adcq $0,%r11 + + + + + subq 0(%rcx),%r14 + sbbq 8(%rcx),%r15 + movq %r9,%rdi + sbbq 16(%rcx),%rax + sbbq 24(%rcx),%r8 + sbbq 32(%rcx),%r9 + movq %r10,%rbp + sbbq 40(%rcx),%r10 + sbbq $0,%r11 + + cmovncq %r14,%rdx + cmovcq %r13,%r15 + cmovcq %rsi,%rax + cmovncq %r8,%r12 + movq %rdx,0(%rbx) + cmovncq %r9,%rdi + movq %r15,8(%rbx) + cmovncq %r10,%rbp + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_384 +.private_extern _sqrx_mont_384 + +.p2align 5 +_sqrx_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -24(%rsp),%rsp +.cfi_adjust_cfa_offset 8*3 + + + movq %rcx,%r8 + leaq -128(%rdx),%rcx + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + leaq (%rsi),%rbx + movq %r8,(%rsp) + leaq -128(%rsi),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset -8*9 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_n_mul_mont_384 +.private_extern _sqrx_n_mul_mont_384 + +.p2align 5 +_sqrx_n_mul_mont_384: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_n_mul_mont_384$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + +L$oop_sqrx_384: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + leaq -128(%rcx),%rcx + + mulxq %rdx,%r8,%r9 + call __mulx_mont_384 + + movd %xmm1,%r10d + decl %r10d + jnz L$oop_sqrx_384 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _sqrx_n_mul_mont_383 +.private_extern _sqrx_n_mul_mont_383 + +.p2align 5 +_sqrx_n_mul_mont_383: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_n_mul_mont_383$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -40(%rsp),%rsp +.cfi_adjust_cfa_offset 8*5 + + + movq %rdx,%r10 + movq 0(%rsi),%rdx + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq %rsi,%rbx + movq 24(%rsi),%r12 + movq %rdi,16(%rsp) + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + + movq %r8,(%rsp) + movq %r9,24(%rsp) + movq 0(%r9),%xmm2 + leaq -128(%rcx),%rcx + +L$oop_sqrx_383: + movd %r10d,%xmm1 + leaq -128(%rbx),%rsi + + mulxq %rdx,%r8,%r9 + call __mulx_mont_383_nonred + + movd %xmm1,%r10d + decl %r10d + jnz L$oop_sqrx_383 + + movq %rdx,%r14 +.byte 102,72,15,126,210 + leaq -128(%rbx),%rsi + movq 24(%rsp),%rbx + + mulxq %r14,%r8,%r9 + call __mulx_mont_384 + + movq 40(%rsp),%r15 +.cfi_restore %r15 + movq 48(%rsp),%r14 +.cfi_restore %r14 + movq 56(%rsp),%r13 +.cfi_restore %r13 + movq 64(%rsp),%r12 +.cfi_restore %r12 + movq 72(%rsp),%rbx +.cfi_restore %rbx + movq 80(%rsp),%rbp +.cfi_restore %rbp + leaq 88(%rsp),%rsp +.cfi_adjust_cfa_offset -8*11 + + .byte 0xf3,0xc3 +.cfi_endproc + + +.p2align 5 +__mulx_mont_383_nonred: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + mulxq %r15,%r14,%r10 + mulxq %rax,%r15,%r11 + addq %r14,%r9 + mulxq %r12,%rax,%r12 + adcq %r15,%r10 + mulxq %rdi,%rdi,%r13 + adcq %rax,%r11 + mulxq %rbp,%rbp,%r14 + movq 8(%rbx),%rdx + adcq %rdi,%r12 + adcq %rbp,%r13 + adcq $0,%r14 + movq %r8,%rax + imulq 8(%rsp),%r8 + + + xorq %r15,%r15 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r9 + adcxq %rbp,%r10 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r8,%rdx + adoxq %rdi,%r14 + adcxq %r15,%rbp + adoxq %rbp,%r15 + + + xorq %r8,%r8 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r9 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 16(%rbx),%rdx + adcxq %rdi,%r13 + adoxq %rbp,%r14 + adcxq %rax,%r14 + adoxq %rax,%r15 + adcxq %rax,%r15 + movq %r9,%r8 + imulq 8(%rsp),%r9 + + + xorq %rax,%rax + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r10 + adcxq %rbp,%r11 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r9,%rdx + adoxq %rdi,%r15 + adcxq %rax,%rbp + adoxq %rbp,%rax + + + xorq %r9,%r9 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r10 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 24(%rbx),%rdx + adcxq %rdi,%r14 + adoxq %rbp,%r15 + adcxq %r8,%r15 + adoxq %r8,%rax + adcxq %r8,%rax + movq %r10,%r9 + imulq 8(%rsp),%r10 + + + xorq %r8,%r8 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r11 + adcxq %rbp,%r12 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r10,%rdx + adoxq %rdi,%rax + adcxq %r8,%rbp + adoxq %rbp,%r8 + + + xorq %r10,%r10 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r9 + adoxq %rbp,%r11 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 40+128(%rcx),%rdi,%rbp + movq 32(%rbx),%rdx + adcxq %rdi,%r15 + adoxq %rbp,%rax + adcxq %r9,%rax + adoxq %r9,%r8 + adcxq %r9,%r8 + movq %r11,%r10 + imulq 8(%rsp),%r11 + + + xorq %r9,%r9 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r12 + adcxq %rbp,%r13 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r11,%rdx + adoxq %rdi,%r8 + adcxq %r9,%rbp + adoxq %rbp,%r9 + + + xorq %r11,%r11 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r10 + adoxq %rbp,%r12 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 40+128(%rcx),%rdi,%rbp + movq 40(%rbx),%rdx + adcxq %rdi,%rax + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcxq %r10,%r9 + movq %r12,%r11 + imulq 8(%rsp),%r12 + + + xorq %r10,%r10 + mulxq 0+128(%rsi),%rdi,%rbp + adoxq %rdi,%r13 + adcxq %rbp,%r14 + + mulxq 8+128(%rsi),%rdi,%rbp + adoxq %rdi,%r14 + adcxq %rbp,%r15 + + mulxq 16+128(%rsi),%rdi,%rbp + adoxq %rdi,%r15 + adcxq %rbp,%rax + + mulxq 24+128(%rsi),%rdi,%rbp + adoxq %rdi,%rax + adcxq %rbp,%r8 + + mulxq 32+128(%rsi),%rdi,%rbp + adoxq %rdi,%r8 + adcxq %rbp,%r9 + + mulxq 40+128(%rsi),%rdi,%rbp + movq %r12,%rdx + adoxq %rdi,%r9 + adcxq %r10,%rbp + adoxq %rbp,%r10 + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r11 + adoxq %rbp,%r13 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r13,%rdx + adcxq %rdi,%r8 + adoxq %rbp,%r9 + adcxq %r11,%r9 + adoxq %r11,%r10 + adcxq %r11,%r10 + imulq 8(%rsp),%rdx + movq 24(%rsp),%rbx + + + xorq %r12,%r12 + mulxq 0+128(%rcx),%rdi,%rbp + adcxq %rdi,%r13 + adoxq %rbp,%r14 + + mulxq 8+128(%rcx),%rdi,%rbp + adcxq %rdi,%r14 + adoxq %rbp,%r15 + + mulxq 16+128(%rcx),%rdi,%rbp + adcxq %rdi,%r15 + adoxq %rbp,%rax + + mulxq 24+128(%rcx),%rdi,%rbp + adcxq %rdi,%rax + adoxq %rbp,%r8 + + mulxq 32+128(%rcx),%rdi,%rbp + adcxq %rdi,%r8 + adoxq %rbp,%r9 + + mulxq 40+128(%rcx),%rdi,%rbp + movq %r14,%rdx + adcxq %rdi,%r9 + adoxq %rbp,%r10 + adcq $0,%r10 + movq %r8,%r12 + + movq %r14,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r9,%rdi + movq %r8,24(%rbx) + movq %r9,32(%rbx) + movq %r10,40(%rbx) + movq %r10,%rbp + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _sqrx_mont_382x +.private_extern _sqrx_mont_382x + +.p2align 5 +_sqrx_mont_382x: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + +sqr_mont_382x$1: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $136,%rsp +.cfi_adjust_cfa_offset 136 + + + movq %rcx,0(%rsp) + movq %rdx,%rcx + movq %rdi,16(%rsp) + movq %rsi,24(%rsp) + + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq 32(%rsi),%r12 + movq 40(%rsi),%r13 + + movq %r8,%r14 + addq 48(%rsi),%r8 + movq %r9,%r15 + adcq 56(%rsi),%r9 + movq %r10,%rax + adcq 64(%rsi),%r10 + movq %r11,%rdx + adcq 72(%rsi),%r11 + movq %r12,%rbx + adcq 80(%rsi),%r12 + movq %r13,%rbp + adcq 88(%rsi),%r13 + + subq 48(%rsi),%r14 + sbbq 56(%rsi),%r15 + sbbq 64(%rsi),%rax + sbbq 72(%rsi),%rdx + sbbq 80(%rsi),%rbx + sbbq 88(%rsi),%rbp + sbbq %rdi,%rdi + + movq %r8,32+0(%rsp) + movq %r9,32+8(%rsp) + movq %r10,32+16(%rsp) + movq %r11,32+24(%rsp) + movq %r12,32+32(%rsp) + movq %r13,32+40(%rsp) + + movq %r14,32+48(%rsp) + movq %r15,32+56(%rsp) + movq %rax,32+64(%rsp) + movq %rdx,32+72(%rsp) + movq %rbx,32+80(%rsp) + movq %rbp,32+88(%rsp) + movq %rdi,32+96(%rsp) + + + + leaq 48(%rsi),%rbx + + movq 48(%rsi),%rdx + movq 0(%rsi),%r14 + movq 8(%rsi),%r15 + movq 16(%rsi),%rax + movq 24(%rsi),%r12 + movq 32(%rsi),%rdi + movq 40(%rsi),%rbp + leaq -128(%rsi),%rsi + leaq -128(%rcx),%rcx + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + addq %rdx,%rdx + adcq %r15,%r15 + adcq %rax,%rax + adcq %r12,%r12 + adcq %rdi,%rdi + adcq %rbp,%rbp + + movq %rdx,48(%rbx) + movq %r15,56(%rbx) + movq %rax,64(%rbx) + movq %r12,72(%rbx) + movq %rdi,80(%rbx) + movq %rbp,88(%rbx) + + leaq 32-128(%rsp),%rsi + leaq 32+48(%rsp),%rbx + + movq 32+48(%rsp),%rdx + movq 32+0(%rsp),%r14 + movq 32+8(%rsp),%r15 + movq 32+16(%rsp),%rax + movq 32+24(%rsp),%r12 + movq 32+32(%rsp),%rdi + movq 32+40(%rsp),%rbp + + + + mulxq %r14,%r8,%r9 + call __mulx_mont_383_nonred + movq 32+96(%rsp),%r14 + leaq 128(%rcx),%rcx + movq 32+0(%rsp),%r8 + andq %r14,%r8 + movq 32+8(%rsp),%r9 + andq %r14,%r9 + movq 32+16(%rsp),%r10 + andq %r14,%r10 + movq 32+24(%rsp),%r11 + andq %r14,%r11 + movq 32+32(%rsp),%r13 + andq %r14,%r13 + andq 32+40(%rsp),%r14 + + subq %r8,%rdx + movq 0(%rcx),%r8 + sbbq %r9,%r15 + movq 8(%rcx),%r9 + sbbq %r10,%rax + movq 16(%rcx),%r10 + sbbq %r11,%r12 + movq 24(%rcx),%r11 + sbbq %r13,%rdi + movq 32(%rcx),%r13 + sbbq %r14,%rbp + sbbq %r14,%r14 + + andq %r14,%r8 + andq %r14,%r9 + andq %r14,%r10 + andq %r14,%r11 + andq %r14,%r13 + andq 40(%rcx),%r14 + + addq %r8,%rdx + adcq %r9,%r15 + adcq %r10,%rax + adcq %r11,%r12 + adcq %r13,%rdi + adcq %r14,%rbp + + movq %rdx,0(%rbx) + movq %r15,8(%rbx) + movq %rax,16(%rbx) + movq %r12,24(%rbx) + movq %rdi,32(%rbx) + movq %rbp,40(%rbx) + leaq 136(%rsp),%r8 + movq 0(%r8),%r15 +.cfi_restore %r15 + movq 8(%r8),%r14 +.cfi_restore %r14 + movq 16(%r8),%r13 +.cfi_restore %r13 + movq 24(%r8),%r12 +.cfi_restore %r12 + movq 32(%r8),%rbx +.cfi_restore %rbx + movq 40(%r8),%rbp +.cfi_restore %rbp + leaq 48(%r8),%rsp +.cfi_adjust_cfa_offset -136-8*6 + + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/mach-o/sha256-armv8.S b/crypto/blst_src/build/mach-o/sha256-armv8.S new file mode 100644 index 00000000000..3f3c1266dcd --- /dev/null +++ b/crypto/blst_src/build/mach-o/sha256-armv8.S @@ -0,0 +1,1083 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with rationale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + +.comm ___blst_platform_cap,4 +.text + +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.align 2 +.align 2 +.globl _blst_sha256_block_armv8 + +.align 6 +_blst_sha256_block_armv8: +Lv8_entry: + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,LK256 + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s +.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s +.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s +.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s +.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +.globl _blst_sha256_block_data_order + +.align 4 +_blst_sha256_block_data_order: + adrp x16,___blst_platform_cap@PAGE + ldr w16,[x16,___blst_platform_cap@PAGEOFF] + tst w16,#1 + b.ne Lv8_entry + + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,LK256 + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b L_00_48 + +.align 4 +L_00_48: + ext v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne L_00_48 + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + csel x17, x17, xzr, eq + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + b.ne L_00_48 + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret + +.globl _blst_sha256_emit +.private_extern _blst_sha256_emit + +.align 4 +_blst_sha256_emit: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret + + +.globl _blst_sha256_bcopy +.private_extern _blst_sha256_bcopy + +.align 4 +_blst_sha256_bcopy: +Loop_bcopy: + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,Loop_bcopy + ret + + +.globl _blst_sha256_hcopy +.private_extern _blst_sha256_hcopy + +.align 4 +_blst_sha256_hcopy: + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret + diff --git a/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s new file mode 100644 index 00000000000..9f0a4f84ff0 --- /dev/null +++ b/crypto/blst_src/build/mach-o/sha256-portable-x86_64.s @@ -0,0 +1,1750 @@ +.comm ___blst_platform_cap,4 +.text + +.globl _blst_sha256_block_data_order + +.p2align 4 +_blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp +#ifdef __BLST_PORTABLE__ + testl $2,___blst_platform_cap(%rip) + jnz L$blst_sha256_block_data_order$2 +#endif + pushq %rbx +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+24,%rsp + +.cfi_def_cfa %rsp,144 + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp L$loop + +.p2align 4 +L$loop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 0(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 4(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 8(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 12(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 16(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 20(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 24(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 28(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 32(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 36(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 40(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 44(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 48(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 52(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 56(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 60(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + jmp L$rounds_16_xx +.p2align 4 +L$rounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 64(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 68(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 72(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 76(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 80(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 84(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 88(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 92(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl 96(%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl 100(%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl 104(%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl 108(%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl 112(%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl 116(%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl 120(%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl 124(%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + leaq 64(%rbp),%rbp + cmpb $0x19,3(%rbp) + jnz L$rounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop + + leaq 64+24+48(%rsp),%r11 +.cfi_def_cfa %r11,8 + movq 64+24(%rsp),%r15 + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbx + movq -8(%r11),%rbp +.cfi_restore %r12 +.cfi_restore %r13 +.cfi_restore %r14 +.cfi_restore %r15 +.cfi_restore %rbp +.cfi_restore %rbx + leaq (%r11),%rsp + .byte 0xf3,0xc3 +.cfi_endproc + + +#ifndef __BLST_PORTABLE__ +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl _blst_sha256_emit +.private_extern _blst_sha256_emit + +.p2align 4 +_blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_bcopy +.private_extern _blst_sha256_bcopy + +.p2align 4 +_blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +L$oop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz L$oop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_hcopy +.private_extern _blst_sha256_hcopy + +.p2align 4 +_blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + +#endif diff --git a/crypto/blst_src/build/mach-o/sha256-x86_64.s b/crypto/blst_src/build/mach-o/sha256-x86_64.s new file mode 100644 index 00000000000..cff024eed4f --- /dev/null +++ b/crypto/blst_src/build/mach-o/sha256-x86_64.s @@ -0,0 +1,1447 @@ +.comm ___blst_platform_cap,4 +.text + +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 +.globl _blst_sha256_block_data_order_shaext +.private_extern _blst_sha256_block_data_order_shaext + +.p2align 6 +_blst_sha256_block_data_order_shaext: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp +L$blst_sha256_block_data_order$2: + + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 256-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 +.byte 102,15,56,0,223 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 102,15,56,0,231 + movdqa %xmm2,%xmm10 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 +.byte 15,56,203,202 + + movdqa 16-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 102,15,56,0,239 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi +.byte 15,56,204,220 +.byte 15,56,203,202 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 102,15,56,0,247 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + + movdqa 48-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 64-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 80-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 96-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 112-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 144-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 + nop + paddd %xmm7,%xmm6 +.byte 15,56,204,220 +.byte 15,56,203,202 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,205,245 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 +.byte 102,15,58,15,253,4 + nop + paddd %xmm7,%xmm3 +.byte 15,56,204,229 +.byte 15,56,203,202 + movdqa 176-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 +.byte 15,56,205,222 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,254,4 + nop + paddd %xmm7,%xmm4 +.byte 15,56,204,238 +.byte 15,56,203,202 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 +.byte 15,56,205,227 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 +.byte 102,15,58,15,251,4 + nop + paddd %xmm7,%xmm5 +.byte 15,56,204,243 +.byte 15,56,203,202 + movdqa 208-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 +.byte 15,56,205,236 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 +.byte 102,15,58,15,252,4 +.byte 15,56,203,202 + paddd %xmm7,%xmm6 + + movdqa 224-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 +.byte 15,56,205,245 + movdqa %xmm8,%xmm7 +.byte 15,56,203,202 + + movdqa 240-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop +.byte 15,56,203,202 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz L$oop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,215,8 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _blst_sha256_block_data_order +.private_extern _blst_sha256_block_data_order + +.p2align 6 +_blst_sha256_block_data_order: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + testl $2,___blst_platform_cap(%rip) + jnz L$blst_sha256_block_data_order$2 + pushq %rbx +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $24,%rsp + + leaq (%rsi,%rdx,4),%rdx + movq %rdi,-64(%rbp) + + movq %rdx,-48(%rbp) + + + leaq -64(%rsp),%rsp + movl 0(%rdi),%eax + andq $-64,%rsp + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp L$loop_ssse3 +.p2align 4 +L$loop_ssse3: + movdqa K256+256(%rip),%xmm7 + movq %rsi,-56(%rbp) + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 +.byte 102,15,56,0,199 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rsi +.byte 102,15,56,0,207 + movdqa 0(%rsi),%xmm4 + movdqa 16(%rsi),%xmm5 +.byte 102,15,56,0,215 + paddd %xmm0,%xmm4 + movdqa 32(%rsi),%xmm6 +.byte 102,15,56,0,223 + movdqa 48(%rsi),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$ssse3_00_47 + +.p2align 4 +L$ssse3_00_47: + subq $-64,%rsi + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,224,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,250,4 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,225,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,251,4 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 16(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d +.byte 102,15,58,15,226,4 + andl %r8d,%r12d + xorl %r8d,%r13d +.byte 102,15,58,15,248,4 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 32(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d +.byte 102,15,58,15,227,4 + andl %eax,%r12d + xorl %eax,%r13d +.byte 102,15,58,15,249,4 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 48(%rsi),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,67(%rsi) + jne L$ssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq -64(%rbp),%rdi + movl %r14d,%eax + movq -56(%rbp),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + leaq 64(%rsi),%rsi + cmpq -48(%rbp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_ssse3 + + xorps %xmm0,%xmm0 + movaps %xmm0,0(%rsp) + movaps %xmm0,16(%rsp) + movaps %xmm0,32(%rsp) + movaps %xmm0,48(%rsp) + movq -40(%rbp),%r15 + movq -32(%rbp),%r14 + movq -24(%rbp),%r13 + movq -16(%rbp),%r12 + movq -8(%rbp),%rbx + movq %rbp,%rsp +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp +.cfi_restore %r12 +.cfi_restore %r13 +.cfi_restore %r14 +.cfi_restore %r15 +.cfi_restore %rbx + .byte 0xf3,0xc3 +.cfi_endproc + +.globl _blst_sha256_emit +.private_extern _blst_sha256_emit + +.p2align 4 +_blst_sha256_emit: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + bswapq %r8 + movq 24(%rsi),%r11 + bswapq %r9 + movl %r8d,4(%rdi) + bswapq %r10 + movl %r9d,12(%rdi) + bswapq %r11 + movl %r10d,20(%rdi) + shrq $32,%r8 + movl %r11d,28(%rdi) + shrq $32,%r9 + movl %r8d,0(%rdi) + shrq $32,%r10 + movl %r9d,8(%rdi) + shrq $32,%r11 + movl %r10d,16(%rdi) + movl %r11d,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_bcopy +.private_extern _blst_sha256_bcopy + +.p2align 4 +_blst_sha256_bcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + subq %rsi,%rdi +L$oop_bcopy: + movzbl (%rsi),%eax + leaq 1(%rsi),%rsi + movb %al,-1(%rdi,%rsi,1) + decq %rdx + jnz L$oop_bcopy + .byte 0xf3,0xc3 +.cfi_endproc + + +.globl _blst_sha256_hcopy +.private_extern _blst_sha256_hcopy + +.p2align 4 +_blst_sha256_hcopy: +.cfi_startproc + .byte 0xf3,0x0f,0x1e,0xfa + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc + diff --git a/crypto/blst_src/build/refresh.sh b/crypto/blst_src/build/refresh.sh new file mode 100755 index 00000000000..56b0b279c69 --- /dev/null +++ b/crypto/blst_src/build/refresh.sh @@ -0,0 +1,48 @@ +#!/bin/sh + +HERE=`dirname $0` +cd "${HERE}" + +PERL=${PERL:-perl} + +for pl in ../src/asm/*-x86_64.pl; do + s=`basename $pl .pl`.asm + expr $s : '.*portable' > /dev/null || (set -x; ${PERL} $pl masm > win64/$s) + s=`basename $pl .pl`.s + (set -x; ${PERL} $pl elf > elf/$s) + (set -x; ${PERL} $pl mingw64 > coff/$s) + (set -x; ${PERL} $pl macosx > mach-o/$s) +done + +for pl in ../src/asm/*-armv8.pl; do + s=`basename $pl .pl`.asm + (set -x; ${PERL} $pl win64 > win64/$s) + s=`basename $pl .pl`.S + (set -x; ${PERL} $pl linux64 > elf/$s) + (set -x; ${PERL} $pl coff64 > coff/$s) + (set -x; ${PERL} $pl ios64 > mach-o/$s) +done + +( cd ../bindings; + echo "LIBRARY blst" + echo + echo "EXPORTS" + cc -E blst.h | \ + ${PERL} -ne '{ (/(blst_[\w]+)\s*\(/ || /(BLS12_[\w]+);/) && print "\t$1\n" }' + echo +) > win64/blst.def + +if which bindgen > /dev/null 2>&1; then + ( cd ../bindings; set -x; + bindgen --opaque-type blst_pairing \ + --opaque-type blst_uniq \ + --with-derive-default \ + --with-derive-eq \ + --rustified-enum BLST.\* \ + blst.h -- -D__BLST_RUST_BINDGEN__ \ + | ${PERL} ../build/bindings_trim.pl > rust/src/bindings.rs + ) +else + echo "Install Rust bindgen with 'cargo install bindgen-cli'" 1>&2 + exit 1 +fi diff --git a/crypto/blst_src/build/win64/add_mod_256-armv8.asm b/crypto/blst_src/build/win64/add_mod_256-armv8.asm new file mode 100644 index 00000000000..8d6975185a6 --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_256-armv8.asm @@ -0,0 +1,380 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |add_mod_256|[FUNC] + ALIGN 32 +|add_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + adds x8,x8,x12 + ldp x14,x15,[x2,#16] + adcs x9,x9,x13 + ldp x4,x5,[x3] + adcs x10,x10,x14 + ldp x6,x7,[x3,#16] + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + stp x8,x9,[x0] + csello x11,x11,x2 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |mul_by_3_mod_256|[FUNC] + ALIGN 32 +|mul_by_3_mod_256| PROC + ldp x12,x13,[x1] + ldp x14,x15,[x1,#16] + + adds x8,x12,x12 + ldp x4,x5,[x2] + adcs x9,x13,x13 + ldp x6,x7,[x2,#16] + adcs x10,x14,x14 + adcs x11,x15,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + csello x11,x11,x2 + + adds x8,x8,x12 + adcs x9,x9,x13 + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + stp x8,x9,[x0] + csello x11,x11,x2 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |lshift_mod_256|[FUNC] + ALIGN 32 +|lshift_mod_256| PROC + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +|$Loop_lshift_mod_256| + adds x8,x8,x8 + sub x2,x2,#1 + adcs x9,x9,x9 + adcs x10,x10,x10 + adcs x11,x11,x11 + adc x3,xzr,xzr + + subs x12,x8,x4 + sbcs x13,x9,x5 + sbcs x14,x10,x6 + sbcs x15,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x12 + csello x9,x9,x13 + csello x10,x10,x14 + csello x11,x11,x15 + + cbnz x2,|$Loop_lshift_mod_256| + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |rshift_mod_256|[FUNC] + ALIGN 32 +|rshift_mod_256| PROC + ldp x8,x9,[x1] + ldp x10,x11,[x1,#16] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + +|$Loop_rshift| + adds x12,x8,x4 + sub x2,x2,#1 + adcs x13,x9,x5 + adcs x14,x10,x6 + adcs x15,x11,x7 + adc x3,xzr,xzr + tst x8,#1 + + cselne x12,x12,x8 + cselne x13,x13,x9 + cselne x14,x14,x10 + cselne x15,x15,x11 + cselne x3,x3,xzr + + extr x8,x13,x12,#1 + extr x9,x14,x13,#1 + extr x10,x15,x14,#1 + extr x11,x3,x15,#1 + + cbnz x2,|$Loop_rshift| + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |cneg_mod_256|[FUNC] + ALIGN 32 +|cneg_mod_256| PROC + ldp x8,x9,[x1] + ldp x4,x5,[x3] + + ldp x10,x11,[x1,#16] + subs x12,x4,x8 + ldp x6,x7,[x3,#16] + orr x4,x8,x9 + sbcs x13,x5,x9 + orr x5,x10,x11 + sbcs x14,x6,x10 + orr x3,x4,x5 + sbc x15,x7,x11 + + cmp x3,#0 + csetmne x3 + ands x2,x2,x3 + + cseleq x8,x8,x12 + cseleq x9,x9,x13 + cseleq x10,x10,x14 + stp x8,x9,[x0] + cseleq x11,x11,x15 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |sub_mod_256|[FUNC] + ALIGN 32 +|sub_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + + ldp x10,x11,[x1,#16] + subs x8,x8,x12 + ldp x14,x15,[x2,#16] + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + stp x8,x9,[x0] + adc x11,x11,x7 + stp x10,x11,[x0,#16] + + ret + ENDP + + + + EXPORT |check_mod_256|[FUNC] + ALIGN 32 +|check_mod_256| PROC + ldp x8,x9,[x0] + ldp x10,x11,[x0,#16] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + subs xzr,x8,x4 + sbcs xzr,x9,x5 + orr x8,x8,x9 + sbcs xzr,x10,x6 + orr x8,x8,x10 + sbcs xzr,x11,x7 + orr x8,x8,x11 + sbc x1,xzr,xzr + + cmp x8,#0 + mov x0,#1 + cselne x0,x0,xzr + and x0,x0,x1 + + ret + ENDP + + + + EXPORT |add_n_check_mod_256|[FUNC] + ALIGN 32 +|add_n_check_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + adds x8,x8,x12 + ldp x4,x5,[x3] + adcs x9,x9,x13 + ldp x6,x7,[x3,#16] + adcs x10,x10,x14 + adcs x11,x11,x15 + adc x3,xzr,xzr + + subs x16,x8,x4 + sbcs x17,x9,x5 + sbcs x1,x10,x6 + sbcs x2,x11,x7 + sbcs xzr,x3,xzr + + csello x8,x8,x16 + csello x9,x9,x17 + csello x10,x10,x1 + csello x11,x11,x2 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + cselne x0,x17,xzr + + ret + ENDP + + + + EXPORT |sub_n_check_mod_256|[FUNC] + ALIGN 32 +|sub_n_check_mod_256| PROC + ldp x8,x9,[x1] + ldp x12,x13,[x2] + ldp x10,x11,[x1,#16] + ldp x14,x15,[x2,#16] + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x12,x12 + rev x9,x9 + rev x13,x13 + rev x10,x10 + rev x14,x14 + rev x11,x11 + rev x15,x15 +#endif + + subs x8,x8,x12 + sbcs x9,x9,x13 + ldp x4,x5,[x3] + sbcs x10,x10,x14 + ldp x6,x7,[x3,#16] + sbcs x11,x11,x15 + sbc x3,xzr,xzr + + and x4,x4,x3 + and x5,x5,x3 + adds x8,x8,x4 + and x6,x6,x3 + adcs x9,x9,x5 + and x7,x7,x3 + adcs x10,x10,x6 + adc x11,x11,x7 + + orr x16, x8, x9 + orr x17, x10, x11 + orr x16, x16, x17 + +#ifdef __AARCH64EB__ + rev x8,x8 + rev x9,x9 + rev x10,x10 + rev x11,x11 +#endif + + stp x8,x9,[x0] + stp x10,x11,[x0,#16] + + mov x17, #1 + cmp x16, #0 + cselne x0,x17,xzr + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/add_mod_256-x86_64.asm b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm new file mode 100644 index 00000000000..d5308b8f809 --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_256-x86_64.asm @@ -0,0 +1,939 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC add_mod_256 + + +ALIGN 32 +add_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_256:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + sub rsp,8 + +$L$SEH_body_add_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oaded_a_add_mod_256:: + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + mov rax,r8 + adc r10,QWORD PTR[16+rdx] + mov rsi,r9 + adc r11,QWORD PTR[24+rdx] + sbb rdx,rdx + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb rdx,0 + + cmovc r8,rax + cmovc r9,rsi + mov QWORD PTR[rdi],r8 + cmovc r10,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_add_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_256:: +add_mod_256 ENDP + + +PUBLIC mul_by_3_mod_256 + + +ALIGN 32 +mul_by_3_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_256:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + push rbx + + push r12 + +$L$SEH_body_mul_by_3_mod_256:: + + + mov rcx,rdx + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov rdx,rsi + mov r11,QWORD PTR[24+rsi] + + call __lshift_mod_256 + mov r12,QWORD PTR[rsp] + + jmp $L$oaded_a_add_mod_256 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_mul_by_3_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_256:: +mul_by_3_mod_256 ENDP + + +ALIGN 32 +__lshift_mod_256 PROC PRIVATE + DB 243,15,30,250 + + add r8,r8 + adc r9,r9 + mov rax,r8 + adc r10,r10 + mov rsi,r9 + adc r11,r11 + sbb r12,r12 + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb r12,0 + + cmovc r8,rax + cmovc r9,rsi + cmovc r10,rbx + cmovc r11,rbp + + DB 0F3h,0C3h ;repret +__lshift_mod_256 ENDP + + +PUBLIC lshift_mod_256 + + +ALIGN 32 +lshift_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_lshift_mod_256:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + +$L$SEH_body_lshift_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oop_lshift_mod_256:: + call __lshift_mod_256 + dec edx + jnz $L$oop_lshift_mod_256 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_lshift_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_lshift_mod_256:: +lshift_mod_256 ENDP + + +PUBLIC rshift_mod_256 + + +ALIGN 32 +rshift_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_rshift_mod_256:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + sub rsp,8 + +$L$SEH_body_rshift_mod_256:: + + + mov rbp,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + +$L$oop_rshift_mod_256:: + mov r8,rbp + and rbp,1 + mov rax,QWORD PTR[rcx] + neg rbp + mov rsi,QWORD PTR[8+rcx] + mov rbx,QWORD PTR[16+rcx] + + and rax,rbp + and rsi,rbp + and rbx,rbp + and rbp,QWORD PTR[24+rcx] + + add r8,rax + adc r9,rsi + adc r10,rbx + adc r11,rbp + sbb rax,rax + + shr r8,1 + mov rbp,r9 + shr r9,1 + mov rbx,r10 + shr r10,1 + mov rsi,r11 + shr r11,1 + + shl rbp,63 + shl rbx,63 + or rbp,r8 + shl rsi,63 + or r9,rbx + shl rax,63 + or r10,rsi + or r11,rax + + dec edx + jnz $L$oop_rshift_mod_256 + + mov QWORD PTR[rdi],rbp + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_rshift_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_rshift_mod_256:: +rshift_mod_256 ENDP + + +PUBLIC cneg_mod_256 + + +ALIGN 32 +cneg_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_cneg_mod_256:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + +$L$SEH_body_cneg_mod_256:: + + + mov r12,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r8,r12 + mov r11,QWORD PTR[24+rsi] + or r12,r9 + or r12,r10 + or r12,r11 + mov rbp,-1 + + mov rax,QWORD PTR[rcx] + cmovnz r12,rbp + mov rsi,QWORD PTR[8+rcx] + mov rbx,QWORD PTR[16+rcx] + and rax,r12 + mov rbp,QWORD PTR[24+rcx] + and rsi,r12 + and rbx,r12 + and rbp,r12 + + sub rax,r8 + sbb rsi,r9 + sbb rbx,r10 + sbb rbp,r11 + + or rdx,rdx + + cmovz rax,r8 + cmovz rsi,r9 + mov QWORD PTR[rdi],rax + cmovz rbx,r10 + mov QWORD PTR[8+rdi],rsi + cmovz rbp,r11 + mov QWORD PTR[16+rdi],rbx + mov QWORD PTR[24+rdi],rbp + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_cneg_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_cneg_mod_256:: +cneg_mod_256 ENDP + + +PUBLIC sub_mod_256 + + +ALIGN 32 +sub_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_256:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + sub rsp,8 + +$L$SEH_body_sub_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + sub r8,QWORD PTR[rdx] + mov rax,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov rsi,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[24+rcx] + sbb rdx,rdx + + and rax,rdx + and rsi,rdx + and rbx,rdx + and rbp,rdx + + add r8,rax + adc r9,rsi + mov QWORD PTR[rdi],r8 + adc r10,rbx + mov QWORD PTR[8+rdi],r9 + adc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sub_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_256:: +sub_mod_256 ENDP + + +PUBLIC check_mod_256 + + +ALIGN 32 +check_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_check_mod_256:: + + + mov rdi,rcx + mov rsi,rdx + mov rax,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + + mov r8,rax + or rax,r9 + or rax,r10 + or rax,r11 + + sub r8,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rsi,rsi + + mov rdx,1 + cmp rax,0 + cmovne rax,rdx + and rax,rsi +$L$SEH_epilogue_check_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_check_mod_256:: +check_mod_256 ENDP + + +PUBLIC add_n_check_mod_256 + + +ALIGN 32 +add_n_check_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_n_check_mod_256:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + sub rsp,8 + +$L$SEH_body_add_n_check_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + mov rax,r8 + adc r10,QWORD PTR[16+rdx] + mov rsi,r9 + adc r11,QWORD PTR[24+rdx] + sbb rdx,rdx + + mov rbx,r10 + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov rbp,r11 + sbb r11,QWORD PTR[24+rcx] + sbb rdx,0 + + cmovc r8,rax + cmovc r9,rsi + mov QWORD PTR[rdi],r8 + cmovc r10,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + or r8,r9 + or r10,r11 + or r8,r10 + mov rax,1 + cmovz rax,r8 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_add_n_check_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_n_check_mod_256:: +add_n_check_mod_256 ENDP + + +PUBLIC sub_n_check_mod_256 + + +ALIGN 32 +sub_n_check_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_n_check_mod_256:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + sub rsp,8 + +$L$SEH_body_sub_n_check_mod_256:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + sub r8,QWORD PTR[rdx] + mov rax,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov rsi,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[24+rcx] + sbb rdx,rdx + + and rax,rdx + and rsi,rdx + and rbx,rdx + and rbp,rdx + + add r8,rax + adc r9,rsi + mov QWORD PTR[rdi],r8 + adc r10,rbx + mov QWORD PTR[8+rdi],r9 + adc r11,rbp + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + or r8,r9 + or r10,r11 + or r8,r10 + mov rax,1 + cmovz rax,r8 + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sub_n_check_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_n_check_mod_256:: +sub_n_check_mod_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_256 + DD imagerel $L$SEH_body_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_prologue + + DD imagerel $L$SEH_body_add_mod_256 + DD imagerel $L$SEH_epilogue_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_body + + DD imagerel $L$SEH_epilogue_add_mod_256 + DD imagerel $L$SEH_end_add_mod_256 + DD imagerel $L$SEH_info_add_mod_256_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_256 + DD imagerel $L$SEH_body_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_256 + DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_256 + DD imagerel $L$SEH_end_mul_by_3_mod_256 + DD imagerel $L$SEH_info_mul_by_3_mod_256_epilogue + + DD imagerel $L$SEH_begin_lshift_mod_256 + DD imagerel $L$SEH_body_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_prologue + + DD imagerel $L$SEH_body_lshift_mod_256 + DD imagerel $L$SEH_epilogue_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_body + + DD imagerel $L$SEH_epilogue_lshift_mod_256 + DD imagerel $L$SEH_end_lshift_mod_256 + DD imagerel $L$SEH_info_lshift_mod_256_epilogue + + DD imagerel $L$SEH_begin_rshift_mod_256 + DD imagerel $L$SEH_body_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_prologue + + DD imagerel $L$SEH_body_rshift_mod_256 + DD imagerel $L$SEH_epilogue_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_body + + DD imagerel $L$SEH_epilogue_rshift_mod_256 + DD imagerel $L$SEH_end_rshift_mod_256 + DD imagerel $L$SEH_info_rshift_mod_256_epilogue + + DD imagerel $L$SEH_begin_cneg_mod_256 + DD imagerel $L$SEH_body_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_prologue + + DD imagerel $L$SEH_body_cneg_mod_256 + DD imagerel $L$SEH_epilogue_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_body + + DD imagerel $L$SEH_epilogue_cneg_mod_256 + DD imagerel $L$SEH_end_cneg_mod_256 + DD imagerel $L$SEH_info_cneg_mod_256_epilogue + + DD imagerel $L$SEH_begin_sub_mod_256 + DD imagerel $L$SEH_body_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_prologue + + DD imagerel $L$SEH_body_sub_mod_256 + DD imagerel $L$SEH_epilogue_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_body + + DD imagerel $L$SEH_epilogue_sub_mod_256 + DD imagerel $L$SEH_end_sub_mod_256 + DD imagerel $L$SEH_info_sub_mod_256_epilogue + + DD imagerel $L$SEH_epilogue_check_mod_256 + DD imagerel $L$SEH_end_check_mod_256 + DD imagerel $L$SEH_info_check_mod_256_epilogue + + DD imagerel $L$SEH_begin_add_n_check_mod_256 + DD imagerel $L$SEH_body_add_n_check_mod_256 + DD imagerel $L$SEH_info_add_n_check_mod_256_prologue + + DD imagerel $L$SEH_body_add_n_check_mod_256 + DD imagerel $L$SEH_epilogue_add_n_check_mod_256 + DD imagerel $L$SEH_info_add_n_check_mod_256_body + + DD imagerel $L$SEH_epilogue_add_n_check_mod_256 + DD imagerel $L$SEH_end_add_n_check_mod_256 + DD imagerel $L$SEH_info_add_n_check_mod_256_epilogue + + DD imagerel $L$SEH_begin_sub_n_check_mod_256 + DD imagerel $L$SEH_body_sub_n_check_mod_256 + DD imagerel $L$SEH_info_sub_n_check_mod_256_prologue + + DD imagerel $L$SEH_body_sub_n_check_mod_256 + DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 + DD imagerel $L$SEH_info_sub_n_check_mod_256_body + + DD imagerel $L$SEH_epilogue_sub_n_check_mod_256 + DD imagerel $L$SEH_end_sub_n_check_mod_256 + DD imagerel $L$SEH_info_sub_n_check_mod_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_add_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_add_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_by_3_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_mul_by_3_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_lshift_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_lshift_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_lshift_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_rshift_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_rshift_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_rshift_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_cneg_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_cneg_mod_256_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_cneg_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sub_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sub_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_check_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_add_n_check_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_add_n_check_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_add_n_check_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_n_check_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sub_n_check_mod_256_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sub_n_check_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/add_mod_384-armv8.asm b/crypto/blst_src/build/win64/add_mod_384-armv8.asm new file mode 100644 index 00000000000..4bf703a6da0 --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_384-armv8.asm @@ -0,0 +1,1001 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |add_mod_384|[FUNC] + ALIGN 32 +|add_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__add_mod_384| PROC + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + +|__add_mod_384_ab_are_loaded| + adds x10,x10,x16 + adcs x11,x11,x17 + adcs x12,x12,x19 + adcs x13,x13,x20 + adcs x14,x14,x21 + adcs x15,x15,x22 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csello x10,x10,x16 + csello x11,x11,x17 + csello x12,x12,x19 + csello x13,x13,x20 + csello x14,x14,x21 + csello x15,x15,x22 + + ret + ENDP + + + + EXPORT |add_mod_384x|[FUNC] + ALIGN 32 +|add_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __add_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __add_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |rshift_mod_384|[FUNC] + ALIGN 32 +|rshift_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +|$Loop_rshift_mod_384| + sub x2,x2,#1 + bl __rshift_mod_384 + cbnz x2,|$Loop_rshift_mod_384| + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__rshift_mod_384| PROC + sbfx x22,x10,#0,#1 + and x16,x22,x4 + and x17,x22,x5 + adds x10,x10,x16 + and x19,x22,x6 + adcs x11,x11,x17 + and x20,x22,x7 + adcs x12,x12,x19 + and x21,x22,x8 + adcs x13,x13,x20 + and x22,x22,x9 + adcs x14,x14,x21 + extr x10,x11,x10,#1 // a[0:5] >>= 1 + adcs x15,x15,x22 + extr x11,x12,x11,#1 + adc x22,xzr,xzr + extr x12,x13,x12,#1 + extr x13,x14,x13,#1 + extr x14,x15,x14,#1 + extr x15,x22,x15,#1 + ret + ENDP + + + + EXPORT |div_by_2_mod_384|[FUNC] + ALIGN 32 +|div_by_2_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __rshift_mod_384 + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |lshift_mod_384|[FUNC] + ALIGN 32 +|lshift_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + +|$Loop_lshift_mod_384| + sub x2,x2,#1 + bl __lshift_mod_384 + cbnz x2,|$Loop_lshift_mod_384| + + ldr x30,[sp,#8] + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__lshift_mod_384| PROC + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x16,x10,x4 + sbcs x17,x11,x5 + sbcs x19,x12,x6 + sbcs x20,x13,x7 + sbcs x21,x14,x8 + sbcs x22,x15,x9 + sbcs xzr,x3,xzr + + csello x10,x10,x16 + csello x11,x11,x17 + csello x12,x12,x19 + csello x13,x13,x20 + csello x14,x14,x21 + csello x15,x15,x22 + + ret + ENDP + + + + EXPORT |mul_by_3_mod_384|[FUNC] + ALIGN 32 +|mul_by_3_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_8_mod_384|[FUNC] + ALIGN 32 +|mul_by_8_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_3_mod_384x|[FUNC] + ALIGN 32 +|mul_by_3_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + + bl __add_mod_384_ab_are_loaded + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + + ldp x16,x17,[x1,#48] + ldp x19,x20,[x1,#64] + ldp x21,x22,[x1,#80] + + bl __add_mod_384_ab_are_loaded + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_8_mod_384x|[FUNC] + ALIGN 32 +|mul_by_8_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + ldp x14,x15,[x1,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __lshift_mod_384 + bl __lshift_mod_384 + bl __lshift_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |cneg_mod_384|[FUNC] + ALIGN 32 +|cneg_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x10,x11,[x1] + ldp x4,x5,[x3] + ldp x12,x13,[x1,#16] + ldp x6,x7,[x3,#16] + + subs x16,x4,x10 + ldp x14,x15,[x1,#32] + ldp x8,x9,[x3,#32] + orr x3,x10,x11 + sbcs x17,x5,x11 + orr x3,x3,x12 + sbcs x19,x6,x12 + orr x3,x3,x13 + sbcs x20,x7,x13 + orr x3,x3,x14 + sbcs x21,x8,x14 + orr x3,x3,x15 + sbc x22,x9,x15 + + cmp x3,#0 + csetmne x3 + ands x2,x2,x3 + + cseleq x10,x10,x16 + cseleq x11,x11,x17 + cseleq x12,x12,x19 + cseleq x13,x13,x20 + stp x10,x11,[x0] + cseleq x14,x14,x21 + stp x12,x13,[x0,#16] + cseleq x15,x15,x22 + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sub_mod_384|[FUNC] + ALIGN 32 +|sub_mod_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + stp x14,x15,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__sub_mod_384| PROC + ldp x10,x11,[x1] + ldp x16,x17,[x2] + ldp x12,x13,[x1,#16] + ldp x19,x20,[x2,#16] + ldp x14,x15,[x1,#32] + ldp x21,x22,[x2,#32] + + subs x10,x10,x16 + sbcs x11,x11,x17 + sbcs x12,x12,x19 + sbcs x13,x13,x20 + sbcs x14,x14,x21 + sbcs x15,x15,x22 + sbc x3,xzr,xzr + + and x16,x4,x3 + and x17,x5,x3 + adds x10,x10,x16 + and x19,x6,x3 + adcs x11,x11,x17 + and x20,x7,x3 + adcs x12,x12,x19 + and x21,x8,x3 + adcs x13,x13,x20 + and x22,x9,x3 + adcs x14,x14,x21 + adc x15,x15,x22 + + ret + ENDP + + + + EXPORT |sub_mod_384x|[FUNC] + ALIGN 32 +|sub_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x3] + ldp x6,x7,[x3,#16] + ldp x8,x9,[x3,#32] + + bl __sub_mod_384 + + stp x10,x11,[x0] + add x1,x1,#48 + stp x12,x13,[x0,#16] + add x2,x2,#48 + stp x14,x15,[x0,#32] + + bl __sub_mod_384 + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_by_1_plus_i_mod_384x|[FUNC] + ALIGN 32 +|mul_by_1_plus_i_mod_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x4,x5,[x2] + ldp x6,x7,[x2,#16] + ldp x8,x9,[x2,#32] + add x2,x1,#48 + + bl __sub_mod_384 // a->re - a->im + + ldp x16,x17,[x1] + ldp x19,x20,[x1,#16] + ldp x21,x22,[x1,#32] + stp x10,x11,[x0] + ldp x10,x11,[x1,#48] + stp x12,x13,[x0,#16] + ldp x12,x13,[x1,#64] + stp x14,x15,[x0,#32] + ldp x14,x15,[x1,#80] + + bl __add_mod_384_ab_are_loaded // a->re + a->im + ldr x30,[sp,#8] + + stp x10,x11,[x0,#48] + stp x12,x13,[x0,#64] + stp x14,x15,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sgn0_pty_mod_384|[FUNC] + ALIGN 32 +|sgn0_pty_mod_384| PROC + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x0,x10,#1 + adds x10,x10,x10 + adcs x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x3,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x3,x3,xzr + + mvn x3,x3 + and x3,x3,#2 + orr x0,x0,x3 + + ret + ENDP + + + + EXPORT |sgn0_pty_mod_384x|[FUNC] + ALIGN 32 +|sgn0_pty_mod_384x| PROC + ldp x10,x11,[x0] + ldp x12,x13,[x0,#16] + ldp x14,x15,[x0,#32] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + ldp x8,x9,[x1,#32] + + and x2,x10,#1 + orr x3,x10,x11 + adds x10,x10,x10 + orr x3,x3,x12 + adcs x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + ldp x10,x11,[x0,#48] + ldp x12,x13,[x0,#64] + ldp x14,x15,[x0,#80] + + mvn x16,x16 + and x16,x16,#2 + orr x2,x2,x16 + + and x0,x10,#1 + orr x1,x10,x11 + adds x10,x10,x10 + orr x1,x1,x12 + adcs x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + adcs x15,x15,x15 + adc x16,xzr,xzr + + subs x10,x10,x4 + sbcs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbc x16,x16,xzr + + mvn x16,x16 + and x16,x16,#2 + orr x0,x0,x16 + + cmp x3,#0 + cseleq x3,x0,x2 + + cmp x1,#0 + cselne x1,x0,x2 + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ret + ENDP + + + EXPORT |vec_select_32|[FUNC] + ALIGN 32 +|vec_select_32| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_48|[FUNC] + ALIGN 32 +|vec_select_48| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_96|[FUNC] + ALIGN 32 +|vec_select_96| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_192|[FUNC] + ALIGN 32 +|vec_select_192| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_144|[FUNC] + ALIGN 32 +|vec_select_144| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + bit v1.16b, v4.16b, v6.16b + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0] + ret + ENDP + + + EXPORT |vec_select_288|[FUNC] + ALIGN 32 +|vec_select_288| PROC + dup v6.2d, x3 + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + cmeq v6.2d, v6.2d, #0 + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + ld1 {v0.2d, v1.2d, v2.2d}, [x1],#48 + bit v17.16b, v20.16b, v6.16b + ld1 {v3.2d, v4.2d, v5.2d}, [x2],#48 + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0],#48 + bit v0.16b, v3.16b, v6.16b + ld1 {v16.2d, v17.2d, v18.2d}, [x1],#48 + bit v1.16b, v4.16b, v6.16b + ld1 {v19.2d, v20.2d, v21.2d}, [x2],#48 + bit v2.16b, v5.16b, v6.16b + st1 {v0.2d, v1.2d, v2.2d}, [x0],#48 + bit v16.16b, v19.16b, v6.16b + bit v17.16b, v20.16b, v6.16b + bit v18.16b, v21.16b, v6.16b + st1 {v16.2d, v17.2d, v18.2d}, [x0] + ret + ENDP + + + EXPORT |vec_prefetch|[FUNC] + ALIGN 32 +|vec_prefetch| PROC + add x1, x1, x0 + sub x1, x1, #1 + mov x2, #64 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + cselhi x2,xzr,x2 + prfm pldl1keep, [x0] + add x0, x0, x2 + cmp x0, x1 + cselhi x0,x1,x0 + prfm pldl1keep, [x0] + ret + ENDP + + + EXPORT |vec_is_zero_16x|[FUNC] + ALIGN 32 +|vec_is_zero_16x| PROC + ld1 {v0.2d}, [x0], #16 + lsr x1, x1, #4 + sub x1, x1, #1 + cbz x1, |$Loop_is_zero_done| + +|$Loop_is_zero| + ld1 {v1.2d}, [x0], #16 + orr v0.16b, v0.16b, v1.16b + sub x1, x1, #1 + cbnz x1, |$Loop_is_zero| + +|$Loop_is_zero_done| + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + cseleq x0,x0,xzr + ret + ENDP + + + EXPORT |vec_is_equal_16x|[FUNC] + ALIGN 32 +|vec_is_equal_16x| PROC + ld1 {v0.2d}, [x0], #16 + ld1 {v1.2d}, [x1], #16 + lsr x2, x2, #4 + eor v0.16b, v0.16b, v1.16b + +|$Loop_is_equal| + sub x2, x2, #1 + cbz x2, |$Loop_is_equal_done| + ld1 {v1.2d}, [x0], #16 + ld1 {v2.2d}, [x1], #16 + eor v1.16b, v1.16b, v2.16b + orr v0.16b, v0.16b, v1.16b + b |$Loop_is_equal| + nop + +|$Loop_is_equal_done| + dup v1.2d, v0.d[1] + orr v0.16b, v0.16b, v1.16b + mov x1, v0.d[0] + mov x0, #1 + cmp x1, #0 + cseleq x0,x0,xzr + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/add_mod_384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm new file mode 100644 index 00000000000..560e02ee105 --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_384-x86_64.asm @@ -0,0 +1,2531 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC add_mod_384 + + +ALIGN 32 +add_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_add_mod_384:: + + + call __add_mod_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_add_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384:: +add_mod_384 ENDP + + +ALIGN 32 +__add_mod_384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__add_mod_384_a_is_loaded:: + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__add_mod_384 ENDP + +PUBLIC add_mod_384x + + +ALIGN 32 +add_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384x:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,24 + +$L$SEH_body_add_mod_384x:: + + + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + lea rsi,QWORD PTR[48+rsi] + lea rdx,QWORD PTR[48+rdx] + lea rdi,QWORD PTR[48+rdi] + call __add_mod_384 + + mov rsi,QWORD PTR[rsp] + mov rdx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __add_mod_384 + + mov r15,QWORD PTR[((24+0))+rsp] + + mov r14,QWORD PTR[((24+8))+rsp] + + mov r13,QWORD PTR[((24+16))+rsp] + + mov r12,QWORD PTR[((24+24))+rsp] + + mov rbx,QWORD PTR[((24+32))+rsp] + + mov rbp,QWORD PTR[((24+40))+rsp] + + lea rsp,QWORD PTR[((24+48))+rsp] + +$L$SEH_epilogue_add_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384x:: +add_mod_384x ENDP + + +PUBLIC rshift_mod_384 + + +ALIGN 32 +rshift_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_rshift_mod_384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_rshift_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +$L$oop_rshift_mod_384:: + call __rshift_mod_384 + dec edx + jnz $L$oop_rshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_rshift_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_rshift_mod_384:: +rshift_mod_384 ENDP + + +ALIGN 32 +__rshift_mod_384 PROC PRIVATE + DB 243,15,30,250 + + mov rsi,1 + mov r14,QWORD PTR[rcx] + and rsi,r8 + mov r15,QWORD PTR[8+rcx] + neg rsi + mov rax,QWORD PTR[16+rcx] + and r14,rsi + mov rbx,QWORD PTR[24+rcx] + and r15,rsi + mov rbp,QWORD PTR[32+rcx] + and rax,rsi + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[40+rcx] + + add r14,r8 + adc r15,r9 + adc rax,r10 + adc rbx,r11 + adc rbp,r12 + adc rsi,r13 + sbb r13,r13 + + shr r14,1 + mov r8,r15 + shr r15,1 + mov r9,rax + shr rax,1 + mov r10,rbx + shr rbx,1 + mov r11,rbp + shr rbp,1 + mov r12,rsi + shr rsi,1 + shl r8,63 + shl r9,63 + or r8,r14 + shl r10,63 + or r9,r15 + shl r11,63 + or r10,rax + shl r12,63 + or r11,rbx + shl r13,63 + or r12,rbp + or r13,rsi + + DB 0F3h,0C3h ;repret +__rshift_mod_384 ENDP + +PUBLIC div_by_2_mod_384 + + +ALIGN 32 +div_by_2_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_div_by_2_mod_384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_div_by_2_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov rcx,rdx + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + call __rshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_div_by_2_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_div_by_2_mod_384:: +div_by_2_mod_384 ENDP + + +PUBLIC lshift_mod_384 + + +ALIGN 32 +lshift_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_lshift_mod_384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_lshift_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +$L$oop_lshift_mod_384:: + add r8,r8 + adc r9,r9 + adc r10,r10 + mov r14,r8 + adc r11,r11 + mov r15,r9 + adc r12,r12 + mov rax,r10 + adc r13,r13 + mov rbx,r11 + sbb rdi,rdi + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdi,0 + + mov rdi,QWORD PTR[rsp] + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + cmovc r11,rbx + cmovc r12,rbp + cmovc r13,rsi + + dec edx + jnz $L$oop_lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_lshift_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_lshift_mod_384:: +lshift_mod_384 ENDP + + +ALIGN 32 +__lshift_mod_384 PROC PRIVATE + DB 243,15,30,250 + + add r8,r8 + adc r9,r9 + adc r10,r10 + mov r14,r8 + adc r11,r11 + mov r15,r9 + adc r12,r12 + mov rax,r10 + adc r13,r13 + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + cmovc r11,rbx + cmovc r12,rbp + cmovc r13,rsi + + DB 0F3h,0C3h ;repret +__lshift_mod_384 ENDP + + +PUBLIC mul_by_3_mod_384 + + +ALIGN 32 +mul_by_3_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_3_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + + mov rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_3_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_384:: +mul_by_3_mod_384 ENDP + +PUBLIC mul_by_8_mod_384 + + +ALIGN 32 +mul_by_8_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_8_mod_384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mul_by_8_mod_384:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_8_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_8_mod_384:: +mul_by_8_mod_384 ENDP + + +PUBLIC mul_by_3_mod_384x + + +ALIGN 32 +mul_by_3_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_3_mod_384x:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_3_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + + mov rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov rsi,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + + mov r8,QWORD PTR[48+rsi] + mov r9,QWORD PTR[56+rsi] + mov r10,QWORD PTR[64+rsi] + mov r11,QWORD PTR[72+rsi] + mov r12,QWORD PTR[80+rsi] + mov r13,QWORD PTR[88+rsi] + + call __lshift_mod_384 + + mov rdx,8*6 + add rdx,QWORD PTR[rsp] + call __add_mod_384_a_is_loaded + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_3_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_3_mod_384x:: +mul_by_3_mod_384x ENDP + +PUBLIC mul_by_8_mod_384x + + +ALIGN 32 +mul_by_8_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_8_mod_384x:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_mul_by_8_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov rcx,rdx + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov rsi,QWORD PTR[rsp] + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + call __lshift_mod_384 + call __lshift_mod_384 + call __lshift_mod_384 + + mov QWORD PTR[((48+0))+rdi],r8 + mov QWORD PTR[((48+8))+rdi],r9 + mov QWORD PTR[((48+16))+rdi],r10 + mov QWORD PTR[((48+24))+rdi],r11 + mov QWORD PTR[((48+32))+rdi],r12 + mov QWORD PTR[((48+40))+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_by_8_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_8_mod_384x:: +mul_by_8_mod_384x ENDP + + +PUBLIC cneg_mod_384 + + +ALIGN 32 +cneg_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_cneg_mod_384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdx + +$L$SEH_body_cneg_mod_384:: + + + mov rdx,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r8,rdx + mov r11,QWORD PTR[24+rsi] + or rdx,r9 + mov r12,QWORD PTR[32+rsi] + or rdx,r10 + mov r13,QWORD PTR[40+rsi] + or rdx,r11 + mov rsi,-1 + or rdx,r12 + or rdx,r13 + + mov r14,QWORD PTR[rcx] + cmovnz rdx,rsi + mov r15,QWORD PTR[8+rcx] + mov rax,QWORD PTR[16+rcx] + and r14,rdx + mov rbx,QWORD PTR[24+rcx] + and r15,rdx + mov rbp,QWORD PTR[32+rcx] + and rax,rdx + mov rsi,QWORD PTR[40+rcx] + and rbx,rdx + mov rcx,QWORD PTR[rsp] + and rbp,rdx + and rsi,rdx + + sub r14,r8 + sbb r15,r9 + sbb rax,r10 + sbb rbx,r11 + sbb rbp,r12 + sbb rsi,r13 + + or rcx,rcx + + cmovz r14,r8 + cmovz r15,r9 + cmovz rax,r10 + mov QWORD PTR[rdi],r14 + cmovz rbx,r11 + mov QWORD PTR[8+rdi],r15 + cmovz rbp,r12 + mov QWORD PTR[16+rdi],rax + cmovz rsi,r13 + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rsi + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_cneg_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_cneg_mod_384:: +cneg_mod_384 ENDP + + +PUBLIC sub_mod_384 + + +ALIGN 32 +sub_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sub_mod_384:: + + + call __sub_mod_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sub_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384:: +sub_mod_384 ENDP + + +ALIGN 32 +__sub_mod_384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__sub_mod_384 ENDP + +PUBLIC sub_mod_384x + + +ALIGN 32 +sub_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384x:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,24 + +$L$SEH_body_sub_mod_384x:: + + + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + lea rsi,QWORD PTR[48+rsi] + lea rdx,QWORD PTR[48+rdx] + lea rdi,QWORD PTR[48+rdi] + call __sub_mod_384 + + mov rsi,QWORD PTR[rsp] + mov rdx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __sub_mod_384 + + mov r15,QWORD PTR[((24+0))+rsp] + + mov r14,QWORD PTR[((24+8))+rsp] + + mov r13,QWORD PTR[((24+16))+rsp] + + mov r12,QWORD PTR[((24+24))+rsp] + + mov rbx,QWORD PTR[((24+32))+rsp] + + mov rbp,QWORD PTR[((24+40))+rsp] + + lea rsp,QWORD PTR[((24+48))+rsp] + +$L$SEH_epilogue_sub_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384x:: +sub_mod_384x ENDP +PUBLIC mul_by_1_plus_i_mod_384x + + +ALIGN 32 +mul_by_1_plus_i_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_by_1_plus_i_mod_384x:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,56 + +$L$SEH_body_mul_by_1_plus_i_mod_384x:: + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rbx,r11 + adc r11,QWORD PTR[72+rsi] + mov rcx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + mov QWORD PTR[48+rsp],rdi + sbb rdi,rdi + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rbx,QWORD PTR[72+rsi] + sbb rcx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rsi,rsi + + mov QWORD PTR[rsp],r8 + mov r8,QWORD PTR[rdx] + mov QWORD PTR[8+rsp],r9 + mov r9,QWORD PTR[8+rdx] + mov QWORD PTR[16+rsp],r10 + mov r10,QWORD PTR[16+rdx] + mov QWORD PTR[24+rsp],r11 + mov r11,QWORD PTR[24+rdx] + mov QWORD PTR[32+rsp],r12 + and r8,rsi + mov r12,QWORD PTR[32+rdx] + mov QWORD PTR[40+rsp],r13 + and r9,rsi + mov r13,QWORD PTR[40+rdx] + and r10,rsi + and r11,rsi + and r12,rsi + and r13,rsi + mov rsi,QWORD PTR[48+rsp] + + add r14,r8 + mov r8,QWORD PTR[rsp] + adc r15,r9 + mov r9,QWORD PTR[8+rsp] + adc rax,r10 + mov r10,QWORD PTR[16+rsp] + adc rbx,r11 + mov r11,QWORD PTR[24+rsp] + adc rcx,r12 + mov r12,QWORD PTR[32+rsp] + adc rbp,r13 + mov r13,QWORD PTR[40+rsp] + + mov QWORD PTR[rsi],r14 + mov r14,r8 + mov QWORD PTR[8+rsi],r15 + mov QWORD PTR[16+rsi],rax + mov r15,r9 + mov QWORD PTR[24+rsi],rbx + mov QWORD PTR[32+rsi],rcx + mov rax,r10 + mov QWORD PTR[40+rsi],rbp + + sub r8,QWORD PTR[rdx] + mov rbx,r11 + sbb r9,QWORD PTR[8+rdx] + sbb r10,QWORD PTR[16+rdx] + mov rcx,r12 + sbb r11,QWORD PTR[24+rdx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,r13 + sbb r13,QWORD PTR[40+rdx] + sbb rdi,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[48+rsi],r8 + cmovc r11,rbx + mov QWORD PTR[56+rsi],r9 + cmovc r12,rcx + mov QWORD PTR[64+rsi],r10 + cmovc r13,rbp + mov QWORD PTR[72+rsi],r11 + mov QWORD PTR[80+rsi],r12 + mov QWORD PTR[88+rsi],r13 + + mov r15,QWORD PTR[((56+0))+rsp] + + mov r14,QWORD PTR[((56+8))+rsp] + + mov r13,QWORD PTR[((56+16))+rsp] + + mov r12,QWORD PTR[((56+24))+rsp] + + mov rbx,QWORD PTR[((56+32))+rsp] + + mov rbp,QWORD PTR[((56+40))+rsp] + + lea rsp,QWORD PTR[((56+48))+rsp] + +$L$SEH_epilogue_mul_by_1_plus_i_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_by_1_plus_i_mod_384x:: +mul_by_1_plus_i_mod_384x ENDP +PUBLIC sgn0_pty_mod_384 + + +ALIGN 32 +sgn0_pty_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mod_384:: + + + mov rdi,rcx + mov rsi,rdx +$L$SEH_body_sgn0_pty_mod_384:: + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov rcx,QWORD PTR[32+rdi] + mov rdx,QWORD PTR[40+rdi] + + xor rax,rax + mov rdi,r8 + add r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rax,0 + + sub r8,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rax,0 + + not rax + and rdi,1 + and rax,2 + or rax,rdi + +$L$SEH_epilogue_sgn0_pty_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mod_384:: +sgn0_pty_mod_384 ENDP + +PUBLIC sgn0_pty_mod_384x + + +ALIGN 32 +sgn0_pty_mod_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mod_384x:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + push rbx + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mod_384x:: + + + mov r8,QWORD PTR[48+rdi] + mov r9,QWORD PTR[56+rdi] + mov r10,QWORD PTR[64+rdi] + mov r11,QWORD PTR[72+rdi] + mov rcx,QWORD PTR[80+rdi] + mov rdx,QWORD PTR[88+rdi] + + mov rbx,r8 + or r8,r9 + or r8,r10 + or r8,r11 + or r8,rcx + or r8,rdx + + lea rax,QWORD PTR[rdi] + xor rdi,rdi + mov rbp,rbx + add rbx,rbx + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rdi,0 + + sub rbx,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rdi,0 + + mov QWORD PTR[rsp],r8 + not rdi + and rbp,1 + and rdi,2 + or rdi,rbp + + mov r8,QWORD PTR[rax] + mov r9,QWORD PTR[8+rax] + mov r10,QWORD PTR[16+rax] + mov r11,QWORD PTR[24+rax] + mov rcx,QWORD PTR[32+rax] + mov rdx,QWORD PTR[40+rax] + + mov rbx,r8 + or r8,r9 + or r8,r10 + or r8,r11 + or r8,rcx + or r8,rdx + + xor rax,rax + mov rbp,rbx + add rbx,rbx + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rcx,rcx + adc rdx,rdx + adc rax,0 + + sub rbx,QWORD PTR[rsi] + sbb r9,QWORD PTR[8+rsi] + sbb r10,QWORD PTR[16+rsi] + sbb r11,QWORD PTR[24+rsi] + sbb rcx,QWORD PTR[32+rsi] + sbb rdx,QWORD PTR[40+rsi] + sbb rax,0 + + mov rbx,QWORD PTR[rsp] + + not rax + + test r8,r8 + cmovz rbp,rdi + + test rbx,rbx + cmovnz rax,rdi + + and rbp,1 + and rax,2 + or rax,rbp + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_sgn0_pty_mod_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mod_384x:: +sgn0_pty_mod_384x ENDP +PUBLIC vec_select_32 + + +ALIGN 32 +vec_select_32 PROC PUBLIC + DB 243,15,30,250 + + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[16+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[16+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[16+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-16))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-16))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-16)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-16)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_32 ENDP +PUBLIC vec_select_48 + + +ALIGN 32 +vec_select_48 PROC PUBLIC + DB 243,15,30,250 + + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[24+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[24+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[24+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-24))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-24))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-24)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-24))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-24))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-24)+rcx],xmm2 + pand xmm0,xmm4 + pand xmm1,xmm5 + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-24)+rcx],xmm0 + DB 0F3h,0C3h ;repret +vec_select_48 ENDP +PUBLIC vec_select_96 + + +ALIGN 32 +vec_select_96 PROC PUBLIC + DB 243,15,30,250 + + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[48+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[48+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[48+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-48))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-48))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-48)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-48))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-48))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-48)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-48))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-48))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-48)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-48))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-48))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-48)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-48))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-48))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-48)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-48)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_96 ENDP +PUBLIC vec_select_192 + + +ALIGN 32 +vec_select_192 PROC PUBLIC + DB 243,15,30,250 + + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[96+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[96+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[96+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((80+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((80+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((96+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((96+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(96-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((112+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((112+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(112-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((128+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((128+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(128-96)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((144+16-96))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((144+16-96))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(144-96)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((160+16-96))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((160+16-96))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(160-96)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(176-96)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_192 ENDP +PUBLIC vec_select_144 + + +ALIGN 32 +vec_select_144 PROC PUBLIC + DB 243,15,30,250 + + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[72+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[72+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[72+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-72)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-72)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((80+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((80+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-72)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((96+16-72))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((96+16-72))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(96-72)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((112+16-72))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((112+16-72))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(112-72)+rcx],xmm2 + pand xmm0,xmm4 + pand xmm1,xmm5 + por xmm0,xmm1 + movdqu XMMWORD PTR[(128-72)+rcx],xmm0 + DB 0F3h,0C3h ;repret +vec_select_144 ENDP +PUBLIC vec_select_288 + + +ALIGN 32 +vec_select_288 PROC PUBLIC + DB 243,15,30,250 + + movd xmm5,r9d + pxor xmm4,xmm4 + pshufd xmm5,xmm5,0 + movdqu xmm0,XMMWORD PTR[rdx] + lea rdx,QWORD PTR[144+rdx] + pcmpeqd xmm5,xmm4 + movdqu xmm1,XMMWORD PTR[r8] + lea r8,QWORD PTR[144+r8] + pcmpeqd xmm4,xmm5 + lea rcx,QWORD PTR[144+rcx] + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((0+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((0+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(0-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((16+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((16+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(16-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((32+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((32+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(32-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((48+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((48+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(48-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((64+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((64+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(64-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((80+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((80+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(80-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((96+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((96+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(96-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((112+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((112+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(112-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((128+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((128+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(128-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((144+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((144+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(144-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((160+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((160+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(160-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((176+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((176+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(176-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((192+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((192+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(192-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((208+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((208+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(208-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((224+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((224+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(224-144)+rcx],xmm0 + pand xmm2,xmm4 + movdqu xmm0,XMMWORD PTR[((240+16-144))+rdx] + pand xmm3,xmm5 + movdqu xmm1,XMMWORD PTR[((240+16-144))+r8] + por xmm2,xmm3 + movdqu XMMWORD PTR[(240-144)+rcx],xmm2 + pand xmm0,xmm4 + movdqu xmm2,XMMWORD PTR[((256+16-144))+rdx] + pand xmm1,xmm5 + movdqu xmm3,XMMWORD PTR[((256+16-144))+r8] + por xmm0,xmm1 + movdqu XMMWORD PTR[(256-144)+rcx],xmm0 + pand xmm2,xmm4 + pand xmm3,xmm5 + por xmm2,xmm3 + movdqu XMMWORD PTR[(272-144)+rcx],xmm2 + DB 0F3h,0C3h ;repret +vec_select_288 ENDP +PUBLIC vec_prefetch + + +ALIGN 32 +vec_prefetch PROC PUBLIC + DB 243,15,30,250 + + lea rdx,QWORD PTR[((-1))+rdx*1+rcx] + mov rax,64 + xor r8,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + cmova rax,r8 + prefetchnta [rcx] + lea rcx,QWORD PTR[rax*1+rcx] + cmp rcx,rdx + cmova rcx,rdx + prefetchnta [rcx] + DB 0F3h,0C3h ;repret +vec_prefetch ENDP +PUBLIC vec_is_zero_16x + + +ALIGN 32 +vec_is_zero_16x PROC PUBLIC + DB 243,15,30,250 + + shr edx,4 + movdqu xmm0,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + +$L$oop_is_zero:: + dec edx + jz $L$oop_is_zero_done + movdqu xmm1,XMMWORD PTR[rcx] + lea rcx,QWORD PTR[16+rcx] + por xmm0,xmm1 + jmp $L$oop_is_zero + +$L$oop_is_zero_done:: + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 +DB 102,72,15,126,192 + inc edx + test rax,rax + cmovnz eax,edx + xor eax,1 + DB 0F3h,0C3h ;repret +vec_is_zero_16x ENDP +PUBLIC vec_is_equal_16x + + +ALIGN 32 +vec_is_equal_16x PROC PUBLIC + DB 243,15,30,250 + + shr r8d,4 + movdqu xmm0,XMMWORD PTR[rcx] + movdqu xmm1,XMMWORD PTR[rdx] + sub rdx,rcx + lea rcx,QWORD PTR[16+rcx] + pxor xmm0,xmm1 + +$L$oop_is_equal:: + dec r8d + jz $L$oop_is_equal_done + movdqu xmm1,XMMWORD PTR[rcx] + movdqu xmm2,XMMWORD PTR[rdx*1+rcx] + lea rcx,QWORD PTR[16+rcx] + pxor xmm1,xmm2 + por xmm0,xmm1 + jmp $L$oop_is_equal + +$L$oop_is_equal_done:: + pshufd xmm1,xmm0,04eh + por xmm0,xmm1 +DB 102,72,15,126,192 + inc r8d + test rax,rax + cmovnz eax,r8d + xor eax,1 + DB 0F3h,0C3h ;repret +vec_is_equal_16x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_384 + DD imagerel $L$SEH_body_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_prologue + + DD imagerel $L$SEH_body_add_mod_384 + DD imagerel $L$SEH_epilogue_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_body + + DD imagerel $L$SEH_epilogue_add_mod_384 + DD imagerel $L$SEH_end_add_mod_384 + DD imagerel $L$SEH_info_add_mod_384_epilogue + + DD imagerel $L$SEH_begin_add_mod_384x + DD imagerel $L$SEH_body_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_prologue + + DD imagerel $L$SEH_body_add_mod_384x + DD imagerel $L$SEH_epilogue_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_body + + DD imagerel $L$SEH_epilogue_add_mod_384x + DD imagerel $L$SEH_end_add_mod_384x + DD imagerel $L$SEH_info_add_mod_384x_epilogue + + DD imagerel $L$SEH_begin_rshift_mod_384 + DD imagerel $L$SEH_body_rshift_mod_384 + DD imagerel $L$SEH_info_rshift_mod_384_prologue + + DD imagerel $L$SEH_body_rshift_mod_384 + DD imagerel $L$SEH_epilogue_rshift_mod_384 + DD imagerel $L$SEH_info_rshift_mod_384_body + + DD imagerel $L$SEH_epilogue_rshift_mod_384 + DD imagerel $L$SEH_end_rshift_mod_384 + DD imagerel $L$SEH_info_rshift_mod_384_epilogue + + DD imagerel $L$SEH_begin_div_by_2_mod_384 + DD imagerel $L$SEH_body_div_by_2_mod_384 + DD imagerel $L$SEH_info_div_by_2_mod_384_prologue + + DD imagerel $L$SEH_body_div_by_2_mod_384 + DD imagerel $L$SEH_epilogue_div_by_2_mod_384 + DD imagerel $L$SEH_info_div_by_2_mod_384_body + + DD imagerel $L$SEH_epilogue_div_by_2_mod_384 + DD imagerel $L$SEH_end_div_by_2_mod_384 + DD imagerel $L$SEH_info_div_by_2_mod_384_epilogue + + DD imagerel $L$SEH_begin_lshift_mod_384 + DD imagerel $L$SEH_body_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_prologue + + DD imagerel $L$SEH_body_lshift_mod_384 + DD imagerel $L$SEH_epilogue_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_body + + DD imagerel $L$SEH_epilogue_lshift_mod_384 + DD imagerel $L$SEH_end_lshift_mod_384 + DD imagerel $L$SEH_info_lshift_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_384 + DD imagerel $L$SEH_body_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_384 + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384 + DD imagerel $L$SEH_end_mul_by_3_mod_384 + DD imagerel $L$SEH_info_mul_by_3_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_8_mod_384 + DD imagerel $L$SEH_body_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_prologue + + DD imagerel $L$SEH_body_mul_by_8_mod_384 + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_body + + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384 + DD imagerel $L$SEH_end_mul_by_8_mod_384 + DD imagerel $L$SEH_info_mul_by_8_mod_384_epilogue + + DD imagerel $L$SEH_begin_mul_by_3_mod_384x + DD imagerel $L$SEH_body_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_3_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_3_mod_384x + DD imagerel $L$SEH_end_mul_by_3_mod_384x + DD imagerel $L$SEH_info_mul_by_3_mod_384x_epilogue + + DD imagerel $L$SEH_begin_mul_by_8_mod_384x + DD imagerel $L$SEH_body_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_8_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_8_mod_384x + DD imagerel $L$SEH_end_mul_by_8_mod_384x + DD imagerel $L$SEH_info_mul_by_8_mod_384x_epilogue + + DD imagerel $L$SEH_begin_cneg_mod_384 + DD imagerel $L$SEH_body_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_prologue + + DD imagerel $L$SEH_body_cneg_mod_384 + DD imagerel $L$SEH_epilogue_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_body + + DD imagerel $L$SEH_epilogue_cneg_mod_384 + DD imagerel $L$SEH_end_cneg_mod_384 + DD imagerel $L$SEH_info_cneg_mod_384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384 + DD imagerel $L$SEH_body_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_prologue + + DD imagerel $L$SEH_body_sub_mod_384 + DD imagerel $L$SEH_epilogue_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_body + + DD imagerel $L$SEH_epilogue_sub_mod_384 + DD imagerel $L$SEH_end_sub_mod_384 + DD imagerel $L$SEH_info_sub_mod_384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384x + DD imagerel $L$SEH_body_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_prologue + + DD imagerel $L$SEH_body_sub_mod_384x + DD imagerel $L$SEH_epilogue_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_body + + DD imagerel $L$SEH_epilogue_sub_mod_384x + DD imagerel $L$SEH_end_sub_mod_384x + DD imagerel $L$SEH_info_sub_mod_384x_epilogue + + DD imagerel $L$SEH_begin_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_prologue + + DD imagerel $L$SEH_body_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_body + + DD imagerel $L$SEH_epilogue_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_end_mul_by_1_plus_i_mod_384x + DD imagerel $L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mod_384 + DD imagerel $L$SEH_body_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mod_384 + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384 + DD imagerel $L$SEH_end_sgn0_pty_mod_384 + DD imagerel $L$SEH_info_sgn0_pty_mod_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mod_384x + DD imagerel $L$SEH_body_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mod_384x + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mod_384x + DD imagerel $L$SEH_end_sgn0_pty_mod_384x + DD imagerel $L$SEH_info_sgn0_pty_mod_384x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_add_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_add_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_add_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_add_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_add_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_rshift_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_rshift_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_rshift_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_div_by_2_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_div_by_2_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_div_by_2_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_lshift_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_lshift_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_lshift_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_by_3_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mul_by_3_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_8_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_by_8_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mul_by_8_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_3_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_by_3_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mul_by_3_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_8_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_by_8_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mul_by_8_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_cneg_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_cneg_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_cneg_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sub_mod_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sub_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sub_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sub_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_by_1_plus_i_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_by_1_plus_i_mod_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,007h,000h +DB 000h,0e4h,008h,000h +DB 000h,0d4h,009h,000h +DB 000h,0c4h,00ah,000h +DB 000h,034h,00bh,000h +DB 000h,054h,00ch,000h +DB 000h,074h,00eh,000h +DB 000h,064h,00fh,000h +DB 000h,0c2h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mul_by_1_plus_i_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sgn0_pty_mod_384_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sgn0_pty_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mod_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sgn0_pty_mod_384x_body:: +DB 1,0,9,0 +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sgn0_pty_mod_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm new file mode 100644 index 00000000000..59b51a910ce --- /dev/null +++ b/crypto/blst_src/build/win64/add_mod_384x384-x86_64.asm @@ -0,0 +1,338 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + + +ALIGN 32 +__add_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + add r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + adc r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + adc r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + adc r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + adc r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + adc r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + adc r14,QWORD PTR[48+rdx] + mov QWORD PTR[8+rdi],r9 + adc r15,QWORD PTR[56+rdx] + mov QWORD PTR[16+rdi],r10 + adc rax,QWORD PTR[64+rdx] + mov QWORD PTR[32+rdi],r12 + mov r8,r14 + adc rbx,QWORD PTR[72+rdx] + mov QWORD PTR[24+rdi],r11 + mov r9,r15 + adc rbp,QWORD PTR[80+rdx] + mov QWORD PTR[40+rdi],r13 + mov r10,rax + adc rsi,QWORD PTR[88+rdx] + mov r11,rbx + sbb rdx,rdx + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov r12,rbp + sbb rax,QWORD PTR[16+rcx] + sbb rbx,QWORD PTR[24+rcx] + sbb rbp,QWORD PTR[32+rcx] + mov r13,rsi + sbb rsi,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r14,r8 + cmovc r15,r9 + cmovc rax,r10 + mov QWORD PTR[48+rdi],r14 + cmovc rbx,r11 + mov QWORD PTR[56+rdi],r15 + cmovc rbp,r12 + mov QWORD PTR[64+rdi],rax + cmovc rsi,r13 + mov QWORD PTR[72+rdi],rbx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__add_mod_384x384 ENDP + + +ALIGN 32 +__sub_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__sub_mod_384x384 ENDP + +PUBLIC add_mod_384x384 + + +ALIGN 32 +add_mod_384x384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_add_mod_384x384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_add_mod_384x384:: + + + call __add_mod_384x384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_add_mod_384x384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_add_mod_384x384:: +add_mod_384x384 ENDP + +PUBLIC sub_mod_384x384 + + +ALIGN 32 +sub_mod_384x384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sub_mod_384x384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sub_mod_384x384:: + + + call __sub_mod_384x384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sub_mod_384x384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sub_mod_384x384:: +sub_mod_384x384 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_add_mod_384x384 + DD imagerel $L$SEH_body_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_prologue + + DD imagerel $L$SEH_body_add_mod_384x384 + DD imagerel $L$SEH_epilogue_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_body + + DD imagerel $L$SEH_epilogue_add_mod_384x384 + DD imagerel $L$SEH_end_add_mod_384x384 + DD imagerel $L$SEH_info_add_mod_384x384_epilogue + + DD imagerel $L$SEH_begin_sub_mod_384x384 + DD imagerel $L$SEH_body_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_prologue + + DD imagerel $L$SEH_body_sub_mod_384x384 + DD imagerel $L$SEH_epilogue_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_body + + DD imagerel $L$SEH_epilogue_sub_mod_384x384 + DD imagerel $L$SEH_end_sub_mod_384x384 + DD imagerel $L$SEH_info_sub_mod_384x384_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_add_mod_384x384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_add_mod_384x384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_add_mod_384x384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sub_mod_384x384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sub_mod_384x384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sub_mod_384x384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/blst.def b/crypto/blst_src/build/win64/blst.def new file mode 100644 index 00000000000..dda95336a93 --- /dev/null +++ b/crypto/blst_src/build/win64/blst.def @@ -0,0 +1,221 @@ +LIBRARY blst + +EXPORTS + blst_scalar_from_uint32 + blst_uint32_from_scalar + blst_scalar_from_uint64 + blst_uint64_from_scalar + blst_scalar_from_bendian + blst_bendian_from_scalar + blst_scalar_from_lendian + blst_lendian_from_scalar + blst_scalar_fr_check + blst_sk_check + blst_sk_add_n_check + blst_sk_sub_n_check + blst_sk_mul_n_check + blst_sk_inverse + blst_scalar_from_le_bytes + blst_scalar_from_be_bytes + blst_fr_add + blst_fr_sub + blst_fr_mul_by_3 + blst_fr_lshift + blst_fr_rshift + blst_fr_mul + blst_fr_sqr + blst_fr_cneg + blst_fr_eucl_inverse + blst_fr_inverse + blst_fr_from_uint64 + blst_uint64_from_fr + blst_fr_from_scalar + blst_scalar_from_fr + blst_fp_add + blst_fp_sub + blst_fp_mul_by_3 + blst_fp_mul_by_8 + blst_fp_lshift + blst_fp_mul + blst_fp_sqr + blst_fp_cneg + blst_fp_eucl_inverse + blst_fp_inverse + blst_fp_sqrt + blst_fp_from_uint32 + blst_uint32_from_fp + blst_fp_from_uint64 + blst_uint64_from_fp + blst_fp_from_bendian + blst_bendian_from_fp + blst_fp_from_lendian + blst_lendian_from_fp + blst_fp2_add + blst_fp2_sub + blst_fp2_mul_by_3 + blst_fp2_mul_by_8 + blst_fp2_lshift + blst_fp2_mul + blst_fp2_sqr + blst_fp2_cneg + blst_fp2_eucl_inverse + blst_fp2_inverse + blst_fp2_sqrt + blst_fp12_sqr + blst_fp12_cyclotomic_sqr + blst_fp12_mul + blst_fp12_mul_by_xy00z0 + blst_fp12_conjugate + blst_fp12_inverse + blst_fp12_frobenius_map + blst_fp12_is_equal + blst_fp12_is_one + blst_fp12_in_group + blst_fp12_one + blst_p1_add + blst_p1_add_or_double + blst_p1_add_affine + blst_p1_add_or_double_affine + blst_p1_double + blst_p1_mult + blst_p1_cneg + blst_p1_to_affine + blst_p1_from_affine + blst_p1_on_curve + blst_p1_in_g1 + blst_p1_is_equal + blst_p1_is_inf + blst_p1_generator + blst_p1_affine_on_curve + blst_p1_affine_in_g1 + blst_p1_affine_is_equal + blst_p1_affine_is_inf + blst_p1_affine_generator + blst_p2_add + blst_p2_add_or_double + blst_p2_add_affine + blst_p2_add_or_double_affine + blst_p2_double + blst_p2_mult + blst_p2_cneg + blst_p2_to_affine + blst_p2_from_affine + blst_p2_on_curve + blst_p2_in_g2 + blst_p2_is_equal + blst_p2_is_inf + blst_p2_generator + blst_p2_affine_on_curve + blst_p2_affine_in_g2 + blst_p2_affine_is_equal + blst_p2_affine_is_inf + blst_p2_affine_generator + blst_p1s_to_affine + blst_p1s_add + blst_p1s_mult_wbits_precompute_sizeof + blst_p1s_mult_wbits_precompute + blst_p1s_mult_wbits_scratch_sizeof + blst_p1s_mult_wbits + blst_p1s_mult_pippenger_scratch_sizeof + blst_p1s_mult_pippenger + blst_p1s_tile_pippenger + blst_p2s_to_affine + blst_p2s_add + blst_p2s_mult_wbits_precompute_sizeof + blst_p2s_mult_wbits_precompute + blst_p2s_mult_wbits_scratch_sizeof + blst_p2s_mult_wbits + blst_p2s_mult_pippenger_scratch_sizeof + blst_p2s_mult_pippenger + blst_p2s_tile_pippenger + blst_map_to_g1 + blst_map_to_g2 + blst_encode_to_g1 + blst_hash_to_g1 + blst_encode_to_g2 + blst_hash_to_g2 + blst_p1_serialize + blst_p1_compress + blst_p1_affine_serialize + blst_p1_affine_compress + blst_p1_uncompress + blst_p1_deserialize + blst_p2_serialize + blst_p2_compress + blst_p2_affine_serialize + blst_p2_affine_compress + blst_p2_uncompress + blst_p2_deserialize + blst_keygen + blst_sk_to_pk_in_g1 + blst_sign_pk_in_g1 + blst_sk_to_pk_in_g2 + blst_sign_pk_in_g2 + blst_miller_loop + blst_miller_loop_n + blst_final_exp + blst_precompute_lines + blst_miller_loop_lines + blst_fp12_finalverify + blst_pairing_sizeof + blst_pairing_init + blst_pairing_get_dst + blst_pairing_commit + blst_pairing_aggregate_pk_in_g2 + blst_pairing_chk_n_aggr_pk_in_g2 + blst_pairing_mul_n_aggregate_pk_in_g2 + blst_pairing_chk_n_mul_n_aggr_pk_in_g2 + blst_pairing_aggregate_pk_in_g1 + blst_pairing_chk_n_aggr_pk_in_g1 + blst_pairing_mul_n_aggregate_pk_in_g1 + blst_pairing_chk_n_mul_n_aggr_pk_in_g1 + blst_pairing_merge + blst_pairing_finalverify + blst_aggregate_in_g1 + blst_aggregate_in_g2 + blst_aggregated_in_g1 + blst_aggregated_in_g2 + blst_core_verify_pk_in_g1 + blst_core_verify_pk_in_g2 + BLS12_381_G1 + BLS12_381_NEG_G1 + BLS12_381_G2 + BLS12_381_NEG_G2 + blst_fr_ct_bfly + blst_fr_gs_bfly + blst_fr_to + blst_fr_from + blst_fp_to + blst_fp_from + blst_fp_is_square + blst_fp2_is_square + blst_p1_from_jacobian + blst_p2_from_jacobian + blst_sk_to_pk2_in_g1 + blst_sign_pk2_in_g1 + blst_sk_to_pk2_in_g2 + blst_sign_pk2_in_g2 + blst_uniq_sizeof + blst_uniq_init + blst_uniq_test + blst_expand_message_xmd + blst_p1_unchecked_mult + blst_p2_unchecked_mult + blst_pairing_raw_aggregate + blst_pairing_as_fp12 + blst_bendian_from_fp12 + blst_keygen_v3 + blst_keygen_v4_5 + blst_keygen_v5 + blst_derive_master_eip2333 + blst_derive_child_eip2333 + blst_scalar_from_hexascii + blst_fr_from_hexascii + blst_fp_from_hexascii + blst_p1_sizeof + blst_p1_affine_sizeof + blst_p2_sizeof + blst_p2_affine_sizeof + blst_fp12_sizeof + blst_sha256 + diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm new file mode 100644 index 00000000000..a4467904612 --- /dev/null +++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-armv8.asm @@ -0,0 +1,786 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |ct_inverse_mod_256|[FUNC] + ALIGN 32 +|ct_inverse_mod_256| PROC + DCDU 3573752639 + stp x29, x30, [sp,#-80]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + sub sp, sp, #1040 + + ldp x4, x5, [x1,#8*0] + ldp x6, x7, [x1,#8*2] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + str x0, [sp] + + ldp x8, x9, [x2,#8*0] + ldp x10, x11, [x2,#8*2] + + stp x4, x5, [x1,#8*0] // copy input to |a| + stp x6, x7, [x1,#8*2] + stp x8, x9, [x1,#8*4] // copy modulus to |b| + stp x10, x11, [x1,#8*6] + + ////////////////////////////////////////// first iteration + bl |$Lab_approximation_31_256_loaded| + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + str x12,[x0,#8*8] // initialize |u| with |f0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to dst |b| + bl __smul_256_n_shift_by_31 + str x12, [x0,#8*9] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + ldr x8, [x1,#8*8] // |u| + ldr x9, [x1,#8*13] // |v| + madd x4, x16, x8, xzr // |u|*|f0| + madd x4, x17, x9, x4 // |v|*|g0| + str x4, [x0,#8*4] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*5] + stp x5, x5, [x0,#8*7] + + madd x4, x12, x8, xzr // |u|*|f1| + madd x4, x13, x9, x4 // |v|*|g1| + str x4, [x0,#8*9] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*10] + stp x5, x5, [x0,#8*12] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + adc x22, x22, x23 + stp x22, x22, [x0,#8*4] + stp x22, x22, [x0,#8*6] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + bl __ab_approximation_31_256 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_256_n_shift_by_31 + mov x16, x12 // corrected |f0| + mov x17, x13 // corrected |g0| + + mov x12, x14 // |f1| + mov x13, x15 // |g1| + add x0, x0, #8*4 // pointer to destination |b| + bl __smul_256_n_shift_by_31 + + add x0, x0, #8*4 // pointer to destination |u| + bl __smul_256x63 + adc x22, x22, x23 + str x22, [x0,#8*4] + + mov x16, x12 // corrected |f1| + mov x17, x13 // corrected |g1| + add x0, x0, #8*5 // pointer to destination |v| + bl __smul_256x63 + bl __smul_512x63_tail + ////////////////////////////////////////// two[!] last iterations + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #47 // 31 + 512 % 31 + //bl __ab_approximation_62_256 // |a| and |b| are exact, + ldr x7, [x1,#8*0] // just load + ldr x11, [x1,#8*4] + bl __inner_loop_62_256 + + mov x16, x14 + mov x17, x15 + ldr x0, [sp] // original out_ptr + bl __smul_256x63 + bl __smul_512x63_tail + ldr x30, [x29,#8] + + smulh x20, x7, x17 // figure out top-most limb + ldp x8, x9, [x3,#8*0] + adc x23, x23, x25 + ldp x10, x11, [x3,#8*2] + + add x20, x20, x23 // x20 is 1, 0 or -1 + asr x19, x20, #63 // sign as mask + + and x23, x8, x19 // add mod<<256 conditionally + and x24, x9, x19 + adds x4, x4, x23 + and x25, x10, x19 + adcs x5, x5, x24 + and x26, x11, x19 + adcs x6, x6, x25 + adcs x7, x22, x26 + adc x20, x20, xzr // x20 is 1, 0 or -1 + + neg x19, x20 + orr x20, x20, x19 // excess bit or sign as mask + asr x19, x19, #63 // excess bit as mask + + and x8, x8, x20 // mask |mod| + and x9, x9, x20 + and x10, x10, x20 + and x11, x11, x20 + + eor x8, x8, x19 // conditionally negate |mod| + eor x9, x9, x19 + adds x8, x8, x19, lsr#63 + eor x10, x10, x19 + adcs x9, x9, xzr + eor x11, x11, x19 + adcs x10, x10, xzr + adc x11, x11, xzr + + adds x4, x4, x8 // final adjustment for |mod|<<256 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*4] + adc x7, x7, x11 + stp x6, x7, [x0,#8*6] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldr x29, [sp],#80 + DCDU 3573752767 + ret + ENDP + +//////////////////////////////////////////////////////////////////////// + + ALIGN 32 +|__smul_256x63| PROC + ldp x4, x5, [x1,#8*0+64] // load |u| (or |v|) + asr x14, x16, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x6, x7, [x1,#8*2+64] + eor x16, x16, x14 // conditionally negate |f_| (or |g_|) + ldr x22, [x1,#8*4+64] + + eor x4, x4, x14 // conditionally negate |u| (or |v|) + sub x16, x16, x14 + eor x5, x5, x14 + adds x4, x4, x14, lsr#63 + eor x6, x6, x14 + adcs x5, x5, xzr + eor x7, x7, x14 + adcs x6, x6, xzr + eor x22, x22, x14 + umulh x19, x4, x16 + adcs x7, x7, xzr + umulh x20, x5, x16 + adcs x22, x22, xzr + umulh x21, x6, x16 + mul x4, x4, x16 + cmp x16, #0 + mul x5, x5, x16 + cselne x22,x22,xzr + mul x6, x6, x16 + adds x5, x5, x19 + mul x24, x7, x16 + adcs x6, x6, x20 + adcs x24, x24, x21 + adc x26, xzr, xzr + ldp x8, x9, [x1,#8*0+104] // load |u| (or |v|) + asr x14, x17, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x10, x11, [x1,#8*2+104] + eor x17, x17, x14 // conditionally negate |f_| (or |g_|) + ldr x23, [x1,#8*4+104] + + eor x8, x8, x14 // conditionally negate |u| (or |v|) + sub x17, x17, x14 + eor x9, x9, x14 + adds x8, x8, x14, lsr#63 + eor x10, x10, x14 + adcs x9, x9, xzr + eor x11, x11, x14 + adcs x10, x10, xzr + eor x23, x23, x14 + umulh x19, x8, x17 + adcs x11, x11, xzr + umulh x20, x9, x17 + adcs x23, x23, xzr + umulh x21, x10, x17 + adc x15, xzr, xzr // used in __smul_512x63_tail + mul x8, x8, x17 + cmp x17, #0 + mul x9, x9, x17 + cselne x23,x23,xzr + mul x10, x10, x17 + adds x9, x9, x19 + mul x25, x11, x17 + adcs x10, x10, x20 + adcs x25, x25, x21 + adc x26, x26, xzr + + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + stp x4, x5, [x0,#8*0] + adcs x24, x24, x25 + stp x6, x24, [x0,#8*2] + + ret + ENDP + + + ALIGN 32 +|__smul_512x63_tail| PROC + umulh x24, x7, x16 + ldp x5, x6, [x1,#8*18] // load rest of |v| + adc x26, x26, xzr + ldr x7, [x1,#8*20] + and x22, x22, x16 + + umulh x11, x11, x17 // resume |v|*|g1| chain + + sub x24, x24, x22 // tie up |u|*|f1| chain + asr x25, x24, #63 + + eor x5, x5, x14 // conditionally negate rest of |v| + eor x6, x6, x14 + adds x5, x5, x15 + eor x7, x7, x14 + adcs x6, x6, xzr + umulh x19, x23, x17 + adc x7, x7, xzr + umulh x20, x5, x17 + add x11, x11, x26 + umulh x21, x6, x17 + + mul x4, x23, x17 + mul x5, x5, x17 + adds x4, x4, x11 + mul x6, x6, x17 + adcs x5, x5, x19 + mul x22, x7, x17 + adcs x6, x6, x20 + adcs x22, x22, x21 + adc x23, xzr, xzr // used in the final step + + adds x4, x4, x24 + adcs x5, x5, x25 + adcs x6, x6, x25 + stp x4, x5, [x0,#8*4] + adcs x22, x22, x25 // carry is used in the final step + stp x6, x22, [x0,#8*6] + + ret + ENDP + + + ALIGN 32 +|__smul_256_n_shift_by_31| PROC + ldp x4, x5, [x1,#8*0+0] // load |a| (or |b|) + asr x24, x12, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x6, x7, [x1,#8*2+0] + eor x25, x12, x24 // conditionally negate |f0| (or |g0|) + + eor x4, x4, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x5, x5, x24 + adds x4, x4, x24, lsr#63 + eor x6, x6, x24 + adcs x5, x5, xzr + eor x7, x7, x24 + umulh x19, x4, x25 + adcs x6, x6, xzr + umulh x20, x5, x25 + adc x7, x7, xzr + umulh x21, x6, x25 + and x24, x24, x25 + umulh x22, x7, x25 + neg x24, x24 + + mul x4, x4, x25 + mul x5, x5, x25 + mul x6, x6, x25 + adds x5, x5, x19 + mul x7, x7, x25 + adcs x6, x6, x20 + adcs x7, x7, x21 + adc x22, x22, x24 + ldp x8, x9, [x1,#8*0+32] // load |a| (or |b|) + asr x24, x13, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x10, x11, [x1,#8*2+32] + eor x25, x13, x24 // conditionally negate |f0| (or |g0|) + + eor x8, x8, x24 // conditionally negate |a| (or |b|) + sub x25, x25, x24 + eor x9, x9, x24 + adds x8, x8, x24, lsr#63 + eor x10, x10, x24 + adcs x9, x9, xzr + eor x11, x11, x24 + umulh x19, x8, x25 + adcs x10, x10, xzr + umulh x20, x9, x25 + adc x11, x11, xzr + umulh x21, x10, x25 + and x24, x24, x25 + umulh x23, x11, x25 + neg x24, x24 + + mul x8, x8, x25 + mul x9, x9, x25 + mul x10, x10, x25 + adds x9, x9, x19 + mul x11, x11, x25 + adcs x10, x10, x20 + adcs x11, x11, x21 + adc x23, x23, x24 + adds x4, x4, x8 + adcs x5, x5, x9 + adcs x6, x6, x10 + adcs x7, x7, x11 + adc x8, x22, x23 + + extr x4, x5, x4, #31 + extr x5, x6, x5, #31 + extr x6, x7, x6, #31 + asr x23, x8, #63 // result's sign as mask + extr x7, x8, x7, #31 + + eor x4, x4, x23 // ensure the result is positive + eor x5, x5, x23 + adds x4, x4, x23, lsr#63 + eor x6, x6, x23 + adcs x5, x5, xzr + eor x7, x7, x23 + adcs x6, x6, xzr + stp x4, x5, [x0,#8*0] + adc x7, x7, xzr + stp x6, x7, [x0,#8*2] + + eor x12, x12, x23 // adjust |f/g| accordingly + eor x13, x13, x23 + sub x12, x12, x23 + sub x13, x13, x23 + + ret + ENDP + + ALIGN 16 +|__ab_approximation_31_256| PROC + ldp x6, x7, [x1,#8*2] + ldp x10, x11, [x1,#8*6] + ldp x4, x5, [x1,#8*0] + ldp x8, x9, [x1,#8*4] + +|$Lab_approximation_31_256_loaded| + orr x19, x7, x11 // check top-most limbs, ... + cmp x19, #0 + cselne x7,x7,x6 + cselne x11,x11,x10 + cselne x6,x6,x5 + orr x19, x7, x11 // and ones before top-most, ... + cselne x10,x10,x9 + + cmp x19, #0 + cselne x7,x7,x6 + cselne x11,x11,x10 + cselne x6,x6,x4 + orr x19, x7, x11 // and one more, ... + cselne x10,x10,x8 + + clz x19, x19 + cmp x19, #64 + cselne x19,x19,xzr + cselne x7,x7,x6 + cselne x11,x11,x10 + neg x20, x19 + + lslv x7, x7, x19 // align high limbs to the left + lslv x11, x11, x19 + lsrv x6, x6, x20 + lsrv x10, x10, x20 + and x6, x6, x20, asr#6 + and x10, x10, x20, asr#6 + orr x7, x7, x6 + orr x11, x11, x10 + + bfxil x7, x4, #0, #31 + bfxil x11, x8, #0, #31 + + b __inner_loop_31_256 + ret + ENDP + + + ALIGN 16 +|__inner_loop_31_256| PROC + mov x2, #31 + mov x13, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x15, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x23,#0x7FFFFFFF7FFFFFFF + +|$Loop_31_256| + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x15 + cselhs x11,x11,x7 + cselhs x7,x21,x20 + cselhs x15,x15,x13 + cselhs x13,x13,x19 + lsr x7, x7, #1 + and x19, x15, x22 + and x20, x23, x22 + sub x13, x13, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x15, x15, x15 // |f1|<<=1 + add x13, x13, x20 + sub x15, x15, x23 + cbnz x2, |$Loop_31_256| + + mov x23, #0x7FFFFFFF + ubfx x12, x13, #0, #32 + ubfx x13, x13, #32, #32 + ubfx x14, x15, #0, #32 + ubfx x15, x15, #32, #32 + sub x12, x12, x23 // remove bias + sub x13, x13, x23 + sub x14, x14, x23 + sub x15, x15, x23 + + ret + ENDP + + + ALIGN 16 +|__inner_loop_62_256| PROC + mov x12, #1 // |f0|=1 + mov x13, #0 // |g0|=0 + mov x14, #0 // |f1|=0 + mov x15, #1 // |g1|=1 + +|$Loop_62_256| + sbfx x22, x7, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + and x19, x11, x22 + sub x20, x11, x7 // |b_|-|a_| + subs x21, x7, x19 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x19, x12 + cselhs x11,x11,x7 + cselhs x7,x21,x20 + mov x20, x13 + cselhs x12,x12,x14 + cselhs x14,x14,x19 + cselhs x13,x13,x15 + cselhs x15,x15,x20 + lsr x7, x7, #1 + and x19, x14, x22 + and x20, x15, x22 + add x14, x14, x14 // |f1|<<=1 + add x15, x15, x15 // |g1|<<=1 + sub x12, x12, x19 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x13, x13, x20 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, |$Loop_62_256| + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm new file mode 100644 index 00000000000..5cd09a1d8f2 --- /dev/null +++ b/crypto/blst_src/build/win64/ct_inverse_mod_256-x86_64.asm @@ -0,0 +1,1220 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ct_inverse_mod_256 + + +ALIGN 32 +ct_inverse_mod_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ct_inverse_mod_256:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,1072 + +$L$SEH_body_ct_inverse_mod_256:: + + + lea rax,QWORD PTR[((48+511))+rsp] + and rax,-512 + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[40+rsp],rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + + mov r12,QWORD PTR[rdx] + mov r13,QWORD PTR[8+rdx] + mov r14,QWORD PTR[16+rdx] + mov r15,QWORD PTR[24+rdx] + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov rsi,rax + + + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + + + mov QWORD PTR[64+rdi],rdx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + + + mov QWORD PTR[72+rdi],rdx + + + xor rsi,256 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + + + + mov r8,QWORD PTR[64+rsi] + mov r12,QWORD PTR[104+rsi] + mov r9,r8 + imul r8,QWORD PTR[rsp] + mov r13,r12 + imul r12,QWORD PTR[8+rsp] + add r8,r12 + mov QWORD PTR[32+rdi],r8 + sar r8,63 + mov QWORD PTR[40+rdi],r8 + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r8 + mov QWORD PTR[64+rdi],r8 + lea rsi,QWORD PTR[64+rsi] + + imul r9,rdx + imul r13,rcx + add r9,r13 + mov QWORD PTR[72+rdi],r9 + sar r9,63 + mov QWORD PTR[80+rdi],r9 + mov QWORD PTR[88+rdi],r9 + mov QWORD PTR[96+rdi],r9 + mov QWORD PTR[104+rdi],r9 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_256x63 + sar rbp,63 + mov QWORD PTR[40+rdi],rbp + mov QWORD PTR[48+rdi],rbp + mov QWORD PTR[56+rdi],rbp + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + xor rsi,256+8*8 + mov edx,31 + call __ab_approximation_31_256 + + + mov QWORD PTR[16+rsp],r12 + mov QWORD PTR[24+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_256_n_shift_by_31 + mov QWORD PTR[rsp],rdx + mov QWORD PTR[8+rsp],rcx + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256_n_shift_by_31 + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[24+rsp],rcx + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[64+rsi] + lea rdi,QWORD PTR[32+rdi] + call __smulq_256x63 + + mov rdx,QWORD PTR[16+rsp] + mov rcx,QWORD PTR[24+rsp] + lea rdi,QWORD PTR[40+rdi] + call __smulq_512x63 + + xor rsi,256+8*8 + mov edx,47 + + mov r8,QWORD PTR[rsi] + + mov r10,QWORD PTR[32+rsi] + + call __inner_loop_62_256 + + + + + + + + lea rsi,QWORD PTR[64+rsi] + + + + + + mov rdx,r12 + mov rcx,r13 + mov rdi,QWORD PTR[32+rsp] + call __smulq_512x63 + adc rdx,rbp + + mov rsi,QWORD PTR[40+rsp] + mov rax,rdx + sar rdx,63 + + mov r8,rdx + mov r9,rdx + and r8,QWORD PTR[rsi] + mov r10,rdx + and r9,QWORD PTR[8+rsi] + and r10,QWORD PTR[16+rsi] + and rdx,QWORD PTR[24+rsi] + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,rdx + adc rax,0 + + mov rdx,rax + neg rax + or rdx,rax + sar rax,63 + + mov r8,rdx + mov r9,rdx + and r8,QWORD PTR[rsi] + mov r10,rdx + and r9,QWORD PTR[8+rsi] + and r10,QWORD PTR[16+rsi] + and rdx,QWORD PTR[24+rsi] + + xor r8,rax + xor rcx,rcx + xor r9,rax + sub rcx,rax + xor r10,rax + xor rdx,rax + add r8,rcx + adc r9,0 + adc r10,0 + adc rdx,0 + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,rdx + + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + + lea r8,QWORD PTR[1072+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ct_inverse_mod_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ct_inverse_mod_256:: +ct_inverse_mod_256 ENDP + +ALIGN 32 +__smulq_512x63 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor rbp,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc rbp,0 + + mul rbx + mov QWORD PTR[rdi],rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov QWORD PTR[8+rdi],r9 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov QWORD PTR[16+rdi],r10 + mov r11,rdx + and rbp,rbx + neg rbp + mul rbx + add r11,rax + adc rbp,rdx + mov QWORD PTR[24+rdi],r11 + + mov r8,QWORD PTR[40+rsi] + mov r9,QWORD PTR[48+rsi] + mov r10,QWORD PTR[56+rsi] + mov r11,QWORD PTR[64+rsi] + mov r12,QWORD PTR[72+rsi] + mov r13,QWORD PTR[80+rsi] + mov r14,QWORD PTR[88+rsi] + mov r15,QWORD PTR[96+rsi] + + mov rdx,rcx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rcx,rdx + add rcx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + xor r14,rdx + xor r15,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + + mul rcx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rcx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rcx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rcx + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rcx + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + mul rcx + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rcx + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + imul rcx + add r15,rax + adc rdx,0 + + mov rbx,rbp + sar rbp,63 + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,rbx + adc r13,rbp + adc r14,rbp + adc r15,rbp + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + + DB 0F3h,0C3h ;repret +__smulq_512x63 ENDP + + +ALIGN 32 +__smulq_256x63 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + mov rbp,QWORD PTR[((0+32))+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor rbp,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc rbp,0 + + mul rbx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + and rbp,rbx + neg rbp + mul rbx + add r11,rax + adc rbp,rdx + mov rdx,rcx + mov r12,QWORD PTR[((40+0))+rsi] + mov r13,QWORD PTR[((40+8))+rsi] + mov r14,QWORD PTR[((40+16))+rsi] + mov r15,QWORD PTR[((40+24))+rsi] + mov rcx,QWORD PTR[((40+32))+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r12,rdx + xor r13,rdx + xor r14,rdx + xor r15,rdx + xor rcx,rdx + add rax,r12 + adc r13,0 + adc r14,0 + adc r15,0 + adc rcx,0 + + mul rbx + mov r12,rax + mov rax,r13 + mov r13,rdx + mul rbx + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rbx + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + and rcx,rbx + neg rcx + mul rbx + add r15,rax + adc rcx,rdx + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + adc rbp,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],rbp + + DB 0F3h,0C3h ;repret +__smulq_256x63 ENDP + +ALIGN 32 +__smulq_256_n_shift_by_31 PROC PRIVATE + DB 243,15,30,250 + + mov QWORD PTR[rdi],rdx + mov QWORD PTR[8+rdi],rcx + mov rbp,rdx + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + + mov rbx,rbp + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor rbx,rbp + add rbx,rax + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + + mul rbx + mov r8,rax + mov rax,r9 + and rbp,rbx + neg rbp + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbx + add r11,rax + adc rbp,rdx + mov r12,QWORD PTR[((32+0))+rsi] + mov r13,QWORD PTR[((32+8))+rsi] + mov r14,QWORD PTR[((32+16))+rsi] + mov r15,QWORD PTR[((32+24))+rsi] + + mov rbx,rcx + sar rcx,63 + xor rax,rax + sub rax,rcx + + xor rbx,rcx + add rbx,rax + + xor r12,rcx + xor r13,rcx + xor r14,rcx + xor r15,rcx + add rax,r12 + adc r13,0 + adc r14,0 + adc r15,0 + + mul rbx + mov r12,rax + mov rax,r13 + and rcx,rbx + neg rcx + mov r13,rdx + mul rbx + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rbx + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + mul rbx + add r15,rax + adc rcx,rdx + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + adc rbp,rcx + + mov rdx,QWORD PTR[rdi] + mov rcx,QWORD PTR[8+rdi] + + shrd r8,r9,31 + shrd r9,r10,31 + shrd r10,r11,31 + shrd r11,rbp,31 + + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + xor rdx,rbp + xor rcx,rbp + add rdx,rax + add rcx,rax + + DB 0F3h,0C3h ;repret +__smulq_256_n_shift_by_31 ENDP + +ALIGN 32 +__ab_approximation_31_256 PROC PRIVATE + DB 243,15,30,250 + + mov r9,QWORD PTR[24+rsi] + mov r11,QWORD PTR[56+rsi] + mov rbx,QWORD PTR[16+rsi] + mov rbp,QWORD PTR[48+rsi] + mov r8,QWORD PTR[8+rsi] + mov r10,QWORD PTR[40+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[32+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + + mov rax,r9 + or rax,r11 + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r9,r8 + cmovz r11,r10 + cmovz rcx,rax + neg rcx + + + shld r9,rbx,cl + shld r11,rbp,cl + + mov eax,07FFFFFFFh + and r8,rax + and r10,rax + not rax + and r9,rax + and r11,rax + or r8,r9 + or r10,r11 + + jmp __inner_loop_31_256 + + DB 0F3h,0C3h ;repret +__ab_approximation_31_256 ENDP + +ALIGN 32 +__inner_loop_31_256 PROC PRIVATE + DB 243,15,30,250 + + mov rcx,07FFFFFFF80000000h + mov r13,0800000007FFFFFFFh + mov r15,07FFFFFFF7FFFFFFFh + +$L$oop_31_256:: + cmp r8,r10 + mov rax,r8 + mov rbx,r10 + mov rbp,rcx + mov r14,r13 + cmovb r8,r10 + cmovb r10,rax + cmovb rcx,r13 + cmovb r13,rbp + + sub r8,r10 + sub rcx,r13 + add rcx,r15 + + test rax,1 + cmovz r8,rax + cmovz r10,rbx + cmovz rcx,rbp + cmovz r13,r14 + + shr r8,1 + add r13,r13 + sub r13,r15 + sub edx,1 + jnz $L$oop_31_256 + + shr r15,32 + mov edx,ecx + mov r12d,r13d + shr rcx,32 + shr r13,32 + sub rdx,r15 + sub rcx,r15 + sub r12,r15 + sub r13,r15 + + DB 0F3h,0C3h ;repret +__inner_loop_31_256 ENDP + + +ALIGN 32 +__inner_loop_62_256 PROC PRIVATE + DB 243,15,30,250 + + mov r15d,edx + mov rdx,1 + xor rcx,rcx + xor r12,r12 + mov r13,rdx + mov r14,rdx + +$L$oop_62_256:: + xor rax,rax + test r8,r14 + mov rbx,r10 + cmovnz rax,r10 + sub rbx,r8 + mov rbp,r8 + sub r8,rax + cmovc r8,rbx + cmovc r10,rbp + mov rax,rdx + cmovc rdx,r12 + cmovc r12,rax + mov rbx,rcx + cmovc rcx,r13 + cmovc r13,rbx + xor rax,rax + xor rbx,rbx + shr r8,1 + test rbp,r14 + cmovnz rax,r12 + cmovnz rbx,r13 + add r12,r12 + add r13,r13 + sub rdx,rax + sub rcx,rbx + sub r15d,1 + jnz $L$oop_62_256 + + DB 0F3h,0C3h ;repret +__inner_loop_62_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ct_inverse_mod_256 + DD imagerel $L$SEH_body_ct_inverse_mod_256 + DD imagerel $L$SEH_info_ct_inverse_mod_256_prologue + + DD imagerel $L$SEH_body_ct_inverse_mod_256 + DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 + DD imagerel $L$SEH_info_ct_inverse_mod_256_body + + DD imagerel $L$SEH_epilogue_ct_inverse_mod_256 + DD imagerel $L$SEH_end_ct_inverse_mod_256 + DD imagerel $L$SEH_info_ct_inverse_mod_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ct_inverse_mod_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_ct_inverse_mod_256_body:: +DB 1,0,18,0 +DB 000h,0f4h,086h,000h +DB 000h,0e4h,087h,000h +DB 000h,0d4h,088h,000h +DB 000h,0c4h,089h,000h +DB 000h,034h,08ah,000h +DB 000h,054h,08bh,000h +DB 000h,074h,08dh,000h +DB 000h,064h,08eh,000h +DB 000h,001h,08ch,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_ct_inverse_mod_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm new file mode 100644 index 00000000000..311ce7638ce --- /dev/null +++ b/crypto/blst_src/build/win64/ct_inverse_mod_384-armv8.asm @@ -0,0 +1,719 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |ct_inverse_mod_383|[FUNC] + ALIGN 32 +|ct_inverse_mod_383| PROC + DCDU 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #1040 + + ldp x22, x4, [x1,#8*0] + ldp x5, x6, [x1,#8*2] + ldp x7, x8, [x1,#8*4] + + add x1, sp, #16+511 // find closest 512-byte-aligned spot + and x1, x1, #-512 // in the frame... + stp x0, x3, [sp] + + ldp x9, x10, [x2,#8*0] + ldp x11, x12, [x2,#8*2] + ldp x13, x14, [x2,#8*4] + + stp x22, x4, [x1,#8*0] // copy input to |a| + stp x5, x6, [x1,#8*2] + stp x7, x8, [x1,#8*4] + stp x9, x10, [x1,#8*6] // copy modulus to |b| + stp x11, x12, [x1,#8*8] + stp x13, x14, [x1,#8*10] + + ////////////////////////////////////////// first iteration + mov x2, #62 + bl |$Lab_approximation_62_loaded| + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + str x15,[x0,#8*12] // initialize |u| with |f0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to dst |b| + bl __smul_383_n_shift_by_62 + str x15, [x0,#8*12] // initialize |v| with |f1| + + ////////////////////////////////////////// second iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + ldr x7, [x1,#8*12] // |u| + ldr x8, [x1,#8*18] // |v| + mul x3, x20, x7 // |u|*|f0| + smulh x4, x20, x7 + mul x5, x21, x8 // |v|*|g0| + smulh x6, x21, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*6] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*8] + stp x5, x5, [x0,#8*10] + + mul x3, x15, x7 // |u|*|f1| + smulh x4, x15, x7 + mul x5, x16, x8 // |v|*|g1| + smulh x6, x16, x8 + adds x3, x3, x5 + adc x4, x4, x6 + stp x3, x4, [x0,#8*12] + asr x5, x4, #63 // sign extension + stp x5, x5, [x0,#8*14] + stp x5, x5, [x0,#8*16] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + asr x27, x27, #63 // sign extension + stp x27, x27, [x0,#8*6] + stp x27, x27, [x0,#8*8] + stp x27, x27, [x0,#8*10] + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + bl __ab_approximation_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + bl __smul_383_n_shift_by_62 + mov x20, x15 // corrected |f0| + mov x21, x16 // corrected |g0| + + mov x15, x17 // |f1| + mov x16, x19 // |g1| + add x0, x0, #8*6 // pointer to destination |b| + bl __smul_383_n_shift_by_62 + + add x0, x0, #8*6 // pointer to destination |u| + bl __smul_383x63 + + mov x20, x15 // corrected |f1| + mov x21, x16 // corrected |g1| + add x0, x0, #8*6 // pointer to destination |v| + bl __smul_383x63 + bl __smul_767x63_tail + ////////////////////////////////////////// iteration before last + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldp x3, x8, [x1,#8*0] // just load + ldp x9, x14, [x1,#8*6] + bl __inner_loop_62 + + eor x0, x1, #256 // pointer to dst |a|b|u|v| + str x3, [x0,#8*0] + str x9, [x0,#8*6] + + mov x20, x15 // exact |f0| + mov x21, x16 // exact |g0| + mov x15, x17 + mov x16, x19 + add x0, x0, #8*12 // pointer to dst |u| + bl __smul_383x63 + + mov x20, x15 // exact |f1| + mov x21, x16 // exact |g1| + add x0, x0, #8*6 // pointer to dst |v| + bl __smul_383x63 + bl __smul_767x63_tail + + ////////////////////////////////////////// last iteration + eor x1, x1, #256 // flip-flop src |a|b|u|v| + mov x2, #22 // 766 % 62 + //bl __ab_approximation_62 // |a| and |b| are exact, + ldr x3, [x1,#8*0] // just load + eor x8, x8, x8 + ldr x9, [x1,#8*6] + eor x14, x14, x14 + bl __inner_loop_62 + + mov x20, x17 + mov x21, x19 + ldp x0, x15, [sp] // original out_ptr and n_ptr + bl __smul_383x63 + bl __smul_767x63_tail + ldr x30, [x29,#8] + + asr x22, x8, #63 // sign as mask + ldp x9, x10, [x15,#8*0] + ldp x11, x12, [x15,#8*2] + ldp x13, x14, [x15,#8*4] + + and x9, x9, x22 // add mod<<384 conditionally + and x10, x10, x22 + adds x3, x3, x9 + and x11, x11, x22 + adcs x4, x4, x10 + and x12, x12, x22 + adcs x5, x5, x11 + and x13, x13, x22 + adcs x6, x6, x12 + and x14, x14, x22 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*8] + adc x8, x8, x14 + stp x7, x8, [x0,#8*10] + + add sp, sp, #1040 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + DCDU 3573752767 + ret + ENDP + +//////////////////////////////////////////////////////////////////////// +// see corresponding commentary in ctx_inverse_mod_384-x86_64... + + ALIGN 32 +|__smul_383x63| PROC + ldp x3, x4, [x1,#8*0+96] // load |u| (or |v|) + asr x17, x20, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x5, x6, [x1,#8*2+96] + eor x20, x20, x17 // conditionally negate |f_| (or |g_|) + ldp x7, x8, [x1,#8*4+96] + + eor x3, x3, x17 // conditionally negate |u| (or |v|) + sub x20, x20, x17 + eor x4, x4, x17 + adds x3, x3, x17, lsr#63 + eor x5, x5, x17 + adcs x4, x4, xzr + eor x6, x6, x17 + adcs x5, x5, xzr + eor x7, x7, x17 + adcs x6, x6, xzr + umulh x22, x3, x20 + eor x8, x8, x17 + umulh x23, x4, x20 + adcs x7, x7, xzr + umulh x24, x5, x20 + adcs x8, x8, xzr + umulh x25, x6, x20 + umulh x26, x7, x20 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x22 + mul x6, x6, x20 + adcs x5, x5, x23 + mul x7, x7, x20 + adcs x6, x6, x24 + mul x27,x8, x20 + adcs x7, x7, x25 + adcs x27,x27,x26 + adc x2, xzr, xzr + ldp x9, x10, [x1,#8*0+144] // load |u| (or |v|) + asr x17, x21, #63 // |f_|'s sign as mask (or |g_|'s) + ldp x11, x12, [x1,#8*2+144] + eor x21, x21, x17 // conditionally negate |f_| (or |g_|) + ldp x13, x14, [x1,#8*4+144] + + eor x9, x9, x17 // conditionally negate |u| (or |v|) + sub x21, x21, x17 + eor x10, x10, x17 + adds x9, x9, x17, lsr#63 + eor x11, x11, x17 + adcs x10, x10, xzr + eor x12, x12, x17 + adcs x11, x11, xzr + eor x13, x13, x17 + adcs x12, x12, xzr + umulh x22, x9, x21 + eor x14, x14, x17 + umulh x23, x10, x21 + adcs x13, x13, xzr + umulh x24, x11, x21 + adcs x14, x14, xzr + umulh x25, x12, x21 + adc x19, xzr, xzr // used in __smul_767x63_tail + umulh x26, x13, x21 + mul x9, x9, x21 + mul x10, x10, x21 + mul x11, x11, x21 + adds x10, x10, x22 + mul x12, x12, x21 + adcs x11, x11, x23 + mul x13, x13, x21 + adcs x12, x12, x24 + mul x28,x14, x21 + adcs x13, x13, x25 + adcs x28,x28,x26 + adc x2, x2, xzr + + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + stp x3, x4, [x0,#8*0] + adcs x7, x7, x13 + stp x5, x6, [x0,#8*2] + adcs x27, x27, x28 + stp x7, x27, [x0,#8*4] + adc x28, x2, xzr // used in __smul_767x63_tail + + ret + ENDP + + + ALIGN 32 +|__smul_767x63_tail| PROC + smulh x27, x8, x20 + ldp x3, x4, [x1,#8*24] // load rest of |v| + umulh x14,x14, x21 + ldp x5, x6, [x1,#8*26] + ldp x7, x8, [x1,#8*28] + + eor x3, x3, x17 // conditionally negate rest of |v| + eor x4, x4, x17 + eor x5, x5, x17 + adds x3, x3, x19 + eor x6, x6, x17 + adcs x4, x4, xzr + eor x7, x7, x17 + adcs x5, x5, xzr + eor x8, x8, x17 + adcs x6, x6, xzr + umulh x22, x3, x21 + adcs x7, x7, xzr + umulh x23, x4, x21 + adc x8, x8, xzr + + umulh x24, x5, x21 + add x14, x14, x28 + umulh x25, x6, x21 + asr x28, x27, #63 + umulh x26, x7, x21 + mul x3, x3, x21 + mul x4, x4, x21 + mul x5, x5, x21 + adds x3, x3, x14 + mul x6, x6, x21 + adcs x4, x4, x22 + mul x7, x7, x21 + adcs x5, x5, x23 + mul x8, x8, x21 + adcs x6, x6, x24 + adcs x7, x7, x25 + adc x8, x8, x26 + + adds x3, x3, x27 + adcs x4, x4, x28 + adcs x5, x5, x28 + adcs x6, x6, x28 + stp x3, x4, [x0,#8*6] + adcs x7, x7, x28 + stp x5, x6, [x0,#8*8] + adc x8, x8, x28 + stp x7, x8, [x0,#8*10] + + ret + ENDP + + + ALIGN 32 +|__smul_383_n_shift_by_62| PROC + ldp x3, x4, [x1,#8*0+0] // load |a| (or |b|) + asr x28, x15, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x5, x6, [x1,#8*2+0] + eor x2, x15, x28 // conditionally negate |f0| (or |g0|) + ldp x7, x8, [x1,#8*4+0] + + eor x3, x3, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + umulh x22, x3, x2 + adcs x6, x6, xzr + umulh x23, x4, x2 + eor x8, x8, x28 + umulh x24, x5, x2 + adcs x7, x7, xzr + umulh x25, x6, x2 + adc x8, x8, xzr + + umulh x26, x7, x2 + smulh x27, x8, x2 + mul x3, x3, x2 + mul x4, x4, x2 + mul x5, x5, x2 + adds x4, x4, x22 + mul x6, x6, x2 + adcs x5, x5, x23 + mul x7, x7, x2 + adcs x6, x6, x24 + mul x8, x8, x2 + adcs x7, x7, x25 + adcs x8, x8 ,x26 + adc x27, x27, xzr + ldp x9, x10, [x1,#8*0+48] // load |a| (or |b|) + asr x28, x16, #63 // |f0|'s sign as mask (or |g0|'s) + ldp x11, x12, [x1,#8*2+48] + eor x2, x16, x28 // conditionally negate |f0| (or |g0|) + ldp x13, x14, [x1,#8*4+48] + + eor x9, x9, x28 // conditionally negate |a| (or |b|) + sub x2, x2, x28 + eor x10, x10, x28 + adds x9, x9, x28, lsr#63 + eor x11, x11, x28 + adcs x10, x10, xzr + eor x12, x12, x28 + adcs x11, x11, xzr + eor x13, x13, x28 + umulh x22, x9, x2 + adcs x12, x12, xzr + umulh x23, x10, x2 + eor x14, x14, x28 + umulh x24, x11, x2 + adcs x13, x13, xzr + umulh x25, x12, x2 + adc x14, x14, xzr + + umulh x26, x13, x2 + smulh x28, x14, x2 + mul x9, x9, x2 + mul x10, x10, x2 + mul x11, x11, x2 + adds x10, x10, x22 + mul x12, x12, x2 + adcs x11, x11, x23 + mul x13, x13, x2 + adcs x12, x12, x24 + mul x14, x14, x2 + adcs x13, x13, x25 + adcs x14, x14 ,x26 + adc x28, x28, xzr + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x27, x28 + + extr x3, x4, x3, #62 + extr x4, x5, x4, #62 + extr x5, x6, x5, #62 + asr x28, x9, #63 + extr x6, x7, x6, #62 + extr x7, x8, x7, #62 + extr x8, x9, x8, #62 + + eor x3, x3, x28 + eor x4, x4, x28 + adds x3, x3, x28, lsr#63 + eor x5, x5, x28 + adcs x4, x4, xzr + eor x6, x6, x28 + adcs x5, x5, xzr + eor x7, x7, x28 + adcs x6, x6, xzr + eor x8, x8, x28 + stp x3, x4, [x0,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x0,#8*2] + adc x8, x8, xzr + stp x7, x8, [x0,#8*4] + + eor x15, x15, x28 + eor x16, x16, x28 + sub x15, x15, x28 + sub x16, x16, x28 + + ret + ENDP + + ALIGN 16 +|__ab_approximation_62| PROC + ldp x7, x8, [x1,#8*4] + ldp x13, x14, [x1,#8*10] + ldp x5, x6, [x1,#8*2] + ldp x11, x12, [x1,#8*8] + +|$Lab_approximation_62_loaded| + orr x22, x8, x14 // check top-most limbs, ... + cmp x22, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x6 + orr x22, x8, x14 // ... ones before top-most, ... + cselne x13,x13,x12 + + ldp x3, x4, [x1,#8*0] + ldp x9, x10, [x1,#8*6] + + cmp x22, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x5 + orr x22, x8, x14 // ... and ones before that ... + cselne x13,x13,x11 + + cmp x22, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x4 + orr x22, x8, x14 + cselne x13,x13,x10 + + clz x22, x22 + cmp x22, #64 + cselne x22,x22,xzr + cselne x8,x8,x7 + cselne x14,x14,x13 + neg x23, x22 + + lslv x8, x8, x22 // align high limbs to the left + lslv x14, x14, x22 + lsrv x7, x7, x23 + lsrv x13, x13, x23 + and x7, x7, x23, asr#6 + and x13, x13, x23, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + b __inner_loop_62 + ret + ENDP + + ALIGN 16 +|__inner_loop_62| PROC + mov x15, #1 // |f0|=1 + mov x16, #0 // |g0|=0 + mov x17, #0 // |f1|=0 + mov x19, #1 // |g1|=1 + +|$Loop_62| + sbfx x28, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + sub x2, x2, #1 + subs x24, x9, x3 // |b_|-|a_| + and x22, x9, x28 + sbc x25, x14, x8 + and x23, x14, x28 + subs x26, x3, x22 // |a_|-|b_| (or |a_|-0 if |a_| was even) + mov x22, x15 + sbcs x27, x8, x23 + mov x23, x16 + cselhs x9,x9,x3 + cselhs x14,x14,x8 + cselhs x3,x26,x24 + cselhs x8,x27,x25 + cselhs x15,x15,x17 + cselhs x17,x17,x22 + cselhs x16,x16,x19 + cselhs x19,x19,x23 + extr x3, x8, x3, #1 + lsr x8, x8, #1 + and x22, x17, x28 + and x23, x19, x28 + add x17, x17, x17 // |f1|<<=1 + add x19, x19, x19 // |g1|<<=1 + sub x15, x15, x22 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + sub x16, x16, x23 // |g0|-=|g1| (or |g0-=0| ...) + cbnz x2, |$Loop_62| + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm new file mode 100644 index 00000000000..e2454897b33 --- /dev/null +++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-armv8.asm @@ -0,0 +1,326 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |ct_is_square_mod_384|[FUNC] + ALIGN 32 +|ct_is_square_mod_384| PROC + DCDU 3573752639 + stp x29, x30, [sp,#-128]! + add x29, sp, #0 + stp x19, x20, [sp,#16] + stp x21, x22, [sp,#32] + stp x23, x24, [sp,#48] + stp x25, x26, [sp,#64] + stp x27, x28, [sp,#80] + sub sp, sp, #512 + + ldp x3, x4, [x0,#8*0] // load input + ldp x5, x6, [x0,#8*2] + ldp x7, x8, [x0,#8*4] + + add x0, sp, #255 // find closest 256-byte-aligned spot + and x0, x0, #-256 // in the frame... + + ldp x9, x10, [x1,#8*0] // load modulus + ldp x11, x12, [x1,#8*2] + ldp x13, x14, [x1,#8*4] + + stp x3, x4, [x0,#8*6] // copy input to |a| + stp x5, x6, [x0,#8*8] + stp x7, x8, [x0,#8*10] + stp x9, x10, [x0,#8*0] // copy modulus to |b| + stp x11, x12, [x0,#8*2] + stp x13, x14, [x0,#8*4] + + eor x2, x2, x2 // init the |$Legendre| symbol + mov x15, #24 // 24 is 768/30-1 + b |$Loop_is_square| + + ALIGN 16 +|$Loop_is_square| + bl __ab_approximation_30 + sub x15, x15, #1 + + eor x1, x0, #128 // pointer to dst |b| + bl __smul_384_n_shift_by_30 + + mov x19, x16 // |f0| + mov x20, x17 // |g0| + add x1, x1, #8*6 // pointer to dst |a| + bl __smul_384_n_shift_by_30 + + ldp x9, x10, [x1,#-8*6] + eor x0, x0, #128 // flip-flop src |a|b| + and x27, x27, x9 // if |a| was negative, + add x2, x2, x27, lsr#1 // adjust |L| + + cbnz x15, |$Loop_is_square| + + ////////////////////////////////////////// last iteration + //bl __ab_approximation_30 // |a| and |b| are exact, + //ldr x8, [x0,#8*6] // and loaded + //ldr x14, [x0,#8*0] + mov x15, #48 // 48 is 768%30 + 30 + bl __inner_loop_48 + ldr x30, [x29,#8] + + and x0, x2, #1 + eor x0, x0, #1 + + add sp, sp, #512 + ldp x19, x20, [x29,#16] + ldp x21, x22, [x29,#32] + ldp x23, x24, [x29,#48] + ldp x25, x26, [x29,#64] + ldp x27, x28, [x29,#80] + ldr x29, [sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__smul_384_n_shift_by_30| PROC + ldp x3, x4, [x0,#8*0+0] // load |b| (or |a|) + asr x27, x20, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x5, x6, [x0,#8*2+0] + eor x20, x20, x27 // conditionally negate |g1| (or |f1|) + ldp x7, x8, [x0,#8*4+0] + + eor x3, x3, x27 // conditionally negate |b| (or |a|) + sub x20, x20, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + umulh x21, x3, x20 + adcs x6, x6, xzr + umulh x22, x4, x20 + eor x8, x8, x27 + umulh x23, x5, x20 + adcs x7, x7, xzr + umulh x24, x6, x20 + adc x8, x8, xzr + + umulh x25, x7, x20 + and x28, x20, x27 + umulh x26, x8, x20 + neg x28, x28 + mul x3, x3, x20 + mul x4, x4, x20 + mul x5, x5, x20 + adds x4, x4, x21 + mul x6, x6, x20 + adcs x5, x5, x22 + mul x7, x7, x20 + adcs x6, x6, x23 + mul x8, x8, x20 + adcs x7, x7, x24 + adcs x8, x8 ,x25 + adc x26, x26, x28 + ldp x9, x10, [x0,#8*0+48] // load |b| (or |a|) + asr x27, x19, #63 // |g1|'s sign as mask (or |f1|'s) + ldp x11, x12, [x0,#8*2+48] + eor x19, x19, x27 // conditionally negate |g1| (or |f1|) + ldp x13, x14, [x0,#8*4+48] + + eor x9, x9, x27 // conditionally negate |b| (or |a|) + sub x19, x19, x27 + eor x10, x10, x27 + adds x9, x9, x27, lsr#63 + eor x11, x11, x27 + adcs x10, x10, xzr + eor x12, x12, x27 + adcs x11, x11, xzr + eor x13, x13, x27 + umulh x21, x9, x19 + adcs x12, x12, xzr + umulh x22, x10, x19 + eor x14, x14, x27 + umulh x23, x11, x19 + adcs x13, x13, xzr + umulh x24, x12, x19 + adc x14, x14, xzr + + umulh x25, x13, x19 + and x28, x19, x27 + umulh x27, x14, x19 + neg x28, x28 + mul x9, x9, x19 + mul x10, x10, x19 + mul x11, x11, x19 + adds x10, x10, x21 + mul x12, x12, x19 + adcs x11, x11, x22 + mul x13, x13, x19 + adcs x12, x12, x23 + mul x14, x14, x19 + adcs x13, x13, x24 + adcs x14, x14 ,x25 + adc x27, x27, x28 + adds x3, x3, x9 + adcs x4, x4, x10 + adcs x5, x5, x11 + adcs x6, x6, x12 + adcs x7, x7, x13 + adcs x8, x8, x14 + adc x9, x26, x27 + + extr x3, x4, x3, #30 + extr x4, x5, x4, #30 + extr x5, x6, x5, #30 + asr x27, x9, #63 + extr x6, x7, x6, #30 + extr x7, x8, x7, #30 + extr x8, x9, x8, #30 + + eor x3, x3, x27 + eor x4, x4, x27 + adds x3, x3, x27, lsr#63 + eor x5, x5, x27 + adcs x4, x4, xzr + eor x6, x6, x27 + adcs x5, x5, xzr + eor x7, x7, x27 + adcs x6, x6, xzr + eor x8, x8, x27 + stp x3, x4, [x1,#8*0] + adcs x7, x7, xzr + stp x5, x6, [x1,#8*2] + adc x8, x8, xzr + stp x7, x8, [x1,#8*4] + + ret + ENDP + + ALIGN 16 +|__ab_approximation_30| PROC + ldp x13, x14, [x0,#8*4] // |a| is still in registers + ldp x11, x12, [x0,#8*2] + + orr x21, x8, x14 // check top-most limbs, ... + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x6 + orr x21, x8, x14 // ... ones before top-most, ... + cselne x13,x13,x12 + + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x5 + orr x21, x8, x14 // ... and ones before that ... + cselne x13,x13,x11 + + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x4 + orr x21, x8, x14 // and one more, ... + cselne x13,x13,x10 + + cmp x21, #0 + cselne x8,x8,x7 + cselne x14,x14,x13 + cselne x7,x7,x3 + orr x21, x8, x14 + cselne x13,x13,x9 + + clz x21, x21 + cmp x21, #64 + cselne x21,x21,xzr + cselne x8,x8,x7 + cselne x14,x14,x13 + neg x22, x21 + + lslv x8, x8, x21 // align high limbs to the left + lslv x14, x14, x21 + lsrv x7, x7, x22 + lsrv x13, x13, x22 + and x7, x7, x22, asr#6 + and x13, x13, x22, asr#6 + orr x8, x8, x7 + orr x14, x14, x13 + + bfxil x8, x3, #0, #32 + bfxil x14, x9, #0, #32 + + b __inner_loop_30 + ret + ENDP + + + ALIGN 16 +|__inner_loop_30| PROC + mov x28, #30 + mov x17, #0x7FFFFFFF80000000 // |f0|=1, |g0|=0 + mov x20, #0x800000007FFFFFFF // |f1|=0, |g1|=1 + mov x27,#0x7FFFFFFF7FFFFFFF + +|$Loop_30| + sbfx x24, x8, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x8, x14 + sub x28, x28, #1 + and x21, x14, x24 + + sub x22, x14, x8 // |b_|-|a_| + subs x23, x8, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 // L + (a_ & b_) >> 1 + mov x21, x20 + cselhs x14,x14,x8 + cselhs x8,x23,x22 + cselhs x20,x20,x17 + cselhs x17,x17,x21 + cselhs x2,x2,x25 + lsr x8, x8, #1 + and x21, x20, x24 + and x22, x27, x24 + add x23, x14, #2 + sub x17, x17, x21 // |f0|-=|f1| (or |f0-=0| if |a_| was even) + add x20, x20, x20 // |f1|<<=1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + add x17, x17, x22 + sub x20, x20, x27 + + cbnz x28, |$Loop_30| + + mov x27, #0x7FFFFFFF + ubfx x16, x17, #0, #32 + ubfx x17, x17, #32, #32 + ubfx x19, x20, #0, #32 + ubfx x20, x20, #32, #32 + sub x16, x16, x27 // remove the bias + sub x17, x17, x27 + sub x19, x19, x27 + sub x20, x20, x27 + + ret + ENDP + + ALIGN 16 +|__inner_loop_48| PROC +|$Loop_48| + sbfx x24, x3, #0, #1 // if |a_| is odd, then we'll be subtracting + and x25, x3, x9 + sub x15, x15, #1 + and x21, x9, x24 + sub x22, x9, x3 // |b_|-|a_| + subs x23, x3, x21 // |a_|-|b_| (or |a_|-0 if |a_| was even) + add x25, x2, x25, lsr#1 + cselhs x9,x9,x3 + cselhs x3,x23,x22 + cselhs x2,x2,x25 + add x23, x9, #2 + lsr x3, x3, #1 + add x2, x2, x23, lsr#2 // "negate" |L| if |b|%8 is 3 or 5 + + cbnz x15, |$Loop_48| + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm new file mode 100644 index 00000000000..be00f479efb --- /dev/null +++ b/crypto/blst_src/build/win64/ct_is_square_mod_384-x86_64.asm @@ -0,0 +1,516 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ct_is_square_mod_384 + + +ALIGN 32 +ct_is_square_mod_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ct_is_square_mod_384:: + + + push rbp + + mov rdi,rcx + mov rsi,rdx + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,536 + +$L$SEH_body_ct_is_square_mod_384:: + + + lea rax,QWORD PTR[((24+255))+rsp] + and rax,-256 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbx,QWORD PTR[16+rsi] + mov rcx,QWORD PTR[24+rsi] + mov rdx,QWORD PTR[32+rsi] + mov rdi,QWORD PTR[40+rsi] + mov rsi,rax + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov QWORD PTR[64+rax],rbx + mov QWORD PTR[72+rax],rcx + mov QWORD PTR[80+rax],rdx + mov QWORD PTR[88+rax],rdi + + xor rbp,rbp + mov ecx,24 + jmp $L$oop_is_square + +ALIGN 32 +$L$oop_is_square:: + mov DWORD PTR[16+rsp],ecx + + call __ab_approximation_30 + mov QWORD PTR[rsp],rax + mov QWORD PTR[8+rsp],rbx + + mov rdi,128+8*6 + xor rdi,rsi + call __smulq_384_n_shift_by_30 + + mov rdx,QWORD PTR[rsp] + mov rcx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-48))+rdi] + call __smulq_384_n_shift_by_30 + + mov ecx,DWORD PTR[16+rsp] + xor rsi,128 + + and r14,QWORD PTR[48+rdi] + shr r14,1 + add rbp,r14 + + sub ecx,1 + jnz $L$oop_is_square + + + + + mov r9,QWORD PTR[48+rsi] + call __inner_loop_48 + + mov rax,1 + and rax,rbp + xor rax,1 + + lea r8,QWORD PTR[536+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ct_is_square_mod_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ct_is_square_mod_384:: +ct_is_square_mod_384 ENDP + + +ALIGN 32 +__smulq_384_n_shift_by_30 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov r14,rdx + and r14,rbx + mul rbx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbx + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbx + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + neg r14 + mul rbx + add r13,rax + adc r14,rdx + lea rsi,QWORD PTR[48+rsi] + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbx,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbx,rdx + add rbx,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov r15,rdx + and r15,rbx + mul rbx + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbx + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbx + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbx + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbx + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + neg r15 + mul rbx + add r13,rax + adc r15,rdx + lea rsi,QWORD PTR[((-48))+rsi] + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + adc r14,r15 + + shrd r8,r9,30 + shrd r9,r10,30 + shrd r10,r11,30 + shrd r11,r12,30 + shrd r12,r13,30 + shrd r13,r14,30 + + sar r14,63 + xor rbx,rbx + sub rbx,r14 + + xor r8,r14 + xor r9,r14 + xor r10,r14 + xor r11,r14 + xor r12,r14 + xor r13,r14 + add r8,rbx + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__smulq_384_n_shift_by_30 ENDP + +ALIGN 32 +__ab_approximation_30 PROC PRIVATE + DB 243,15,30,250 + + mov rbx,QWORD PTR[88+rsi] + mov r15,QWORD PTR[80+rsi] + mov r14,QWORD PTR[72+rsi] + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r11 + mov r11,QWORD PTR[64+rsi] + cmovz r15,r14 + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r10 + mov r10,QWORD PTR[56+rsi] + cmovz r15,r11 + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r9 + mov r9,QWORD PTR[48+rsi] + cmovz r15,r10 + + mov rax,r13 + or rax,rbx + cmovz r13,r12 + cmovz rbx,r15 + cmovz r12,r8 + cmovz r15,r9 + + mov rax,r13 + or rax,rbx + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r13,r8 + cmovz rbx,r9 + cmovz rcx,rax + neg rcx + + + shld r13,r12,cl + shld rbx,r15,cl + + mov rax,0FFFFFFFF00000000h + mov r8d,r8d + mov r9d,r9d + and r13,rax + and rbx,rax + or r8,r13 + or r9,rbx + + jmp __inner_loop_30 + + DB 0F3h,0C3h ;repret +__ab_approximation_30 ENDP + +ALIGN 32 +__inner_loop_30 PROC PRIVATE + DB 243,15,30,250 + + mov rbx,07FFFFFFF80000000h + mov rcx,0800000007FFFFFFFh + lea r15,QWORD PTR[((-1))+rbx] + mov edi,30 + +$L$oop_30:: + mov rax,r8 + and rax,r9 + shr rax,1 + + cmp r8,r9 + mov r10,r8 + mov r11,r9 + lea rax,QWORD PTR[rbp*1+rax] + mov r12,rbx + mov r13,rcx + mov r14,rbp + cmovb r8,r9 + cmovb r9,r10 + cmovb rbx,rcx + cmovb rcx,r12 + cmovb rbp,rax + + sub r8,r9 + sub rbx,rcx + add rbx,r15 + + test r10,1 + cmovz r8,r10 + cmovz r9,r11 + cmovz rbx,r12 + cmovz rcx,r13 + cmovz rbp,r14 + + lea rax,QWORD PTR[2+r9] + shr r8,1 + shr rax,2 + add rcx,rcx + lea rbp,QWORD PTR[rbp*1+rax] + sub rcx,r15 + + sub edi,1 + jnz $L$oop_30 + + shr r15,32 + mov eax,ebx + shr rbx,32 + mov edx,ecx + shr rcx,32 + sub rax,r15 + sub rbx,r15 + sub rdx,r15 + sub rcx,r15 + + DB 0F3h,0C3h ;repret +__inner_loop_30 ENDP + + +ALIGN 32 +__inner_loop_48 PROC PRIVATE + DB 243,15,30,250 + + mov edi,48 + +$L$oop_48:: + mov rax,r8 + and rax,r9 + shr rax,1 + + cmp r8,r9 + mov r10,r8 + mov r11,r9 + lea rax,QWORD PTR[rbp*1+rax] + mov r12,rbp + cmovb r8,r9 + cmovb r9,r10 + cmovb rbp,rax + + sub r8,r9 + + test r10,1 + cmovz r8,r10 + cmovz r9,r11 + cmovz rbp,r12 + + lea rax,QWORD PTR[2+r9] + shr r8,1 + shr rax,2 + add rbp,rax + + sub edi,1 + jnz $L$oop_48 + + DB 0F3h,0C3h ;repret +__inner_loop_48 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ct_is_square_mod_384 + DD imagerel $L$SEH_body_ct_is_square_mod_384 + DD imagerel $L$SEH_info_ct_is_square_mod_384_prologue + + DD imagerel $L$SEH_body_ct_is_square_mod_384 + DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 + DD imagerel $L$SEH_info_ct_is_square_mod_384_body + + DD imagerel $L$SEH_epilogue_ct_is_square_mod_384 + DD imagerel $L$SEH_end_ct_is_square_mod_384 + DD imagerel $L$SEH_info_ct_is_square_mod_384_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ct_is_square_mod_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_ct_is_square_mod_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,043h,000h +DB 000h,0e4h,044h,000h +DB 000h,0d4h,045h,000h +DB 000h,0c4h,046h,000h +DB 000h,034h,047h,000h +DB 000h,054h,048h,000h +DB 000h,074h,04ah,000h +DB 000h,064h,04bh,000h +DB 000h,001h,049h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_ct_is_square_mod_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm new file mode 100644 index 00000000000..89fbe5d0666 --- /dev/null +++ b/crypto/blst_src/build/win64/ctq_inverse_mod_384-x86_64.asm @@ -0,0 +1,1240 @@ +OPTION DOTNAME +EXTERN ct_inverse_mod_383$1:NEAR +_DATA SEGMENT +COMM __blst_platform_cap:DWORD:1 +_DATA ENDS +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ct_inverse_mod_383 + + +ALIGN 32 +ct_inverse_mod_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ct_inverse_mod_383:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz ct_inverse_mod_383$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,1112 + +$L$SEH_body_ct_inverse_mod_383:: + + + lea rax,QWORD PTR[((88+511))+rsp] + and rax,-512 + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[40+rsp],rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,QWORD PTR[rdx] + mov r15,QWORD PTR[8+rdx] + mov rbx,QWORD PTR[16+rdx] + mov rbp,QWORD PTR[24+rdx] + mov rsi,QWORD PTR[32+rdx] + mov rdi,QWORD PTR[40+rdx] + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov QWORD PTR[64+rax],rbx + mov QWORD PTR[72+rax],rbp + mov QWORD PTR[80+rax],rsi + mov rsi,rax + mov QWORD PTR[88+rax],rdi + + + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + + + mov QWORD PTR[96+rdi],rdx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + + + mov QWORD PTR[96+rdi],rdx + + + xor rsi,256 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + + + + mov rax,QWORD PTR[96+rsi] + mov r11,QWORD PTR[144+rsi] + mov rbx,rdx + mov r10,rax + imul QWORD PTR[56+rsp] + mov r8,rax + mov rax,r11 + mov r9,rdx + imul QWORD PTR[64+rsp] + add r8,rax + adc r9,rdx + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + sar r9,63 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r9 + mov QWORD PTR[88+rdi],r9 + lea rsi,QWORD PTR[96+rsi] + + mov rax,r10 + imul rbx + mov r8,rax + mov rax,r11 + mov r9,rdx + imul rcx + add r8,rax + adc r9,rdx + mov QWORD PTR[96+rdi],r8 + mov QWORD PTR[104+rdi],r9 + sar r9,63 + mov QWORD PTR[112+rdi],r9 + mov QWORD PTR[120+rdi],r9 + mov QWORD PTR[128+rdi],r9 + mov QWORD PTR[136+rdi],r9 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + sar r13,63 + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r13 + mov QWORD PTR[64+rdi],r13 + mov QWORD PTR[72+rdi],r13 + mov QWORD PTR[80+rdi],r13 + mov QWORD PTR[88+rdi],r13 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + xor rsi,256+8*12 + mov edi,62 + call __ab_approximation_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulq_383_n_shift_by_62 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383_n_shift_by_62 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + + xor rsi,256+8*12 + mov edi,62 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[48+rsi] + mov r11,QWORD PTR[56+rsi] + call __inner_loop_62 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + mov QWORD PTR[rdi],r8 + mov QWORD PTR[48+rdi],r10 + + + + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[96+rdi] + call __smulq_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulq_767x63 + + + xor rsi,256+8*12 + mov edi,22 + + mov r8,QWORD PTR[rsi] + xor r9,r9 + mov r10,QWORD PTR[48+rsi] + xor r11,r11 + call __inner_loop_62 + + + + + + + + lea rsi,QWORD PTR[96+rsi] + + + + + + mov rdx,r12 + mov rcx,r13 + mov rdi,QWORD PTR[32+rsp] + call __smulq_767x63 + + mov rsi,QWORD PTR[40+rsp] + mov rdx,rax + sar rax,63 + + mov r8,rax + mov r9,rax + mov r10,rax + and r8,QWORD PTR[rsi] + and r9,QWORD PTR[8+rsi] + mov r11,rax + and r10,QWORD PTR[16+rsi] + and r11,QWORD PTR[24+rsi] + mov r12,rax + and r12,QWORD PTR[32+rsi] + and rax,QWORD PTR[40+rsi] + + add r14,r8 + adc r15,r9 + adc rbx,r10 + adc rbp,r11 + adc rcx,r12 + adc rdx,rax + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],rbx + mov QWORD PTR[72+rdi],rbp + mov QWORD PTR[80+rdi],rcx + mov QWORD PTR[88+rdi],rdx + + lea r8,QWORD PTR[1112+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ct_inverse_mod_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ct_inverse_mod_383:: +ct_inverse_mod_383 ENDP + +ALIGN 32 +__smulq_767x63 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rsi + lea rsi,QWORD PTR[48+rsi] + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov QWORD PTR[rdi],rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mov QWORD PTR[8+rdi],r9 + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mov QWORD PTR[16+rdi],r10 + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mov QWORD PTR[24+rdi],r11 + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + mov QWORD PTR[32+rdi],r12 + imul rbp + add r13,rax + adc rdx,0 + + mov QWORD PTR[40+rdi],r13 + mov QWORD PTR[48+rdi],rdx + sar rdx,63 + mov QWORD PTR[56+rdi],rdx + mov rdx,rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + mov r15,QWORD PTR[56+rsi] + mov rbx,QWORD PTR[64+rsi] + mov rbp,QWORD PTR[72+rsi] + mov rcx,QWORD PTR[80+rsi] + mov rdi,QWORD PTR[88+rsi] + + mov rsi,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rsi,rdx + add rsi,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + xor r14,rdx + xor r15,rdx + xor rbx,rdx + xor rbp,rdx + xor rcx,rdx + xor rdi,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rbx,0 + adc rbp,0 + adc rcx,0 + adc rdi,0 + + mul rsi + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rsi + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rsi + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rsi + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rsi + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + mul rsi + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + mul rsi + add r14,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + mul rsi + add r15,rax + mov rax,rbx + adc rdx,0 + mov rbx,rdx + mul rsi + add rbx,rax + mov rax,rbp + adc rdx,0 + mov rbp,rdx + mul rsi + add rbp,rax + mov rax,rcx + adc rdx,0 + mov rcx,rdx + mul rsi + add rcx,rax + mov rax,rdi + adc rdx,0 + mov rdi,rdx + mov rdx,QWORD PTR[8+rsp] + imul rax,rsi + mov rsi,QWORD PTR[16+rsp] + add rax,rdi + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + adc r11,QWORD PTR[24+rdx] + adc r12,QWORD PTR[32+rdx] + adc r13,QWORD PTR[40+rdx] + adc r14,QWORD PTR[48+rdx] + mov rdi,QWORD PTR[56+rdx] + adc r15,rdi + adc rbx,rdi + adc rbp,rdi + adc rcx,rdi + adc rax,rdi + + mov rdi,rdx + + mov QWORD PTR[rdx],r8 + mov QWORD PTR[8+rdx],r9 + mov QWORD PTR[16+rdx],r10 + mov QWORD PTR[24+rdx],r11 + mov QWORD PTR[32+rdx],r12 + mov QWORD PTR[40+rdx],r13 + mov QWORD PTR[48+rdx],r14 + mov QWORD PTR[56+rdx],r15 + mov QWORD PTR[64+rdx],rbx + mov QWORD PTR[72+rdx],rbp + mov QWORD PTR[80+rdx],rcx + mov QWORD PTR[88+rdx],rax + + DB 0F3h,0C3h ;repret +__smulq_767x63 ENDP + +ALIGN 32 +__smulq_383x63 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rax,rbp + add r13,rax + + lea rsi,QWORD PTR[48+rsi] + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rax,rbp + add r13,rax + + lea rsi,QWORD PTR[((-48))+rsi] + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__smulq_383x63 ENDP + +ALIGN 32 +__smulq_383_n_shift_by_62 PROC PRIVATE + DB 243,15,30,250 + + mov rbx,rdx + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rbp + add r13,rax + adc rdx,0 + + lea rsi,QWORD PTR[48+rsi] + mov r14,rdx + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rbp,rdx + sar rdx,63 + xor rax,rax + sub rax,rdx + + xor rbp,rdx + add rbp,rax + + xor r8,rdx + xor r9,rdx + xor r10,rdx + xor r11,rdx + xor r12,rdx + xor r13,rdx + add rax,r8 + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mul rbp + mov r8,rax + mov rax,r9 + mov r9,rdx + mul rbp + add r9,rax + mov rax,r10 + adc rdx,0 + mov r10,rdx + mul rbp + add r10,rax + mov rax,r11 + adc rdx,0 + mov r11,rdx + mul rbp + add r11,rax + mov rax,r12 + adc rdx,0 + mov r12,rdx + mul rbp + add r12,rax + mov rax,r13 + adc rdx,0 + mov r13,rdx + imul rbp + add r13,rax + adc rdx,0 + + lea rsi,QWORD PTR[((-48))+rsi] + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + adc r14,rdx + mov rdx,rbx + + shrd r8,r9,62 + shrd r9,r10,62 + shrd r10,r11,62 + shrd r11,r12,62 + shrd r12,r13,62 + shrd r13,r14,62 + + sar r14,63 + xor rbp,rbp + sub rbp,r14 + + xor r8,r14 + xor r9,r14 + xor r10,r14 + xor r11,r14 + xor r12,r14 + xor r13,r14 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + xor rdx,r14 + xor rcx,r14 + add rdx,rbp + add rcx,rbp + + DB 0F3h,0C3h ;repret +__smulq_383_n_shift_by_62 ENDP + +ALIGN 32 +__ab_approximation_62 PROC PRIVATE + DB 243,15,30,250 + + mov r9,QWORD PTR[40+rsi] + mov r11,QWORD PTR[88+rsi] + mov rbx,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[80+rsi] + mov r8,QWORD PTR[24+rsi] + mov r10,QWORD PTR[72+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + mov r8,QWORD PTR[16+rsi] + mov r10,QWORD PTR[64+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + mov r8,QWORD PTR[8+rsi] + mov r10,QWORD PTR[56+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + mov r8,QWORD PTR[rsi] + mov r10,QWORD PTR[48+rsi] + + mov rax,r9 + or rax,r11 + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r9,rbx + cmovz r11,rbp + cmovz rcx,rax + neg rcx + + + shld r9,rbx,cl + shld r11,rbp,cl + + jmp __inner_loop_62 + + DB 0F3h,0C3h ;repret +__ab_approximation_62 ENDP + +ALIGN 8 + DD 0 +__inner_loop_62 PROC PRIVATE + DB 243,15,30,250 + + mov rdx,1 + xor rcx,rcx + xor r12,r12 + mov r13,1 + mov QWORD PTR[8+rsp],rsi + +$L$oop_62:: + xor rax,rax + xor rbx,rbx + test r8,1 + mov rbp,r10 + mov r14,r11 + cmovnz rax,r10 + cmovnz rbx,r11 + sub rbp,r8 + sbb r14,r9 + mov r15,r8 + mov rsi,r9 + sub r8,rax + sbb r9,rbx + cmovc r8,rbp + cmovc r9,r14 + cmovc r10,r15 + cmovc r11,rsi + mov rax,rdx + cmovc rdx,r12 + cmovc r12,rax + mov rbx,rcx + cmovc rcx,r13 + cmovc r13,rbx + xor rax,rax + xor rbx,rbx + shrd r8,r9,1 + shr r9,1 + test r15,1 + cmovnz rax,r12 + cmovnz rbx,r13 + add r12,r12 + add r13,r13 + sub rdx,rax + sub rcx,rbx + sub edi,1 + jnz $L$oop_62 + + mov rsi,QWORD PTR[8+rsp] + DB 0F3h,0C3h ;repret +__inner_loop_62 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ct_inverse_mod_383 + DD imagerel $L$SEH_body_ct_inverse_mod_383 + DD imagerel $L$SEH_info_ct_inverse_mod_383_prologue + + DD imagerel $L$SEH_body_ct_inverse_mod_383 + DD imagerel $L$SEH_epilogue_ct_inverse_mod_383 + DD imagerel $L$SEH_info_ct_inverse_mod_383_body + + DD imagerel $L$SEH_epilogue_ct_inverse_mod_383 + DD imagerel $L$SEH_end_ct_inverse_mod_383 + DD imagerel $L$SEH_info_ct_inverse_mod_383_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ct_inverse_mod_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_ct_inverse_mod_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,08bh,000h +DB 000h,0e4h,08ch,000h +DB 000h,0d4h,08dh,000h +DB 000h,0c4h,08eh,000h +DB 000h,034h,08fh,000h +DB 000h,054h,090h,000h +DB 000h,074h,092h,000h +DB 000h,064h,093h,000h +DB 000h,001h,091h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_ct_inverse_mod_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm new file mode 100644 index 00000000000..024da69a645 --- /dev/null +++ b/crypto/blst_src/build/win64/ctx_inverse_mod_384-x86_64.asm @@ -0,0 +1,1609 @@ +OPTION DOTNAME +PUBLIC ct_inverse_mod_383$1 +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC ctx_inverse_mod_383 + + +ALIGN 32 +ctx_inverse_mod_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_ctx_inverse_mod_383:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ct_inverse_mod_383$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,1112 + +$L$SEH_body_ctx_inverse_mod_383:: + + + lea rax,QWORD PTR[((88+511))+rsp] + and rax,-512 + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[40+rsp],rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,QWORD PTR[rdx] + mov r15,QWORD PTR[8+rdx] + mov rbx,QWORD PTR[16+rdx] + mov rbp,QWORD PTR[24+rdx] + mov rsi,QWORD PTR[32+rdx] + mov rdi,QWORD PTR[40+rdx] + + mov QWORD PTR[rax],r8 + mov QWORD PTR[8+rax],r9 + mov QWORD PTR[16+rax],r10 + mov QWORD PTR[24+rax],r11 + mov QWORD PTR[32+rax],r12 + mov QWORD PTR[40+rax],r13 + + mov QWORD PTR[48+rax],r14 + mov QWORD PTR[56+rax],r15 + mov QWORD PTR[64+rax],rbx + mov QWORD PTR[72+rax],rbp + mov QWORD PTR[80+rax],rsi + mov rsi,rax + mov QWORD PTR[88+rax],rdi + + + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + + + mov QWORD PTR[96+rdi],rdx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + + + mov QWORD PTR[96+rdi],rdx + + + xor rsi,256 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + + + + mov rax,QWORD PTR[96+rsi] + mov r11,QWORD PTR[144+rsi] + mov rbx,rdx + mov r10,rax + imul QWORD PTR[56+rsp] + mov r8,rax + mov rax,r11 + mov r9,rdx + imul QWORD PTR[64+rsp] + add r8,rax + adc r9,rdx + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + sar r9,63 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r9 + mov QWORD PTR[88+rdi],r9 + lea rsi,QWORD PTR[96+rsi] + + mov rax,r10 + imul rbx + mov r8,rax + mov rax,r11 + mov r9,rdx + imul rcx + add r8,rax + adc r9,rdx + mov QWORD PTR[96+rdi],r8 + mov QWORD PTR[104+rdi],r9 + sar r9,63 + mov QWORD PTR[112+rdi],r9 + mov QWORD PTR[120+rdi],r9 + mov QWORD PTR[128+rdi],r9 + mov QWORD PTR[136+rdi],r9 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + sar r13,63 + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r13 + mov QWORD PTR[64+rdi],r13 + mov QWORD PTR[72+rdi],r13 + mov QWORD PTR[80+rdi],r13 + mov QWORD PTR[88+rdi],r13 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_383_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + xor rsi,256+8*12 + mov edi,31 + call __ab_approximation_31 + + + mov QWORD PTR[72+rsp],r12 + mov QWORD PTR[80+rsp],r13 + + mov rdi,256 + xor rdi,rsi + call __smulx_191_n_shift_by_31 + mov QWORD PTR[56+rsp],rdx + mov QWORD PTR[64+rsp],rcx + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_191_n_shift_by_31 + mov QWORD PTR[72+rsp],rdx + mov QWORD PTR[80+rsp],rcx + + mov rdx,QWORD PTR[56+rsp] + mov rcx,QWORD PTR[64+rsp] + lea rsi,QWORD PTR[96+rsi] + lea rdi,QWORD PTR[48+rdi] + call __smulx_383x63 + + mov rdx,QWORD PTR[72+rsp] + mov rcx,QWORD PTR[80+rsp] + lea rdi,QWORD PTR[48+rdi] + call __smulx_767x63 + + xor rsi,256+8*12 + mov edi,53 + + mov r8,QWORD PTR[rsi] + + mov r10,QWORD PTR[48+rsi] + + call __tail_loop_53 + + + + + + + + lea rsi,QWORD PTR[96+rsi] + + + + + + mov rdx,r12 + mov rcx,r13 + mov rdi,QWORD PTR[32+rsp] + call __smulx_767x63 + + mov rsi,QWORD PTR[40+rsp] + mov rdx,rax + sar rax,63 + + mov r8,rax + mov r9,rax + mov r10,rax + and r8,QWORD PTR[rsi] + and r9,QWORD PTR[8+rsi] + mov r11,rax + and r10,QWORD PTR[16+rsi] + and r11,QWORD PTR[24+rsi] + mov r12,rax + and r12,QWORD PTR[32+rsi] + and rax,QWORD PTR[40+rsi] + + add r14,r8 + adc r15,r9 + adc rbx,r10 + adc rbp,r11 + adc rcx,r12 + adc rdx,rax + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],rbx + mov QWORD PTR[72+rdi],rbp + mov QWORD PTR[80+rdi],rcx + mov QWORD PTR[88+rdi],rdx + + lea r8,QWORD PTR[1112+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_ctx_inverse_mod_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_ctx_inverse_mod_383:: +ctx_inverse_mod_383 ENDP + +ALIGN 32 +__smulx_767x63 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rsi + lea rsi,QWORD PTR[48+rsi] + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor rax,r13 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r13,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,r13 + mulx r13,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,r13 + adc rbp,0 + imul rdx + add rax,rbp + adc rdx,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],rax + mov QWORD PTR[48+rdi],rdx + sar rdx,63 + mov QWORD PTR[56+rdi],rdx + mov rdx,rcx + mov rax,rcx + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + mov r15,QWORD PTR[56+rsi] + mov rbx,QWORD PTR[64+rsi] + mov rbp,QWORD PTR[72+rsi] + mov rcx,QWORD PTR[80+rsi] + mov rdi,QWORD PTR[88+rsi] + + sar rax,63 + xor rsi,rsi + sub rsi,rax + + xor rdx,rax + add rdx,rsi + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor r13,rax + xor r14,rax + xor r15,rax + xor rbx,rax + xor rbp,rax + xor rcx,rax + xor rdi,rax + add r8,rsi + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rbx,0 + adc rbp,0 + adc rcx,0 + adc rdi,0 + + mulx rax,r8,r8 + mulx rsi,r9,r9 + add r9,rax + mulx rax,r10,r10 + adc r10,rsi + mulx rsi,r11,r11 + adc r11,rax + mulx rax,r12,r12 + adc r12,rsi + mulx rsi,r13,r13 + adc r13,rax + mulx rax,r14,r14 + adc r14,rsi + mulx rsi,r15,r15 + adc r15,rax + mulx rax,rbx,rbx + adc rbx,rsi + mulx rsi,rbp,rbp + adc rbp,rax + mulx rax,rcx,rcx + adc rcx,rsi + mulx rsi,rdi,rdi + mov rdx,QWORD PTR[8+rsp] + mov rsi,QWORD PTR[16+rsp] + adc rax,rdi + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + adc r11,QWORD PTR[24+rdx] + adc r12,QWORD PTR[32+rdx] + adc r13,QWORD PTR[40+rdx] + adc r14,QWORD PTR[48+rdx] + mov rdi,QWORD PTR[56+rdx] + adc r15,rdi + adc rbx,rdi + adc rbp,rdi + adc rcx,rdi + adc rax,rdi + + mov rdi,rdx + + mov QWORD PTR[rdx],r8 + mov QWORD PTR[8+rdx],r9 + mov QWORD PTR[16+rdx],r10 + mov QWORD PTR[24+rdx],r11 + mov QWORD PTR[32+rdx],r12 + mov QWORD PTR[40+rdx],r13 + mov QWORD PTR[48+rdx],r14 + mov QWORD PTR[56+rdx],r15 + mov QWORD PTR[64+rdx],rbx + mov QWORD PTR[72+rdx],rbp + mov QWORD PTR[80+rdx],rcx + mov QWORD PTR[88+rdx],rax + + DB 0F3h,0C3h ;repret +__smulx_767x63 ENDP + +ALIGN 32 +__smulx_383x63 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + mov r12,QWORD PTR[((0+32))+rsi] + mov r13,QWORD PTR[((0+40))+rsi] + + mov rbp,rdx + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor rdx,rbp + add rdx,rax + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + xor r12,rbp + xor r13,rbp + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mulx rbp,r8,r8 + mulx rax,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,rax + mulx rax,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,rax + mulx rax,r13,r13 + mov rdx,rcx + adc r13,rbp + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + mov rbp,rdx + sar rbp,63 + xor rax,rax + sub rax,rbp + + xor rdx,rbp + add rdx,rax + + xor r8,rbp + xor r9,rbp + xor r10,rbp + xor r11,rbp + xor r12,rbp + xor r13,rbp + add r8,rax + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + + mulx rbp,r8,r8 + mulx rax,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,rax + mulx rax,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,rax + mulx rax,r13,r13 + adc r13,rbp + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc r13,QWORD PTR[40+rdi] + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__smulx_383x63 ENDP + +ALIGN 32 +__smulx_383_n_shift_by_31 PROC PRIVATE + DB 243,15,30,250 + + mov rbx,rdx + xor r14,r14 + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + mov r11,QWORD PTR[((0+24))+rsi] + mov r12,QWORD PTR[((0+32))+rsi] + mov r13,QWORD PTR[((0+40))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor rax,r13 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r13,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,r13 + mulx r13,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,r13 + adc rbp,0 + imul rdx + add rax,rbp + adc r14,rdx + + mov rdx,rcx + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],rax + mov r8,QWORD PTR[((48+0))+rsi] + mov r9,QWORD PTR[((48+8))+rsi] + mov r10,QWORD PTR[((48+16))+rsi] + mov r11,QWORD PTR[((48+24))+rsi] + mov r12,QWORD PTR[((48+32))+rsi] + mov r13,QWORD PTR[((48+40))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor r10,rax + xor r11,rax + xor r12,rax + xor rax,r13 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r13,r9,r9 + add r9,rbp + mulx rbp,r10,r10 + adc r10,r13 + mulx r13,r11,r11 + adc r11,rbp + mulx rbp,r12,r12 + adc r12,r13 + adc rbp,0 + imul rdx + add rax,rbp + adc rdx,0 + + add r8,QWORD PTR[rdi] + adc r9,QWORD PTR[8+rdi] + adc r10,QWORD PTR[16+rdi] + adc r11,QWORD PTR[24+rdi] + adc r12,QWORD PTR[32+rdi] + adc rax,QWORD PTR[40+rdi] + adc r14,rdx + mov rdx,rbx + + shrd r8,r9,31 + shrd r9,r10,31 + shrd r10,r11,31 + shrd r11,r12,31 + shrd r12,rax,31 + shrd rax,r14,31 + + sar r14,63 + xor rbp,rbp + sub rbp,r14 + + xor r8,r14 + xor r9,r14 + xor r10,r14 + xor r11,r14 + xor r12,r14 + xor rax,r14 + add r8,rbp + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc rax,0 + + mov QWORD PTR[rdi],r8 + mov QWORD PTR[8+rdi],r9 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],rax + + xor rdx,r14 + xor rcx,r14 + add rdx,rbp + add rcx,rbp + + DB 0F3h,0C3h ;repret +__smulx_383_n_shift_by_31 ENDP + +ALIGN 32 +__smulx_191_n_shift_by_31 PROC PRIVATE + DB 243,15,30,250 + + mov rbx,rdx + mov r8,QWORD PTR[((0+0))+rsi] + mov r9,QWORD PTR[((0+8))+rsi] + mov r10,QWORD PTR[((0+16))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r8,rax + xor r9,rax + xor rax,r10 + add r8,rbp + adc r9,0 + adc rax,0 + + mulx rbp,r8,r8 + mulx r10,r9,r9 + add r9,rbp + adc r10,0 + imul rdx + add r10,rax + adc rdx,0 + mov r14,rdx + mov rdx,rcx + mov r11,QWORD PTR[((48+0))+rsi] + mov r12,QWORD PTR[((48+8))+rsi] + mov r13,QWORD PTR[((48+16))+rsi] + + mov rax,rdx + sar rax,63 + xor rbp,rbp + sub rbp,rax + + xor rdx,rax + add rdx,rbp + + xor r11,rax + xor r12,rax + xor rax,r13 + add r11,rbp + adc r12,0 + adc rax,0 + + mulx rbp,r11,r11 + mulx r13,r12,r12 + add r12,rbp + adc r13,0 + imul rdx + add r13,rax + adc rdx,0 + add r11,r8 + adc r12,r9 + adc r13,r10 + adc r14,rdx + mov rdx,rbx + + shrd r11,r12,31 + shrd r12,r13,31 + shrd r13,r14,31 + + sar r14,63 + xor rbp,rbp + sub rbp,r14 + + xor r11,r14 + xor r12,r14 + xor r13,r14 + add r11,rbp + adc r12,0 + adc r13,0 + + mov QWORD PTR[rdi],r11 + mov QWORD PTR[8+rdi],r12 + mov QWORD PTR[16+rdi],r13 + + xor rdx,r14 + xor rcx,r14 + add rdx,rbp + add rcx,rbp + + DB 0F3h,0C3h ;repret +__smulx_191_n_shift_by_31 ENDP + +ALIGN 32 +__ab_approximation_31 PROC PRIVATE + DB 243,15,30,250 + + mov r9,QWORD PTR[40+rsi] + mov r11,QWORD PTR[88+rsi] + mov rbx,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[80+rsi] + mov r8,QWORD PTR[24+rsi] + mov r10,QWORD PTR[72+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[16+rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[64+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[8+rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[56+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + mov r8,QWORD PTR[rsi] + cmovz rbp,r10 + mov r10,QWORD PTR[48+rsi] + + mov rax,r9 + or rax,r11 + cmovz r9,rbx + cmovz r11,rbp + cmovz rbx,r8 + cmovz rbp,r10 + + mov rax,r9 + or rax,r11 + bsr rcx,rax + lea rcx,QWORD PTR[1+rcx] + cmovz r9,r8 + cmovz r11,r10 + cmovz rcx,rax + neg rcx + + + shld r9,rbx,cl + shld r11,rbp,cl + + mov eax,07FFFFFFFh + and r8,rax + and r10,rax + andn r9,rax,r9 + andn r11,rax,r11 + or r8,r9 + or r10,r11 + + jmp __inner_loop_31 + + DB 0F3h,0C3h ;repret +__ab_approximation_31 ENDP + +ALIGN 32 +__inner_loop_31 PROC PRIVATE + DB 243,15,30,250 + + mov rcx,07FFFFFFF80000000h + mov r13,0800000007FFFFFFFh + mov r15,07FFFFFFF7FFFFFFFh + +$L$oop_31:: + cmp r8,r10 + mov rax,r8 + mov rbx,r10 + mov rbp,rcx + mov r14,r13 + cmovb r8,r10 + cmovb r10,rax + cmovb rcx,r13 + cmovb r13,rbp + + sub r8,r10 + sub rcx,r13 + add rcx,r15 + + test rax,1 + cmovz r8,rax + cmovz r10,rbx + cmovz rcx,rbp + cmovz r13,r14 + + shr r8,1 + add r13,r13 + sub r13,r15 + sub edi,1 + jnz $L$oop_31 + + shr r15,32 + mov edx,ecx + mov r12d,r13d + shr rcx,32 + shr r13,32 + sub rdx,r15 + sub rcx,r15 + sub r12,r15 + sub r13,r15 + + DB 0F3h,0C3h ;repret +__inner_loop_31 ENDP + + +ALIGN 32 +__tail_loop_53 PROC PRIVATE + DB 243,15,30,250 + + mov rdx,1 + xor rcx,rcx + xor r12,r12 + mov r13,1 + +$L$oop_53:: + xor rax,rax + test r8,1 + mov rbx,r10 + cmovnz rax,r10 + sub rbx,r8 + mov rbp,r8 + sub r8,rax + cmovc r8,rbx + cmovc r10,rbp + mov rax,rdx + cmovc rdx,r12 + cmovc r12,rax + mov rbx,rcx + cmovc rcx,r13 + cmovc r13,rbx + xor rax,rax + xor rbx,rbx + shr r8,1 + test rbp,1 + cmovnz rax,r12 + cmovnz rbx,r13 + add r12,r12 + add r13,r13 + sub rdx,rax + sub rcx,rbx + sub edi,1 + jnz $L$oop_53 + + DB 0F3h,0C3h ;repret +__tail_loop_53 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_ctx_inverse_mod_383 + DD imagerel $L$SEH_body_ctx_inverse_mod_383 + DD imagerel $L$SEH_info_ctx_inverse_mod_383_prologue + + DD imagerel $L$SEH_body_ctx_inverse_mod_383 + DD imagerel $L$SEH_epilogue_ctx_inverse_mod_383 + DD imagerel $L$SEH_info_ctx_inverse_mod_383_body + + DD imagerel $L$SEH_epilogue_ctx_inverse_mod_383 + DD imagerel $L$SEH_end_ctx_inverse_mod_383 + DD imagerel $L$SEH_info_ctx_inverse_mod_383_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_ctx_inverse_mod_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_ctx_inverse_mod_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,08bh,000h +DB 000h,0e4h,08ch,000h +DB 000h,0d4h,08dh,000h +DB 000h,0c4h,08eh,000h +DB 000h,034h,08fh,000h +DB 000h,054h,090h,000h +DB 000h,074h,092h,000h +DB 000h,064h,093h,000h +DB 000h,001h,091h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_ctx_inverse_mod_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/div3w-armv8.asm b/crypto/blst_src/build/win64/div3w-armv8.asm new file mode 100644 index 00000000000..aec90679eea --- /dev/null +++ b/crypto/blst_src/build/win64/div3w-armv8.asm @@ -0,0 +1,89 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |div_3_limbs|[FUNC] + ALIGN 32 +|div_3_limbs| PROC + ldp x4,x5,[x0] // load R + eor x0,x0,x0 // Q = 0 + mov x3,#64 // loop counter + nop + +|$Loop| + subs x6,x4,x1 // R - D + add x0,x0,x0 // Q <<= 1 + sbcs x7,x5,x2 + add x0,x0,#1 // Q + speculative bit + csello x4,x4,x6 + extr x1,x2,x1,#1 // D >>= 1 + csello x5,x5,x7 + lsr x2,x2,#1 + sbc x0,x0,xzr // subtract speculative bit + sub x3,x3,#1 + cbnz x3,|$Loop| + + asr x3,x0,#63 // top bit -> mask + add x0,x0,x0 // Q <<= 1 + subs x6,x4,x1 // R - D + add x0,x0,#1 // Q + speculative bit + sbcs x7,x5,x2 + sbc x0,x0,xzr // subtract speculative bit + + orr x0,x0,x3 // all ones if overflow + + ret + ENDP + + EXPORT |quot_rem_128|[FUNC] + ALIGN 32 +|quot_rem_128| PROC + ldp x3,x4,[x1] + + mul x5,x3,x2 // divisor[0:1} * quotient + umulh x6,x3,x2 + mul x11, x4,x2 + umulh x7,x4,x2 + + ldp x8,x9,[x0] // load 3 limbs of the dividend + ldr x10,[x0,#16] + + adds x6,x6,x11 + adc x7,x7,xzr + + subs x8,x8,x5 // dividend - divisor * quotient + sbcs x9,x9,x6 + sbcs x10,x10,x7 + sbc x5,xzr,xzr // borrow -> mask + + add x2,x2,x5 // if borrowed, adjust the quotient ... + and x3,x3,x5 + and x4,x4,x5 + adds x8,x8,x3 // ... and add divisor + adc x9,x9,x4 + + stp x8,x9,[x0] // save 2 limbs of the remainder + str x2,[x0,#16] // and one limb of the quotient + + mov x0,x2 // return adjusted quotient + + ret + ENDP + + + EXPORT |quot_rem_64|[FUNC] + ALIGN 32 +|quot_rem_64| PROC + ldr x3,[x1] + ldr x8,[x0] // load 1 limb of the dividend + + mul x5,x3,x2 // divisor * quotient + + sub x8,x8,x5 // dividend - divisor * quotient + + stp x8,x2,[x0] // save remainder and quotient + + mov x0,x2 // return quotient + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/div3w-x86_64.asm b/crypto/blst_src/build/win64/div3w-x86_64.asm new file mode 100644 index 00000000000..805c5b1fcb0 --- /dev/null +++ b/crypto/blst_src/build/win64/div3w-x86_64.asm @@ -0,0 +1,257 @@ +OPTION DOTNAME +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC div_3_limbs + + +ALIGN 32 +div_3_limbs PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_div_3_limbs:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +$L$SEH_body_div_3_limbs:: + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + xor rax,rax + mov ecx,64 + +$L$oop:: + mov r10,r8 + sub r8,rsi + mov r11,r9 + sbb r9,rdx + lea rax,QWORD PTR[1+rax*1+rax] + mov rdi,rdx + cmovc r8,r10 + cmovc r9,r11 + sbb rax,0 + shl rdi,63 + shr rsi,1 + shr rdx,1 + or rsi,rdi + sub ecx,1 + jnz $L$oop + + lea rcx,QWORD PTR[1+rax*1+rax] + sar rax,63 + + sub r8,rsi + sbb r9,rdx + sbb rcx,0 + + or rax,rcx + +$L$SEH_epilogue_div_3_limbs:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_div_3_limbs:: +div_3_limbs ENDP +PUBLIC quot_rem_128 + + +ALIGN 32 +quot_rem_128 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_quot_rem_128:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +$L$SEH_body_quot_rem_128:: + + mov rax,rdx + mov rcx,rdx + + mul QWORD PTR[rsi] + mov r8,rax + mov rax,rcx + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r9,rax + adc rdx,0 + + mov r10,QWORD PTR[rdi] + mov r11,QWORD PTR[8+rdi] + mov rax,QWORD PTR[16+rdi] + + sub r10,r8 + sbb r11,r9 + sbb rax,rdx + sbb r8,r8 + + add rcx,r8 + mov r9,r8 + and r8,QWORD PTR[rsi] + and r9,QWORD PTR[8+rsi] + add r10,r8 + adc r11,r9 + + mov QWORD PTR[rdi],r10 + mov QWORD PTR[8+rdi],r11 + mov QWORD PTR[16+rdi],rcx + + mov rax,rcx + +$L$SEH_epilogue_quot_rem_128:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_quot_rem_128:: +quot_rem_128 ENDP + + + + + +PUBLIC quot_rem_64 + + +ALIGN 32 +quot_rem_64 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_quot_rem_64:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +$L$SEH_body_quot_rem_64:: + + mov rax,rdx + imul rdx,QWORD PTR[rsi] + + mov r10,QWORD PTR[rdi] + + sub r10,rdx + + mov QWORD PTR[rdi],r10 + mov QWORD PTR[8+rdi],rax + +$L$SEH_epilogue_quot_rem_64:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_quot_rem_64:: +quot_rem_64 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_div_3_limbs + DD imagerel $L$SEH_body_div_3_limbs + DD imagerel $L$SEH_info_div_3_limbs_prologue + + DD imagerel $L$SEH_body_div_3_limbs + DD imagerel $L$SEH_epilogue_div_3_limbs + DD imagerel $L$SEH_info_div_3_limbs_body + + DD imagerel $L$SEH_epilogue_div_3_limbs + DD imagerel $L$SEH_end_div_3_limbs + DD imagerel $L$SEH_info_div_3_limbs_epilogue + + DD imagerel $L$SEH_begin_quot_rem_128 + DD imagerel $L$SEH_body_quot_rem_128 + DD imagerel $L$SEH_info_quot_rem_128_prologue + + DD imagerel $L$SEH_body_quot_rem_128 + DD imagerel $L$SEH_epilogue_quot_rem_128 + DD imagerel $L$SEH_info_quot_rem_128_body + + DD imagerel $L$SEH_epilogue_quot_rem_128 + DD imagerel $L$SEH_end_quot_rem_128 + DD imagerel $L$SEH_info_quot_rem_128_epilogue + + DD imagerel $L$SEH_begin_quot_rem_64 + DD imagerel $L$SEH_body_quot_rem_64 + DD imagerel $L$SEH_info_quot_rem_64_prologue + + DD imagerel $L$SEH_body_quot_rem_64 + DD imagerel $L$SEH_epilogue_quot_rem_64 + DD imagerel $L$SEH_info_quot_rem_64_body + + DD imagerel $L$SEH_epilogue_quot_rem_64 + DD imagerel $L$SEH_end_quot_rem_64 + DD imagerel $L$SEH_info_quot_rem_64_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_div_3_limbs_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_div_3_limbs_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_div_3_limbs_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_quot_rem_128_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_quot_rem_128_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_quot_rem_128_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_quot_rem_64_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_quot_rem_64_body:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h +$L$SEH_info_quot_rem_64_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/dll.c b/crypto/blst_src/build/win64/dll.c new file mode 100644 index 00000000000..a70d0c98a23 --- /dev/null +++ b/crypto/blst_src/build/win64/dll.c @@ -0,0 +1,32 @@ +#include + +#if defined(_MSC_VER) +/* + * Even though we don't have memcpy/memset anywhere, MSVC compiler + * generates calls to them as it recognizes corresponding patterns. + */ +void *memcpy(unsigned char *dst, const unsigned char *src, size_t n) +{ + void *ret = dst; + + while(n--) + *dst++ = *src++; + + return ret; +} + +void *memset(unsigned char *dst, int c, size_t n) +{ + void *ret = dst; + + while(n--) + *dst++ = (unsigned char)c; + + return ret; +} +#elif defined(__GNUC__) +# pragma GCC diagnostic ignored "-Wunused-parameter" +#endif + +BOOL WINAPI DllMain(HINSTANCE hinstDLL, DWORD fdwReason, LPVOID lpvReserved) +{ return TRUE; } diff --git a/crypto/blst_src/build/win64/mul_mont_256-armv8.asm b/crypto/blst_src/build/win64/mul_mont_256-armv8.asm new file mode 100644 index 00000000000..bb2dfe043c7 --- /dev/null +++ b/crypto/blst_src/build/win64/mul_mont_256-armv8.asm @@ -0,0 +1,465 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + + EXPORT |mul_mont_sparse_256|[FUNC] + ALIGN 32 +|mul_mont_sparse_256| PROC + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x10,x11,[x1] + ldr x9, [x2] + ldp x12,x13,[x1,#16] + + mul x19,x10,x9 + ldp x5,x6,[x3] + mul x20,x11,x9 + ldp x7,x8,[x3,#16] + mul x21,x12,x9 + mul x22,x13,x9 + + umulh x14,x10,x9 + umulh x15,x11,x9 + mul x3,x4,x19 + umulh x16,x12,x9 + umulh x17,x13,x9 + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,xzr, x17 + mul x17,x8,x3 + ldr x9,[x2,8*1] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*2] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + ldr x9,[x2,8*3] + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + mul x14,x10,x9 + adcs x20,x21,x15 + mul x15,x11,x9 + adcs x21,x22,x16 + mul x16,x12,x9 + adcs x22,x23,x17 + mul x17,x13,x9 + adc x23,xzr,xzr + + adds x19,x19,x14 + umulh x14,x10,x9 + adcs x20,x20,x15 + umulh x15,x11,x9 + adcs x21,x21,x16 + mul x3,x4,x19 + umulh x16,x12,x9 + adcs x22,x22,x17 + umulh x17,x13,x9 + adc x23,x23,xzr + + adds x20,x20,x14 + //mul x14,x5,x3 + adcs x21,x21,x15 + mul x15,x6,x3 + adcs x22,x22,x16 + mul x16,x7,x3 + adc x23,x23,x17 + mul x17,x8,x3 + subs xzr,x19,#1 //adds x19,x19,x14 + umulh x14,x5,x3 + adcs x20,x20,x15 + umulh x15,x6,x3 + adcs x21,x21,x16 + umulh x16,x7,x3 + adcs x22,x22,x17 + umulh x17,x8,x3 + adc x23,x23,xzr + + adds x19,x20,x14 + adcs x20,x21,x15 + adcs x21,x22,x16 + adcs x22,x23,x17 + adc x23,xzr,xzr + + subs x14,x19,x5 + sbcs x15,x20,x6 + sbcs x16,x21,x7 + sbcs x17,x22,x8 + sbcs xzr, x23,xzr + + csello x19,x19,x14 + csello x20,x20,x15 + csello x21,x21,x16 + csello x22,x22,x17 + + stp x19,x20,[x0] + stp x21,x22,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret + ENDP + + + EXPORT |sqr_mont_sparse_256|[FUNC] + ALIGN 32 +|sqr_mont_sparse_256| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + mov x4,x3 + + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is x10 + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x11,x6,x5 // a[1]*a[0] + umulh x15,x6,x5 + mul x12,x7,x5 // a[2]*a[0] + umulh x16,x7,x5 + mul x13,x8,x5 // a[3]*a[0] + umulh x19,x8,x5 + + adds x12,x12,x15 // accumulate high parts of multiplication + mul x14,x7,x6 // a[2]*a[1] + umulh x15,x7,x6 + adcs x13,x13,x16 + mul x16,x8,x6 // a[3]*a[1] + umulh x17,x8,x6 + adc x19,x19,xzr // can't overflow + + mul x20,x8,x7 // a[3]*a[2] + umulh x21,x8,x7 + + adds x15,x15,x16 // accumulate high parts of multiplication + mul x10,x5,x5 // a[0]*a[0] + adc x16,x17,xzr // can't overflow + + adds x13,x13,x14 // accumulate low parts of multiplication + umulh x5,x5,x5 + adcs x19,x19,x15 + mul x15,x6,x6 // a[1]*a[1] + adcs x20,x20,x16 + umulh x6,x6,x6 + adc x21,x21,xzr // can't overflow + + adds x11,x11,x11 // acc[1-6]*=2 + mul x16,x7,x7 // a[2]*a[2] + adcs x12,x12,x12 + umulh x7,x7,x7 + adcs x13,x13,x13 + mul x17,x8,x8 // a[3]*a[3] + adcs x19,x19,x19 + umulh x8,x8,x8 + adcs x20,x20,x20 + adcs x21,x21,x21 + adc x22,xzr,xzr + + adds x11,x11,x5 // +a[i]*a[i] + adcs x12,x12,x15 + adcs x13,x13,x6 + adcs x19,x19,x16 + adcs x20,x20,x7 + adcs x21,x21,x17 + adc x22,x22,x8 + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + adds x10,x10,x19 // accumulate upper half + adcs x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adc x19,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x19,xzr + + csello x10,x10,x14 + csello x11,x11,x15 + csello x12,x12,x16 + csello x13,x13,x17 + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldr x29,[sp],#48 + DCDU 3573752767 + ret + ENDP + + + EXPORT |from_mont_256|[FUNC] + ALIGN 32 +|from_mont_256| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + + csello x10,x10,x14 + csello x11,x11,x15 + csello x12,x12,x16 + csello x13,x13,x17 + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |redc_mont_256|[FUNC] + ALIGN 32 +|redc_mont_256| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x4,x3 + ldp x10,x11,[x1] + ldp x12,x13,[x1,#16] + + bl __mul_by_1_mont_256 + ldr x30,[x29,#8] + + ldp x14,x15,[x1,#32] + ldp x16,x17,[x1,#48] + + adds x10,x10,x14 + adcs x11,x11,x15 + adcs x12,x12,x16 + adcs x13,x13,x17 + adc x9,xzr,xzr + + subs x14,x10,x5 + sbcs x15,x11,x6 + sbcs x16,x12,x7 + sbcs x17,x13,x8 + sbcs xzr, x9,xzr + + csello x10,x10,x14 + csello x11,x11,x15 + csello x12,x12,x16 + csello x13,x13,x17 + + stp x10,x11,[x0] + stp x12,x13,[x0,#16] + + ldr x29,[sp],#16 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_by_1_mont_256| PROC + mul x3,x4,x10 + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + mul x3,x4,x10 + adc x13,x9,x17 + //mul x14,x5,x3 + mul x15,x6,x3 + mul x16,x7,x3 + mul x17,x8,x3 + subs xzr,x10,#1 //adds x10,x10,x14 + umulh x14,x5,x3 + adcs x11,x11,x15 + umulh x15,x6,x3 + adcs x12,x12,x16 + umulh x16,x7,x3 + adcs x13,x13,x17 + umulh x17,x8,x3 + adc x9,xzr,xzr + + adds x10,x11,x14 + adcs x11,x12,x15 + adcs x12,x13,x16 + adc x13,x9,x17 + + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/mul_mont_384-armv8.asm b/crypto/blst_src/build/win64/mul_mont_384-armv8.asm new file mode 100644 index 00000000000..a309dfa4121 --- /dev/null +++ b/crypto/blst_src/build/win64/mul_mont_384-armv8.asm @@ -0,0 +1,2373 @@ + AREA |.text|,CODE,ALIGN=8,ARM64 + + + EXPORT |add_mod_384x384|[FUNC] + ALIGN 32 +|add_mod_384x384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __add_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__add_mod_384x384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + stp x11, x12, [x0] + adcs x15,x15,x23 + ldp x11, x12, [x1,#48] + adcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + adcs x11,x11,x19 + stp x15, x16, [x0,#32] + adcs x12,x12,x20 + ldp x15, x16, [x1,#80] + adcs x13,x13,x21 + ldp x23,x24,[x2,#80] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + stp x11,x12,[x0,#48] + csello x15,x15,x23 + stp x13,x14,[x0,#64] + csello x16,x16,x24 + stp x15,x16,[x0,#80] + + ret + ENDP + + + EXPORT |sub_mod_384x384|[FUNC] + ALIGN 32 +|sub_mod_384x384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__sub_mod_384x384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + stp x11, x12, [x0] + sbcs x15,x15,x23 + ldp x11, x12, [x1,#48] + sbcs x16,x16,x24 + + ldp x19,x20,[x2,#48] + stp x13, x14, [x0,#16] + ldp x13, x14, [x1,#64] + ldp x21,x22,[x2,#64] + + sbcs x11,x11,x19 + stp x15, x16, [x0,#32] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#80] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#80] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + ret + ENDP + + + ALIGN 32 +|__add_mod_384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + adds x11,x11,x19 + ldp x21,x22,[x2,#16] + adcs x12,x12,x20 + ldp x15, x16, [x1,#32] + adcs x13,x13,x21 + ldp x23,x24,[x2,#32] + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x17,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x17,xzr + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + csello x15,x15,x23 + stp x11,x12,[x0] + csello x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + ENDP + + + ALIGN 32 +|__sub_mod_384| PROC + ldp x11, x12, [x1] + ldp x19,x20,[x2] + ldp x13, x14, [x1,#16] + subs x11,x11,x19 + ldp x21,x22,[x2,#16] + sbcs x12,x12,x20 + ldp x15, x16, [x1,#32] + sbcs x13,x13,x21 + ldp x23,x24,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x17,xzr,xzr + + and x19,x5,x17 + and x20,x6,x17 + adds x11,x11,x19 + and x21,x7,x17 + adcs x12,x12,x20 + and x22,x8,x17 + adcs x13,x13,x21 + and x23,x9,x17 + adcs x14,x14,x22 + and x24,x10,x17 + adcs x15,x15,x23 + stp x11,x12,[x0] + adc x16,x16,x24 + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + ENDP + + + + EXPORT |mul_mont_384x|[FUNC] + ALIGN 32 +|mul_mont_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#288 // space for 3 768-bit vectors + + mov x26,x0 // save r_ptr + mov x27,x1 // save b_ptr + mov x28,x2 // save b_ptr + + sub x0,sp,#0 // mul_384(t0, a->re, b->re) + bl __mul_384 + + add x1,x1,#48 // mul_384(t1, a->im, b->im) + add x2,x2,#48 + add x0,sp,#96 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + sub x2,x1,#48 + add x0,sp,#240 + bl __add_mod_384 + + add x1,x28,#0 + add x2,x28,#48 + add x0,sp,#192 // t2 + bl __add_mod_384 + + add x1,x0,#0 + add x2,x0,#48 + bl __mul_384 // mul_384(t2, a->re+a->im, b->re+b->im) + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,x0 + add x2,sp,#0 + bl __sub_mod_384x384 + + add x2,sp,#96 + bl __sub_mod_384x384 // t2 = t2-t0-t1 + + add x1,sp,#0 + add x2,sp,#96 + add x0,sp,#0 + bl __sub_mod_384x384 // t0 = t0-t1 + + add x1,sp,#0 // ret->re = redc(t0) + add x0,x26,#0 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + + add x1,sp,#192 // ret->im = redc(t2) + add x0,x0,#48 + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#288 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_mont_384x|[FUNC] + ALIGN 32 +|sqr_mont_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 2 384-bit vectors + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + add x0,sp,#0 + bl __add_mod_384 // t0 = a->re + a->im + + add x0,sp,#48 + bl __sub_mod_384 // t1 = a->re - a->im + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __mul_mont_384 // mul_mont_384(ret->im, a->re, a->im) + + adds x11,x11,x11 // add with itself + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csello x19,x11,x19 + csello x20,x12,x20 + csello x21,x13,x21 + ldp x11,x12,[sp] + csello x22,x14,x22 + ldr x17, [sp,#48] + csello x23,x15,x23 + ldp x13,x14,[sp,#16] + csello x24,x16,x24 + ldp x15,x16,[sp,#32] + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + add x2,sp,#48 + bl __mul_mont_384 // mul_mont_384(ret->re, t0, t1) + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |mul_mont_384|[FUNC] + ALIGN 32 +|mul_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_mont_384| PROC + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + mov x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*1] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*2] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*3] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*4] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + adc x4,x17,xzr + ldr x17,[x2,8*5] + + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,x4,xzr + ldr x4,[x29,#96] + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adcs x25,x25,xzr + adc x17,xzr,xzr + + adds x20,x20,x26 + // mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adcs x25,x25,x3 + mul x3,x10,x4 + adc x17,x17,xzr + subs xzr,x19,#1 // adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adcs x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + adc x17,x17,xzr + + adds x19,x20,x26 + adcs x20,x21,x27 + adcs x21,x22,x28 + adcs x22,x23,x0 + adcs x23,x24,x1 + adcs x24,x25,x3 + adc x25,x17,xzr + + subs x26,x19,x5 + sbcs x27,x20,x6 + sbcs x28,x21,x7 + sbcs x0,x22,x8 + sbcs x1,x23,x9 + sbcs x3,x24,x10 + sbcs xzr, x25,xzr + + csello x11,x19,x26 + csello x12,x20,x27 + csello x13,x21,x28 + csello x14,x22,x0 + csello x15,x23,x1 + csello x16,x24,x3 + ret + ENDP + + + + EXPORT |sqr_mont_384|[FUNC] + ALIGN 32 +|sqr_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for 768-bit vector + mov x4,x3 // adjust for missing b_ptr + + mov x3,x0 // save r_ptr + mov x0,sp + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + mov x1,sp + mov x0,x3 // restore r_ptr + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_n_mul_mont_383|[FUNC] + ALIGN 32 +|sqr_n_mul_mont_383| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x4,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#96 // space for 768-bit vector + mov x17,x5 // save b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + mov x0,sp +|$Loop_sqr_383| + bl __sqr_384 + sub x2,x2,#1 // counter + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + mov x1,sp + bl __mul_by_1_mont_384 + + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // just accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + cbnz x2,|$Loop_sqr_383| + + mov x2,x17 + ldr x17,[x17] + bl __mul_mont_384 + ldr x30,[x29,#8] + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + ALIGN 32 +|__sqr_384| PROC + mul x19,x12,x11 + mul x20,x13,x11 + mul x21,x14,x11 + mul x22,x15,x11 + mul x23,x16,x11 + + umulh x6,x12,x11 + umulh x7,x13,x11 + umulh x8,x14,x11 + umulh x9,x15,x11 + adds x20,x20,x6 + umulh x10,x16,x11 + adcs x21,x21,x7 + mul x7,x13,x12 + adcs x22,x22,x8 + mul x8,x14,x12 + adcs x23,x23,x9 + mul x9,x15,x12 + adc x24,xzr, x10 + mul x10,x16,x12 + + adds x21,x21,x7 + umulh x7,x13,x12 + adcs x22,x22,x8 + umulh x8,x14,x12 + adcs x23,x23,x9 + umulh x9,x15,x12 + adcs x24,x24,x10 + umulh x10,x16,x12 + adc x25,xzr,xzr + + mul x5,x11,x11 + adds x22,x22,x7 + umulh x11, x11,x11 + adcs x23,x23,x8 + mul x8,x14,x13 + adcs x24,x24,x9 + mul x9,x15,x13 + adc x25,x25,x10 + mul x10,x16,x13 + + adds x23,x23,x8 + umulh x8,x14,x13 + adcs x24,x24,x9 + umulh x9,x15,x13 + adcs x25,x25,x10 + umulh x10,x16,x13 + adc x26,xzr,xzr + + mul x6,x12,x12 + adds x24,x24,x8 + umulh x12, x12,x12 + adcs x25,x25,x9 + mul x9,x15,x14 + adc x26,x26,x10 + mul x10,x16,x14 + + adds x25,x25,x9 + umulh x9,x15,x14 + adcs x26,x26,x10 + umulh x10,x16,x14 + adc x27,xzr,xzr + mul x7,x13,x13 + adds x26,x26,x9 + umulh x13, x13,x13 + adc x27,x27,x10 + mul x8,x14,x14 + + mul x10,x16,x15 + umulh x14, x14,x14 + adds x27,x27,x10 + umulh x10,x16,x15 + mul x9,x15,x15 + adc x28,x10,xzr + + adds x19,x19,x19 + adcs x20,x20,x20 + adcs x21,x21,x21 + adcs x22,x22,x22 + adcs x23,x23,x23 + adcs x24,x24,x24 + adcs x25,x25,x25 + adcs x26,x26,x26 + umulh x15, x15,x15 + adcs x27,x27,x27 + mul x10,x16,x16 + adcs x28,x28,x28 + umulh x16, x16,x16 + adc x1,xzr,xzr + + adds x19,x19,x11 + adcs x20,x20,x6 + adcs x21,x21,x12 + adcs x22,x22,x7 + adcs x23,x23,x13 + adcs x24,x24,x8 + adcs x25,x25,x14 + stp x5,x19,[x0] + adcs x26,x26,x9 + stp x20,x21,[x0,#16] + adcs x27,x27,x15 + stp x22,x23,[x0,#32] + adcs x28,x28,x10 + stp x24,x25,[x0,#48] + adc x16,x16,x1 + stp x26,x27,[x0,#64] + stp x28,x16,[x0,#80] + + ret + ENDP + + + EXPORT |sqr_384|[FUNC] + ALIGN 32 +|sqr_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + bl __sqr_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |redc_mont_384|[FUNC] + ALIGN 32 +|redc_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + bl __redc_tail_mont_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |from_mont_384|[FUNC] + ALIGN 32 +|from_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + mov x4,x3 // adjust for missing b_ptr + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + csello x15,x15,x23 + csello x16,x16,x24 + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_by_1_mont_384| PROC + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + mul x26,x4,x11 + ldp x15,x16,[x1,#32] + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + mul x26,x4,x11 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + // mul x19,x5,x26 + mul x20,x6,x26 + mul x21,x7,x26 + mul x22,x8,x26 + mul x23,x9,x26 + mul x24,x10,x26 + subs xzr,x11,#1 // adds x19,x19,x11 + umulh x11,x5,x26 + adcs x20,x20,x12 + umulh x12,x6,x26 + adcs x21,x21,x13 + umulh x13,x7,x26 + adcs x22,x22,x14 + umulh x14,x8,x26 + adcs x23,x23,x15 + umulh x15,x9,x26 + adcs x24,x24,x16 + umulh x16,x10,x26 + adc x25,xzr,xzr + adds x11,x11,x20 + adcs x12,x12,x21 + adcs x13,x13,x22 + adcs x14,x14,x23 + adcs x15,x15,x24 + adc x16,x16,x25 + + ret + ENDP + + + ALIGN 32 +|__redc_tail_mont_384| PROC + ldp x19,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x11,x11,x19 // accumulate upper half + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adcs x16,x16,x24 + adc x25,xzr,xzr + + subs x19,x11,x5 + sbcs x20,x12,x6 + sbcs x21,x13,x7 + sbcs x22,x14,x8 + sbcs x23,x15,x9 + sbcs x24,x16,x10 + sbcs xzr,x25,xzr + + csello x11,x11,x19 + csello x12,x12,x20 + csello x13,x13,x21 + csello x14,x14,x22 + csello x15,x15,x23 + csello x16,x16,x24 + + stp x11,x12,[x0] + stp x13,x14,[x0,#16] + stp x15,x16,[x0,#32] + + ret + ENDP + + + + EXPORT |mul_384|[FUNC] + ALIGN 32 +|mul_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + bl __mul_384 + ldr x30,[x29,#8] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_384| PROC + ldp x11,x12,[x1] + ldr x17, [x2] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + + umulh x5,x11,x17 + umulh x6,x12,x17 + umulh x7,x13,x17 + umulh x8,x14,x17 + umulh x9,x15,x17 + umulh x10,x16,x17 + ldr x17,[x2,8*1] + + str x19,[x0] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,xzr, x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(1+1)] + adc x25,xzr,xzr + + str x19,[x0,8*1] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(2+1)] + adc x25,xzr,xzr + + str x19,[x0,8*2] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(3+1)] + adc x25,xzr,xzr + + str x19,[x0,8*3] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + ldr x17,[x2,#8*(4+1)] + adc x25,xzr,xzr + + str x19,[x0,8*4] + adds x19,x20,x5 + mul x5,x11,x17 + adcs x20,x21,x6 + mul x6,x12,x17 + adcs x21,x22,x7 + mul x7,x13,x17 + adcs x22,x23,x8 + mul x8,x14,x17 + adcs x23,x24,x9 + mul x9,x15,x17 + adc x24,x25,x10 + mul x10,x16,x17 + adds x19,x19,x5 + umulh x5,x11,x17 + adcs x20,x20,x6 + umulh x6,x12,x17 + adcs x21,x21,x7 + umulh x7,x13,x17 + adcs x22,x22,x8 + umulh x8,x14,x17 + adcs x23,x23,x9 + umulh x9,x15,x17 + adcs x24,x24,x10 + umulh x10,x16,x17 + adc x25,xzr,xzr + + str x19,[x0,8*5] + adds x19,x20,x5 + adcs x20,x21,x6 + adcs x21,x22,x7 + adcs x22,x23,x8 + adcs x23,x24,x9 + adc x24,x25,x10 + + stp x19,x20,[x0,#48] + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ret + ENDP + + + + EXPORT |mul_382x|[FUNC] + ALIGN 32 +|mul_382x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#96 // space for two 384-bit vectors + + ldp x11,x12,[x1] + mov x26,x0 // save r_ptr + ldp x19,x20,[x1,#48] + mov x27,x1 // save a_ptr + ldp x13,x14,[x1,#16] + mov x28,x2 // save b_ptr + ldp x21,x22,[x1,#64] + ldp x15,x16,[x1,#32] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x23,x24,[x1,#80] + adcs x6,x12,x20 + ldp x11,x12,[x2] + adcs x7,x13,x21 + ldp x19,x20,[x2,#48] + adcs x8,x14,x22 + ldp x13,x14,[x2,#16] + adcs x9,x15,x23 + ldp x21,x22,[x2,#64] + adc x10,x16,x24 + ldp x15,x16,[x2,#32] + + stp x5,x6,[sp] + adds x5,x11,x19 // t1 = b->re + b->im + ldp x23,x24,[x2,#80] + adcs x6,x12,x20 + stp x7,x8,[sp,#16] + adcs x7,x13,x21 + adcs x8,x14,x22 + stp x9,x10,[sp,#32] + adcs x9,x15,x23 + stp x5,x6,[sp,#48] + adc x10,x16,x24 + stp x7,x8,[sp,#64] + stp x9,x10,[sp,#80] + + bl __mul_384 // mul_384(ret->re, a->re, b->re) + + add x1,sp,#0 // mul_384(ret->im, t0, t1) + add x2,sp,#48 + add x0,x26,#96 + bl __mul_384 + + add x1,x27,#48 // mul_384(tx, a->im, b->im) + add x2,x28,#48 + add x0,sp,#0 + bl __mul_384 + + ldp x5,x6,[x3] + ldp x7,x8,[x3,#16] + ldp x9,x10,[x3,#32] + + add x1,x26,#96 // ret->im -= tx + add x2,sp,#0 + add x0,x26,#96 + bl __sub_mod_384x384 + + add x2,x26,#0 // ret->im -= ret->re + bl __sub_mod_384x384 + + add x1,x26,#0 // ret->re -= tx + add x2,sp,#0 + add x0,x26,#0 + bl __sub_mod_384x384 + ldr x30,[x29,#8] + + add sp,sp,#96 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_382x|[FUNC] + ALIGN 32 +|sqr_382x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp x11,x12,[x1] + ldp x19,x20,[x1,#48] + ldp x13,x14,[x1,#16] + adds x5,x11,x19 // t0 = a->re + a->im + ldp x21,x22,[x1,#64] + adcs x6,x12,x20 + ldp x15,x16,[x1,#32] + adcs x7,x13,x21 + ldp x23,x24,[x1,#80] + adcs x8,x14,x22 + stp x5,x6,[x0] + adcs x9,x15,x23 + ldp x5,x6,[x2] + adc x10,x16,x24 + stp x7,x8,[x0,#16] + + subs x11,x11,x19 // t1 = a->re - a->im + ldp x7,x8,[x2,#16] + sbcs x12,x12,x20 + stp x9,x10,[x0,#32] + sbcs x13,x13,x21 + ldp x9,x10,[x2,#32] + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + adds x11,x11,x19 + and x21,x7,x25 + adcs x12,x12,x20 + and x22,x8,x25 + adcs x13,x13,x21 + and x23,x9,x25 + adcs x14,x14,x22 + and x24,x10,x25 + adcs x15,x15,x23 + stp x11,x12,[x0,#48] + adc x16,x16,x24 + stp x13,x14,[x0,#64] + stp x15,x16,[x0,#80] + + mov x4,x1 // save a_ptr + add x1,x0,#0 // mul_384(ret->re, t0, t1) + add x2,x0,#48 + bl __mul_384 + + add x1,x4,#0 // mul_384(ret->im, a->re, a->im) + add x2,x4,#48 + add x0,x0,#96 + bl __mul_384 + ldr x30,[x29,#8] + + ldp x11,x12,[x0] + ldp x13,x14,[x0,#16] + adds x11,x11,x11 // add with itself + ldp x15,x16,[x0,#32] + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adcs x19,x19,x19 + adcs x20,x20,x20 + stp x11,x12,[x0] + adcs x21,x21,x21 + stp x13,x14,[x0,#16] + adcs x22,x22,x22 + stp x15,x16,[x0,#32] + adcs x23,x23,x23 + stp x19,x20,[x0,#48] + adc x24,x24,x24 + stp x21,x22,[x0,#64] + stp x23,x24,[x0,#80] + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sqr_mont_382x|[FUNC] + ALIGN 32 +|sqr_mont_382x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x3,x0,[sp,#96] // __mul_mont_384 wants them there + sub sp,sp,#112 // space for two 384-bit vectors + word + mov x4,x3 // adjust for missing b_ptr + + ldp x11,x12,[x1] + ldp x13,x14,[x1,#16] + ldp x15,x16,[x1,#32] + + ldp x17,x20,[x1,#48] + ldp x21,x22,[x1,#64] + ldp x23,x24,[x1,#80] + + adds x5,x11,x17 // t0 = a->re + a->im + adcs x6,x12,x20 + adcs x7,x13,x21 + adcs x8,x14,x22 + adcs x9,x15,x23 + adc x10,x16,x24 + + subs x19,x11,x17 // t1 = a->re - a->im + sbcs x20,x12,x20 + sbcs x21,x13,x21 + sbcs x22,x14,x22 + sbcs x23,x15,x23 + sbcs x24,x16,x24 + sbc x25,xzr,xzr // borrow flag as mask + + stp x5,x6,[sp] + stp x7,x8,[sp,#16] + stp x9,x10,[sp,#32] + stp x19,x20,[sp,#48] + stp x21,x22,[sp,#64] + stp x23,x24,[sp,#80] + str x25,[sp,#96] + + ldp x5,x6,[x2] + ldp x7,x8,[x2,#16] + ldp x9,x10,[x2,#32] + + add x2,x1,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, a->re, a->im) + + adds x19,x11,x11 // add with itself + adcs x20,x12,x12 + adcs x21,x13,x13 + adcs x22,x14,x14 + adcs x23,x15,x15 + adc x24,x16,x16 + + stp x19,x20,[x2,#48] + stp x21,x22,[x2,#64] + stp x23,x24,[x2,#80] + + ldp x11,x12,[sp] + ldr x17,[sp,#48] + ldp x13,x14,[sp,#16] + ldp x15,x16,[sp,#32] + + add x2,sp,#48 + bl __mul_mont_383_nonred // mul_mont_384(ret->im, t0, t1) + ldr x30,[x29,#8] + + ldr x25,[sp,#96] // account for sign from a->re - a->im + ldp x19,x20,[sp] + ldp x21,x22,[sp,#16] + ldp x23,x24,[sp,#32] + + and x19,x19,x25 + and x20,x20,x25 + and x21,x21,x25 + and x22,x22,x25 + and x23,x23,x25 + and x24,x24,x25 + + subs x11,x11,x19 + sbcs x12,x12,x20 + sbcs x13,x13,x21 + sbcs x14,x14,x22 + sbcs x15,x15,x23 + sbcs x16,x16,x24 + sbc x25,xzr,xzr + + and x19,x5,x25 + and x20,x6,x25 + and x21,x7,x25 + and x22,x8,x25 + and x23,x9,x25 + and x24,x10,x25 + + adds x11,x11,x19 + adcs x12,x12,x20 + adcs x13,x13,x21 + adcs x14,x14,x22 + adcs x15,x15,x23 + adc x16,x16,x24 + + stp x11,x12,[x2] + stp x13,x14,[x2,#16] + stp x15,x16,[x2,#32] + + add sp,sp,#112 + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + ALIGN 32 +|__mul_mont_383_nonred| PROC + mul x19,x11,x17 + mul x20,x12,x17 + mul x21,x13,x17 + mul x22,x14,x17 + mul x23,x15,x17 + mul x24,x16,x17 + mul x4,x4,x19 + + umulh x26,x11,x17 + umulh x27,x12,x17 + umulh x28,x13,x17 + umulh x0,x14,x17 + umulh x1,x15,x17 + umulh x3,x16,x17 + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,xzr, x3 + mul x3,x10,x4 + ldr x17,[x2,8*1] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*2] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*3] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*4] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + ldr x17,[x2,8*5] + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + + ldr x4,[x29,#96] + adds x19,x20,x26 + mul x26,x11,x17 + adcs x20,x21,x27 + mul x27,x12,x17 + adcs x21,x22,x28 + mul x28,x13,x17 + adcs x22,x23,x0 + mul x0,x14,x17 + adcs x23,x24,x1 + mul x1,x15,x17 + adcs x24,x25,x3 + mul x3,x16,x17 + adc x25,xzr,xzr + + adds x19,x19,x26 + umulh x26,x11,x17 + adcs x20,x20,x27 + umulh x27,x12,x17 + adcs x21,x21,x28 + mul x4,x4,x19 + umulh x28,x13,x17 + adcs x22,x22,x0 + umulh x0,x14,x17 + adcs x23,x23,x1 + umulh x1,x15,x17 + adcs x24,x24,x3 + umulh x3,x16,x17 + adc x25,x25,xzr + + adds x20,x20,x26 + mul x26,x5,x4 + adcs x21,x21,x27 + mul x27,x6,x4 + adcs x22,x22,x28 + mul x28,x7,x4 + adcs x23,x23,x0 + mul x0,x8,x4 + adcs x24,x24,x1 + mul x1,x9,x4 + adc x25,x25,x3 + mul x3,x10,x4 + adds x19,x19,x26 + umulh x26,x5,x4 + adcs x20,x20,x27 + umulh x27,x6,x4 + adcs x21,x21,x28 + umulh x28,x7,x4 + adcs x22,x22,x0 + umulh x0,x8,x4 + adcs x23,x23,x1 + umulh x1,x9,x4 + adcs x24,x24,x3 + umulh x3,x10,x4 + adc x25,x25,xzr + ldp x4,x2,[x29,#96] // pull r_ptr + + adds x11,x20,x26 + adcs x12,x21,x27 + adcs x13,x22,x28 + adcs x14,x23,x0 + adcs x15,x24,x1 + adcs x16,x25,x3 + + ret + ENDP + + + + EXPORT |sgn0_pty_mont_384|[FUNC] + ALIGN 32 +|sgn0_pty_mont_384| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + adds x11,x11,x11 + adcs x12,x12,x12 + adcs x13,x13,x13 + adcs x14,x14,x14 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + + + + EXPORT |sgn0_pty_mont_384x|[FUNC] + ALIGN 32 +|sgn0_pty_mont_384x| PROC + DCDU 3573752639 + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + mov x4,x2 + ldp x5,x6,[x1] + ldp x7,x8,[x1,#16] + ldp x9,x10,[x1,#32] + mov x1,x0 + + bl __mul_by_1_mont_384 + add x1,x1,#48 + + and x2,x11,#1 + orr x3,x11,x12 + adds x11,x11,x11 + orr x3,x3,x13 + adcs x12,x12,x12 + orr x3,x3,x14 + adcs x13,x13,x13 + orr x3,x3,x15 + adcs x14,x14,x14 + orr x3,x3,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x2,x2,x17 + + bl __mul_by_1_mont_384 + ldr x30,[x29,#8] + + and x0,x11,#1 + orr x1,x11,x12 + adds x11,x11,x11 + orr x1,x1,x13 + adcs x12,x12,x12 + orr x1,x1,x14 + adcs x13,x13,x13 + orr x1,x1,x15 + adcs x14,x14,x14 + orr x1,x1,x16 + adcs x15,x15,x15 + adcs x16,x16,x16 + adc x17,xzr,xzr + + subs x11,x11,x5 + sbcs x12,x12,x6 + sbcs x13,x13,x7 + sbcs x14,x14,x8 + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbc x17,x17,xzr + + mvn x17,x17 + and x17,x17,#2 + orr x0,x0,x17 + + cmp x3,#0 + cseleq x3,x0,x2 + + cmp x1,#0 + cselne x1,x0,x2 + + and x3,x3,#1 + and x1,x1,#2 + orr x0,x1,x3 // pack sign and parity + + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + DCDU 3573752767 + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm new file mode 100644 index 00000000000..6aedca7cdaf --- /dev/null +++ b/crypto/blst_src/build/win64/mulq_mont_256-x86_64.asm @@ -0,0 +1,913 @@ +OPTION DOTNAME +EXTERN mul_mont_sparse_256$1:NEAR +EXTERN sqr_mont_sparse_256$1:NEAR +EXTERN from_mont_256$1:NEAR +EXTERN redc_mont_256$1:NEAR +_DATA SEGMENT +COMM __blst_platform_cap:DWORD:1 +_DATA ENDS +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC mul_mont_sparse_256 + + +ALIGN 32 +mul_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_sparse_256:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_mont_sparse_256$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_mul_mont_sparse_256:: + + + mov rax,QWORD PTR[rdx] + mov r13,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov rbp,QWORD PTR[24+rsi] + mov rbx,rdx + + mov r15,rax + mul r13 + mov r9,rax + mov rax,r15 + mov r10,rdx + call __mulq_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mul_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_sparse_256:: +mul_mont_sparse_256 ENDP + +PUBLIC sqr_mont_sparse_256 + + +ALIGN 32 +sqr_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_sparse_256:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_mont_sparse_256$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_sqr_mont_sparse_256:: + + + mov rax,QWORD PTR[rsi] + mov r8,rcx + mov r14,QWORD PTR[8+rsi] + mov rcx,rdx + mov r12,QWORD PTR[16+rsi] + lea rbx,QWORD PTR[rsi] + mov rbp,QWORD PTR[24+rsi] + + mov r15,rax + mul rax + mov r9,rax + mov rax,r15 + mov r10,rdx + call __mulq_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_sparse_256:: +sqr_mont_sparse_256 ENDP + +ALIGN 32 +__mulq_mont_sparse_256 PROC PRIVATE + DB 243,15,30,250 + + mul r14 + add r10,rax + mov rax,r15 + adc rdx,0 + mov r11,rdx + + mul r12 + add r11,rax + mov rax,r15 + adc rdx,0 + mov r12,rdx + + mul rbp + add r12,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + xor r14,r14 + mov r13,rdx + + mov rdi,r9 + imul r9,r8 + + + mov r15,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,r15 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,r15 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,r15 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc r14,rdx + xor r15,r15 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r9 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r12,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r12,rbp + adc rdx,0 + add r13,rdx + adc r14,0 + adc r15,0 + mov rdi,r10 + imul r10,r8 + + + mov r9,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,r9 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,r9 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc r15,rdx + xor r9,r9 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r10 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r13,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r13,rbp + adc rdx,0 + add r14,rdx + adc r15,0 + adc r9,0 + mov rdi,r11 + imul r11,r8 + + + mov r10,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,r10 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,r10 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,rbp + adc r9,rdx + xor r10,r10 + + + mul QWORD PTR[rcx] + add rdi,rax + mov rax,r11 + adc rdi,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,rdi + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,rbp + adc rdx,0 + add r15,rdx + adc r9,0 + adc r10,0 + imul rax,r8 + mov rsi,QWORD PTR[8+rsp] + + + mov r11,rax + mul QWORD PTR[rcx] + add r12,rax + mov rax,r11 + adc r12,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r12 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r11 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + mov rbx,r14 + add r15,rbp + adc rdx,0 + add r15,rax + mov rax,r13 + adc rdx,0 + add r9,rdx + adc r10,0 + + + + + mov r12,r15 + sub r13,QWORD PTR[rcx] + sbb r14,QWORD PTR[8+rcx] + sbb r15,QWORD PTR[16+rcx] + mov rbp,r9 + sbb r9,QWORD PTR[24+rcx] + sbb r10,0 + + cmovc r13,rax + cmovc r14,rbx + cmovc r15,r12 + mov QWORD PTR[rsi],r13 + cmovc r9,rbp + mov QWORD PTR[8+rsi],r14 + mov QWORD PTR[16+rsi],r15 + mov QWORD PTR[24+rsi],r9 + + DB 0F3h,0C3h ;repret + +__mulq_mont_sparse_256 ENDP +PUBLIC from_mont_256 + + +ALIGN 32 +from_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_from_mont_256:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz from_mont_256$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_from_mont_256:: + + + mov rbx,rdx + call __mulq_by_1_mont_256 + + + + + + mov r10,r14 + mov r11,r15 + mov r12,r9 + + sub r13,QWORD PTR[rbx] + sbb r14,QWORD PTR[8+rbx] + sbb r15,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + + cmovnc rax,r13 + cmovnc r10,r14 + cmovnc r11,r15 + mov QWORD PTR[rdi],rax + cmovnc r12,r9 + mov QWORD PTR[8+rdi],r10 + mov QWORD PTR[16+rdi],r11 + mov QWORD PTR[24+rdi],r12 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_from_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_from_mont_256:: +from_mont_256 ENDP + +PUBLIC redc_mont_256 + + +ALIGN 32 +redc_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redc_mont_256:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz redc_mont_256$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redc_mont_256:: + + + mov rbx,rdx + call __mulq_by_1_mont_256 + + add r13,QWORD PTR[32+rsi] + adc r14,QWORD PTR[40+rsi] + mov rax,r13 + adc r15,QWORD PTR[48+rsi] + mov r10,r14 + adc r9,QWORD PTR[56+rsi] + sbb rsi,rsi + + + + + mov r11,r15 + sub r13,QWORD PTR[rbx] + sbb r14,QWORD PTR[8+rbx] + sbb r15,QWORD PTR[16+rbx] + mov r12,r9 + sbb r9,QWORD PTR[24+rbx] + sbb rsi,0 + + cmovnc rax,r13 + cmovnc r10,r14 + cmovnc r11,r15 + mov QWORD PTR[rdi],rax + cmovnc r12,r9 + mov QWORD PTR[8+rdi],r10 + mov QWORD PTR[16+rdi],r11 + mov QWORD PTR[24+rdi],r12 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redc_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redc_mont_256:: +redc_mont_256 ENDP + +ALIGN 32 +__mulq_by_1_mont_256 PROC PRIVATE + DB 243,15,30,250 + + mov rax,QWORD PTR[rsi] + mov r10,QWORD PTR[8+rsi] + mov r11,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + + mov r13,rax + imul rax,rcx + mov r9,rax + + mul QWORD PTR[rbx] + add r13,rax + mov rax,r9 + adc r13,rdx + + mul QWORD PTR[8+rbx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[16+rbx] + mov r14,r10 + imul r10,rcx + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[24+rbx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r13 + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r10 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + mov r15,r11 + imul r11,rcx + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r11 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + mov r9,r12 + imul r12,rcx + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r9,rax + mov rax,r12 + adc r9,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + DB 0F3h,0C3h ;repret +__mulq_by_1_mont_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mul_mont_sparse_256 + DD imagerel $L$SEH_body_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_mul_mont_sparse_256 + DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_mul_mont_sparse_256 + DD imagerel $L$SEH_end_mul_mont_sparse_256 + DD imagerel $L$SEH_info_mul_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_sparse_256 + DD imagerel $L$SEH_body_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_sqr_mont_sparse_256 + DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_sqr_mont_sparse_256 + DD imagerel $L$SEH_end_sqr_mont_sparse_256 + DD imagerel $L$SEH_info_sqr_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_from_mont_256 + DD imagerel $L$SEH_body_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_prologue + + DD imagerel $L$SEH_body_from_mont_256 + DD imagerel $L$SEH_epilogue_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_body + + DD imagerel $L$SEH_epilogue_from_mont_256 + DD imagerel $L$SEH_end_from_mont_256 + DD imagerel $L$SEH_info_from_mont_256_epilogue + + DD imagerel $L$SEH_begin_redc_mont_256 + DD imagerel $L$SEH_body_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_prologue + + DD imagerel $L$SEH_body_redc_mont_256 + DD imagerel $L$SEH_epilogue_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_body + + DD imagerel $L$SEH_epilogue_redc_mont_256 + DD imagerel $L$SEH_end_redc_mont_256 + DD imagerel $L$SEH_info_redc_mont_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mul_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mul_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqr_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqr_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_from_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_from_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_from_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redc_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_redc_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_redc_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm new file mode 100644 index 00000000000..8563815917e --- /dev/null +++ b/crypto/blst_src/build/win64/mulq_mont_384-x86_64.asm @@ -0,0 +1,4341 @@ +OPTION DOTNAME +EXTERN mul_mont_384x$1:NEAR +EXTERN sqr_mont_384x$1:NEAR +EXTERN mul_382x$1:NEAR +EXTERN sqr_382x$1:NEAR +EXTERN mul_384$1:NEAR +EXTERN sqr_384$1:NEAR +EXTERN redc_mont_384$1:NEAR +EXTERN from_mont_384$1:NEAR +EXTERN sgn0_pty_mont_384$1:NEAR +EXTERN sgn0_pty_mont_384x$1:NEAR +EXTERN mul_mont_384$1:NEAR +EXTERN sqr_mont_384$1:NEAR +EXTERN sqr_n_mul_mont_384$1:NEAR +EXTERN sqr_n_mul_mont_383$1:NEAR +EXTERN sqr_mont_382x$1:NEAR +_DATA SEGMENT +COMM __blst_platform_cap:DWORD:1 +_DATA ENDS +.text$ SEGMENT ALIGN(256) 'CODE' + + + + + + + + +ALIGN 32 +__subq_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__subq_mod_384x384 ENDP + + +ALIGN 32 +__addq_mod_384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__addq_mod_384 ENDP + + +ALIGN 32 +__subq_mod_384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__subq_mod_384_a_is_loaded:: + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__subq_mod_384 ENDP +PUBLIC mul_mont_384x + + +ALIGN 32 +mul_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_384x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_mont_384x$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,328 + +$L$SEH_body_mul_mont_384x:: + + + mov rbx,rdx + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[24+rsp],rsi + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[8+rsp],rcx + mov QWORD PTR[rsp],r8 + + + + + lea rdi,QWORD PTR[40+rsp] + call __mulq_384 + + + lea rbx,QWORD PTR[48+rbx] + lea rsi,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((40+96))+rsp] + call __mulq_384 + + + mov rcx,QWORD PTR[8+rsp] + lea rdx,QWORD PTR[((-48))+rsi] + lea rdi,QWORD PTR[((40+192+48))+rsp] + call __addq_mod_384 + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((-48))+rdi] + call __addq_mod_384 + + lea rbx,QWORD PTR[rdi] + lea rsi,QWORD PTR[48+rdi] + call __mulq_384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[8+rsp] + call __subq_mod_384x384 + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __subq_mod_384x384 + + + lea rsi,QWORD PTR[40+rsp] + lea rdx,QWORD PTR[((40+96))+rsp] + lea rdi,QWORD PTR[40+rsp] + call __subq_mod_384x384 + + mov rbx,rcx + + + lea rsi,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[rsp] + mov rdi,QWORD PTR[32+rsp] + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + + lea rsi,QWORD PTR[((40+192))+rsp] + mov rcx,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + lea r8,QWORD PTR[328+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mul_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_384x:: +mul_mont_384x ENDP +PUBLIC sqr_mont_384x + + +ALIGN 32 +sqr_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_384x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_mont_384x$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqr_mont_384x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rsi + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[32+rsp] + call __addq_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((32+48))+rsp] + call __subq_mod_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rbx,QWORD PTR[48+rsi] + + mov rax,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + call __mulq_mont_384 + add r14,r14 + adc r15,r15 + adc r8,r8 + mov r12,r14 + adc r9,r9 + mov r13,r15 + adc r10,r10 + mov rax,r8 + adc r11,r11 + mov rbx,r9 + sbb rdx,rdx + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov rbp,r10 + sbb r8,QWORD PTR[16+rcx] + sbb r9,QWORD PTR[24+rcx] + sbb r10,QWORD PTR[32+rcx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r14,r12 + cmovc r15,r13 + cmovc r8,rax + mov QWORD PTR[48+rdi],r14 + cmovc r9,rbx + mov QWORD PTR[56+rdi],r15 + cmovc r10,rbp + mov QWORD PTR[64+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r10 + mov QWORD PTR[88+rdi],r11 + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rax,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov r12,QWORD PTR[((32+16))+rsp] + mov r13,QWORD PTR[((32+24))+rsp] + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_384x:: +sqr_mont_384x ENDP + +PUBLIC mul_382x + + +ALIGN 32 +mul_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_382x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_382x$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_mul_382x:: + + + lea rdi,QWORD PTR[96+rdi] + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rcx + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[48+rsi] + adc r9,QWORD PTR[56+rsi] + adc r10,QWORD PTR[64+rsi] + adc r11,QWORD PTR[72+rsi] + adc r12,QWORD PTR[80+rsi] + adc r13,QWORD PTR[88+rsi] + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov r12,QWORD PTR[32+rdx] + mov r13,QWORD PTR[40+rdx] + + add r8,QWORD PTR[48+rdx] + adc r9,QWORD PTR[56+rdx] + adc r10,QWORD PTR[64+rdx] + adc r11,QWORD PTR[72+rdx] + adc r12,QWORD PTR[80+rdx] + adc r13,QWORD PTR[88+rdx] + + mov QWORD PTR[((32+48))+rsp],r8 + mov QWORD PTR[((32+56))+rsp],r9 + mov QWORD PTR[((32+64))+rsp],r10 + mov QWORD PTR[((32+72))+rsp],r11 + mov QWORD PTR[((32+80))+rsp],r12 + mov QWORD PTR[((32+88))+rsp],r13 + + + lea rsi,QWORD PTR[((32+0))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + call __mulq_384 + + + mov rsi,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __mulq_384 + + + lea rsi,QWORD PTR[48+rsi] + lea rbx,QWORD PTR[48+rbx] + lea rdi,QWORD PTR[32+rsp] + call __mulq_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[32+rsp] + mov rcx,QWORD PTR[24+rsp] + mov rdi,rsi + call __subq_mod_384x384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __subq_mod_384x384 + + + lea rsi,QWORD PTR[((-96))+rdi] + lea rdx,QWORD PTR[32+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __subq_mod_384x384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mul_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_382x:: +mul_382x ENDP +PUBLIC sqr_382x + + +ALIGN 32 +sqr_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_382x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_382x$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_sqr_382x:: + + + mov rcx,rdx + + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rdx,QWORD PTR[40+rsi] + + mov r8,r14 + add r14,QWORD PTR[48+rsi] + mov r9,r15 + adc r15,QWORD PTR[56+rsi] + mov r10,rax + adc rax,QWORD PTR[64+rsi] + mov r11,rbx + adc rbx,QWORD PTR[72+rsi] + mov r12,rbp + adc rbp,QWORD PTR[80+rsi] + mov r13,rdx + adc rdx,QWORD PTR[88+rsi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],rax + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rdx + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __subq_mod_384_a_is_loaded + + + lea rsi,QWORD PTR[rdi] + lea rbx,QWORD PTR[((-48))+rdi] + lea rdi,QWORD PTR[((-48))+rdi] + call __mulq_384 + + + mov rsi,QWORD PTR[rsp] + lea rbx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulq_384 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + mov r14,QWORD PTR[48+rdi] + mov r15,QWORD PTR[56+rdi] + mov rax,QWORD PTR[64+rdi] + mov rbx,QWORD PTR[72+rdi] + mov rbp,QWORD PTR[80+rdi] + add r8,r8 + mov rdx,QWORD PTR[88+rdi] + adc r9,r9 + mov QWORD PTR[rdi],r8 + adc r10,r10 + mov QWORD PTR[8+rdi],r9 + adc r11,r11 + mov QWORD PTR[16+rdi],r10 + adc r12,r12 + mov QWORD PTR[24+rdi],r11 + adc r13,r13 + mov QWORD PTR[32+rdi],r12 + adc r14,r14 + mov QWORD PTR[40+rdi],r13 + adc r15,r15 + mov QWORD PTR[48+rdi],r14 + adc rax,rax + mov QWORD PTR[56+rdi],r15 + adc rbx,rbx + mov QWORD PTR[64+rdi],rax + adc rbp,rbp + mov QWORD PTR[72+rdi],rbx + adc rdx,rdx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rdx + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_382x:: +sqr_382x ENDP +PUBLIC mul_384 + + +ALIGN 32 +mul_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_384$1 +endif + push rbp + + push rbx + + push r12 + +$L$SEH_body_mul_384:: + + + mov rbx,rdx + call __mulq_384 + + mov r12,QWORD PTR[rsp] + + mov rbx,QWORD PTR[8+rsp] + + mov rbp,QWORD PTR[16+rsp] + + lea rsp,QWORD PTR[24+rsp] + +$L$SEH_epilogue_mul_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_384:: +mul_384 ENDP + + +ALIGN 32 +__mulq_384 PROC PRIVATE + DB 243,15,30,250 + + mov rax,QWORD PTR[rbx] + + mov rbp,rax + mul QWORD PTR[rsi] + mov QWORD PTR[rdi],rax + mov rax,rbp + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r11,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[8+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[16+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[24+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[32+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov rbp,rax + mul QWORD PTR[rsi] + add rcx,rax + mov rax,rbp + adc rdx,0 + mov QWORD PTR[40+rdi],rcx + mov rcx,rdx + + mul QWORD PTR[8+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r12,rax + mov rax,rax + adc rdx,0 + add r11,r12 + adc rdx,0 + mov r12,rdx + mov QWORD PTR[48+rdi],rcx + mov QWORD PTR[56+rdi],r8 + mov QWORD PTR[64+rdi],r9 + mov QWORD PTR[72+rdi],r10 + mov QWORD PTR[80+rdi],r11 + mov QWORD PTR[88+rdi],r12 + + DB 0F3h,0C3h ;repret +__mulq_384 ENDP +PUBLIC sqr_384 + + +ALIGN 32 +sqr_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_384:: + + + mov rdi,rcx + mov rsi,rdx +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_384$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sqr_384:: + + + call __sqrq_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqr_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_384:: +sqr_384 ENDP + + +ALIGN 32 +__sqrq_384 PROC PRIVATE + DB 243,15,30,250 + + mov rax,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rcx,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + + + mov r14,rax + mul r15 + mov r9,rax + mov rax,r14 + mov rbp,QWORD PTR[32+rsi] + mov r10,rdx + + mul rcx + add r10,rax + mov rax,r14 + adc rdx,0 + mov rsi,QWORD PTR[40+rsi] + mov r11,rdx + + mul rbx + add r11,rax + mov rax,r14 + adc rdx,0 + mov r12,rdx + + mul rbp + add r12,rax + mov rax,r14 + adc rdx,0 + mov r13,rdx + + mul rsi + add r13,rax + mov rax,r14 + adc rdx,0 + mov r14,rdx + + mul rax + xor r8,r8 + mov QWORD PTR[rdi],rax + mov rax,r15 + add r9,r9 + adc r8,0 + add r9,rdx + adc r8,0 + mov QWORD PTR[8+rdi],r9 + + mul rcx + add r11,rax + mov rax,r15 + adc rdx,0 + mov r9,rdx + + mul rbx + add r12,rax + mov rax,r15 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul rbp + add r13,rax + mov rax,r15 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul rsi + add r14,rax + mov rax,r15 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r15,rdx + + mul rax + xor r9,r9 + add r8,rax + mov rax,rcx + add r10,r10 + adc r11,r11 + adc r9,0 + add r10,r8 + adc r11,rdx + adc r9,0 + mov QWORD PTR[16+rdi],r10 + + mul rbx + add r13,rax + mov rax,rcx + adc rdx,0 + mov QWORD PTR[24+rdi],r11 + mov r8,rdx + + mul rbp + add r14,rax + mov rax,rcx + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul rsi + add r15,rax + mov rax,rcx + adc rdx,0 + add r15,r8 + adc rdx,0 + mov rcx,rdx + + mul rax + xor r11,r11 + add r9,rax + mov rax,rbx + add r12,r12 + adc r13,r13 + adc r11,0 + add r12,r9 + adc r13,rdx + adc r11,0 + mov QWORD PTR[32+rdi],r12 + + + mul rbp + add r15,rax + mov rax,rbx + adc rdx,0 + mov QWORD PTR[40+rdi],r13 + mov r8,rdx + + mul rsi + add rcx,rax + mov rax,rbx + adc rdx,0 + add rcx,r8 + adc rdx,0 + mov rbx,rdx + + mul rax + xor r12,r12 + add r11,rax + mov rax,rbp + add r14,r14 + adc r15,r15 + adc r12,0 + add r14,r11 + adc r15,rdx + mov QWORD PTR[48+rdi],r14 + adc r12,0 + mov QWORD PTR[56+rdi],r15 + + + mul rsi + add rbx,rax + mov rax,rbp + adc rdx,0 + mov rbp,rdx + + mul rax + xor r13,r13 + add r12,rax + mov rax,rsi + add rcx,rcx + adc rbx,rbx + adc r13,0 + add rcx,r12 + adc rbx,rdx + mov QWORD PTR[64+rdi],rcx + adc r13,0 + mov QWORD PTR[72+rdi],rbx + + + mul rax + add rax,r13 + add rbp,rbp + adc rdx,0 + add rax,rbp + adc rdx,0 + mov QWORD PTR[80+rdi],rax + mov QWORD PTR[88+rdi],rdx + + DB 0F3h,0C3h ;repret +__sqrq_384 ENDP + +PUBLIC sqr_mont_384 + + +ALIGN 32 +sqr_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_mont_384$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*15 + +$L$SEH_body_sqr_mont_384:: + + + mov QWORD PTR[96+rsp],rcx + mov QWORD PTR[104+rsp],rdx + mov QWORD PTR[112+rsp],rdi + + mov rdi,rsp + call __sqrq_384 + + lea rsi,QWORD PTR[rsp] + mov rcx,QWORD PTR[96+rsp] + mov rbx,QWORD PTR[104+rsp] + mov rdi,QWORD PTR[112+rsp] + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + lea r8,QWORD PTR[120+rsp] + mov r15,QWORD PTR[120+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_384:: +sqr_mont_384 ENDP + + + +PUBLIC redc_mont_384 + + +ALIGN 32 +redc_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redc_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz redc_mont_384$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redc_mont_384:: + + + mov rbx,rdx + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redc_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redc_mont_384:: +redc_mont_384 ENDP + + + + +PUBLIC from_mont_384 + + +ALIGN 32 +from_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_from_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz from_mont_384$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_from_mont_384:: + + + mov rbx,rdx + call __mulq_by_1_mont_384 + + + + + + mov rcx,r15 + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_from_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_from_mont_384:: +from_mont_384 ENDP + +ALIGN 32 +__mulq_by_1_mont_384 PROC PRIVATE + DB 243,15,30,250 + + mov rax,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,rax + imul rax,rcx + mov r8,rax + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r8 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r15,r9 + imul r9,rcx + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[32+rbx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[40+rbx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r9 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r12,rax + mov rax,r9 + adc rdx,0 + mov r8,r10 + imul r10,rcx + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rbx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rbx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r8,rax + mov rax,r10 + adc r8,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rbx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r10 + adc rdx,0 + mov r9,r11 + imul r11,rcx + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rbx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rbx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[rbx] + add r9,rax + mov rax,r11 + adc r9,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r11 + adc rdx,0 + mov r10,r12 + imul r12,rcx + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rbx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rbx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[rbx] + add r10,rax + mov rax,r12 + adc r10,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r12 + adc rdx,0 + mov r11,r13 + imul r13,rcx + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rbx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rbx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rbx] + add r11,rax + mov rax,r13 + adc r11,rdx + + mul QWORD PTR[8+rbx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rbx] + add r8,rax + mov rax,r13 + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rbx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rbx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + DB 0F3h,0C3h ;repret +__mulq_by_1_mont_384 ENDP + + +ALIGN 32 +__redq_tail_mont_384 PROC PRIVATE + DB 243,15,30,250 + + add r14,QWORD PTR[48+rsi] + mov rax,r14 + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + mov rcx,r15 + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + sbb r12,r12 + + + + + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__redq_tail_mont_384 ENDP + +PUBLIC sgn0_pty_mont_384 + + +ALIGN 32 +sgn0_pty_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sgn0_pty_mont_384$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mont_384:: + + + mov rbx,rsi + lea rsi,QWORD PTR[rdi] + mov rcx,rdx + call __mulq_by_1_mont_384 + + xor rax,rax + mov r13,r14 + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + not rax + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0_pty_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mont_384:: +sgn0_pty_mont_384 ENDP + +PUBLIC sgn0_pty_mont_384x + + +ALIGN 32 +sgn0_pty_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0_pty_mont_384x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sgn0_pty_mont_384x$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0_pty_mont_384x:: + + + mov rbx,rsi + lea rsi,QWORD PTR[48+rdi] + mov rcx,rdx + call __mulq_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + lea rsi,QWORD PTR[rdi] + xor rdi,rdi + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rdi,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rdi,0 + + mov QWORD PTR[rsp],r14 + not rdi + and r13,1 + and rdi,2 + or rdi,r13 + + call __mulq_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + xor rax,rax + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + mov r12,QWORD PTR[rsp] + + not rax + + test r14,r14 + cmovz r13,rdi + + test r12,r12 + cmovnz rax,rdi + + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0_pty_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0_pty_mont_384x:: +sgn0_pty_mont_384x ENDP +PUBLIC mul_mont_384 + + +ALIGN 32 +mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mul_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz mul_mont_384$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*3 + +$L$SEH_body_mul_mont_384:: + + + mov rax,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + mov rbx,rdx + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rdi + + call __mulq_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mul_mont_384:: +mul_mont_384 ENDP + +ALIGN 32 +__mulq_mont_384 PROC PRIVATE + DB 243,15,30,250 + + mov rdi,rax + mul r14 + mov r8,rax + mov rax,rdi + mov r9,rdx + + mul r15 + add r9,rax + mov rax,rdi + adc rdx,0 + mov r10,rdx + + mul r12 + add r10,rax + mov rax,rdi + adc rdx,0 + mov r11,rdx + + mov rbp,r8 + imul r8,QWORD PTR[8+rsp] + + mul r13 + add r11,rax + mov rax,rdi + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[40+rsi] + add r13,rax + mov rax,r8 + adc rdx,0 + xor r15,r15 + mov r14,rdx + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r8 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,r8 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r13,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + add r13,rbp + adc r14,rdx + adc r15,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r9,rax + mov rax,rdi + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[8+rsi] + add r10,rax + mov rax,rdi + adc rdx,0 + add r10,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mov rbp,r9 + imul r9,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rsi] + add r14,r8 + adc rdx,0 + xor r8,r8 + add r14,rax + mov rax,r9 + adc r15,rdx + adc r8,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r9 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,r9 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r14,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r14,rbp + adc r15,rdx + adc r8,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,rdi + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + add r11,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mov rbp,r10 + imul r10,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rsi] + add r15,r9 + adc rdx,0 + xor r9,r9 + add r15,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r10 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,r10 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r15,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r15,rbp + adc r8,rdx + adc r9,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,rdi + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + add r12,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mov rbp,r11 + imul r11,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rsi] + add r8,r10 + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r11 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r14,rbp + adc rdx,0 + add r14,rax + mov rax,r11 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r8,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r8,rbp + adc r9,rdx + adc r10,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,rdi + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + add r13,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mov rbp,r12 + imul r12,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rsi] + add r8,rax + mov rax,rdi + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r9,r11 + adc rdx,0 + xor r11,r11 + add r9,rax + mov rax,r12 + adc r10,rdx + adc r11,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r12 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r15,rbp + adc rdx,0 + add r15,rax + mov rax,r12 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r9,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r9,rbp + adc r10,rdx + adc r11,0 + + mov rdi,rax + mul QWORD PTR[rsi] + add r13,rax + mov rax,rdi + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[8+rsi] + add r14,rax + mov rax,rdi + adc rdx,0 + add r14,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[16+rsi] + add r15,rax + mov rax,rdi + adc rdx,0 + add r15,r12 + adc rdx,0 + mov r12,rdx + + mov rbp,r13 + imul r13,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r8,rax + mov rax,rdi + adc rdx,0 + add r8,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r9,rax + mov rax,rdi + adc rdx,0 + add r9,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[40+rsi] + add r10,r12 + adc rdx,0 + xor r12,r12 + add r10,rax + mov rax,r13 + adc r11,rdx + adc r12,0 + + mul QWORD PTR[rcx] + add rbp,rax + mov rax,r13 + adc rbp,rdx + + mul QWORD PTR[8+rcx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[16+rcx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[24+rcx] + add r8,rbp + adc rdx,0 + add r8,rax + mov rax,r13 + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[32+rcx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,rbp + adc rdx,0 + mov rbp,rdx + + mul QWORD PTR[40+rcx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,rbp + adc r11,rdx + adc r12,0 + + + + + mov rdi,QWORD PTR[16+rsp] + sub r14,QWORD PTR[rcx] + mov rdx,r15 + sbb r15,QWORD PTR[8+rcx] + mov rbx,r8 + sbb r8,QWORD PTR[16+rcx] + mov rsi,r9 + sbb r9,QWORD PTR[24+rcx] + mov rbp,r10 + sbb r10,QWORD PTR[32+rcx] + mov r13,r11 + sbb r11,QWORD PTR[40+rcx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rdx + cmovc r8,rbx + mov QWORD PTR[rdi],r14 + cmovc r9,rsi + mov QWORD PTR[8+rdi],r15 + cmovc r10,rbp + mov QWORD PTR[16+rdi],r8 + cmovc r11,r13 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__mulq_mont_384 ENDP +PUBLIC sqr_n_mul_mont_384 + + +ALIGN 32 +sqr_n_mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_n_mul_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_n_mul_mont_384$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*17 + +$L$SEH_body_sqr_n_mul_mont_384:: + + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rcx + lea rdi,QWORD PTR[32+rsp] + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqr_384:: + movd xmm1,edx + + call __sqrq_384 + + lea rsi,QWORD PTR[rdi] + mov rcx,QWORD PTR[rsp] + mov rbx,QWORD PTR[16+rsp] + call __mulq_by_1_mont_384 + call __redq_tail_mont_384 + + movd edx,xmm1 + lea rsi,QWORD PTR[rdi] + dec edx + jnz $L$oop_sqr_384 + +DB 102,72,15,126,208 + mov rcx,rbx + mov rbx,QWORD PTR[24+rsp] + + + + + + + mov r12,r8 + mov r13,r9 + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[136+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_n_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_n_mul_mont_384:: +sqr_n_mul_mont_384 ENDP + +PUBLIC sqr_n_mul_mont_383 + + +ALIGN 32 +sqr_n_mul_mont_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_n_mul_mont_383:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_n_mul_mont_383$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8*17 + +$L$SEH_body_sqr_n_mul_mont_383:: + + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[8+rsp],rdi + mov QWORD PTR[16+rsp],rcx + lea rdi,QWORD PTR[32+rsp] + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqr_383:: + movd xmm1,edx + + call __sqrq_384 + + lea rsi,QWORD PTR[rdi] + mov rcx,QWORD PTR[rsp] + mov rbx,QWORD PTR[16+rsp] + call __mulq_by_1_mont_384 + + movd edx,xmm1 + add r14,QWORD PTR[48+rsi] + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + lea rsi,QWORD PTR[rdi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r8 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + dec edx + jnz $L$oop_sqr_383 + +DB 102,72,15,126,208 + mov rcx,rbx + mov rbx,QWORD PTR[24+rsp] + + + + + + + mov r12,r8 + mov r13,r9 + + call __mulq_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[136+rsp] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_n_mul_mont_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_n_mul_mont_383:: +sqr_n_mul_mont_383 ENDP + +ALIGN 32 +__mulq_mont_383_nonred PROC PRIVATE + DB 243,15,30,250 + + mov rbp,rax + mul r14 + mov r8,rax + mov rax,rbp + mov r9,rdx + + mul r15 + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul r12 + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mov r15,r8 + imul r8,QWORD PTR[8+rsp] + + mul r13 + add r11,rax + mov rax,rbp + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + mov r13,rdx + + mul QWORD PTR[40+rsi] + add r13,rax + mov rax,r8 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rcx] + add r15,rax + mov rax,r8 + adc r15,rdx + + mul QWORD PTR[8+rcx] + add r9,rax + mov rax,r8 + adc rdx,0 + add r9,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rcx] + add r10,rax + mov rax,r8 + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rcx] + add r11,r15 + adc rdx,0 + add r11,rax + mov rax,r8 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rcx] + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rcx] + add r13,rax + mov rax,QWORD PTR[8+rbx] + adc rdx,0 + add r13,r15 + adc r14,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[8+rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + add r10,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r11,r15 + adc rdx,0 + mov r15,rdx + + mov r8,r9 + imul r9,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[32+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[40+rsi] + add r14,r15 + adc rdx,0 + add r14,rax + mov rax,r9 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rcx] + add r8,rax + mov rax,r9 + adc r8,rdx + + mul QWORD PTR[8+rcx] + add r10,rax + mov rax,r9 + adc rdx,0 + add r10,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rcx] + add r11,rax + mov rax,r9 + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[24+rcx] + add r12,r8 + adc rdx,0 + add r12,rax + mov rax,r9 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rcx] + add r13,rax + mov rax,r9 + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rcx] + add r14,rax + mov rax,QWORD PTR[16+rbx] + adc rdx,0 + add r14,r8 + adc r15,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[8+rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + add r11,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[16+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r8 + adc rdx,0 + mov r8,rdx + + mov r9,r10 + imul r10,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[32+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r8 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[40+rsi] + add r15,r8 + adc rdx,0 + add r15,rax + mov rax,r10 + adc rdx,0 + mov r8,rdx + + mul QWORD PTR[rcx] + add r9,rax + mov rax,r10 + adc r9,rdx + + mul QWORD PTR[8+rcx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rcx] + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[24+rcx] + add r13,r9 + adc rdx,0 + add r13,rax + mov rax,r10 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rcx] + add r14,rax + mov rax,r10 + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rcx] + add r15,rax + mov rax,QWORD PTR[24+rbx] + adc rdx,0 + add r15,r9 + adc r8,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[8+rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + add r12,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[16+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r9 + adc rdx,0 + mov r9,rdx + + mov r10,r11 + imul r11,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[32+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r9 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[40+rsi] + add r8,r9 + adc rdx,0 + add r8,rax + mov rax,r11 + adc rdx,0 + mov r9,rdx + + mul QWORD PTR[rcx] + add r10,rax + mov rax,r11 + adc r10,rdx + + mul QWORD PTR[8+rcx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rcx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rcx] + add r14,r10 + adc rdx,0 + add r14,rax + mov rax,r11 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rcx] + add r15,rax + mov rax,r11 + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rcx] + add r8,rax + mov rax,QWORD PTR[32+rbx] + adc rdx,0 + add r8,r10 + adc r9,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r12,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[8+rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mov r11,r12 + imul r12,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[32+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add r8,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[40+rsi] + add r9,r10 + adc rdx,0 + add r9,rax + mov rax,r12 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rcx] + add r11,rax + mov rax,r12 + adc r11,rdx + + mul QWORD PTR[8+rcx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rcx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rcx] + add r15,r11 + adc rdx,0 + add r15,rax + mov rax,r12 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rcx] + add r8,rax + mov rax,r12 + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rcx] + add r9,rax + mov rax,QWORD PTR[40+rbx] + adc rdx,0 + add r9,r11 + adc r10,rdx + + mov rbp,rax + mul QWORD PTR[rsi] + add r13,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[8+rsi] + add r14,rax + mov rax,rbp + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rsi] + add r15,rax + mov rax,rbp + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mov r12,r13 + imul r13,QWORD PTR[8+rsp] + + mul QWORD PTR[24+rsi] + add r8,rax + mov rax,rbp + adc rdx,0 + add r8,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[32+rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + add r9,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[40+rsi] + add r10,r11 + adc rdx,0 + add r10,rax + mov rax,r13 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[rcx] + add r12,rax + mov rax,r13 + adc r12,rdx + + mul QWORD PTR[8+rcx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[16+rcx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[24+rcx] + add r8,r12 + adc rdx,0 + add r8,rax + mov rax,r13 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[32+rcx] + add r9,rax + mov rax,r13 + adc rdx,0 + add r9,r12 + adc rdx,0 + mov r12,rdx + + mul QWORD PTR[40+rcx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r12 + adc r11,rdx + DB 0F3h,0C3h ;repret +__mulq_mont_383_nonred ENDP +PUBLIC sqr_mont_382x + + +ALIGN 32 +sqr_mont_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqr_mont_382x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +ifdef __BLST_PORTABLE__ + test DWORD PTR[__blst_platform_cap],1 + jnz sqr_mont_382x$1 +endif + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqr_mont_382x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[16+rsp],rsi + mov QWORD PTR[24+rsp],rdi + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rdx,r11 + adc r11,QWORD PTR[72+rsi] + mov rbx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rdx,QWORD PTR[72+rsi] + sbb rbx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rdi,rdi + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + mov QWORD PTR[((32+48))+rsp],r14 + mov QWORD PTR[((32+56))+rsp],r15 + mov QWORD PTR[((32+64))+rsp],rax + mov QWORD PTR[((32+72))+rsp],rdx + mov QWORD PTR[((32+80))+rsp],rbx + mov QWORD PTR[((32+88))+rsp],rbp + mov QWORD PTR[((32+96))+rsp],rdi + + + + lea rbx,QWORD PTR[48+rsi] + + mov rax,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + mov rdi,QWORD PTR[24+rsp] + call __mulq_mont_383_nonred + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + + mov QWORD PTR[48+rdi],r14 + mov QWORD PTR[56+rdi],r15 + mov QWORD PTR[64+rdi],r8 + mov QWORD PTR[72+rdi],r9 + mov QWORD PTR[80+rdi],r10 + mov QWORD PTR[88+rdi],r11 + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rax,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov r12,QWORD PTR[((32+16))+rsp] + mov r13,QWORD PTR[((32+24))+rsp] + + call __mulq_mont_383_nonred + mov rsi,QWORD PTR[((32+96))+rsp] + mov r12,QWORD PTR[((32+0))+rsp] + mov r13,QWORD PTR[((32+8))+rsp] + and r12,rsi + mov rax,QWORD PTR[((32+16))+rsp] + and r13,rsi + mov rbx,QWORD PTR[((32+24))+rsp] + and rax,rsi + mov rbp,QWORD PTR[((32+32))+rsp] + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[((32+40))+rsp] + + sub r14,r12 + mov r12,QWORD PTR[rcx] + sbb r15,r13 + mov r13,QWORD PTR[8+rcx] + sbb r8,rax + mov rax,QWORD PTR[16+rcx] + sbb r9,rbx + mov rbx,QWORD PTR[24+rcx] + sbb r10,rbp + mov rbp,QWORD PTR[32+rcx] + sbb r11,rsi + sbb rsi,rsi + + and r12,rsi + and r13,rsi + and rax,rsi + and rbx,rsi + and rbp,rsi + and rsi,QWORD PTR[40+rcx] + + add r14,r12 + adc r15,r13 + adc r8,rax + adc r9,rbx + adc r10,rbp + adc r11,rsi + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r8 + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqr_mont_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqr_mont_382x:: +sqr_mont_382x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mul_mont_384x + DD imagerel $L$SEH_body_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_prologue + + DD imagerel $L$SEH_body_mul_mont_384x + DD imagerel $L$SEH_epilogue_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_body + + DD imagerel $L$SEH_epilogue_mul_mont_384x + DD imagerel $L$SEH_end_mul_mont_384x + DD imagerel $L$SEH_info_mul_mont_384x_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_384x + DD imagerel $L$SEH_body_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_prologue + + DD imagerel $L$SEH_body_sqr_mont_384x + DD imagerel $L$SEH_epilogue_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_body + + DD imagerel $L$SEH_epilogue_sqr_mont_384x + DD imagerel $L$SEH_end_sqr_mont_384x + DD imagerel $L$SEH_info_sqr_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mul_382x + DD imagerel $L$SEH_body_mul_382x + DD imagerel $L$SEH_info_mul_382x_prologue + + DD imagerel $L$SEH_body_mul_382x + DD imagerel $L$SEH_epilogue_mul_382x + DD imagerel $L$SEH_info_mul_382x_body + + DD imagerel $L$SEH_epilogue_mul_382x + DD imagerel $L$SEH_end_mul_382x + DD imagerel $L$SEH_info_mul_382x_epilogue + + DD imagerel $L$SEH_begin_sqr_382x + DD imagerel $L$SEH_body_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_prologue + + DD imagerel $L$SEH_body_sqr_382x + DD imagerel $L$SEH_epilogue_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_body + + DD imagerel $L$SEH_epilogue_sqr_382x + DD imagerel $L$SEH_end_sqr_382x + DD imagerel $L$SEH_info_sqr_382x_epilogue + + DD imagerel $L$SEH_begin_mul_384 + DD imagerel $L$SEH_body_mul_384 + DD imagerel $L$SEH_info_mul_384_prologue + + DD imagerel $L$SEH_body_mul_384 + DD imagerel $L$SEH_epilogue_mul_384 + DD imagerel $L$SEH_info_mul_384_body + + DD imagerel $L$SEH_epilogue_mul_384 + DD imagerel $L$SEH_end_mul_384 + DD imagerel $L$SEH_info_mul_384_epilogue + + DD imagerel $L$SEH_begin_sqr_384 + DD imagerel $L$SEH_body_sqr_384 + DD imagerel $L$SEH_info_sqr_384_prologue + + DD imagerel $L$SEH_body_sqr_384 + DD imagerel $L$SEH_epilogue_sqr_384 + DD imagerel $L$SEH_info_sqr_384_body + + DD imagerel $L$SEH_epilogue_sqr_384 + DD imagerel $L$SEH_end_sqr_384 + DD imagerel $L$SEH_info_sqr_384_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_384 + DD imagerel $L$SEH_body_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_prologue + + DD imagerel $L$SEH_body_sqr_mont_384 + DD imagerel $L$SEH_epilogue_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_body + + DD imagerel $L$SEH_epilogue_sqr_mont_384 + DD imagerel $L$SEH_end_sqr_mont_384 + DD imagerel $L$SEH_info_sqr_mont_384_epilogue + + DD imagerel $L$SEH_begin_redc_mont_384 + DD imagerel $L$SEH_body_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_prologue + + DD imagerel $L$SEH_body_redc_mont_384 + DD imagerel $L$SEH_epilogue_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_body + + DD imagerel $L$SEH_epilogue_redc_mont_384 + DD imagerel $L$SEH_end_redc_mont_384 + DD imagerel $L$SEH_info_redc_mont_384_epilogue + + DD imagerel $L$SEH_begin_from_mont_384 + DD imagerel $L$SEH_body_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_prologue + + DD imagerel $L$SEH_body_from_mont_384 + DD imagerel $L$SEH_epilogue_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_body + + DD imagerel $L$SEH_epilogue_from_mont_384 + DD imagerel $L$SEH_end_from_mont_384 + DD imagerel $L$SEH_info_from_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mont_384 + DD imagerel $L$SEH_body_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mont_384 + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384 + DD imagerel $L$SEH_end_sgn0_pty_mont_384 + DD imagerel $L$SEH_info_sgn0_pty_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0_pty_mont_384x + DD imagerel $L$SEH_body_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_prologue + + DD imagerel $L$SEH_body_sgn0_pty_mont_384x + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_body + + DD imagerel $L$SEH_epilogue_sgn0_pty_mont_384x + DD imagerel $L$SEH_end_sgn0_pty_mont_384x + DD imagerel $L$SEH_info_sgn0_pty_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mul_mont_384 + DD imagerel $L$SEH_body_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_prologue + + DD imagerel $L$SEH_body_mul_mont_384 + DD imagerel $L$SEH_epilogue_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_mul_mont_384 + DD imagerel $L$SEH_end_mul_mont_384 + DD imagerel $L$SEH_info_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqr_n_mul_mont_384 + DD imagerel $L$SEH_body_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_prologue + + DD imagerel $L$SEH_body_sqr_n_mul_mont_384 + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_384 + DD imagerel $L$SEH_end_sqr_n_mul_mont_384 + DD imagerel $L$SEH_info_sqr_n_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqr_n_mul_mont_383 + DD imagerel $L$SEH_body_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_prologue + + DD imagerel $L$SEH_body_sqr_n_mul_mont_383 + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_body + + DD imagerel $L$SEH_epilogue_sqr_n_mul_mont_383 + DD imagerel $L$SEH_end_sqr_n_mul_mont_383 + DD imagerel $L$SEH_info_sqr_n_mul_mont_383_epilogue + + DD imagerel $L$SEH_begin_sqr_mont_382x + DD imagerel $L$SEH_body_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_prologue + + DD imagerel $L$SEH_body_sqr_mont_382x + DD imagerel $L$SEH_epilogue_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_body + + DD imagerel $L$SEH_epilogue_sqr_mont_382x + DD imagerel $L$SEH_end_sqr_mont_382x + DD imagerel $L$SEH_info_sqr_mont_382x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mul_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,029h,000h +DB 000h,0e4h,02ah,000h +DB 000h,0d4h,02bh,000h +DB 000h,0c4h,02ch,000h +DB 000h,034h,02dh,000h +DB 000h,054h,02eh,000h +DB 000h,074h,030h,000h +DB 000h,064h,031h,000h +DB 000h,001h,02fh,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mul_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqr_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqr_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mul_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqr_382x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqr_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_384_body:: +DB 1,0,11,0 +DB 000h,0c4h,000h,000h +DB 000h,034h,001h,000h +DB 000h,054h,002h,000h +DB 000h,074h,004h,000h +DB 000h,064h,005h,000h +DB 000h,022h +DB 000h,000h,000h,000h,000h,000h +$L$SEH_info_mul_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqr_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqr_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqr_mont_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,00fh,000h +DB 000h,0e4h,010h,000h +DB 000h,0d4h,011h,000h +DB 000h,0c4h,012h,000h +DB 000h,034h,013h,000h +DB 000h,054h,014h,000h +DB 000h,074h,016h,000h +DB 000h,064h,017h,000h +DB 000h,001h,015h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqr_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redc_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_redc_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_redc_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_from_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_from_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_from_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sgn0_pty_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sgn0_pty_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0_pty_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sgn0_pty_mont_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sgn0_pty_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mul_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_n_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqr_n_mul_mont_384_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqr_n_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_n_mul_mont_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqr_n_mul_mont_383_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqr_n_mul_mont_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqr_mont_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqr_mont_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqr_mont_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm new file mode 100644 index 00000000000..21d18a8b40b --- /dev/null +++ b/crypto/blst_src/build/win64/mulx_mont_256-x86_64.asm @@ -0,0 +1,810 @@ +OPTION DOTNAME +PUBLIC mul_mont_sparse_256$1 +PUBLIC sqr_mont_sparse_256$1 +PUBLIC from_mont_256$1 +PUBLIC redc_mont_256$1 +.text$ SEGMENT ALIGN(256) 'CODE' + +PUBLIC mulx_mont_sparse_256 + + +ALIGN 32 +mulx_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_sparse_256:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] +mul_mont_sparse_256$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_mulx_mont_sparse_256:: + + + mov rbx,rdx + mov rdx,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbp,QWORD PTR[16+rsi] + mov r9,QWORD PTR[24+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r11,rax,r14 + call __mulx_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_mulx_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_sparse_256:: +mulx_mont_sparse_256 ENDP + +PUBLIC sqrx_mont_sparse_256 + + +ALIGN 32 +sqrx_mont_sparse_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_sparse_256:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +sqr_mont_sparse_256$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sqrx_mont_sparse_256:: + + + mov rbx,rsi + mov r8,rcx + mov rcx,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rbp,QWORD PTR[16+rsi] + mov r9,QWORD PTR[24+rsi] + lea rsi,QWORD PTR[((-128))+rbx] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r11,rax,rdx + call __mulx_mont_sparse_256 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_mont_sparse_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_sparse_256:: +sqrx_mont_sparse_256 ENDP + +ALIGN 32 +__mulx_mont_sparse_256 PROC PRIVATE + DB 243,15,30,250 + + mulx r12,r15,r15 + mulx r13,rbp,rbp + add r11,r15 + mulx r14,r9,r9 + mov rdx,QWORD PTR[8+rbx] + adc r12,rbp + adc r13,r9 + adc r14,0 + + mov r10,rax + imul rax,r8 + + + xor r15,r15 + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r11,rbp + adcx r12,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r12,rbp + adcx r13,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r14,rbp + adcx r9,r15 + adox r15,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r10,rbp + adox rax,r11 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r12,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r12,rbp + adox r13,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rbp + adox r14,r9 + adcx r14,r10 + adox r15,r10 + adcx r15,r10 + adox r10,r10 + adc r10,0 + mov r11,rax + imul rax,r8 + + + xor rbp,rbp + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r12,rbp + adcx r13,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r14,rbp + adcx r15,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r15,rbp + adcx r9,r10 + adox r10,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r11,rbp + adox rax,r12 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r13,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r13,rbp + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rbp + adox r15,r9 + adcx r15,r11 + adox r10,r11 + adcx r10,r11 + adox r11,r11 + adc r11,0 + mov r12,rax + imul rax,r8 + + + xor rbp,rbp + mulx r9,rbp,QWORD PTR[((0+128))+rsi] + adox r13,rbp + adcx r14,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rsi] + adox r14,rbp + adcx r15,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rsi] + adox r15,rbp + adcx r10,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rsi] + mov rdx,rax + adox r10,rbp + adcx r9,r11 + adox r11,r9 + + + mulx rax,rbp,QWORD PTR[((0+128))+rcx] + adcx r12,rbp + adox rax,r13 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx rax,rbp + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r14,rbp + adox r15,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,rax + adcx r15,rbp + adox r10,r9 + adcx r10,r12 + adox r11,r12 + adcx r11,r12 + adox r12,r12 + adc r12,0 + imul rdx,r8 + + + xor rbp,rbp + mulx r9,r13,QWORD PTR[((0+128))+rcx] + adcx r13,rax + adox r14,r9 + + mulx r9,rbp,QWORD PTR[((8+128))+rcx] + adcx r14,rbp + adox r15,r9 + + mulx r9,rbp,QWORD PTR[((16+128))+rcx] + adcx r15,rbp + adox r10,r9 + + mulx r9,rbp,QWORD PTR[((24+128))+rcx] + mov rdx,r14 + lea rcx,QWORD PTR[128+rcx] + adcx r10,rbp + adox r11,r9 + mov rax,r15 + adcx r11,r13 + adox r12,r13 + adc r12,0 + + + + + mov rbp,r10 + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rcx] + mov r9,r11 + sbb r11,QWORD PTR[24+rcx] + sbb r12,0 + + cmovc r14,rdx + cmovc r15,rax + cmovc r10,rbp + mov QWORD PTR[rdi],r14 + cmovc r11,r9 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],r10 + mov QWORD PTR[24+rdi],r11 + + DB 0F3h,0C3h ;repret +__mulx_mont_sparse_256 ENDP +PUBLIC fromx_mont_256 + + +ALIGN 32 +fromx_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_fromx_mont_256:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +from_mont_256$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_fromx_mont_256:: + + + mov rbx,rdx + call __mulx_by_1_mont_256 + + + + + + mov rdx,r15 + mov r12,r10 + mov r13,r11 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r10,QWORD PTR[16+rbx] + sbb r11,QWORD PTR[24+rbx] + + cmovnc rax,r14 + cmovnc rdx,r15 + cmovnc r12,r10 + mov QWORD PTR[rdi],rax + cmovnc r13,r11 + mov QWORD PTR[8+rdi],rdx + mov QWORD PTR[16+rdi],r12 + mov QWORD PTR[24+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_fromx_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_fromx_mont_256:: +fromx_mont_256 ENDP + +PUBLIC redcx_mont_256 + + +ALIGN 32 +redcx_mont_256 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redcx_mont_256:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +redc_mont_256$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redcx_mont_256:: + + + mov rbx,rdx + call __mulx_by_1_mont_256 + + add r14,QWORD PTR[32+rsi] + adc r15,QWORD PTR[40+rsi] + mov rax,r14 + adc r10,QWORD PTR[48+rsi] + mov rdx,r15 + adc r11,QWORD PTR[56+rsi] + sbb rsi,rsi + + + + + mov r12,r10 + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r10,QWORD PTR[16+rbx] + mov r13,r11 + sbb r11,QWORD PTR[24+rbx] + sbb rsi,0 + + cmovnc rax,r14 + cmovnc rdx,r15 + cmovnc r12,r10 + mov QWORD PTR[rdi],rax + cmovnc r13,r11 + mov QWORD PTR[8+rdi],rdx + mov QWORD PTR[16+rdi],r12 + mov QWORD PTR[24+rdi],r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redcx_mont_256:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redcx_mont_256:: +redcx_mont_256 ENDP + +ALIGN 32 +__mulx_by_1_mont_256 PROC PRIVATE + DB 243,15,30,250 + + mov rax,QWORD PTR[rsi] + mov r11,QWORD PTR[8+rsi] + mov r12,QWORD PTR[16+rsi] + mov r13,QWORD PTR[24+rsi] + + mov r14,rax + imul rax,rcx + mov r10,rax + + mul QWORD PTR[rbx] + add r14,rax + mov rax,r10 + adc r14,rdx + + mul QWORD PTR[8+rbx] + add r11,rax + mov rax,r10 + adc rdx,0 + add r11,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[16+rbx] + mov r15,r11 + imul r11,rcx + add r12,rax + mov rax,r10 + adc rdx,0 + add r12,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[24+rbx] + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r14 + adc rdx,0 + mov r14,rdx + + mul QWORD PTR[rbx] + add r15,rax + mov rax,r11 + adc r15,rdx + + mul QWORD PTR[8+rbx] + add r12,rax + mov rax,r11 + adc rdx,0 + add r12,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[16+rbx] + mov r10,r12 + imul r12,rcx + add r13,rax + mov rax,r11 + adc rdx,0 + add r13,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[24+rbx] + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r15 + adc rdx,0 + mov r15,rdx + + mul QWORD PTR[rbx] + add r10,rax + mov rax,r12 + adc r10,rdx + + mul QWORD PTR[8+rbx] + add r13,rax + mov rax,r12 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[16+rbx] + mov r11,r13 + imul r13,rcx + add r14,rax + mov rax,r12 + adc rdx,0 + add r14,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[24+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r10 + adc rdx,0 + mov r10,rdx + + mul QWORD PTR[rbx] + add r11,rax + mov rax,r13 + adc r11,rdx + + mul QWORD PTR[8+rbx] + add r14,rax + mov rax,r13 + adc rdx,0 + add r14,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[16+rbx] + add r15,rax + mov rax,r13 + adc rdx,0 + add r15,r11 + adc rdx,0 + mov r11,rdx + + mul QWORD PTR[24+rbx] + add r10,rax + mov rax,r14 + adc rdx,0 + add r10,r11 + adc rdx,0 + mov r11,rdx + DB 0F3h,0C3h ;repret +__mulx_by_1_mont_256 ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mulx_mont_sparse_256 + DD imagerel $L$SEH_body_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_mulx_mont_sparse_256 + DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_mulx_mont_sparse_256 + DD imagerel $L$SEH_end_mulx_mont_sparse_256 + DD imagerel $L$SEH_info_mulx_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_sparse_256 + DD imagerel $L$SEH_body_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_prologue + + DD imagerel $L$SEH_body_sqrx_mont_sparse_256 + DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_sparse_256 + DD imagerel $L$SEH_end_sqrx_mont_sparse_256 + DD imagerel $L$SEH_info_sqrx_mont_sparse_256_epilogue + + DD imagerel $L$SEH_begin_fromx_mont_256 + DD imagerel $L$SEH_body_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_prologue + + DD imagerel $L$SEH_body_fromx_mont_256 + DD imagerel $L$SEH_epilogue_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_body + + DD imagerel $L$SEH_epilogue_fromx_mont_256 + DD imagerel $L$SEH_end_fromx_mont_256 + DD imagerel $L$SEH_info_fromx_mont_256_epilogue + + DD imagerel $L$SEH_begin_redcx_mont_256 + DD imagerel $L$SEH_body_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_prologue + + DD imagerel $L$SEH_body_redcx_mont_256 + DD imagerel $L$SEH_epilogue_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_body + + DD imagerel $L$SEH_epilogue_redcx_mont_256 + DD imagerel $L$SEH_end_redcx_mont_256 + DD imagerel $L$SEH_info_redcx_mont_256_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mulx_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mulx_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mulx_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_sparse_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqrx_mont_sparse_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqrx_mont_sparse_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_fromx_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_fromx_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_fromx_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redcx_mont_256_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_redcx_mont_256_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_redcx_mont_256_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm new file mode 100644 index 00000000000..4dc41b04098 --- /dev/null +++ b/crypto/blst_src/build/win64/mulx_mont_384-x86_64.asm @@ -0,0 +1,3644 @@ +OPTION DOTNAME +PUBLIC mul_mont_384x$1 +PUBLIC sqr_mont_384x$1 +PUBLIC mul_382x$1 +PUBLIC sqr_382x$1 +PUBLIC mul_384$1 +PUBLIC sqr_384$1 +PUBLIC redc_mont_384$1 +PUBLIC from_mont_384$1 +PUBLIC sgn0_pty_mont_384$1 +PUBLIC sgn0_pty_mont_384x$1 +PUBLIC mul_mont_384$1 +PUBLIC sqr_mont_384$1 +PUBLIC sqr_n_mul_mont_384$1 +PUBLIC sqr_n_mul_mont_383$1 +PUBLIC sqr_mont_382x$1 +.text$ SEGMENT ALIGN(256) 'CODE' + + + + + + + + +ALIGN 32 +__subx_mod_384x384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + mov r14,QWORD PTR[48+rsi] + + sub r8,QWORD PTR[rdx] + mov r15,QWORD PTR[56+rsi] + sbb r9,QWORD PTR[8+rdx] + mov rax,QWORD PTR[64+rsi] + sbb r10,QWORD PTR[16+rdx] + mov rbx,QWORD PTR[72+rsi] + sbb r11,QWORD PTR[24+rdx] + mov rbp,QWORD PTR[80+rsi] + sbb r12,QWORD PTR[32+rdx] + mov rsi,QWORD PTR[88+rsi] + sbb r13,QWORD PTR[40+rdx] + mov QWORD PTR[rdi],r8 + sbb r14,QWORD PTR[48+rdx] + mov r8,QWORD PTR[rcx] + mov QWORD PTR[8+rdi],r9 + sbb r15,QWORD PTR[56+rdx] + mov r9,QWORD PTR[8+rcx] + mov QWORD PTR[16+rdi],r10 + sbb rax,QWORD PTR[64+rdx] + mov r10,QWORD PTR[16+rcx] + mov QWORD PTR[24+rdi],r11 + sbb rbx,QWORD PTR[72+rdx] + mov r11,QWORD PTR[24+rcx] + mov QWORD PTR[32+rdi],r12 + sbb rbp,QWORD PTR[80+rdx] + mov r12,QWORD PTR[32+rcx] + mov QWORD PTR[40+rdi],r13 + sbb rsi,QWORD PTR[88+rdx] + mov r13,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r8,rdx + and r9,rdx + and r10,rdx + and r11,rdx + and r12,rdx + and r13,rdx + + add r14,r8 + adc r15,r9 + mov QWORD PTR[48+rdi],r14 + adc rax,r10 + mov QWORD PTR[56+rdi],r15 + adc rbx,r11 + mov QWORD PTR[64+rdi],rax + adc rbp,r12 + mov QWORD PTR[72+rdi],rbx + adc rsi,r13 + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rsi + + DB 0F3h,0C3h ;repret +__subx_mod_384x384 ENDP + + +ALIGN 32 +__addx_mod_384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[rdx] + adc r9,QWORD PTR[8+rdx] + adc r10,QWORD PTR[16+rdx] + mov r14,r8 + adc r11,QWORD PTR[24+rdx] + mov r15,r9 + adc r12,QWORD PTR[32+rdx] + mov rax,r10 + adc r13,QWORD PTR[40+rdx] + mov rbx,r11 + sbb rdx,rdx + + sub r8,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rcx] + mov rbp,r12 + sbb r10,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rcx] + mov rsi,r13 + sbb r13,QWORD PTR[40+rcx] + sbb rdx,0 + + cmovc r8,r14 + cmovc r9,r15 + cmovc r10,rax + mov QWORD PTR[rdi],r8 + cmovc r11,rbx + mov QWORD PTR[8+rdi],r9 + cmovc r12,rbp + mov QWORD PTR[16+rdi],r10 + cmovc r13,rsi + mov QWORD PTR[24+rdi],r11 + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__addx_mod_384 ENDP + + +ALIGN 32 +__subx_mod_384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + +__subx_mod_384_a_is_loaded:: + sub r8,QWORD PTR[rdx] + mov r14,QWORD PTR[rcx] + sbb r9,QWORD PTR[8+rdx] + mov r15,QWORD PTR[8+rcx] + sbb r10,QWORD PTR[16+rdx] + mov rax,QWORD PTR[16+rcx] + sbb r11,QWORD PTR[24+rdx] + mov rbx,QWORD PTR[24+rcx] + sbb r12,QWORD PTR[32+rdx] + mov rbp,QWORD PTR[32+rcx] + sbb r13,QWORD PTR[40+rdx] + mov rsi,QWORD PTR[40+rcx] + sbb rdx,rdx + + and r14,rdx + and r15,rdx + and rax,rdx + and rbx,rdx + and rbp,rdx + and rsi,rdx + + add r8,r14 + adc r9,r15 + mov QWORD PTR[rdi],r8 + adc r10,rax + mov QWORD PTR[8+rdi],r9 + adc r11,rbx + mov QWORD PTR[16+rdi],r10 + adc r12,rbp + mov QWORD PTR[24+rdi],r11 + adc r13,rsi + mov QWORD PTR[32+rdi],r12 + mov QWORD PTR[40+rdi],r13 + + DB 0F3h,0C3h ;repret +__subx_mod_384 ENDP +PUBLIC mulx_mont_384x + + +ALIGN 32 +mulx_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_384x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] +mul_mont_384x$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,328 + +$L$SEH_body_mulx_mont_384x:: + + + mov rbx,rdx + mov QWORD PTR[32+rsp],rdi + mov QWORD PTR[24+rsp],rsi + mov QWORD PTR[16+rsp],rdx + mov QWORD PTR[8+rsp],rcx + mov QWORD PTR[rsp],r8 + + + + + lea rdi,QWORD PTR[40+rsp] + call __mulx_384 + + + lea rbx,QWORD PTR[48+rbx] + lea rsi,QWORD PTR[((128+48))+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulx_384 + + + mov rcx,QWORD PTR[8+rsp] + lea rsi,QWORD PTR[rbx] + lea rdx,QWORD PTR[((-48))+rbx] + lea rdi,QWORD PTR[((40+192+48))+rsp] + call __addx_mod_384 + + mov rsi,QWORD PTR[24+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((-48))+rdi] + call __addx_mod_384 + + lea rbx,QWORD PTR[rdi] + lea rsi,QWORD PTR[48+rdi] + call __mulx_384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[8+rsp] + call __subx_mod_384x384 + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __subx_mod_384x384 + + + lea rsi,QWORD PTR[40+rsp] + lea rdx,QWORD PTR[((40+96))+rsp] + lea rdi,QWORD PTR[40+rsp] + call __subx_mod_384x384 + + lea rbx,QWORD PTR[rcx] + + + lea rsi,QWORD PTR[40+rsp] + mov rcx,QWORD PTR[rsp] + mov rdi,QWORD PTR[32+rsp] + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + + lea rsi,QWORD PTR[((40+192))+rsp] + mov rcx,QWORD PTR[rsp] + lea rdi,QWORD PTR[48+rdi] + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + lea r8,QWORD PTR[328+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mulx_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_384x:: +mulx_mont_384x ENDP +PUBLIC sqrx_mont_384x + + +ALIGN 32 +sqrx_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_384x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +sqr_mont_384x$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqrx_mont_384x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rsi + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[32+rsp] + call __addx_mod_384 + + + mov rsi,QWORD PTR[24+rsp] + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[((32+48))+rsp] + call __subx_mod_384 + + + mov rsi,QWORD PTR[24+rsp] + lea rbx,QWORD PTR[48+rsi] + + mov rdx,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + add rdx,rdx + adc r15,r15 + adc rax,rax + mov r8,rdx + adc r12,r12 + mov r9,r15 + adc rdi,rdi + mov r10,rax + adc rbp,rbp + mov r11,r12 + sbb rsi,rsi + + sub rdx,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov r13,rdi + sbb rax,QWORD PTR[16+rcx] + sbb r12,QWORD PTR[24+rcx] + sbb rdi,QWORD PTR[32+rcx] + mov r14,rbp + sbb rbp,QWORD PTR[40+rcx] + sbb rsi,0 + + cmovc rdx,r8 + cmovc r15,r9 + cmovc rax,r10 + mov QWORD PTR[48+rbx],rdx + cmovc r12,r11 + mov QWORD PTR[56+rbx],r15 + cmovc rdi,r13 + mov QWORD PTR[64+rbx],rax + cmovc rbp,r14 + mov QWORD PTR[72+rbx],r12 + mov QWORD PTR[80+rbx],rdi + mov QWORD PTR[88+rbx],rbp + + lea rsi,QWORD PTR[32+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rdx,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov rax,QWORD PTR[((32+16))+rsp] + mov r12,QWORD PTR[((32+24))+rsp] + mov rdi,QWORD PTR[((32+32))+rsp] + mov rbp,QWORD PTR[((32+40))+rsp] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqrx_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_384x:: +sqrx_mont_384x ENDP + +PUBLIC mulx_382x + + +ALIGN 32 +mulx_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_382x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +mul_382x$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_mulx_382x:: + + + lea rdi,QWORD PTR[96+rdi] + mov QWORD PTR[rsp],rsi + mov QWORD PTR[8+rsp],rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rcx + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + add r8,QWORD PTR[48+rsi] + adc r9,QWORD PTR[56+rsi] + adc r10,QWORD PTR[64+rsi] + adc r11,QWORD PTR[72+rsi] + adc r12,QWORD PTR[80+rsi] + adc r13,QWORD PTR[88+rsi] + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov r12,QWORD PTR[32+rdx] + mov r13,QWORD PTR[40+rdx] + + add r8,QWORD PTR[48+rdx] + adc r9,QWORD PTR[56+rdx] + adc r10,QWORD PTR[64+rdx] + adc r11,QWORD PTR[72+rdx] + adc r12,QWORD PTR[80+rdx] + adc r13,QWORD PTR[88+rdx] + + mov QWORD PTR[((32+48))+rsp],r8 + mov QWORD PTR[((32+56))+rsp],r9 + mov QWORD PTR[((32+64))+rsp],r10 + mov QWORD PTR[((32+72))+rsp],r11 + mov QWORD PTR[((32+80))+rsp],r12 + mov QWORD PTR[((32+88))+rsp],r13 + + + lea rsi,QWORD PTR[((32+0))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + call __mulx_384 + + + mov rsi,QWORD PTR[rsp] + mov rbx,QWORD PTR[8+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __mulx_384 + + + lea rsi,QWORD PTR[((48+128))+rsi] + lea rbx,QWORD PTR[48+rbx] + lea rdi,QWORD PTR[32+rsp] + call __mulx_384 + + + mov rsi,QWORD PTR[16+rsp] + lea rdx,QWORD PTR[32+rsp] + mov rcx,QWORD PTR[24+rsp] + mov rdi,rsi + call __subx_mod_384x384 + + + lea rsi,QWORD PTR[rdi] + lea rdx,QWORD PTR[((-96))+rdi] + call __subx_mod_384x384 + + + lea rsi,QWORD PTR[((-96))+rdi] + lea rdx,QWORD PTR[32+rsp] + lea rdi,QWORD PTR[((-96))+rdi] + call __subx_mod_384x384 + + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_mulx_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_382x:: +mulx_382x ENDP +PUBLIC sqrx_382x + + +ALIGN 32 +sqrx_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_382x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +sqr_382x$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rsi + +$L$SEH_body_sqrx_382x:: + + + mov rcx,rdx + + + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,QWORD PTR[24+rsi] + mov rbp,QWORD PTR[32+rsi] + mov rdx,QWORD PTR[40+rsi] + + mov r8,r14 + add r14,QWORD PTR[48+rsi] + mov r9,r15 + adc r15,QWORD PTR[56+rsi] + mov r10,rax + adc rax,QWORD PTR[64+rsi] + mov r11,rbx + adc rbx,QWORD PTR[72+rsi] + mov r12,rbp + adc rbp,QWORD PTR[80+rsi] + mov r13,rdx + adc rdx,QWORD PTR[88+rsi] + + mov QWORD PTR[rdi],r14 + mov QWORD PTR[8+rdi],r15 + mov QWORD PTR[16+rdi],rax + mov QWORD PTR[24+rdi],rbx + mov QWORD PTR[32+rdi],rbp + mov QWORD PTR[40+rdi],rdx + + + lea rdx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[48+rdi] + call __subx_mod_384_a_is_loaded + + + lea rsi,QWORD PTR[rdi] + lea rbx,QWORD PTR[((-48))+rdi] + lea rdi,QWORD PTR[((-48))+rdi] + call __mulx_384 + + + mov rsi,QWORD PTR[rsp] + lea rbx,QWORD PTR[48+rsi] + lea rdi,QWORD PTR[96+rdi] + call __mulx_384 + + mov r8,QWORD PTR[rdi] + mov r9,QWORD PTR[8+rdi] + mov r10,QWORD PTR[16+rdi] + mov r11,QWORD PTR[24+rdi] + mov r12,QWORD PTR[32+rdi] + mov r13,QWORD PTR[40+rdi] + mov r14,QWORD PTR[48+rdi] + mov r15,QWORD PTR[56+rdi] + mov rax,QWORD PTR[64+rdi] + mov rbx,QWORD PTR[72+rdi] + mov rbp,QWORD PTR[80+rdi] + add r8,r8 + mov rdx,QWORD PTR[88+rdi] + adc r9,r9 + mov QWORD PTR[rdi],r8 + adc r10,r10 + mov QWORD PTR[8+rdi],r9 + adc r11,r11 + mov QWORD PTR[16+rdi],r10 + adc r12,r12 + mov QWORD PTR[24+rdi],r11 + adc r13,r13 + mov QWORD PTR[32+rdi],r12 + adc r14,r14 + mov QWORD PTR[40+rdi],r13 + adc r15,r15 + mov QWORD PTR[48+rdi],r14 + adc rax,rax + mov QWORD PTR[56+rdi],r15 + adc rbx,rbx + mov QWORD PTR[64+rdi],rax + adc rbp,rbp + mov QWORD PTR[72+rdi],rbx + adc rdx,rdx + mov QWORD PTR[80+rdi],rbp + mov QWORD PTR[88+rdi],rdx + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_382x:: +sqrx_382x ENDP +PUBLIC mulx_384 + + +ALIGN 32 +mulx_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +mul_384$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$SEH_body_mulx_384:: + + + mov rbx,rdx + call __mulx_384 + + mov r15,QWORD PTR[rsp] + + mov r14,QWORD PTR[8+rsp] + + mov r13,QWORD PTR[16+rsp] + + mov r12,QWORD PTR[24+rsp] + + mov rbx,QWORD PTR[32+rsp] + + mov rbp,QWORD PTR[40+rsp] + + lea rsp,QWORD PTR[48+rsp] + +$L$SEH_epilogue_mulx_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_384:: +mulx_384 ENDP + + +ALIGN 32 +__mulx_384 PROC PRIVATE + DB 243,15,30,250 + + mov rdx,QWORD PTR[rbx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + + mulx rcx,r9,r14 + xor rbp,rbp + + mulx rax,r8,r15 + adcx r8,rcx + mov QWORD PTR[rdi],r9 + + mulx rcx,r9,r10 + adcx r9,rax + + mulx rax,r10,r11 + adcx r10,rcx + + mulx rcx,r11,r12 + adcx r11,rax + + mulx r13,r12,r13 + mov rdx,QWORD PTR[8+rbx] + adcx r12,rcx + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[8+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[16+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[16+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[24+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[24+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[32+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[32+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,QWORD PTR[40+rbx] + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mulx rcx,rax,r14 + adcx rax,r8 + adox r9,rcx + mov QWORD PTR[40+rdi],rax + + mulx rcx,r8,r15 + adcx r8,r9 + adox r10,rcx + + mulx rax,r9,QWORD PTR[((128+16))+rsi] + adcx r9,r10 + adox r11,rax + + mulx rcx,r10,QWORD PTR[((128+24))+rsi] + adcx r10,r11 + adox r12,rcx + + mulx rax,r11,QWORD PTR[((128+32))+rsi] + adcx r11,r12 + adox rax,r13 + + mulx r13,r12,QWORD PTR[((128+40))+rsi] + mov rdx,rax + adcx r12,rax + adox r13,rbp + adcx r13,rbp + mov QWORD PTR[48+rdi],r8 + mov QWORD PTR[56+rdi],r9 + mov QWORD PTR[64+rdi],r10 + mov QWORD PTR[72+rdi],r11 + mov QWORD PTR[80+rdi],r12 + mov QWORD PTR[88+rdi],r13 + + DB 0F3h,0C3h ;repret +__mulx_384 ENDP +PUBLIC sqrx_384 + + +ALIGN 32 +sqrx_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_384:: + + + mov rdi,rcx + mov rsi,rdx +sqr_384$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + push rdi + +$L$SEH_body_sqrx_384:: + + + call __sqrx_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sqrx_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_384:: +sqrx_384 ENDP + +ALIGN 32 +__sqrx_384 PROC PRIVATE + DB 243,15,30,250 + + mov rdx,QWORD PTR[rsi] + mov r14,QWORD PTR[8+rsi] + mov r15,QWORD PTR[16+rsi] + mov rcx,QWORD PTR[24+rsi] + mov rbx,QWORD PTR[32+rsi] + + + mulx rdi,r8,r14 + mov rbp,QWORD PTR[40+rsi] + mulx rax,r9,r15 + add r9,rdi + mulx rdi,r10,rcx + adc r10,rax + mulx rax,r11,rbx + adc r11,rdi + mulx r13,r12,rbp + mov rdx,r14 + adc r12,rax + adc r13,0 + + + xor r14,r14 + mulx rax,rdi,r15 + adcx r10,rdi + adox r11,rax + + mulx rax,rdi,rcx + adcx r11,rdi + adox r12,rax + + mulx rax,rdi,rbx + adcx r12,rdi + adox r13,rax + + mulx rax,rdi,rbp + mov rdx,r15 + adcx r13,rdi + adox rax,r14 + adcx r14,rax + + + xor r15,r15 + mulx rax,rdi,rcx + adcx r12,rdi + adox r13,rax + + mulx rax,rdi,rbx + adcx r13,rdi + adox r14,rax + + mulx rax,rdi,rbp + mov rdx,rcx + adcx r14,rdi + adox rax,r15 + adcx r15,rax + + + xor rcx,rcx + mulx rax,rdi,rbx + adcx r14,rdi + adox r15,rax + + mulx rax,rdi,rbp + mov rdx,rbx + adcx r15,rdi + adox rax,rcx + adcx rcx,rax + + + mulx rbx,rdi,rbp + mov rdx,QWORD PTR[rsi] + add rcx,rdi + mov rdi,QWORD PTR[8+rsp] + adc rbx,0 + + + xor rbp,rbp + adcx r8,r8 + adcx r9,r9 + adcx r10,r10 + adcx r11,r11 + adcx r12,r12 + + + mulx rax,rdx,rdx + mov QWORD PTR[rdi],rdx + mov rdx,QWORD PTR[8+rsi] + adox r8,rax + mov QWORD PTR[8+rdi],r8 + + mulx rax,r8,rdx + mov rdx,QWORD PTR[16+rsi] + adox r9,r8 + adox r10,rax + mov QWORD PTR[16+rdi],r9 + mov QWORD PTR[24+rdi],r10 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[24+rsi] + adox r11,r8 + adox r12,r9 + adcx r13,r13 + adcx r14,r14 + mov QWORD PTR[32+rdi],r11 + mov QWORD PTR[40+rdi],r12 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[32+rsi] + adox r13,r8 + adox r14,r9 + adcx r15,r15 + adcx rcx,rcx + mov QWORD PTR[48+rdi],r13 + mov QWORD PTR[56+rdi],r14 + + mulx r9,r8,rdx + mov rdx,QWORD PTR[40+rsi] + adox r15,r8 + adox rcx,r9 + adcx rbx,rbx + adcx rbp,rbp + mov QWORD PTR[64+rdi],r15 + mov QWORD PTR[72+rdi],rcx + + mulx r9,r8,rdx + adox rbx,r8 + adox rbp,r9 + + mov QWORD PTR[80+rdi],rbx + mov QWORD PTR[88+rdi],rbp + + DB 0F3h,0C3h ;repret +__sqrx_384 ENDP + + + +PUBLIC redcx_mont_384 + + +ALIGN 32 +redcx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_redcx_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +redc_mont_384$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_redcx_mont_384:: + + + mov rbx,rdx + call __mulx_by_1_mont_384 + call __redx_tail_mont_384 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_redcx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_redcx_mont_384:: +redcx_mont_384 ENDP + + + + +PUBLIC fromx_mont_384 + + +ALIGN 32 +fromx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_fromx_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +from_mont_384$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_fromx_mont_384:: + + + mov rbx,rdx + call __mulx_by_1_mont_384 + + + + + mov rax,r14 + mov rcx,r15 + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_fromx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_fromx_mont_384:: +fromx_mont_384 ENDP + +ALIGN 32 +__mulx_by_1_mont_384 PROC PRIVATE + DB 243,15,30,250 + + mov r8,QWORD PTR[rsi] + mov rdx,rcx + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + imul rdx,r8 + + + xor r14,r14 + mulx rbp,rax,QWORD PTR[rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r13,rax + adox rbp,r14 + adcx r14,rbp + imul rdx,r9 + + + xor r15,r15 + mulx rbp,rax,QWORD PTR[rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r14,rax + adox rbp,r15 + adcx r15,rbp + imul rdx,r10 + + + xor r8,r8 + mulx rbp,rax,QWORD PTR[rbx] + adcx r10,rax + adox r11,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r15,rax + adox rbp,r8 + adcx r8,rbp + imul rdx,r11 + + + xor r9,r9 + mulx rbp,rax,QWORD PTR[rbx] + adcx r11,rax + adox r12,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r8,rax + adox rbp,r9 + adcx r9,rbp + imul rdx,r12 + + + xor r10,r10 + mulx rbp,rax,QWORD PTR[rbx] + adcx r12,rax + adox r13,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r9,rax + adox rbp,r10 + adcx r10,rbp + imul rdx,r13 + + + xor r11,r11 + mulx rbp,rax,QWORD PTR[rbx] + adcx r13,rax + adox r14,rbp + + mulx rbp,rax,QWORD PTR[8+rbx] + adcx r14,rax + adox r15,rbp + + mulx rbp,rax,QWORD PTR[16+rbx] + adcx r15,rax + adox r8,rbp + + mulx rbp,rax,QWORD PTR[24+rbx] + adcx r8,rax + adox r9,rbp + + mulx rbp,rax,QWORD PTR[32+rbx] + adcx r9,rax + adox r10,rbp + + mulx rbp,rax,QWORD PTR[40+rbx] + mov rdx,rcx + adcx r10,rax + adox rbp,r11 + adcx r11,rbp + DB 0F3h,0C3h ;repret +__mulx_by_1_mont_384 ENDP + + +ALIGN 32 +__redx_tail_mont_384 PROC PRIVATE + DB 243,15,30,250 + + add r14,QWORD PTR[48+rsi] + mov rax,r14 + adc r15,QWORD PTR[56+rsi] + adc r8,QWORD PTR[64+rsi] + adc r9,QWORD PTR[72+rsi] + mov rcx,r15 + adc r10,QWORD PTR[80+rsi] + adc r11,QWORD PTR[88+rsi] + sbb r12,r12 + + + + + mov rdx,r8 + mov rbp,r9 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + mov r13,r10 + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + mov rsi,r11 + sbb r11,QWORD PTR[40+rbx] + sbb r12,0 + + cmovc r14,rax + cmovc r15,rcx + cmovc r8,rdx + mov QWORD PTR[rdi],r14 + cmovc r9,rbp + mov QWORD PTR[8+rdi],r15 + cmovc r10,r13 + mov QWORD PTR[16+rdi],r8 + cmovc r11,rsi + mov QWORD PTR[24+rdi],r9 + mov QWORD PTR[32+rdi],r10 + mov QWORD PTR[40+rdi],r11 + + DB 0F3h,0C3h ;repret +__redx_tail_mont_384 ENDP + +PUBLIC sgn0x_pty_mont_384 + + +ALIGN 32 +sgn0x_pty_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0x_pty_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +sgn0_pty_mont_384$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0x_pty_mont_384:: + + + mov rbx,rsi + lea rsi,QWORD PTR[rdi] + mov rcx,rdx + call __mulx_by_1_mont_384 + + xor rax,rax + mov r13,r14 + add r14,r14 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r14,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + not rax + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0x_pty_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0x_pty_mont_384:: +sgn0x_pty_mont_384 ENDP + +PUBLIC sgn0x_pty_mont_384x + + +ALIGN 32 +sgn0x_pty_mont_384x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sgn0x_pty_mont_384x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +sgn0_pty_mont_384x$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,8 + +$L$SEH_body_sgn0x_pty_mont_384x:: + + + mov rbx,rsi + lea rsi,QWORD PTR[48+rdi] + mov rcx,rdx + call __mulx_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + lea rsi,QWORD PTR[rdi] + xor rdi,rdi + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rdi,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rdi,0 + + mov QWORD PTR[rsp],r14 + not rdi + and r13,1 + and rdi,2 + or rdi,r13 + + call __mulx_by_1_mont_384 + + mov r12,r14 + or r14,r15 + or r14,r8 + or r14,r9 + or r14,r10 + or r14,r11 + + xor rax,rax + mov r13,r12 + add r12,r12 + adc r15,r15 + adc r8,r8 + adc r9,r9 + adc r10,r10 + adc r11,r11 + adc rax,0 + + sub r12,QWORD PTR[rbx] + sbb r15,QWORD PTR[8+rbx] + sbb r8,QWORD PTR[16+rbx] + sbb r9,QWORD PTR[24+rbx] + sbb r10,QWORD PTR[32+rbx] + sbb r11,QWORD PTR[40+rbx] + sbb rax,0 + + mov r12,QWORD PTR[rsp] + + not rax + + test r14,r14 + cmovz r13,rdi + + test r12,r12 + cmovnz rax,rdi + + and r13,1 + and rax,2 + or rax,r13 + + mov r15,QWORD PTR[8+rsp] + + mov r14,QWORD PTR[16+rsp] + + mov r13,QWORD PTR[24+rsp] + + mov r12,QWORD PTR[32+rsp] + + mov rbx,QWORD PTR[40+rsp] + + mov rbp,QWORD PTR[48+rsp] + + lea rsp,QWORD PTR[56+rsp] + +$L$SEH_epilogue_sgn0x_pty_mont_384x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sgn0x_pty_mont_384x:: +sgn0x_pty_mont_384x ENDP +PUBLIC mulx_mont_384 + + +ALIGN 32 +mulx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_mulx_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] +mul_mont_384$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_mulx_mont_384:: + + + mov rbx,rdx + mov rdx,QWORD PTR[rdx] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + mov QWORD PTR[rsp],r8 + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_mulx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_mulx_mont_384:: +mulx_mont_384 ENDP + +ALIGN 32 +__mulx_mont_384 PROC PRIVATE + DB 243,15,30,250 + + + mulx r10,r14,r15 + mulx r11,r15,rax + add r9,r14 + mulx r12,rax,r12 + adc r10,r15 + mulx r13,rdi,rdi + adc r11,rax + mulx r14,rbp,rbp + mov rdx,QWORD PTR[8+rbx] + adc r12,rdi + adc r13,rbp + adc r14,0 + xor r15,r15 + + mov QWORD PTR[16+rsp],r8 + imul r8,QWORD PTR[8+rsp] + + + xor rax,rax + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r9,rdi + adcx r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r8 + adox r14,rdi + adcx r15,rbp + adox r15,rax + adox rax,rax + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r9,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rdi + adox r14,rbp + adcx r14,r8 + adox r15,r8 + adcx r15,r8 + adox rax,r8 + adcx rax,r8 + mov QWORD PTR[16+rsp],r9 + imul r9,QWORD PTR[8+rsp] + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r9 + adox r15,rdi + adcx rax,rbp + adox rax,r8 + adox r8,r8 + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rdi + adox r15,rbp + adcx r15,r9 + adox rax,r9 + adcx rax,r9 + adox r8,r9 + adcx r8,r9 + mov QWORD PTR[16+rsp],r10 + imul r10,QWORD PTR[8+rsp] + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r10 + adox rax,rdi + adcx r8,rbp + adox r8,r9 + adox r9,r9 + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[32+rbx] + adcx r15,rdi + adox rax,rbp + adcx rax,r10 + adox r8,r10 + adcx r8,r10 + adox r9,r10 + adcx r9,r10 + mov QWORD PTR[16+rsp],r11 + imul r11,QWORD PTR[8+rsp] + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r11 + adox r8,rdi + adcx r9,rbp + adox r9,r10 + adox r10,r10 + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[40+rbx] + adcx rax,rdi + adox r8,rbp + adcx r8,r11 + adox r9,r11 + adcx r9,r11 + adox r10,r11 + adcx r10,r11 + mov QWORD PTR[16+rsp],r12 + imul r12,QWORD PTR[8+rsp] + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r8,rdi + adcx r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r12 + adox r9,rdi + adcx r10,rbp + adox r10,r11 + adox r11,r11 + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rdi,QWORD PTR[16+rsp] + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r13 + adcx r8,rdi + adox r9,rbp + adcx r9,r12 + adox r10,r12 + adcx r10,r12 + adox r11,r12 + adcx r11,r12 + imul rdx,QWORD PTR[8+rsp] + mov rbx,QWORD PTR[24+rsp] + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx rax,rdi + adox r8,rbp + mov r13,r15 + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r8,rdi + adox r9,rbp + mov rsi,rax + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + adcx r9,rdi + adox r10,rbp + mov rdx,r14 + adcx r10,r12 + adox r11,r12 + lea rcx,QWORD PTR[128+rcx] + mov r12,r8 + adc r11,0 + + + + + sub r14,QWORD PTR[rcx] + sbb r15,QWORD PTR[8+rcx] + mov rdi,r9 + sbb rax,QWORD PTR[16+rcx] + sbb r8,QWORD PTR[24+rcx] + sbb r9,QWORD PTR[32+rcx] + mov rbp,r10 + sbb r10,QWORD PTR[40+rcx] + sbb r11,0 + + cmovnc rdx,r14 + cmovc r15,r13 + cmovc rax,rsi + cmovnc r12,r8 + mov QWORD PTR[rbx],rdx + cmovnc rdi,r9 + mov QWORD PTR[8+rbx],r15 + cmovnc rbp,r10 + mov QWORD PTR[16+rbx],rax + mov QWORD PTR[24+rbx],r12 + mov QWORD PTR[32+rbx],rdi + mov QWORD PTR[40+rbx],rbp + + DB 0F3h,0C3h ;repret + +__mulx_mont_384 ENDP +PUBLIC sqrx_mont_384 + + +ALIGN 32 +sqrx_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +sqr_mont_384$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-24))+rsp] + +$L$SEH_body_sqrx_mont_384:: + + + mov r8,rcx + lea rcx,QWORD PTR[((-128))+rdx] + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + lea rbx,QWORD PTR[rsi] + mov QWORD PTR[rsp],r8 + lea rsi,QWORD PTR[((-128))+rsi] + + mulx r9,r8,rdx + call __mulx_mont_384 + + mov r15,QWORD PTR[24+rsp] + + mov r14,QWORD PTR[32+rsp] + + mov r13,QWORD PTR[40+rsp] + + mov r12,QWORD PTR[48+rsp] + + mov rbx,QWORD PTR[56+rsp] + + mov rbp,QWORD PTR[64+rsp] + + lea rsp,QWORD PTR[72+rsp] + +$L$SEH_epilogue_sqrx_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_384:: +sqrx_mont_384 ENDP + +PUBLIC sqrx_n_mul_mont_384 + + +ALIGN 32 +sqrx_n_mul_mont_384 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_n_mul_mont_384:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] +sqr_n_mul_mont_384$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-40))+rsp] + +$L$SEH_body_sqrx_n_mul_mont_384:: + + + mov r10,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,rsi + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + +$L$oop_sqrx_384:: + movd xmm1,r10d + lea rsi,QWORD PTR[((-128))+rbx] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,rdx + call __mulx_mont_384 + + movd r10d,xmm1 + dec r10d + jnz $L$oop_sqrx_384 + + mov r14,rdx +DB 102,72,15,126,210 + lea rsi,QWORD PTR[((-128))+rbx] + mov rbx,QWORD PTR[24+rsp] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[40+rsp] + + mov r14,QWORD PTR[48+rsp] + + mov r13,QWORD PTR[56+rsp] + + mov r12,QWORD PTR[64+rsp] + + mov rbx,QWORD PTR[72+rsp] + + mov rbp,QWORD PTR[80+rsp] + + lea rsp,QWORD PTR[88+rsp] + +$L$SEH_epilogue_sqrx_n_mul_mont_384:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_n_mul_mont_384:: +sqrx_n_mul_mont_384 ENDP + +PUBLIC sqrx_n_mul_mont_383 + + +ALIGN 32 +sqrx_n_mul_mont_383 PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_n_mul_mont_383:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD PTR[40+rsp] + mov r9,QWORD PTR[48+rsp] +sqr_n_mul_mont_383$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,QWORD PTR[((-40))+rsp] + +$L$SEH_body_sqrx_n_mul_mont_383:: + + + mov r10,rdx + mov rdx,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov rbx,rsi + mov r12,QWORD PTR[24+rsi] + mov QWORD PTR[16+rsp],rdi + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + + mov QWORD PTR[rsp],r8 + mov QWORD PTR[24+rsp],r9 + movq xmm2,QWORD PTR[r9] + lea rcx,QWORD PTR[((-128))+rcx] + +$L$oop_sqrx_383:: + movd xmm1,r10d + lea rsi,QWORD PTR[((-128))+rbx] + + mulx r9,r8,rdx + call __mulx_mont_383_nonred + + movd r10d,xmm1 + dec r10d + jnz $L$oop_sqrx_383 + + mov r14,rdx +DB 102,72,15,126,210 + lea rsi,QWORD PTR[((-128))+rbx] + mov rbx,QWORD PTR[24+rsp] + + mulx r9,r8,r14 + call __mulx_mont_384 + + mov r15,QWORD PTR[40+rsp] + + mov r14,QWORD PTR[48+rsp] + + mov r13,QWORD PTR[56+rsp] + + mov r12,QWORD PTR[64+rsp] + + mov rbx,QWORD PTR[72+rsp] + + mov rbp,QWORD PTR[80+rsp] + + lea rsp,QWORD PTR[88+rsp] + +$L$SEH_epilogue_sqrx_n_mul_mont_383:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_n_mul_mont_383:: +sqrx_n_mul_mont_383 ENDP + +ALIGN 32 +__mulx_mont_383_nonred PROC PRIVATE + DB 243,15,30,250 + + + mulx r10,r14,r15 + mulx r11,r15,rax + add r9,r14 + mulx r12,rax,r12 + adc r10,r15 + mulx r13,rdi,rdi + adc r11,rax + mulx r14,rbp,rbp + mov rdx,QWORD PTR[8+rbx] + adc r12,rdi + adc r13,rbp + adc r14,0 + mov rax,r8 + imul r8,QWORD PTR[8+rsp] + + + xor r15,r15 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r9,rdi + adcx r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r8 + adox r14,rdi + adcx rbp,r15 + adox r15,rbp + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx rax,rdi + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r9,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[16+rbx] + adcx r13,rdi + adox r14,rbp + adcx r14,rax + adox r15,rax + adcx r15,rax + mov r8,r9 + imul r9,QWORD PTR[8+rsp] + + + xor rax,rax + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r10,rdi + adcx r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r9 + adox r15,rdi + adcx rbp,rax + adox rax,rbp + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r8,rdi + adox r10,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r10,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[24+rbx] + adcx r14,rdi + adox r15,rbp + adcx r15,r8 + adox rax,r8 + adcx rax,r8 + mov r9,r10 + imul r10,QWORD PTR[8+rsp] + + + xor r8,r8 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r11,rdi + adcx r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r10 + adox rax,rdi + adcx rbp,r8 + adox r8,rbp + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r9,rdi + adox r11,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r11,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[32+rbx] + adcx r15,rdi + adox rax,rbp + adcx rax,r9 + adox r8,r9 + adcx r8,r9 + mov r10,r11 + imul r11,QWORD PTR[8+rsp] + + + xor r9,r9 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r12,rdi + adcx r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r11 + adox r8,rdi + adcx rbp,r9 + adox r9,rbp + + + xor r11,r11 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r10,rdi + adox r12,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r12,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,QWORD PTR[40+rbx] + adcx rax,rdi + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adcx r9,r10 + mov r11,r12 + imul r12,QWORD PTR[8+rsp] + + + xor r10,r10 + mulx rbp,rdi,QWORD PTR[((0+128))+rsi] + adox r13,rdi + adcx r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rsi] + adox r14,rdi + adcx r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rsi] + adox r15,rdi + adcx rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rsi] + adox rax,rdi + adcx r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rsi] + adox r8,rdi + adcx r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rsi] + mov rdx,r12 + adox r9,rdi + adcx rbp,r10 + adox r10,rbp + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r11,rdi + adox r13,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r13 + adcx r8,rdi + adox r9,rbp + adcx r9,r11 + adox r10,r11 + adcx r10,r11 + imul rdx,QWORD PTR[8+rsp] + mov rbx,QWORD PTR[24+rsp] + + + xor r12,r12 + mulx rbp,rdi,QWORD PTR[((0+128))+rcx] + adcx r13,rdi + adox r14,rbp + + mulx rbp,rdi,QWORD PTR[((8+128))+rcx] + adcx r14,rdi + adox r15,rbp + + mulx rbp,rdi,QWORD PTR[((16+128))+rcx] + adcx r15,rdi + adox rax,rbp + + mulx rbp,rdi,QWORD PTR[((24+128))+rcx] + adcx rax,rdi + adox r8,rbp + + mulx rbp,rdi,QWORD PTR[((32+128))+rcx] + adcx r8,rdi + adox r9,rbp + + mulx rbp,rdi,QWORD PTR[((40+128))+rcx] + mov rdx,r14 + adcx r9,rdi + adox r10,rbp + adc r10,0 + mov r12,r8 + + mov QWORD PTR[rbx],r14 + mov QWORD PTR[8+rbx],r15 + mov QWORD PTR[16+rbx],rax + mov rdi,r9 + mov QWORD PTR[24+rbx],r8 + mov QWORD PTR[32+rbx],r9 + mov QWORD PTR[40+rbx],r10 + mov rbp,r10 + + DB 0F3h,0C3h ;repret + +__mulx_mont_383_nonred ENDP +PUBLIC sqrx_mont_382x + + +ALIGN 32 +sqrx_mont_382x PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_sqrx_mont_382x:: + + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 +sqr_mont_382x$1:: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,136 + +$L$SEH_body_sqrx_mont_382x:: + + + mov QWORD PTR[rsp],rcx + mov rcx,rdx + mov QWORD PTR[16+rsp],rdi + mov QWORD PTR[24+rsp],rsi + + + mov r8,QWORD PTR[rsi] + mov r9,QWORD PTR[8+rsi] + mov r10,QWORD PTR[16+rsi] + mov r11,QWORD PTR[24+rsi] + mov r12,QWORD PTR[32+rsi] + mov r13,QWORD PTR[40+rsi] + + mov r14,r8 + add r8,QWORD PTR[48+rsi] + mov r15,r9 + adc r9,QWORD PTR[56+rsi] + mov rax,r10 + adc r10,QWORD PTR[64+rsi] + mov rdx,r11 + adc r11,QWORD PTR[72+rsi] + mov rbx,r12 + adc r12,QWORD PTR[80+rsi] + mov rbp,r13 + adc r13,QWORD PTR[88+rsi] + + sub r14,QWORD PTR[48+rsi] + sbb r15,QWORD PTR[56+rsi] + sbb rax,QWORD PTR[64+rsi] + sbb rdx,QWORD PTR[72+rsi] + sbb rbx,QWORD PTR[80+rsi] + sbb rbp,QWORD PTR[88+rsi] + sbb rdi,rdi + + mov QWORD PTR[((32+0))+rsp],r8 + mov QWORD PTR[((32+8))+rsp],r9 + mov QWORD PTR[((32+16))+rsp],r10 + mov QWORD PTR[((32+24))+rsp],r11 + mov QWORD PTR[((32+32))+rsp],r12 + mov QWORD PTR[((32+40))+rsp],r13 + + mov QWORD PTR[((32+48))+rsp],r14 + mov QWORD PTR[((32+56))+rsp],r15 + mov QWORD PTR[((32+64))+rsp],rax + mov QWORD PTR[((32+72))+rsp],rdx + mov QWORD PTR[((32+80))+rsp],rbx + mov QWORD PTR[((32+88))+rsp],rbp + mov QWORD PTR[((32+96))+rsp],rdi + + + + lea rbx,QWORD PTR[48+rsi] + + mov rdx,QWORD PTR[48+rsi] + mov r14,QWORD PTR[rsi] + mov r15,QWORD PTR[8+rsi] + mov rax,QWORD PTR[16+rsi] + mov r12,QWORD PTR[24+rsi] + mov rdi,QWORD PTR[32+rsi] + mov rbp,QWORD PTR[40+rsi] + lea rsi,QWORD PTR[((-128))+rsi] + lea rcx,QWORD PTR[((-128))+rcx] + + mulx r9,r8,r14 + call __mulx_mont_383_nonred + add rdx,rdx + adc r15,r15 + adc rax,rax + adc r12,r12 + adc rdi,rdi + adc rbp,rbp + + mov QWORD PTR[48+rbx],rdx + mov QWORD PTR[56+rbx],r15 + mov QWORD PTR[64+rbx],rax + mov QWORD PTR[72+rbx],r12 + mov QWORD PTR[80+rbx],rdi + mov QWORD PTR[88+rbx],rbp + + lea rsi,QWORD PTR[((32-128))+rsp] + lea rbx,QWORD PTR[((32+48))+rsp] + + mov rdx,QWORD PTR[((32+48))+rsp] + mov r14,QWORD PTR[((32+0))+rsp] + mov r15,QWORD PTR[((32+8))+rsp] + mov rax,QWORD PTR[((32+16))+rsp] + mov r12,QWORD PTR[((32+24))+rsp] + mov rdi,QWORD PTR[((32+32))+rsp] + mov rbp,QWORD PTR[((32+40))+rsp] + + + + mulx r9,r8,r14 + call __mulx_mont_383_nonred + mov r14,QWORD PTR[((32+96))+rsp] + lea rcx,QWORD PTR[128+rcx] + mov r8,QWORD PTR[((32+0))+rsp] + and r8,r14 + mov r9,QWORD PTR[((32+8))+rsp] + and r9,r14 + mov r10,QWORD PTR[((32+16))+rsp] + and r10,r14 + mov r11,QWORD PTR[((32+24))+rsp] + and r11,r14 + mov r13,QWORD PTR[((32+32))+rsp] + and r13,r14 + and r14,QWORD PTR[((32+40))+rsp] + + sub rdx,r8 + mov r8,QWORD PTR[rcx] + sbb r15,r9 + mov r9,QWORD PTR[8+rcx] + sbb rax,r10 + mov r10,QWORD PTR[16+rcx] + sbb r12,r11 + mov r11,QWORD PTR[24+rcx] + sbb rdi,r13 + mov r13,QWORD PTR[32+rcx] + sbb rbp,r14 + sbb r14,r14 + + and r8,r14 + and r9,r14 + and r10,r14 + and r11,r14 + and r13,r14 + and r14,QWORD PTR[40+rcx] + + add rdx,r8 + adc r15,r9 + adc rax,r10 + adc r12,r11 + adc rdi,r13 + adc rbp,r14 + + mov QWORD PTR[rbx],rdx + mov QWORD PTR[8+rbx],r15 + mov QWORD PTR[16+rbx],rax + mov QWORD PTR[24+rbx],r12 + mov QWORD PTR[32+rbx],rdi + mov QWORD PTR[40+rbx],rbp + lea r8,QWORD PTR[136+rsp] + mov r15,QWORD PTR[r8] + + mov r14,QWORD PTR[8+r8] + + mov r13,QWORD PTR[16+r8] + + mov r12,QWORD PTR[24+r8] + + mov rbx,QWORD PTR[32+r8] + + mov rbp,QWORD PTR[40+r8] + + lea rsp,QWORD PTR[48+r8] + +$L$SEH_epilogue_sqrx_mont_382x:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_sqrx_mont_382x:: +sqrx_mont_382x ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_mulx_mont_384x + DD imagerel $L$SEH_body_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_prologue + + DD imagerel $L$SEH_body_mulx_mont_384x + DD imagerel $L$SEH_epilogue_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_body + + DD imagerel $L$SEH_epilogue_mulx_mont_384x + DD imagerel $L$SEH_end_mulx_mont_384x + DD imagerel $L$SEH_info_mulx_mont_384x_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_384x + DD imagerel $L$SEH_body_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_prologue + + DD imagerel $L$SEH_body_sqrx_mont_384x + DD imagerel $L$SEH_epilogue_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_384x + DD imagerel $L$SEH_end_sqrx_mont_384x + DD imagerel $L$SEH_info_sqrx_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mulx_382x + DD imagerel $L$SEH_body_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_prologue + + DD imagerel $L$SEH_body_mulx_382x + DD imagerel $L$SEH_epilogue_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_body + + DD imagerel $L$SEH_epilogue_mulx_382x + DD imagerel $L$SEH_end_mulx_382x + DD imagerel $L$SEH_info_mulx_382x_epilogue + + DD imagerel $L$SEH_begin_sqrx_382x + DD imagerel $L$SEH_body_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_prologue + + DD imagerel $L$SEH_body_sqrx_382x + DD imagerel $L$SEH_epilogue_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_body + + DD imagerel $L$SEH_epilogue_sqrx_382x + DD imagerel $L$SEH_end_sqrx_382x + DD imagerel $L$SEH_info_sqrx_382x_epilogue + + DD imagerel $L$SEH_begin_mulx_384 + DD imagerel $L$SEH_body_mulx_384 + DD imagerel $L$SEH_info_mulx_384_prologue + + DD imagerel $L$SEH_body_mulx_384 + DD imagerel $L$SEH_epilogue_mulx_384 + DD imagerel $L$SEH_info_mulx_384_body + + DD imagerel $L$SEH_epilogue_mulx_384 + DD imagerel $L$SEH_end_mulx_384 + DD imagerel $L$SEH_info_mulx_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_384 + DD imagerel $L$SEH_body_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_prologue + + DD imagerel $L$SEH_body_sqrx_384 + DD imagerel $L$SEH_epilogue_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_body + + DD imagerel $L$SEH_epilogue_sqrx_384 + DD imagerel $L$SEH_end_sqrx_384 + DD imagerel $L$SEH_info_sqrx_384_epilogue + + DD imagerel $L$SEH_begin_redcx_mont_384 + DD imagerel $L$SEH_body_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_prologue + + DD imagerel $L$SEH_body_redcx_mont_384 + DD imagerel $L$SEH_epilogue_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_body + + DD imagerel $L$SEH_epilogue_redcx_mont_384 + DD imagerel $L$SEH_end_redcx_mont_384 + DD imagerel $L$SEH_info_redcx_mont_384_epilogue + + DD imagerel $L$SEH_begin_fromx_mont_384 + DD imagerel $L$SEH_body_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_prologue + + DD imagerel $L$SEH_body_fromx_mont_384 + DD imagerel $L$SEH_epilogue_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_body + + DD imagerel $L$SEH_epilogue_fromx_mont_384 + DD imagerel $L$SEH_end_fromx_mont_384 + DD imagerel $L$SEH_info_fromx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0x_pty_mont_384 + DD imagerel $L$SEH_body_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_prologue + + DD imagerel $L$SEH_body_sgn0x_pty_mont_384 + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_body + + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384 + DD imagerel $L$SEH_end_sgn0x_pty_mont_384 + DD imagerel $L$SEH_info_sgn0x_pty_mont_384_epilogue + + DD imagerel $L$SEH_begin_sgn0x_pty_mont_384x + DD imagerel $L$SEH_body_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_prologue + + DD imagerel $L$SEH_body_sgn0x_pty_mont_384x + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_body + + DD imagerel $L$SEH_epilogue_sgn0x_pty_mont_384x + DD imagerel $L$SEH_end_sgn0x_pty_mont_384x + DD imagerel $L$SEH_info_sgn0x_pty_mont_384x_epilogue + + DD imagerel $L$SEH_begin_mulx_mont_384 + DD imagerel $L$SEH_body_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_prologue + + DD imagerel $L$SEH_body_mulx_mont_384 + DD imagerel $L$SEH_epilogue_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_body + + DD imagerel $L$SEH_epilogue_mulx_mont_384 + DD imagerel $L$SEH_end_mulx_mont_384 + DD imagerel $L$SEH_info_mulx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_384 + DD imagerel $L$SEH_body_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_prologue + + DD imagerel $L$SEH_body_sqrx_mont_384 + DD imagerel $L$SEH_epilogue_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_384 + DD imagerel $L$SEH_end_sqrx_mont_384 + DD imagerel $L$SEH_info_sqrx_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_prologue + + DD imagerel $L$SEH_body_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_body + + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_end_sqrx_n_mul_mont_384 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_384_epilogue + + DD imagerel $L$SEH_begin_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_prologue + + DD imagerel $L$SEH_body_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_body + + DD imagerel $L$SEH_epilogue_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_end_sqrx_n_mul_mont_383 + DD imagerel $L$SEH_info_sqrx_n_mul_mont_383_epilogue + + DD imagerel $L$SEH_begin_sqrx_mont_382x + DD imagerel $L$SEH_body_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_prologue + + DD imagerel $L$SEH_body_sqrx_mont_382x + DD imagerel $L$SEH_epilogue_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_body + + DD imagerel $L$SEH_epilogue_sqrx_mont_382x + DD imagerel $L$SEH_end_sqrx_mont_382x + DD imagerel $L$SEH_info_sqrx_mont_382x_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_mulx_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mulx_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,029h,000h +DB 000h,0e4h,02ah,000h +DB 000h,0d4h,02bh,000h +DB 000h,0c4h,02ch,000h +DB 000h,034h,02dh,000h +DB 000h,054h,02eh,000h +DB 000h,074h,030h,000h +DB 000h,064h,031h,000h +DB 000h,001h,02fh,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mulx_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqrx_mont_384x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqrx_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mulx_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mulx_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqrx_382x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqrx_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mulx_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,000h,000h +DB 000h,0e4h,001h,000h +DB 000h,0d4h,002h,000h +DB 000h,0c4h,003h,000h +DB 000h,034h,004h,000h +DB 000h,054h,005h,000h +DB 000h,074h,007h,000h +DB 000h,064h,008h,000h +DB 000h,052h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mulx_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqrx_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqrx_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_redcx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_redcx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_redcx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_fromx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_fromx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_fromx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0x_pty_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sgn0x_pty_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sgn0x_pty_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sgn0x_pty_mont_384x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sgn0x_pty_mont_384x_body:: +DB 1,0,17,0 +DB 000h,0f4h,001h,000h +DB 000h,0e4h,002h,000h +DB 000h,0d4h,003h,000h +DB 000h,0c4h,004h,000h +DB 000h,034h,005h,000h +DB 000h,054h,006h,000h +DB 000h,074h,008h,000h +DB 000h,064h,009h,000h +DB 000h,062h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sgn0x_pty_mont_384x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_mulx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_mulx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_mulx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqrx_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,003h,000h +DB 000h,0e4h,004h,000h +DB 000h,0d4h,005h,000h +DB 000h,0c4h,006h,000h +DB 000h,034h,007h,000h +DB 000h,054h,008h,000h +DB 000h,074h,00ah,000h +DB 000h,064h,00bh,000h +DB 000h,082h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqrx_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_n_mul_mont_384_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqrx_n_mul_mont_384_body:: +DB 1,0,17,0 +DB 000h,0f4h,005h,000h +DB 000h,0e4h,006h,000h +DB 000h,0d4h,007h,000h +DB 000h,0c4h,008h,000h +DB 000h,034h,009h,000h +DB 000h,054h,00ah,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,0a2h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqrx_n_mul_mont_384_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_n_mul_mont_383_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqrx_n_mul_mont_383_body:: +DB 1,0,17,0 +DB 000h,0f4h,005h,000h +DB 000h,0e4h,006h,000h +DB 000h,0d4h,007h,000h +DB 000h,0c4h,008h,000h +DB 000h,034h,009h,000h +DB 000h,054h,00ah,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,0a2h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqrx_n_mul_mont_383_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_sqrx_mont_382x_prologue:: +DB 1,0,5,00bh +DB 0,074h,1,0 +DB 0,064h,2,0 +DB 0,0b3h +DB 0,0 + DD 0,0 +$L$SEH_info_sqrx_mont_382x_body:: +DB 1,0,18,0 +DB 000h,0f4h,011h,000h +DB 000h,0e4h,012h,000h +DB 000h,0d4h,013h,000h +DB 000h,0c4h,014h,000h +DB 000h,034h,015h,000h +DB 000h,054h,016h,000h +DB 000h,074h,018h,000h +DB 000h,064h,019h,000h +DB 000h,001h,017h,000h +DB 000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_sqrx_mont_382x_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/build/win64/sha256-armv8.asm b/crypto/blst_src/build/win64/sha256-armv8.asm new file mode 100644 index 00000000000..31e74219c19 --- /dev/null +++ b/crypto/blst_src/build/win64/sha256-armv8.asm @@ -0,0 +1,1084 @@ +// +// Copyright Supranational LLC +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// ==================================================================== +// Written by Andy Polyakov, @dot-asm, initially for the OpenSSL +// project. +// ==================================================================== +// +// sha256_block procedure for ARMv8. +// +// This module is stripped of scalar code paths, with rationale that all +// known processors are NEON-capable. +// +// See original module at CRYPTOGAMS for further details. + + COMMON |__blst_platform_cap|,4 + AREA |.text|,CODE,ALIGN=8,ARM64 + + ALIGN 64 + +|$LK256| + DCDU 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DCDU 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DCDU 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DCDU 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DCDU 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DCDU 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DCDU 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DCDU 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DCDU 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DCDU 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DCDU 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DCDU 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DCDU 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DCDU 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DCDU 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DCDU 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + DCDU 0 //terminator + + DCB "SHA256 block transform for ARMv8, CRYPTOGAMS by @dot-asm",0 + ALIGN 4 + ALIGN 4 + + EXPORT |blst_sha256_block_armv8|[FUNC] + ALIGN 64 +|blst_sha256_block_armv8| PROC +|$Lv8_entry| + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adr x3,|$LK256| + +|$Loop_hw| + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + DCDU 0x5e2828a4 //sha256su0 v4.16b,v5.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + DCDU 0x5e2828c5 //sha256su0 v5.16b,v6.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s + DCDU 0x5e2828e6 //sha256su0 v6.16b,v7.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + DCDU 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s + DCDU 0x5e282887 //sha256su0 v7.16b,v4.16b + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + DCDU 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s + DCDU 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b + DCDU 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s + DCDU 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,|$Loop_hw| + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + ENDP + + EXPORT |blst_sha256_block_data_order|[FUNC] + ALIGN 16 +|blst_sha256_block_data_order| PROC + adrp x16,__blst_platform_cap + ldr w16,[x16,__blst_platform_cap] + tst w16,#1 + bne |$Lv8_entry| + + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 + + adr x16,|$LK256| + add x2,x1,x2,lsl#6 // len to point at the end of inp + + ld1 {v0.16b},[x1], #16 + ld1 {v1.16b},[x1], #16 + ld1 {v2.16b},[x1], #16 + ld1 {v3.16b},[x1], #16 + ld1 {v4.4s},[x16], #16 + ld1 {v5.4s},[x16], #16 + ld1 {v6.4s},[x16], #16 + ld1 {v7.4s},[x16], #16 + rev32 v0.16b,v0.16b // yes, even on + rev32 v1.16b,v1.16b // big-endian + rev32 v2.16b,v2.16b + rev32 v3.16b,v3.16b + mov x17,sp + add v4.4s,v4.4s,v0.4s + add v5.4s,v5.4s,v1.4s + add v6.4s,v6.4s,v2.4s + st1 {v4.4s,v5.4s},[x17], #32 + add v7.4s,v7.4s,v3.4s + st1 {v6.4s,v7.4s},[x17] + sub x17,x17,#32 + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#8] + ldp w7,w8,[x0,#16] + ldp w9,w10,[x0,#24] + ldr w12,[sp,#0] + mov w13,wzr + eor w14,w4,w5 + mov w15,wzr + b |$L_00_48| + + ALIGN 16 +|$L_00_48| + ext8 v4.16b,v0.16b,v1.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext8 v7.16b,v2.16b,v3.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v3.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v0.4s,v0.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v0.4s,v0.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v0.4s,v0.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v0.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v0.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v0.4s,#15 + add w8,w8,w12 + ushr v17.4s,v0.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v0.4s,#13 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v0.4s,v0.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v0.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext8 v4.16b,v1.16b,v2.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext8 v7.16b,v3.16b,v0.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v0.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v1.4s,v1.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v1.4s,v1.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v1.4s,v1.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v1.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v1.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v1.4s,#15 + add w4,w4,w12 + ushr v17.4s,v1.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v1.4s,#13 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v1.4s,v1.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v1.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + ext8 v4.16b,v2.16b,v3.16b,#4 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + bic w15,w9,w7 + ext8 v7.16b,v0.16b,v1.16b,#4 + eor w11,w7,w7,ror#5 + add w3,w3,w13 + mov d19,v1.d[1] + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w3,w3,ror#11 + ushr v5.4s,v4.4s,#3 + add w10,w10,w12 + add v2.4s,v2.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + ushr v7.4s,v4.4s,#18 + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w6,w6,w10 + sli v7.4s,v4.4s,#14 + eor w14,w14,w4 + ushr v16.4s,v19.4s,#17 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + eor v5.16b,v5.16b,v7.16b + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + sli v16.4s,v19.4s,#15 + add w10,w10,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + ushr v7.4s,v19.4s,#19 + add w9,w9,w12 + ror w11,w11,#6 + add v2.4s,v2.4s,v5.4s + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + sli v7.4s,v19.4s,#13 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + eor v17.16b,v17.16b,v7.16b + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + add v2.4s,v2.4s,v17.4s + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + ushr v18.4s,v2.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v2.4s,#10 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + sli v18.4s,v2.4s,#15 + add w8,w8,w12 + ushr v17.4s,v2.4s,#19 + ror w11,w11,#6 + eor w13,w9,w10 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w9,ror#20 + add w8,w8,w11 + sli v17.4s,v2.4s,#13 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w4,w4,w8 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w10 + eor v17.16b,v17.16b,v17.16b + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + mov v17.d[1],v19.d[0] + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + add v2.4s,v2.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add v4.4s,v4.4s,v2.4s + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + ext8 v4.16b,v3.16b,v0.16b,#4 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + bic w15,w5,w3 + ext8 v7.16b,v1.16b,v2.16b,#4 + eor w11,w3,w3,ror#5 + add w7,w7,w13 + mov d19,v2.d[1] + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + ushr v6.4s,v4.4s,#7 + eor w15,w7,w7,ror#11 + ushr v5.4s,v4.4s,#3 + add w6,w6,w12 + add v3.4s,v3.4s,v7.4s + ror w11,w11,#6 + sli v6.4s,v4.4s,#25 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + ushr v7.4s,v4.4s,#18 + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + eor v5.16b,v5.16b,v6.16b + ror w15,w15,#2 + add w10,w10,w6 + sli v7.4s,v4.4s,#14 + eor w14,w14,w8 + ushr v16.4s,v19.4s,#17 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + eor v5.16b,v5.16b,v7.16b + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + sli v16.4s,v19.4s,#15 + add w6,w6,w14 + orr w12,w12,w15 + ushr v17.4s,v19.4s,#10 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + ushr v7.4s,v19.4s,#19 + add w5,w5,w12 + ror w11,w11,#6 + add v3.4s,v3.4s,v5.4s + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + sli v7.4s,v19.4s,#13 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + eor v17.16b,v17.16b,v16.16b + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + eor v17.16b,v17.16b,v7.16b + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + add v3.4s,v3.4s,v17.4s + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + ushr v18.4s,v3.4s,#17 + orr w12,w12,w15 + ushr v19.4s,v3.4s,#10 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + sli v18.4s,v3.4s,#15 + add w4,w4,w12 + ushr v17.4s,v3.4s,#19 + ror w11,w11,#6 + eor w13,w5,w6 + eor v19.16b,v19.16b,v18.16b + eor w15,w15,w5,ror#20 + add w4,w4,w11 + sli v17.4s,v3.4s,#13 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + ld1 {v4.4s},[x16], #16 + add w8,w8,w4 + eor v19.16b,v19.16b,v17.16b + eor w14,w14,w6 + eor v17.16b,v17.16b,v17.16b + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + mov v17.d[1],v19.d[0] + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + add v3.4s,v3.4s,v17.4s + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add v4.4s,v4.4s,v3.4s + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[x16] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + cmp w12,#0 // check for K256 terminator + ldr w12,[sp,#0] + sub x17,x17,#64 + bne |$L_00_48| + + sub x16,x16,#256 // rewind x16 + cmp x1,x2 + mov x17, #64 + cseleq x17,x17,xzr + sub x1,x1,x17 // avoid SEGV + mov x17,sp + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v0.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v0.16b,v0.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v0.4s + add w10,w10,w11 + ldr w12,[sp,#4] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#8] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#12] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#16] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v1.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v1.16b,v1.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v1.4s + add w6,w6,w11 + ldr w12,[sp,#20] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#24] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#28] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + ldr w12,[sp,#32] + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w10,w10,w12 + add w3,w3,w15 + and w12,w8,w7 + ld1 {v2.16b},[x1],#16 + bic w15,w9,w7 + eor w11,w7,w7,ror#5 + ld1 {v4.4s},[x16],#16 + add w3,w3,w13 + orr w12,w12,w15 + eor w11,w11,w7,ror#19 + eor w15,w3,w3,ror#11 + rev32 v2.16b,v2.16b + add w10,w10,w12 + ror w11,w11,#6 + eor w13,w3,w4 + eor w15,w15,w3,ror#20 + add v4.4s,v4.4s,v2.4s + add w10,w10,w11 + ldr w12,[sp,#36] + and w14,w14,w13 + ror w15,w15,#2 + add w6,w6,w10 + eor w14,w14,w4 + add w9,w9,w12 + add w10,w10,w15 + and w12,w7,w6 + bic w15,w8,w6 + eor w11,w6,w6,ror#5 + add w10,w10,w14 + orr w12,w12,w15 + eor w11,w11,w6,ror#19 + eor w15,w10,w10,ror#11 + add w9,w9,w12 + ror w11,w11,#6 + eor w14,w10,w3 + eor w15,w15,w10,ror#20 + add w9,w9,w11 + ldr w12,[sp,#40] + and w13,w13,w14 + ror w15,w15,#2 + add w5,w5,w9 + eor w13,w13,w3 + add w8,w8,w12 + add w9,w9,w15 + and w12,w6,w5 + bic w15,w7,w5 + eor w11,w5,w5,ror#5 + add w9,w9,w13 + orr w12,w12,w15 + eor w11,w11,w5,ror#19 + eor w15,w9,w9,ror#11 + add w8,w8,w12 + ror w11,w11,#6 + eor w13,w9,w10 + eor w15,w15,w9,ror#20 + add w8,w8,w11 + ldr w12,[sp,#44] + and w14,w14,w13 + ror w15,w15,#2 + add w4,w4,w8 + eor w14,w14,w10 + add w7,w7,w12 + add w8,w8,w15 + and w12,w5,w4 + bic w15,w6,w4 + eor w11,w4,w4,ror#5 + add w8,w8,w14 + orr w12,w12,w15 + eor w11,w11,w4,ror#19 + eor w15,w8,w8,ror#11 + add w7,w7,w12 + ror w11,w11,#6 + eor w14,w8,w9 + eor w15,w15,w8,ror#20 + add w7,w7,w11 + ldr w12,[sp,#48] + and w13,w13,w14 + ror w15,w15,#2 + add w3,w3,w7 + eor w13,w13,w9 + st1 {v4.4s},[x17], #16 + add w6,w6,w12 + add w7,w7,w15 + and w12,w4,w3 + ld1 {v3.16b},[x1],#16 + bic w15,w5,w3 + eor w11,w3,w3,ror#5 + ld1 {v4.4s},[x16],#16 + add w7,w7,w13 + orr w12,w12,w15 + eor w11,w11,w3,ror#19 + eor w15,w7,w7,ror#11 + rev32 v3.16b,v3.16b + add w6,w6,w12 + ror w11,w11,#6 + eor w13,w7,w8 + eor w15,w15,w7,ror#20 + add v4.4s,v4.4s,v3.4s + add w6,w6,w11 + ldr w12,[sp,#52] + and w14,w14,w13 + ror w15,w15,#2 + add w10,w10,w6 + eor w14,w14,w8 + add w5,w5,w12 + add w6,w6,w15 + and w12,w3,w10 + bic w15,w4,w10 + eor w11,w10,w10,ror#5 + add w6,w6,w14 + orr w12,w12,w15 + eor w11,w11,w10,ror#19 + eor w15,w6,w6,ror#11 + add w5,w5,w12 + ror w11,w11,#6 + eor w14,w6,w7 + eor w15,w15,w6,ror#20 + add w5,w5,w11 + ldr w12,[sp,#56] + and w13,w13,w14 + ror w15,w15,#2 + add w9,w9,w5 + eor w13,w13,w7 + add w4,w4,w12 + add w5,w5,w15 + and w12,w10,w9 + bic w15,w3,w9 + eor w11,w9,w9,ror#5 + add w5,w5,w13 + orr w12,w12,w15 + eor w11,w11,w9,ror#19 + eor w15,w5,w5,ror#11 + add w4,w4,w12 + ror w11,w11,#6 + eor w13,w5,w6 + eor w15,w15,w5,ror#20 + add w4,w4,w11 + ldr w12,[sp,#60] + and w14,w14,w13 + ror w15,w15,#2 + add w8,w8,w4 + eor w14,w14,w6 + add w3,w3,w12 + add w4,w4,w15 + and w12,w9,w8 + bic w15,w10,w8 + eor w11,w8,w8,ror#5 + add w4,w4,w14 + orr w12,w12,w15 + eor w11,w11,w8,ror#19 + eor w15,w4,w4,ror#11 + add w3,w3,w12 + ror w11,w11,#6 + eor w14,w4,w5 + eor w15,w15,w4,ror#20 + add w3,w3,w11 + and w13,w13,w14 + ror w15,w15,#2 + add w7,w7,w3 + eor w13,w13,w5 + st1 {v4.4s},[x17], #16 + add w3,w3,w15 // h+=Sigma0(a) from the past + ldp w11,w12,[x0,#0] + add w3,w3,w13 // h+=Maj(a,b,c) from the past + ldp w13,w14,[x0,#8] + add w3,w3,w11 // accumulate + add w4,w4,w12 + ldp w11,w12,[x0,#16] + add w5,w5,w13 + add w6,w6,w14 + ldp w13,w14,[x0,#24] + add w7,w7,w11 + add w8,w8,w12 + ldr w12,[sp,#0] + stp w3,w4,[x0,#0] + add w9,w9,w13 + mov w13,wzr + stp w5,w6,[x0,#8] + add w10,w10,w14 + stp w7,w8,[x0,#16] + eor w14,w4,w5 + stp w9,w10,[x0,#24] + mov w15,wzr + mov x17,sp + bne |$L_00_48| + + ldr x29,[x29] + add sp,sp,#16*4+16 + ret + ENDP + + + EXPORT |blst_sha256_emit|[FUNC] + ALIGN 16 +|blst_sha256_emit| PROC + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] +#ifndef __AARCH64EB__ + rev x4,x4 + rev x5,x5 + rev x6,x6 + rev x7,x7 +#endif + str w4,[x0,#4] + lsr x4,x4,#32 + str w5,[x0,#12] + lsr x5,x5,#32 + str w6,[x0,#20] + lsr x6,x6,#32 + str w7,[x0,#28] + lsr x7,x7,#32 + str w4,[x0,#0] + str w5,[x0,#8] + str w6,[x0,#16] + str w7,[x0,#24] + ret + ENDP + + + + EXPORT |blst_sha256_bcopy|[FUNC] + ALIGN 16 +|blst_sha256_bcopy| PROC +|$Loop_bcopy| + ldrb w3,[x1],#1 + sub x2,x2,#1 + strb w3,[x0],#1 + cbnz x2,|$Loop_bcopy| + ret + ENDP + + + + EXPORT |blst_sha256_hcopy|[FUNC] + ALIGN 16 +|blst_sha256_hcopy| PROC + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + ret + ENDP + END diff --git a/crypto/blst_src/build/win64/sha256-x86_64.asm b/crypto/blst_src/build/win64/sha256-x86_64.asm new file mode 100644 index 00000000000..a502a75ecaf --- /dev/null +++ b/crypto/blst_src/build/win64/sha256-x86_64.asm @@ -0,0 +1,1575 @@ +OPTION DOTNAME +_DATA SEGMENT +COMM __blst_platform_cap:DWORD:1 +_DATA ENDS +.text$ SEGMENT ALIGN(256) 'CODE' + +ALIGN 64 + +K256:: + DD 0428a2f98h,071374491h,0b5c0fbcfh,0e9b5dba5h + DD 03956c25bh,059f111f1h,0923f82a4h,0ab1c5ed5h + DD 0d807aa98h,012835b01h,0243185beh,0550c7dc3h + DD 072be5d74h,080deb1feh,09bdc06a7h,0c19bf174h + DD 0e49b69c1h,0efbe4786h,00fc19dc6h,0240ca1cch + DD 02de92c6fh,04a7484aah,05cb0a9dch,076f988dah + DD 0983e5152h,0a831c66dh,0b00327c8h,0bf597fc7h + DD 0c6e00bf3h,0d5a79147h,006ca6351h,014292967h + DD 027b70a85h,02e1b2138h,04d2c6dfch,053380d13h + DD 0650a7354h,0766a0abbh,081c2c92eh,092722c85h + DD 0a2bfe8a1h,0a81a664bh,0c24b8b70h,0c76c51a3h + DD 0d192e819h,0d6990624h,0f40e3585h,0106aa070h + DD 019a4c116h,01e376c08h,02748774ch,034b0bcb5h + DD 0391c0cb3h,04ed8aa4ah,05b9cca4fh,0682e6ff3h + DD 0748f82eeh,078a5636fh,084c87814h,08cc70208h + DD 090befffah,0a4506cebh,0bef9a3f7h,0c67178f2h + + DD 000010203h,004050607h,008090a0bh,00c0d0e0fh + DD 003020100h,00b0a0908h,0ffffffffh,0ffffffffh + DD 0ffffffffh,0ffffffffh,003020100h,00b0a0908h +DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 +DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +DB 32,64,100,111,116,45,97,115,109,0 +PUBLIC blst_sha256_block_data_order_shaext + + +ALIGN 64 +blst_sha256_block_data_order_shaext PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_blst_sha256_block_data_order_shaext:: + + + push rbp + + mov rbp,rsp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 +$L$blst_sha256_block_data_order$2:: + sub rsp,050h + + movaps XMMWORD PTR[(-80)+rbp],xmm6 + movaps XMMWORD PTR[(-64)+rbp],xmm7 + movaps XMMWORD PTR[(-48)+rbp],xmm8 + movaps XMMWORD PTR[(-32)+rbp],xmm9 + movaps XMMWORD PTR[(-16)+rbp],xmm10 + +$L$SEH_body_blst_sha256_block_data_order_shaext:: + + lea rcx,QWORD PTR[((K256+128))] + movdqu xmm1,XMMWORD PTR[rdi] + movdqu xmm2,XMMWORD PTR[16+rdi] + movdqa xmm7,XMMWORD PTR[((256-128))+rcx] + + pshufd xmm0,xmm1,01bh + pshufd xmm1,xmm1,0b1h + pshufd xmm2,xmm2,01bh + movdqa xmm8,xmm7 +DB 102,15,58,15,202,8 + punpcklqdq xmm2,xmm0 + jmp $L$oop_shaext + +ALIGN 16 +$L$oop_shaext:: + movdqu xmm3,XMMWORD PTR[rsi] + movdqu xmm4,XMMWORD PTR[16+rsi] + movdqu xmm5,XMMWORD PTR[32+rsi] +DB 102,15,56,0,223 + movdqu xmm6,XMMWORD PTR[48+rsi] + + movdqa xmm0,XMMWORD PTR[((0-128))+rcx] + paddd xmm0,xmm3 +DB 102,15,56,0,231 + movdqa xmm10,xmm2 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + nop + movdqa xmm9,xmm1 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((16-128))+rcx] + paddd xmm0,xmm4 +DB 102,15,56,0,239 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + lea rsi,QWORD PTR[64+rsi] +DB 15,56,204,220 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((32-128))+rcx] + paddd xmm0,xmm5 +DB 102,15,56,0,247 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((48-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((64-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((80-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 +DB 15,56,204,220 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((96-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,205,245 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((112-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((128-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((144-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 + nop + paddd xmm6,xmm7 +DB 15,56,204,220 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((160-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,205,245 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm6 +DB 102,15,58,15,253,4 + nop + paddd xmm3,xmm7 +DB 15,56,204,229 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((176-128))+rcx] + paddd xmm0,xmm6 +DB 15,56,205,222 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm3 +DB 102,15,58,15,254,4 + nop + paddd xmm4,xmm7 +DB 15,56,204,238 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((192-128))+rcx] + paddd xmm0,xmm3 +DB 15,56,205,227 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm4 +DB 102,15,58,15,251,4 + nop + paddd xmm5,xmm7 +DB 15,56,204,243 +DB 15,56,203,202 + movdqa xmm0,XMMWORD PTR[((208-128))+rcx] + paddd xmm0,xmm4 +DB 15,56,205,236 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + movdqa xmm7,xmm5 +DB 102,15,58,15,252,4 +DB 15,56,203,202 + paddd xmm6,xmm7 + + movdqa xmm0,XMMWORD PTR[((224-128))+rcx] + paddd xmm0,xmm5 +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh +DB 15,56,205,245 + movdqa xmm7,xmm8 +DB 15,56,203,202 + + movdqa xmm0,XMMWORD PTR[((240-128))+rcx] + paddd xmm0,xmm6 + nop +DB 15,56,203,209 + pshufd xmm0,xmm0,00eh + dec rdx + nop +DB 15,56,203,202 + + paddd xmm2,xmm10 + paddd xmm1,xmm9 + jnz $L$oop_shaext + + pshufd xmm2,xmm2,0b1h + pshufd xmm7,xmm1,01bh + pshufd xmm1,xmm1,0b1h + punpckhqdq xmm1,xmm2 +DB 102,15,58,15,215,8 + + movdqu XMMWORD PTR[rdi],xmm1 + movdqu XMMWORD PTR[16+rdi],xmm2 + movaps xmm6,XMMWORD PTR[((-80))+rbp] + movaps xmm7,XMMWORD PTR[((-64))+rbp] + movaps xmm8,XMMWORD PTR[((-48))+rbp] + movaps xmm9,XMMWORD PTR[((-32))+rbp] + movaps xmm10,XMMWORD PTR[((-16))+rbp] + mov rsp,rbp + + pop rbp + +$L$SEH_epilogue_blst_sha256_block_data_order_shaext:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_blst_sha256_block_data_order_shaext:: +blst_sha256_block_data_order_shaext ENDP +PUBLIC blst_sha256_block_data_order + + +ALIGN 64 +blst_sha256_block_data_order PROC PUBLIC + DB 243,15,30,250 + mov QWORD PTR[8+rsp],rdi ;WIN64 prologue + mov QWORD PTR[16+rsp],rsi + mov r11,rsp +$L$SEH_begin_blst_sha256_block_data_order:: + + + push rbp + + mov rbp,rsp + + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + test DWORD PTR[__blst_platform_cap],2 + jnz $L$blst_sha256_block_data_order$2 + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,88 + + lea rdx,QWORD PTR[rdx*4+rsi] + mov QWORD PTR[((-64))+rbp],rdi + + mov QWORD PTR[((-48))+rbp],rdx + movaps XMMWORD PTR[(-128)+rbp],xmm6 + movaps XMMWORD PTR[(-112)+rbp],xmm7 + movaps XMMWORD PTR[(-96)+rbp],xmm8 + movaps XMMWORD PTR[(-80)+rbp],xmm9 + +$L$SEH_body_blst_sha256_block_data_order:: + + + lea rsp,QWORD PTR[((-64))+rsp] + mov eax,DWORD PTR[rdi] + and rsp,-64 + mov ebx,DWORD PTR[4+rdi] + mov ecx,DWORD PTR[8+rdi] + mov edx,DWORD PTR[12+rdi] + mov r8d,DWORD PTR[16+rdi] + mov r9d,DWORD PTR[20+rdi] + mov r10d,DWORD PTR[24+rdi] + mov r11d,DWORD PTR[28+rdi] + + + jmp $L$loop_ssse3 +ALIGN 16 +$L$loop_ssse3:: + movdqa xmm7,XMMWORD PTR[((K256+256))] + mov QWORD PTR[((-56))+rbp],rsi + movdqu xmm0,XMMWORD PTR[rsi] + movdqu xmm1,XMMWORD PTR[16+rsi] + movdqu xmm2,XMMWORD PTR[32+rsi] +DB 102,15,56,0,199 + movdqu xmm3,XMMWORD PTR[48+rsi] + lea rsi,QWORD PTR[K256] +DB 102,15,56,0,207 + movdqa xmm4,XMMWORD PTR[rsi] + movdqa xmm5,XMMWORD PTR[16+rsi] +DB 102,15,56,0,215 + paddd xmm4,xmm0 + movdqa xmm6,XMMWORD PTR[32+rsi] +DB 102,15,56,0,223 + movdqa xmm7,XMMWORD PTR[48+rsi] + paddd xmm5,xmm1 + paddd xmm6,xmm2 + paddd xmm7,xmm3 + movdqa XMMWORD PTR[rsp],xmm4 + mov r14d,eax + movdqa XMMWORD PTR[16+rsp],xmm5 + mov edi,ebx + movdqa XMMWORD PTR[32+rsp],xmm6 + xor edi,ecx + movdqa XMMWORD PTR[48+rsp],xmm7 + mov r13d,r8d + jmp $L$ssse3_00_47 + +ALIGN 16 +$L$ssse3_00_47:: + sub rsi,-64 + ror r13d,14 + movdqa xmm4,xmm1 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm3 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,224,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,250,4 + add r11d,DWORD PTR[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm0,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm3,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD PTR[4+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm0,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD PTR[8+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm0,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm0,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[12+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD PTR[rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm0,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm0 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD PTR[rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm2 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm0 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,225,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,251,4 + add edx,DWORD PTR[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm1,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm0,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD PTR[20+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm1,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD PTR[24+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm1,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm1,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[28+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD PTR[16+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm1,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm1 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD PTR[16+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm3 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm1 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax +DB 102,15,58,15,226,4 + and r12d,r8d + xor r13d,r8d +DB 102,15,58,15,248,4 + add r11d,DWORD PTR[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm2,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm1,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD PTR[36+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm2,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD PTR[40+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm2,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm2,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[44+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD PTR[32+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm2,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm2 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD PTR[32+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm0 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm2 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d +DB 102,15,58,15,227,4 + and r12d,eax + xor r13d,eax +DB 102,15,58,15,249,4 + add edx,DWORD PTR[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm3,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm2,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD PTR[52+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm3,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD PTR[56+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm3,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm3,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[60+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD PTR[48+rsi] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm3,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm3 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD PTR[48+rsp],xmm6 + cmp BYTE PTR[67+rsi],0 + jne $L$ssse3_00_47 + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD PTR[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD PTR[4+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD PTR[8+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[12+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD PTR[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD PTR[20+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD PTR[24+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[28+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD PTR[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD PTR[36+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD PTR[40+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD PTR[44+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD PTR[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD PTR[52+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD PTR[56+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD PTR[60+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD PTR[((-64))+rbp] + mov eax,r14d + mov rsi,QWORD PTR[((-56))+rbp] + + add eax,DWORD PTR[rdi] + add ebx,DWORD PTR[4+rdi] + add ecx,DWORD PTR[8+rdi] + add edx,DWORD PTR[12+rdi] + add r8d,DWORD PTR[16+rdi] + add r9d,DWORD PTR[20+rdi] + add r10d,DWORD PTR[24+rdi] + add r11d,DWORD PTR[28+rdi] + + lea rsi,QWORD PTR[64+rsi] + cmp rsi,QWORD PTR[((-48))+rbp] + + mov DWORD PTR[rdi],eax + mov DWORD PTR[4+rdi],ebx + mov DWORD PTR[8+rdi],ecx + mov DWORD PTR[12+rdi],edx + mov DWORD PTR[16+rdi],r8d + mov DWORD PTR[20+rdi],r9d + mov DWORD PTR[24+rdi],r10d + mov DWORD PTR[28+rdi],r11d + jb $L$loop_ssse3 + + xorps xmm0,xmm0 + movaps XMMWORD PTR[rsp],xmm0 + movaps XMMWORD PTR[16+rsp],xmm0 + movaps XMMWORD PTR[32+rsp],xmm0 + movaps XMMWORD PTR[48+rsp],xmm0 + movaps xmm6,XMMWORD PTR[((-128))+rbp] + movaps xmm7,XMMWORD PTR[((-112))+rbp] + movaps xmm8,XMMWORD PTR[((-96))+rbp] + movaps xmm9,XMMWORD PTR[((-80))+rbp] + mov r15,QWORD PTR[((-40))+rbp] + mov r14,QWORD PTR[((-32))+rbp] + mov r13,QWORD PTR[((-24))+rbp] + mov r12,QWORD PTR[((-16))+rbp] + mov rbx,QWORD PTR[((-8))+rbp] + mov rsp,rbp + + pop rbp + +$L$SEH_epilogue_blst_sha256_block_data_order:: + mov rdi,QWORD PTR[8+rsp] ;WIN64 epilogue + mov rsi,QWORD PTR[16+rsp] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_blst_sha256_block_data_order:: +blst_sha256_block_data_order ENDP +PUBLIC blst_sha256_emit + + +ALIGN 16 +blst_sha256_emit PROC PUBLIC + DB 243,15,30,250 + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + bswap r8 + mov r11,QWORD PTR[24+rdx] + bswap r9 + mov DWORD PTR[4+rcx],r8d + bswap r10 + mov DWORD PTR[12+rcx],r9d + bswap r11 + mov DWORD PTR[20+rcx],r10d + shr r8,32 + mov DWORD PTR[28+rcx],r11d + shr r9,32 + mov DWORD PTR[rcx],r8d + shr r10,32 + mov DWORD PTR[8+rcx],r9d + shr r11,32 + mov DWORD PTR[16+rcx],r10d + mov DWORD PTR[24+rcx],r11d + DB 0F3h,0C3h ;repret +blst_sha256_emit ENDP + +PUBLIC blst_sha256_bcopy + + +ALIGN 16 +blst_sha256_bcopy PROC PUBLIC + DB 243,15,30,250 + + sub rcx,rdx +$L$oop_bcopy:: + movzx eax,BYTE PTR[rdx] + lea rdx,QWORD PTR[1+rdx] + mov BYTE PTR[((-1))+rdx*1+rcx],al + dec r8 + jnz $L$oop_bcopy + DB 0F3h,0C3h ;repret +blst_sha256_bcopy ENDP + +PUBLIC blst_sha256_hcopy + + +ALIGN 16 +blst_sha256_hcopy PROC PUBLIC + DB 243,15,30,250 + + mov r8,QWORD PTR[rdx] + mov r9,QWORD PTR[8+rdx] + mov r10,QWORD PTR[16+rdx] + mov r11,QWORD PTR[24+rdx] + mov QWORD PTR[rcx],r8 + mov QWORD PTR[8+rcx],r9 + mov QWORD PTR[16+rcx],r10 + mov QWORD PTR[24+rcx],r11 + DB 0F3h,0C3h ;repret +blst_sha256_hcopy ENDP +.text$ ENDS +.pdata SEGMENT READONLY ALIGN(4) +ALIGN 4 + DD imagerel $L$SEH_begin_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_prologue + + DD imagerel $L$SEH_body_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_body + + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_end_blst_sha256_block_data_order_shaext + DD imagerel $L$SEH_info_blst_sha256_block_data_order_shaext_epilogue + + DD imagerel $L$SEH_begin_blst_sha256_block_data_order + DD imagerel $L$SEH_body_blst_sha256_block_data_order + DD imagerel $L$SEH_info_blst_sha256_block_data_order_prologue + + DD imagerel $L$SEH_body_blst_sha256_block_data_order + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order + DD imagerel $L$SEH_info_blst_sha256_block_data_order_body + + DD imagerel $L$SEH_epilogue_blst_sha256_block_data_order + DD imagerel $L$SEH_end_blst_sha256_block_data_order + DD imagerel $L$SEH_info_blst_sha256_block_data_order_epilogue + +.pdata ENDS +.xdata SEGMENT READONLY ALIGN(8) +ALIGN 8 +$L$SEH_info_blst_sha256_block_data_order_shaext_prologue:: +DB 1,4,6,005h +DB 4,074h,2,0 +DB 4,064h,3,0 +DB 4,053h +DB 1,050h + DD 0,0 +$L$SEH_info_blst_sha256_block_data_order_shaext_body:: +DB 1,0,17,85 +DB 000h,068h,000h,000h +DB 000h,078h,001h,000h +DB 000h,088h,002h,000h +DB 000h,098h,003h,000h +DB 000h,0a8h,004h,000h +DB 000h,074h,00ch,000h +DB 000h,064h,00dh,000h +DB 000h,053h +DB 000h,092h +DB 000h,050h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_blst_sha256_block_data_order_shaext_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + +$L$SEH_info_blst_sha256_block_data_order_prologue:: +DB 1,4,6,005h +DB 4,074h,2,0 +DB 4,064h,3,0 +DB 4,053h +DB 1,050h + DD 0,0 +$L$SEH_info_blst_sha256_block_data_order_body:: +DB 1,0,25,133 +DB 000h,068h,000h,000h +DB 000h,078h,001h,000h +DB 000h,088h,002h,000h +DB 000h,098h,003h,000h +DB 000h,0f4h,00bh,000h +DB 000h,0e4h,00ch,000h +DB 000h,0d4h,00dh,000h +DB 000h,0c4h,00eh,000h +DB 000h,034h,00fh,000h +DB 000h,074h,012h,000h +DB 000h,064h,013h,000h +DB 000h,053h +DB 000h,0f2h +DB 000h,050h +DB 000h,000h,000h,000h,000h,000h +DB 000h,000h,000h,000h +$L$SEH_info_blst_sha256_block_data_order_epilogue:: +DB 1,0,4,0 +DB 000h,074h,001h,000h +DB 000h,064h,002h,000h +DB 000h,000h,000h,000h + + +.xdata ENDS +END diff --git a/crypto/blst_src/bulk_addition.c b/crypto/blst_src/bulk_addition.c new file mode 100644 index 00000000000..4d36f405b64 --- /dev/null +++ b/crypto/blst_src/bulk_addition.c @@ -0,0 +1,176 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * This implementation uses explicit addition formula: + * + * λ = (Y₂-Y₁)/(X₂-X₁) + * X₃ = λ²-(X₁+X₂) + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * But since we don't know if we'll have to add point to itself, we need + * to eventually resort to corresponding doubling formula: + * + * λ = 3X₁²/2Y₁ + * X₃ = λ²-2X₁ + * Y₃ = λ⋅(X₁-X₃)-Y₁ + * + * The formulae use prohibitively expensive inversion, but whenever we + * have a lot of affine points to accumulate, we can amortize the cost + * by applying Montgomery's batch inversion approach. As a result, + * asymptotic[!] per-point cost for addition is as small as 5M+1S. For + * comparison, ptype##_dadd_affine takes 8M+5S. In practice, all things + * considered, the improvement coefficient varies from 60% to 85% + * depending on platform and curve. + * + * THIS IMPLEMENTATION IS *NOT* CONSTANT-TIME. [But if there is an + * application that requires constant time-ness, speak up!] + */ + +/* + * Calculate λ's numerator and denominator. + * + * input: A x1 y1 - + * B x2 y2 - + * output: + * if A!=B: A x1 y1 (x2-x1)*mul_acc + * B x2+x1 y2-y1 (x2-x1) + * + * if A==B: A x y 2y*mul_acc + * B 2x 3*x^2 2y + * + * if A==-B: A 0 0 1*mul_acc + * B 0 3*x^2 0 + */ +#define HEAD(ptype, bits, field, one) \ +static void ptype##_head(ptype AB[2], const vec##bits mul_acc) \ +{ \ + ptype *A = AB, *B = AB+1; \ + limb_t inf = vec_is_zero(A, sizeof(ptype##_affine)) | \ + vec_is_zero(B, sizeof(ptype##_affine)); \ + static const vec##bits zero = { 0 }; \ +\ + sub_##field(B->Z, B->X, A->X); /* X2-X1 */ \ + add_##field(B->X, B->X, A->X); /* X2+X1 */ \ + add_##field(A->Z, B->Y, A->Y); /* Y2+Y1 */ \ + sub_##field(B->Y, B->Y, A->Y); /* Y2-Y1 */ \ + if (vec_is_zero(B->Z, sizeof(B->Z))) { /* X2==X1 */ \ + inf = vec_is_zero(A->Z, sizeof(A->Z)); \ + vec_select(B->X, A->Z, B->X, sizeof(B->X), inf); \ + sqr_##field(B->Y, A->X); \ + mul_by_3_##field(B->Y, B->Y); /* 3*X1^2 */ \ + vec_copy(B->Z, A->Z, sizeof(B->Z)); /* 2*Y1 */ \ + } /* B->Y is numenator */ \ + /* B->Z is denominator */ \ + vec_select(A->X, B->X, A->X, sizeof(A->X), inf); \ + vec_select(A->Y, A->Z, A->Y, sizeof(A->Y), inf); \ + vec_select(A->Z, one, B->Z, sizeof(A->Z), inf); \ + vec_select(B->Z, zero, B->Z, sizeof(B->Z), inf); \ + if (mul_acc != NULL) \ + mul_##field(A->Z, A->Z, mul_acc); /* chain multiplication */\ +} + +/* + * Calculate λ and resulting coordinates. + * + * input: A x1 y1 - + * B x2+x1 nominator - + * lambda 1/denominator + * output: D x3=(nom/den)^2-(x2+x1) y3=(nom/den)(x1-x3)-y1 + */ +#define TAIL(ptype, bits, field, one) \ +static void ptype##_tail(ptype *D, ptype AB[2], vec##bits lambda) \ +{ \ + ptype *A = AB, *B = AB+1; \ + vec##bits llambda; \ + limb_t inf = vec_is_zero(B->Z, sizeof(B->Z)); \ +\ + mul_##field(lambda, lambda, B->Y); /* λ = (Y2-Y1)/(X2-X1) */ \ + /* alt. 3*X1^2/2*Y1 */ \ + sqr_##field(llambda, lambda); \ + sub_##field(D->X, llambda, B->X); /* X3 = λ^2-X1-X2 */ \ +\ + sub_##field(D->Y, A->X, D->X); \ + mul_##field(D->Y, D->Y, lambda); \ + sub_##field(D->Y, D->Y, A->Y); /* Y3 = λ*(X1-X3)-Y1 */ \ +\ + vec_select(D->X, A->X, D->X, 2*sizeof(D->X), inf); \ + vec_select(B->Z, one, B->Z, sizeof(B->Z), inf); \ +} + +/* + * |points[]| is volatile buffer with |X|s and |Y|s initially holding + * input affine coordinates, and with |Z|s being used as additional + * temporary storage [unrelated to Jacobian coordinates]. |sum| is + * in-/output, initialize to infinity accordingly. + */ +#define ADDITION_BTREE(prefix, ptype, bits, field, one) \ +HEAD(ptype, bits, field, one) \ +TAIL(ptype, bits, field, one) \ +static void ptype##s_accumulate(ptype *sum, ptype points[], size_t n) \ +{ \ + ptype *dst; \ + void *mul_acc; \ + size_t i; \ +\ + while (n >= 16) { \ + if (n & 1) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ + n /= 2; \ + for (mul_acc = NULL, i = n; i--; mul_acc = points->Z, points += 2) \ + ptype##_head(points, mul_acc); \ +\ + reciprocal_##field(points[-2].Z, points[-2].Z); /* 1/∏ Zi */ \ +\ + for (dst = points, i = n; --i;) { \ + dst--; points -= 2; \ + mul_##field(points[-2].Z, points[0].Z, points[-2].Z); \ + ptype##_tail(dst, points, points[-2].Z); \ + mul_##field(points[-2].Z, points[0].Z, points[1].Z); \ + } \ + dst--; points -= 2; \ + ptype##_tail(dst, points, points[0].Z); \ + points = dst; \ + } \ + while (n--) \ + ptype##_dadd_affine(sum, sum, (const ptype##_affine *)points++); \ +} \ +\ +void prefix##s_add(ptype *sum, const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + const size_t stride = SCRATCH_LIMIT / sizeof(ptype); \ + ptype *scratch = alloca((npoints > stride ? stride : npoints) * \ + sizeof(ptype)); \ + const ptype##_affine *point = NULL; \ +\ + vec_zero(sum, sizeof(*sum)); \ + while (npoints) { \ + size_t i, j = npoints > stride ? stride : npoints; \ + for (i=0; i> (8 * (n % sizeof(limb_t)))); + } +} + +static inline void limbs_from_le_bytes(limb_t *restrict ret, + const unsigned char *in, size_t n) +{ + limb_t limb = 0; + + while(n--) { + limb <<= 8; + limb |= in[n]; + /* + * 'if (n % sizeof(limb_t) == 0)' is omitted because it's cheaper + * to perform redundant stores than to pay penalty for + * mispredicted branch. Besides, some compilers unroll the + * loop and remove redundant stores to 'restrict'-ed storage... + */ + ret[n / sizeof(limb_t)] = limb; + } +} + +static inline void le_bytes_from_limbs(unsigned char *out, const limb_t *in, + size_t n) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + limb_t limb; + size_t i, j, r; + + if ((uptr_t)out == (uptr_t)in && is_endian.little) + return; + + r = n % sizeof(limb_t); + n /= sizeof(limb_t); + + for(i = 0; i < n; i++) { + for (limb = in[i], j = 0; j < sizeof(limb_t); j++, limb >>= 8) + *out++ = (unsigned char)limb; + } + if (r) { + for (limb = in[i], j = 0; j < r; j++, limb >>= 8) + *out++ = (unsigned char)limb; + } +} + +static inline char hex_from_nibble(unsigned char nibble) +{ + int mask = (9 - (nibble &= 0xf)) >> 31; + return (char)(nibble + ((('a'-10) & mask) | ('0' & ~mask))); +} + +static unsigned char nibble_from_hex(char c) +{ + int mask, ret; + + mask = (('a'-c-1) & (c-1-'f')) >> 31; + ret = (10 + c - 'a') & mask; + mask = (('A'-c-1) & (c-1-'F')) >> 31; + ret |= (10 + c - 'A') & mask; + mask = (('0'-c-1) & (c-1-'9')) >> 31; + ret |= (c - '0') & mask; + mask = ((ret-1) & ~mask) >> 31; + ret |= 16 & mask; + + return (unsigned char)ret; +} + +static void bytes_from_hexascii(unsigned char *ret, size_t sz, const char *hex) +{ + size_t len; + unsigned char b = 0; + + if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) + hex += 2; + + for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; + + bytes_zero(ret, sz); + + while(len--) { + b <<= 4; + b |= nibble_from_hex(*hex++); + if (len % 2 == 0) + ret[len / 2] = b; + } +} + +static void limbs_from_hexascii(limb_t *ret, size_t sz, const char *hex) +{ + size_t len; + limb_t limb = 0; + + if (hex[0]=='0' && (hex[1]=='x' || hex[1]=='X')) + hex += 2; + + for (len = 0; len<2*sz && nibble_from_hex(hex[len])<16; len++) ; + + vec_zero(ret, sz); + + while(len--) { + limb <<= 4; + limb |= nibble_from_hex(*hex++); + if (len % (2*sizeof(limb_t)) == 0) + ret[len / (2*sizeof(limb_t))] = limb; + } +} + +#endif diff --git a/crypto/blst_src/client_min_pk.c b/crypto/blst_src/client_min_pk.c new file mode 100644 index 00000000000..0fcf563f502 --- /dev/null +++ b/crypto/blst_src/client_min_pk.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e2.c" +#include "hash_to_field.c" +#include "map_to_g2.c" +#include "e1.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/crypto/blst_src/client_min_sig.c b/crypto/blst_src/client_min_sig.c new file mode 100644 index 00000000000..8e4663daede --- /dev/null +++ b/crypto/blst_src/client_min_sig.c @@ -0,0 +1,17 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "keygen.c" +#include "e1.c" +#include "hash_to_field.c" +#include "map_to_g1.c" +#include "e2.c" +#include "exp.c" +#include "sqrt.c" +#include "recip.c" +#include "consts.c" +#include "vect.c" +#include "exports.c" diff --git a/crypto/blst_src/consts.c b/crypto/blst_src/consts.c new file mode 100644 index 00000000000..021c878a258 --- /dev/null +++ b/crypto/blst_src/consts.c @@ -0,0 +1,36 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" + +/* z = -0xd201000000010000 */ +const vec384 BLS12_381_P = { /* (z-1)^2 * (z^4 - z^2 + 1)/3 + z */ + TO_LIMB_T(0xb9feffffffffaaab), TO_LIMB_T(0x1eabfffeb153ffff), + TO_LIMB_T(0x6730d2a0f6b0f624), TO_LIMB_T(0x64774b84f38512bf), + TO_LIMB_T(0x4b1ba7b6434bacd7), TO_LIMB_T(0x1a0111ea397fe69a) +}; +const limb_t BLS12_381_p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ + +const radix384 BLS12_381_Rx = { /* (1<<384)%P, "radix", one-in-Montgomery */ + { { ONE_MONT_P }, + { 0 } } +}; + +const vec384 BLS12_381_RR = { /* (1<<768)%P, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xf4df1f341c341746), TO_LIMB_T(0x0a76e6a609d104f1), + TO_LIMB_T(0x8de5476c4c95b6d5), TO_LIMB_T(0x67eb88a9939d83c0), + TO_LIMB_T(0x9a793e85b519952d), TO_LIMB_T(0x11988fe592cae3aa) +}; + +const vec256 BLS12_381_r = { /* z^4 - z^2 + 1, group order */ + TO_LIMB_T(0xffffffff00000001), TO_LIMB_T(0x53bda402fffe5bfe), + TO_LIMB_T(0x3339d80809a1d805), TO_LIMB_T(0x73eda753299d7d48) +}; + +const vec256 BLS12_381_rRR = { /* (1<<512)%r, "radix"^2, to-Montgomery */ + TO_LIMB_T(0xc999e990f3f29c6d), TO_LIMB_T(0x2b6cedcb87925c23), + TO_LIMB_T(0x05d314967254398f), TO_LIMB_T(0x0748d9d99f59ff11) +}; diff --git a/crypto/blst_src/consts.h b/crypto/blst_src/consts.h new file mode 100644 index 00000000000..cb391b817df --- /dev/null +++ b/crypto/blst_src/consts.h @@ -0,0 +1,30 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_CONST_H__ +#define __BLS12_381_ASM_CONST_H__ +#include "vect.h" + +extern const vec384 BLS12_381_P; +extern const limb_t BLS12_381_p0; +static const limb_t p0 = (limb_t)0x89f3fffcfffcfffd; /* -1/P */ +typedef union { vec384 p12[12]; vec384x p2; vec384 p; } radix384; +extern const radix384 BLS12_381_Rx; /* (1<<384)%P, "radix", one-in-Montgomery */ +extern const vec384 BLS12_381_RR; /* (1<<768)%P, "radix"^2, to-Montgomery */ + +#define ONE_MONT_P TO_LIMB_T(0x760900000002fffd), \ + TO_LIMB_T(0xebf4000bc40c0002), \ + TO_LIMB_T(0x5f48985753c758ba), \ + TO_LIMB_T(0x77ce585370525745), \ + TO_LIMB_T(0x5c071a97a256ec6d), \ + TO_LIMB_T(0x15f65ec3fa80e493) + +#define ZERO_384 (BLS12_381_Rx.p2[1]) + +extern const vec256 BLS12_381_r; /* order */ +static const limb_t r0 = (limb_t)0xfffffffeffffffff; /* -1/r */ +extern const vec256 BLS12_381_rRR; /* (1<<512)%r, "radix"^2, to-Montgomery */ + +#endif diff --git a/crypto/blst_src/cpuid.c b/crypto/blst_src/cpuid.c new file mode 100644 index 00000000000..43b9229d341 --- /dev/null +++ b/crypto/blst_src/cpuid.c @@ -0,0 +1,85 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#if (defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C)) && !defined(_WIN32) +__attribute__((visibility("hidden"))) +#endif +int __blst_platform_cap = 0; + +#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) + +# if defined(__GNUC__) || defined(__clang__) || defined(__SUNPRO_C) +static void __cpuidex(int info[4], int func, int sub) +{ + int eax, ebx, ecx, edx; + + __asm__("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx) + : "a"(func), "c"(sub)); + + info[0] = eax; + info[1] = ebx; + info[2] = ecx; + info[3] = edx; +} +# else +# include +# endif + +# if defined(__GNUC__) || defined(__clang__) +__attribute__((constructor)) +# endif +static int __blst_cpuid(void) +{ + int info[4], cap = 0; + + __cpuidex(info, 0, 0); + if (info[0] > 6) { + __cpuidex(info, 7, 0); + cap |= (info[1]>>19) & 1; /* ADX */ + cap |= (info[1]>>28) & 2; /* SHA */ + } + + __blst_platform_cap = cap; + + return 0; +} + +# if defined(_MSC_VER) && !defined(__clang__) +# pragma section(".CRT$XCU",read) +__declspec(allocate(".CRT$XCU")) static int (*p)(void) = __blst_cpuid; +# elif defined(__SUNPRO_C) +# pragma init(__blst_cpuid) +# endif + +#elif defined(__aarch64__) || defined(__aarch64) + +# if defined(__linux__) && (defined(__GNUC__) || defined(__clang__)) +extern unsigned long getauxval(unsigned long type) __attribute__ ((weak)); + +__attribute__((constructor)) +static int __blst_cpuid(void) +{ + int cap = 0; + + if (getauxval) { + unsigned long hwcap_ce = getauxval(16); + cap = (hwcap_ce>>6) & 1; /* SHA256 */ + } + + __blst_platform_cap = cap; + + return 0; +} +# elif defined(__APPLE__) && (defined(__GNUC__) || defined(__clang__)) +__attribute__((constructor)) +static int __blst_cpuid() +{ + __blst_platform_cap = 1; /* SHA256 */ + return 0; +} +# endif + +#endif diff --git a/crypto/blst_src/e1.c b/crypto/blst_src/e1.c new file mode 100644 index 00000000000..f8a7be7bc14 --- /dev/null +++ b/crypto/blst_src/e1.c @@ -0,0 +1,564 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384 B_E1 = { /* (4 << 384) % P */ + TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) +}; + +const POINTonE1 BLS12_381_G1 = { /* generator point [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x08b3f481e3aaa0f1a09e30ed741d8ae4fcf5e095d5d00af6 + * 00db18cb2c04b3edd03cc744a2888ae40caa232946c5e7e1 << 384) % P */ + { TO_LIMB_T(0xbaac93d50ce72271), TO_LIMB_T(0x8c22631a7918fd8e), + TO_LIMB_T(0xdd595f13570725ce), TO_LIMB_T(0x51ac582950405194), + TO_LIMB_T(0x0e1c8c3fad0059c0), TO_LIMB_T(0x0bbc3efc5008a26a) }, + { ONE_MONT_P } +}; + +const POINTonE1 BLS12_381_NEG_G1 = { /* negative generator [in Montgomery] */ + /* (0x17f1d3a73197d7942695638c4fa9ac0fc3688c4f9774b905 + * a14e3a3f171bac586c55e83ff97a1aeffb3af00adb22c6bb << 384) % P */ + { TO_LIMB_T(0x5cb38790fd530c16), TO_LIMB_T(0x7817fc679976fff5), + TO_LIMB_T(0x154f95c7143ba1c1), TO_LIMB_T(0xf0ae6acdf3d0e747), + TO_LIMB_T(0xedce6ecc21dbf440), TO_LIMB_T(0x120177419e0bfb75) }, + /* (0x114d1d6855d545a8aa7d76c8cf2e21f267816aef1db507c9 + * 6655b9d5caac42364e6f38ba0ecb751bad54dcd6b939c2ca << 384) % P */ + { TO_LIMB_T(0xff526c2af318883a), TO_LIMB_T(0x92899ce4383b0270), + TO_LIMB_T(0x89d7738d9fa9d055), TO_LIMB_T(0x12caf35ba344c12a), + TO_LIMB_T(0x3cff1b76964b5317), TO_LIMB_T(0x0e44d2ede9774430) }, + { ONE_MONT_P } +}; + +static inline void mul_by_b_onE1(vec384 out, const vec384 in) +{ lshift_fp(out, in, 2); } + +static inline void mul_by_4b_onE1(vec384 out, const vec384 in) +{ lshift_fp(out, in, 4); } + +static void POINTonE1_cneg(POINTonE1 *p, bool_t cbit) +{ cneg_fp(p->Y, p->Y, cbit); } + +void blst_p1_cneg(POINTonE1 *a, int cbit) +{ POINTonE1_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE1_from_Jacobian(POINTonE1 *out, const POINTonE1 *in) +{ + vec384 Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp(Z, in->Z); /* 1/Z */ + + sqr_fp(ZZ, Z); + mul_fp(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G1.Z, + sizeof(BLS12_381_G1.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p1_from_jacobian(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_from_Jacobian(out, a); } + +static void POINTonE1_to_affine(POINTonE1_affine *out, const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p1_to_affine(POINTonE1_affine *out, const POINTonE1 *a) +{ POINTonE1_to_affine(out, a); } + +void blst_p1_from_affine(POINTonE1 *out, const POINTonE1_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE1_affine_on_curve(const POINTonE1_affine *p) +{ + vec384 XXX, YY; + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, B_E1); /* X^3 + B */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p1_affine_on_curve(const POINTonE1_affine *p) +{ return (int)(POINTonE1_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE1_on_curve(const POINTonE1 *p) +{ + vec384 XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp(BZ6, p->Z); + mul_fp(BZ6, BZ6, p->Z); + sqr_fp(BZ6, BZ6); /* Z^6 */ + mul_by_b_onE1(BZ6, BZ6); /* B*Z^6 */ + + sqr_fp(XXX, p->X); + mul_fp(XXX, XXX, p->X); /* X^3 */ + add_fp(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p1_on_curve(const POINTonE1 *p) +{ return (int)POINTonE1_on_curve(p); } + +static limb_t POINTonE1_affine_Serialize_BE(unsigned char out[96], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + from_fp(temp, in->Y); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mod_384(temp, BLS12_381_P); +} + +void blst_p1_affine_serialize(unsigned char out[96], + const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinity bit */ + } else { + (void)POINTonE1_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE1_Serialize_BE(unsigned char out[96], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Serialize_BE(out, (const POINTonE1_affine *)in); +} + +static void POINTonE1_Serialize(unsigned char out[96], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0x40; /* infinity bit */ + } else { + (void)POINTonE1_Serialize_BE(out, in); + } +} + +void blst_p1_serialize(unsigned char out[96], const POINTonE1 *in) +{ POINTonE1_Serialize(out, in); } + +static limb_t POINTonE1_affine_Compress_BE(unsigned char out[48], + const POINTonE1_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X); + be_bytes_from_limbs(out, temp, sizeof(temp)); + + return sgn0_pty_mont_384(in->Y, BLS12_381_P, p0); +} + +void blst_p1_affine_compress(unsigned char out[48], const POINTonE1_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinity bits */ + } else { + limb_t sign = POINTonE1_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Compress_BE(unsigned char out[48], + const POINTonE1 *in) +{ + POINTonE1 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE1_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE1_affine_Compress_BE(out, (const POINTonE1_affine *)in); +} + +void blst_p1_compress(unsigned char out[48], const POINTonE1 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 48); + out[0] = 0xc0; /* compressed and infinity bits */ + } else { + limb_t sign = POINTonE1_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE1_Uncompress_BE(POINTonE1_affine *out, + const unsigned char in[48]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + mul_fp(ret.X, ret.X, BLS12_381_RR); + + sqr_fp(ret.Y, ret.X); + mul_fp(ret.Y, ret.Y, ret.X); + add_fp(ret.Y, ret.Y, B_E1); /* X^3 + B */ + if (!sqrt_fp(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE1_Uncompress_Z(POINTonE1_affine *out, + const unsigned char in[48]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 47)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE1_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp(out->Y, out->Y, sgn0_pty); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +BLST_ERROR blst_p1_uncompress(POINTonE1_affine *out, const unsigned char in[48]) +{ return POINTonE1_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE1_Deserialize_BE(POINTonE1_affine *out, + const unsigned char in[96]) +{ + POINTonE1_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X, in, sizeof(ret.X)); + limbs_from_be_bytes(ret.Y, in + 48, sizeof(ret.Y)); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[sizeof(ret.X)/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X, sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y, ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y, sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X, ret.X, BLS12_381_RR); + mul_fp(ret.Y, ret.Y, BLS12_381_RR); + + if (!POINTonE1_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + /* (0,±2) is not in group, but application might want to ignore? */ + return vec_is_zero(out->X, sizeof(out->X)) ? BLST_POINT_NOT_IN_GROUP + : BLST_SUCCESS; +} + +static BLST_ERROR POINTonE1_Deserialize_Z(POINTonE1_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE1_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE1_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p1_deserialize(POINTonE1_affine *out, + const unsigned char in[96]) +{ return POINTonE1_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE1, 384, fp) +POINT_DADD_AFFINE_IMPL_A0(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_ADD_IMPL(POINTonE1, 384, fp) +POINT_ADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINT_DOUBLE_IMPL_A0(POINTonE1, 384, fp) +POINT_IS_EQUAL_IMPL(POINTonE1, 384, fp) + +void blst_p1_add(POINTonE1 *out, const POINTonE1 *a, const POINTonE1 *b) +{ POINTonE1_add(out, a, b); } + +void blst_p1_add_or_double(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1 *b) +{ POINTonE1_dadd(out, a, b, NULL); } + +void blst_p1_add_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_add_affine(out, a, b); } + +void blst_p1_add_or_double_affine(POINTonE1 *out, const POINTonE1 *a, + const POINTonE1_affine *b) +{ POINTonE1_dadd_affine(out, a, b); } + +void blst_p1_double(POINTonE1 *out, const POINTonE1 *a) +{ POINTonE1_double(out, a); } + +int blst_p1_is_equal(const POINTonE1 *a, const POINTonE1 *b) +{ return (int)POINTonE1_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE1, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE1) + +DECLARE_PRIVATE_POINTXZ(POINTonE1, 384) +POINT_LADDER_PRE_IMPL(POINTonE1, 384, fp) +POINT_LADDER_STEP_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_LADDER_POST_IMPL_A0(POINTonE1, 384, fp, onE1) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE1) +#endif + +static const vec384 beta = { /* such that beta^3 - 1 = 0 */ + /* -1/2 * (1 + sqrt(-3)) = ((P-2)^(P-2)) * (1 + (P-3)^((P+1)/4)) */ + /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaac << 384) % P */ + TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) +}; + +static void sigma(POINTonE1 *out, const POINTonE1 *in) +{ + vec_copy(out->X, in->X, 2*sizeof(out->X)); + mul_fp(out->Z, in->Z, beta); +} + +/* Gallant-Lambert-Vanstone, ~45% faster than POINTonE1_mult_w5 */ +static void POINTonE1_mult_glv(POINTonE1 *out, const POINTonE1 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* SK/z^2 [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s+16, val.s }; + POINTonE1 table[2][1<<(5-1)]; /* 4.5KB */ + size_t i; + + POINTonE1_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + mul_fp(table[1][i].X, table[0][i].X, beta); + cneg_fp(table[1][i].Y, table[0][i].Y, 1); + vec_copy(table[1][i].Z, table[0][i].Z, sizeof(table[1][i].Z)); + } + + POINTonE1s_mult_w5(out, NULL, 2, scalars, 128, table); + POINTonE1_cneg(out, 1); + mul_fp(out->Z, out->Z, beta); + mul_fp(out->Z, out->Z, beta); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE1_sign(POINTonE1 *out, const POINTonE1 *in, const pow256 SK) +{ + vec384 Z, ZZ; + limb_t inf; + + POINTonE1_mult_glv(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp(ZZ, Z); + mul_fp(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp(ZZ, ZZ, Z); + mul_fp(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G1.Z, sizeof(BLS12_381_G1.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g1(POINTonE1 *out, const pow256 SK) +{ POINTonE1_sign(out, &BLS12_381_G1, SK); } + +void blst_sign_pk_in_g2(POINTonE1 *out, const POINTonE1 *msg, const pow256 SK) +{ POINTonE1_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g1(unsigned char out[96], POINTonE1_affine *PK, + const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, &BLS12_381_G1, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g2(unsigned char out[96], POINTonE1_affine *sig, + const POINTonE1 *hash, const pow256 SK) +{ + POINTonE1 P[1]; + + POINTonE1_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE1_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p1_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 176) { + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE1_mult_glv(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE1_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p1_unchecked_mult(POINTonE1 *out, const POINTonE1 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE1_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p1_affine_is_equal(const POINTonE1_affine *a, + const POINTonE1_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p1_is_inf(const POINTonE1 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE1 *blst_p1_generator(void) +{ return &BLS12_381_G1; } + +int blst_p1_affine_is_inf(const POINTonE1_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE1_affine *blst_p1_affine_generator(void) +{ return (const POINTonE1_affine *)&BLS12_381_G1; } + +size_t blst_p1_sizeof(void) +{ return sizeof(POINTonE1); } + +size_t blst_p1_affine_sizeof(void) +{ return sizeof(POINTonE1_affine); } diff --git a/crypto/blst_src/e2.c b/crypto/blst_src/e2.c new file mode 100644 index 00000000000..77f8064bce2 --- /dev/null +++ b/crypto/blst_src/e2.c @@ -0,0 +1,638 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" +#include "errors.h" + +/* + * y^2 = x^3 + B + */ +static const vec384x B_E2 = { /* 4 + 4*i */ + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) }, + { TO_LIMB_T(0xaa270000000cfff3), TO_LIMB_T(0x53cc0032fc34000a), + TO_LIMB_T(0x478fe97a6b0a807f), TO_LIMB_T(0xb1d37ebee6ba24d7), + TO_LIMB_T(0x8ec9733bbf78ab2f), TO_LIMB_T(0x09d645513d83de7e) } +}; + +const POINTonE2 BLS12_381_G2 = { /* generator point [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0ce5d527727d6e118cc9cdc6da2e351aadfd9baa8cbdd3a7 + 6d429a695160d12c923ac9cc3baca289e193548608b82801 << 384) % P */ + { TO_LIMB_T(0x4c730af860494c4a), TO_LIMB_T(0x597cfa1f5e369c5a), + TO_LIMB_T(0xe7e6856caa0a635a), TO_LIMB_T(0xbbefb5e96e0d495f), + TO_LIMB_T(0x07d3a975f0ef25a2), TO_LIMB_T(0x0083fd8e7e80dae5) }, + /* (0x0606c4a02ea734cc32acd2b02bc28b99cb3e287e85a763af + 267492ab572e99ab3f370d275cec1da1aaa9075ff05f79be << 384) % P */ + { TO_LIMB_T(0xadc0fc92df64b05d), TO_LIMB_T(0x18aa270a2b1461dc), + TO_LIMB_T(0x86adac6a3be4eba0), TO_LIMB_T(0x79495c4ec93da33a), + TO_LIMB_T(0xe7175850a43ccaed), TO_LIMB_T(0x0b2bc2a163de1bf2) }, +}, +{ { ONE_MONT_P }, { 0 } } +}; + +const POINTonE2 BLS12_381_NEG_G2 = { /* negative generator [in Montgomery] */ +{ /* (0x024aa2b2f08f0a91260805272dc51051c6e47ad4fa403b02 + b4510b647ae3d1770bac0326a805bbefd48056c8c121bdb8 << 384) % P */ + { TO_LIMB_T(0xf5f28fa202940a10), TO_LIMB_T(0xb3f5fb2687b4961a), + TO_LIMB_T(0xa1a893b53e2ae580), TO_LIMB_T(0x9894999d1a3caee9), + TO_LIMB_T(0x6f67b7631863366b), TO_LIMB_T(0x058191924350bcd7) }, + /* (0x13e02b6052719f607dacd3a088274f65596bd0d09920b61a + b5da61bbdc7f5049334cf11213945d57e5ac7d055d042b7e << 384) % P */ + { TO_LIMB_T(0xa5a9c0759e23f606), TO_LIMB_T(0xaaa0c59dbccd60c3), + TO_LIMB_T(0x3bb17e18e2867806), TO_LIMB_T(0x1b1ab6cc8541b367), + TO_LIMB_T(0xc2b6ed0ef2158547), TO_LIMB_T(0x11922a097360edf3) } +}, +{ /* (0x0d1b3cc2c7027888be51d9ef691d77bcb679afda66c73f17 + f9ee3837a55024f78c71363275a75d75d86bab79f74782aa << 384) % P */ + { TO_LIMB_T(0x6d8bf5079fb65e61), TO_LIMB_T(0xc52f05df531d63a5), + TO_LIMB_T(0x7f4a4d344ca692c9), TO_LIMB_T(0xa887959b8577c95f), + TO_LIMB_T(0x4347fe40525c8734), TO_LIMB_T(0x197d145bbaff0bb5) }, + /* (0x13fa4d4a0ad8b1ce186ed5061789213d993923066dddaf10 + 40bc3ff59f825c78df74f2d75467e25e0f55f8a00fa030ed << 384) % P */ + { TO_LIMB_T(0x0c3e036d209afa4e), TO_LIMB_T(0x0601d8f4863f9e23), + TO_LIMB_T(0xe0832636bacc0a84), TO_LIMB_T(0xeb2def362a476f84), + TO_LIMB_T(0x64044f659f0ee1e9), TO_LIMB_T(0x0ed54f48d5a1caa7) } +}, +{ { ONE_MONT_P }, { 0 } } +}; + +static void mul_by_b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 2); + lshift_fp(out[1], out[1], 2); +} + +static void mul_by_4b_onE2(vec384x out, const vec384x in) +{ + sub_fp(out[0], in[0], in[1]); + add_fp(out[1], in[0], in[1]); + lshift_fp(out[0], out[0], 4); + lshift_fp(out[1], out[1], 4); +} + +static void POINTonE2_cneg(POINTonE2 *p, bool_t cbit) +{ cneg_fp2(p->Y, p->Y, cbit); } + +void blst_p2_cneg(POINTonE2 *a, int cbit) +{ POINTonE2_cneg(a, is_zero(cbit) ^ 1); } + +static void POINTonE2_from_Jacobian(POINTonE2 *out, const POINTonE2 *in) +{ + vec384x Z, ZZ; + limb_t inf = vec_is_zero(in->Z, sizeof(in->Z)); + + reciprocal_fp2(Z, in->Z); /* 1/Z */ + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, in->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, in->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, in->Z, BLS12_381_G2.Z, + sizeof(BLS12_381_G2.Z), inf); /* Z = inf ? 0 : 1 */ +} + +void blst_p2_from_jacobian(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_from_Jacobian(out, a); } + +static void POINTonE2_to_affine(POINTonE2_affine *out, const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + vec_copy(out, in, sizeof(*out)); +} + +void blst_p2_to_affine(POINTonE2_affine *out, const POINTonE2 *a) +{ POINTonE2_to_affine(out, a); } + +void blst_p2_from_affine(POINTonE2 *out, const POINTonE2_affine *a) +{ + vec_copy(out, a, sizeof(*a)); + vec_select(out->Z, a->X, BLS12_381_Rx.p2, sizeof(out->Z), + vec_is_zero(a, sizeof(*a))); +} + +static bool_t POINTonE2_affine_on_curve(const POINTonE2_affine *p) +{ + vec384x XXX, YY; + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, B_E2); /* X^3 + B */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)); +} + +int blst_p2_affine_on_curve(const POINTonE2_affine *p) +{ return (int)(POINTonE2_affine_on_curve(p) | vec_is_zero(p, sizeof(*p))); } + +static bool_t POINTonE2_on_curve(const POINTonE2 *p) +{ + vec384x XXX, YY, BZ6; + limb_t inf = vec_is_zero(p->Z, sizeof(p->Z)); + + sqr_fp2(BZ6, p->Z); + mul_fp2(BZ6, BZ6, p->Z); + sqr_fp2(XXX, BZ6); /* Z^6 */ + mul_by_b_onE2(BZ6, XXX); /* B*Z^6 */ + + sqr_fp2(XXX, p->X); + mul_fp2(XXX, XXX, p->X); /* X^3 */ + add_fp2(XXX, XXX, BZ6); /* X^3 + B*Z^6 */ + + sqr_fp2(YY, p->Y); /* Y^2 */ + + return vec_is_equal(XXX, YY, sizeof(XXX)) | inf; +} + +int blst_p2_on_curve(const POINTonE2 *p) +{ return (int)POINTonE2_on_curve(p); } + +static limb_t POINTonE2_affine_Serialize_BE(unsigned char out[192], + const POINTonE2_affine *in) +{ + vec384x temp; + + from_fp(temp[1], in->X[1]); + be_bytes_from_limbs(out, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->X[0]); + be_bytes_from_limbs(out + 48, temp[0], sizeof(temp[0])); + + from_fp(temp[1], in->Y[1]); + be_bytes_from_limbs(out + 96, temp[1], sizeof(temp[1])); + from_fp(temp[0], in->Y[0]); + be_bytes_from_limbs(out + 144, temp[0], sizeof(temp[0])); + + return sgn0_pty_mod_384x(temp, BLS12_381_P); +} + +void blst_p2_affine_serialize(unsigned char out[192], + const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinity bit */ + } else { + (void)POINTonE2_affine_Serialize_BE(out, in); + } +} + +static limb_t POINTonE2_Serialize_BE(unsigned char out[192], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p2, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Serialize_BE(out, (const POINTonE2_affine *)in); +} + +static void POINTonE2_Serialize(unsigned char out[192], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 192); + out[0] = 0x40; /* infinity bit */ + } else { + (void)POINTonE2_Serialize_BE(out, in); + } +} + +void blst_p2_serialize(unsigned char out[192], const POINTonE2 *in) +{ POINTonE2_Serialize(out, in); } + +static limb_t POINTonE2_affine_Compress_BE(unsigned char out[96], + const POINTonE2_affine *in) +{ + vec384 temp; + + from_fp(temp, in->X[1]); + be_bytes_from_limbs(out, temp, sizeof(temp)); + from_fp(temp, in->X[0]); + be_bytes_from_limbs(out + 48, temp, sizeof(temp)); + + return sgn0_pty_mont_384x(in->Y, BLS12_381_P, p0); +} + +void blst_p2_affine_compress(unsigned char out[96], const POINTonE2_affine *in) +{ + if (vec_is_zero(in->X, 2*sizeof(in->X))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinity bits */ + } else { + limb_t sign = POINTonE2_affine_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Compress_BE(unsigned char out[96], + const POINTonE2 *in) +{ + POINTonE2 p; + + if (!vec_is_equal(in->Z, BLS12_381_Rx.p, sizeof(in->Z))) { + POINTonE2_from_Jacobian(&p, in); + in = &p; + } + + return POINTonE2_affine_Compress_BE(out, (const POINTonE2_affine *)in); +} + +void blst_p2_compress(unsigned char out[96], const POINTonE2 *in) +{ + if (vec_is_zero(in->Z, sizeof(in->Z))) { + bytes_zero(out, 96); + out[0] = 0xc0; /* compressed and infinity bits */ + } else { + limb_t sign = POINTonE2_Compress_BE(out, in); + out[0] |= (unsigned char)(0x80 | ((sign & 2) << 4)); + } +} + +static limb_t POINTonE2_Uncompress_BE(POINTonE2_affine *out, + const unsigned char in[96]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return (limb_t)0 - BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + + sqr_fp2(ret.Y, ret.X); + mul_fp2(ret.Y, ret.Y, ret.X); + add_fp2(ret.Y, ret.Y, B_E2); /* X^3 + B */ + if (!sqrt_fp2(ret.Y, ret.Y)) + return (limb_t)0 - BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return sgn0_pty_mont_384x(out->Y, BLS12_381_P, p0); +} + +static BLST_ERROR POINTonE2_Uncompress_Z(POINTonE2_affine *out, + const unsigned char in[96]) +{ + unsigned char in0 = in[0]; + limb_t sgn0_pty; + + if ((in0 & 0x80) == 0) /* compressed bit */ + return BLST_BAD_ENCODING; + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 95)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } else { + return BLST_BAD_ENCODING; + } + } + + sgn0_pty = POINTonE2_Uncompress_BE(out, in); + + if (sgn0_pty > 3) + return (BLST_ERROR)(0 - sgn0_pty); /* POINT_NOT_ON_CURVE */ + + sgn0_pty >>= 1; /* skip over parity bit */ + sgn0_pty ^= (in0 & 0x20) >> 5; + cneg_fp2(out->Y, out->Y, sgn0_pty); + + return BLST_SUCCESS; +} + +BLST_ERROR blst_p2_uncompress(POINTonE2_affine *out, const unsigned char in[96]) +{ return POINTonE2_Uncompress_Z(out, in); } + +static BLST_ERROR POINTonE2_Deserialize_BE(POINTonE2_affine *out, + const unsigned char in[192]) +{ + POINTonE2_affine ret; + vec384 temp; + + limbs_from_be_bytes(ret.X[1], in, sizeof(ret.X[1])); + limbs_from_be_bytes(ret.X[0], in + 48, sizeof(ret.X[0])); + limbs_from_be_bytes(ret.Y[1], in + 96, sizeof(ret.Y[1])); + limbs_from_be_bytes(ret.Y[0], in + 144, sizeof(ret.Y[0])); + + /* clear top 3 bits in case caller was conveying some information there */ + ret.X[1][sizeof(ret.X[1])/sizeof(limb_t)-1] &= ((limb_t)0-1) >> 3; + add_fp(temp, ret.X[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.X[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.X[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[1], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[1], sizeof(temp))) + return BLST_BAD_ENCODING; + + add_fp(temp, ret.Y[0], ZERO_384); /* less than modulus? */ + if (!vec_is_equal(temp, ret.Y[0], sizeof(temp))) + return BLST_BAD_ENCODING; + + mul_fp(ret.X[0], ret.X[0], BLS12_381_RR); + mul_fp(ret.X[1], ret.X[1], BLS12_381_RR); + mul_fp(ret.Y[0], ret.Y[0], BLS12_381_RR); + mul_fp(ret.Y[1], ret.Y[1], BLS12_381_RR); + + if (!POINTonE2_affine_on_curve(&ret)) + return BLST_POINT_NOT_ON_CURVE; + + vec_copy(out, &ret, sizeof(ret)); + + return BLST_SUCCESS; +} + +static BLST_ERROR POINTonE2_Deserialize_Z(POINTonE2_affine *out, + const unsigned char in[192]) +{ + unsigned char in0 = in[0]; + + if ((in0 & 0xe0) == 0) + return POINTonE2_Deserialize_BE(out, in); + + if (in0 & 0x80) /* compressed bit */ + return POINTonE2_Uncompress_Z(out, in); + + if (in0 & 0x40) { /* infinity bit */ + if (byte_is_zero(in0 & 0x3f) & bytes_are_zero(in+1, 191)) { + vec_zero(out, sizeof(*out)); + return BLST_SUCCESS; + } + } + + return BLST_BAD_ENCODING; +} + +BLST_ERROR blst_p2_deserialize(POINTonE2_affine *out, + const unsigned char in[192]) +{ return POINTonE2_Deserialize_Z(out, in); } + +#include "ec_ops.h" +POINT_DADD_IMPL(POINTonE2, 384x, fp2) +POINT_DADD_AFFINE_IMPL_A0(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_ADD_IMPL(POINTonE2, 384x, fp2) +POINT_ADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINT_DOUBLE_IMPL_A0(POINTonE2, 384x, fp2) +POINT_IS_EQUAL_IMPL(POINTonE2, 384x, fp2) + +void blst_p2_add(POINTonE2 *out, const POINTonE2 *a, const POINTonE2 *b) +{ POINTonE2_add(out, a, b); } + +void blst_p2_add_or_double(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2 *b) +{ POINTonE2_dadd(out, a, b, NULL); } + +void blst_p2_add_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_add_affine(out, a, b); } + +void blst_p2_add_or_double_affine(POINTonE2 *out, const POINTonE2 *a, + const POINTonE2_affine *b) +{ POINTonE2_dadd_affine(out, a, b); } + +void blst_p2_double(POINTonE2 *out, const POINTonE2 *a) +{ POINTonE2_double(out, a); } + +int blst_p2_is_equal(const POINTonE2 *a, const POINTonE2 *b) +{ return (int)POINTonE2_is_equal(a, b); } + +#include "ec_mult.h" +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 4) +POINT_MULT_SCALAR_WX_IMPL(POINTonE2, 5) + +#ifdef __BLST_PRIVATE_TESTMODE__ +POINT_AFFINE_MULT_SCALAR_IMPL(POINTonE2) + +DECLARE_PRIVATE_POINTXZ(POINTonE2, 384x) +POINT_LADDER_PRE_IMPL(POINTonE2, 384x, fp2) +POINT_LADDER_STEP_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_LADDER_POST_IMPL_A0(POINTonE2, 384x, fp2, onE2) +POINT_MULT_SCALAR_LADDER_IMPL(POINTonE2) +#endif + +static void psi(POINTonE2 *out, const POINTonE2 *in) +{ + static const vec384x frobenius_x = { /* 1/(1 + i)^((P-1)/3) */ + { 0 }, + { /* (0x1a0111ea397fe699ec02408663d4de85aa0d857d89759ad4 + 897d29650fb85f9b409427eb4f49fffd8bfd00000000aaad << 384) % P */ + TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) } + }; + static const vec384x frobenius_y = { /* 1/(1 + i)^((P-1)/2) */ + { /* (0x135203e60180a68ee2e9c448d77a2cd91c3dedd930b1cf60 + ef396489f61eb45e304466cf3e67fa0af1ee7b04121bdea2 << 384) % P */ + TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { /* (0x06af0e0437ff400b6831e36d6bd17ffe48395dabc2d3435e + 77f76e17009241c5ee67992f72ec05f4c81084fbede3cc09 << 384) % P */ + TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + }; + + vec_copy(out, in, sizeof(*out)); + cneg_fp(out->X[1], out->X[1], 1); mul_fp2(out->X, out->X, frobenius_x); + cneg_fp(out->Y[1], out->Y[1], 1); mul_fp2(out->Y, out->Y, frobenius_y); + cneg_fp(out->Z[1], out->Z[1], 1); +} + +/* Galbraith-Lin-Scott, ~67% faster than POINTonE2_mul_w5 */ +static void POINTonE2_mult_gls(POINTonE2 *out, const POINTonE2 *in, + const pow256 SK) +{ + union { vec256 l; pow256 s; } val; + + /* break down SK to "digits" with |z| as radix [in constant time] */ + + limbs_from_le_bytes(val.l, SK, 32); + div_by_zz(val.l); + div_by_z(val.l); + div_by_z(val.l + NLIMBS(256)/2); + le_bytes_from_limbs(val.s, val.l, 32); + + { + const byte *scalars[2] = { val.s, NULL }; + POINTonE2 table[4][1<<(5-1)]; /* 18KB */ + size_t i; + + POINTonE2_precompute_w5(table[0], in); + for (i = 0; i < 1<<(5-1); i++) { + psi(&table[1][i], &table[0][i]); + psi(&table[2][i], &table[1][i]); + psi(&table[3][i], &table[2][i]); + POINTonE2_cneg(&table[1][i], 1); /* account for z being negative */ + POINTonE2_cneg(&table[3][i], 1); + } + + POINTonE2s_mult_w5(out, NULL, 4, scalars, 64, table); + } + + vec_zero(val.l, sizeof(val)); /* scrub the copy of SK */ +} + +static void POINTonE2_sign(POINTonE2 *out, const POINTonE2 *in, const pow256 SK) +{ + vec384x Z, ZZ; + limb_t inf; + + POINTonE2_mult_gls(out, in, SK); + + /* convert to affine to remove possible bias in out->Z */ + inf = vec_is_zero(out->Z, sizeof(out->Z)); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + flt_reciprocal_fp2(Z, out->Z); /* 1/Z */ +#else + reciprocal_fp2(Z, out->Z); /* 1/Z */ +#endif + + sqr_fp2(ZZ, Z); + mul_fp2(out->X, out->X, ZZ); /* X = X/Z^2 */ + + mul_fp2(ZZ, ZZ, Z); + mul_fp2(out->Y, out->Y, ZZ); /* Y = Y/Z^3 */ + + vec_select(out->Z, out->Z, BLS12_381_G2.Z, sizeof(BLS12_381_G2.Z), + inf); /* Z = inf ? 0 : 1 */ +} + +void blst_sk_to_pk_in_g2(POINTonE2 *out, const pow256 SK) +{ POINTonE2_sign(out, &BLS12_381_G2, SK); } + +void blst_sign_pk_in_g1(POINTonE2 *out, const POINTonE2 *msg, const pow256 SK) +{ POINTonE2_sign(out, msg, SK); } + +void blst_sk_to_pk2_in_g2(unsigned char out[192], POINTonE2_affine *PK, + const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, &BLS12_381_G2, SK); + if (PK != NULL) + vec_copy(PK, P, sizeof(*PK)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_sign_pk2_in_g1(unsigned char out[192], POINTonE2_affine *sig, + const POINTonE2 *hash, const pow256 SK) +{ + POINTonE2 P[1]; + + POINTonE2_sign(P, hash, SK); + if (sig != NULL) + vec_copy(sig, P, sizeof(*sig)); + if (out != NULL) { + limb_t sgn0_pty = POINTonE2_Serialize_BE(out, P); + out[0] |= (sgn0_pty & 2) << 4; /* pre-decorate */ + out[0] |= vec_is_zero(P->Z, sizeof(P->Z)) << 6; + } +} + +void blst_p2_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits < 144) { + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); + } else if (nbits <= 256) { + union { vec256 l; pow256 s; } val; + size_t i, j, top, mask = (size_t)0 - 1; + + /* this is not about constant-time-ness, but branch optimization */ + for (top = (nbits + 7)/8, i=0, j=0; i> (8*sizeof(top)-1)); + j += 1 & mask; + } + + if (check_mod_256(val.s, BLS12_381_r)) /* z^4 is the formal limit */ + POINTonE2_mult_gls(out, a, val.s); + else /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + + vec_zero(val.l, sizeof(val)); + } else { /* should never be the case, added for formal completeness */ + POINTonE2_mult_w5(out, a, scalar, nbits); + } +} + +void blst_p2_unchecked_mult(POINTonE2 *out, const POINTonE2 *a, + const byte *scalar, size_t nbits) +{ + if (nbits) + POINTonE2_mult_w4(out, a, scalar, nbits); + else + vec_zero(out, sizeof(*out)); +} + +int blst_p2_affine_is_equal(const POINTonE2_affine *a, + const POINTonE2_affine *b) +{ return (int)vec_is_equal(a, b, sizeof(*a)); } + +int blst_p2_is_inf(const POINTonE2 *p) +{ return (int)vec_is_zero(p->Z, sizeof(p->Z)); } + +const POINTonE2 *blst_p2_generator(void) +{ return &BLS12_381_G2; } + +int blst_p2_affine_is_inf(const POINTonE2_affine *p) +{ return (int)vec_is_zero(p, sizeof(*p)); } + +const POINTonE2_affine *blst_p2_affine_generator(void) +{ return (const POINTonE2_affine *)&BLS12_381_G2; } + +size_t blst_p2_sizeof(void) +{ return sizeof(POINTonE2); } + +size_t blst_p2_affine_sizeof(void) +{ return sizeof(POINTonE2_affine); } diff --git a/crypto/blst_src/ec_mult.h b/crypto/blst_src/ec_mult.h new file mode 100644 index 00000000000..3c23489570c --- /dev/null +++ b/crypto/blst_src/ec_mult.h @@ -0,0 +1,290 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_EC_MULT_H__ +#define __BLS12_381_ASM_EC_MULT_H__ + +#include "point.h" + +/* Works up to 9 bits */ +static limb_t get_wval(const byte *d, size_t off, size_t bits) +{ + size_t top = off + bits - 1; + limb_t ret; + + ret = ((limb_t)d[top / 8] << 8) | d[off / 8]; + + return ret >> (off%8); +} + +/* Works up to 25 bits. */ +static limb_t get_wval_limb(const byte *d, size_t off, size_t bits) +{ + size_t i, top = (off + bits - 1)/8; + limb_t ret, mask = (limb_t)0 - 1; + + d += off/8; + top -= off/8-1; + + /* this is not about constant-time-ness, but branch optimization */ + for (ret=0, i=0; i<4;) { + ret |= (*d & mask) << (8*i); + mask = (limb_t)0 - ((++i - top) >> (8*sizeof(top)-1)); + d += 1 & mask; + } + + return ret >> (off%8); +} + +/* + * Window value encoding that utilizes the fact that -P is trivially + * calculated, which allows to halve the size of pre-computed table, + * is attributed to A. D. Booth, hence the name of the subroutines... + */ +static limb_t booth_encode(limb_t wval, size_t sz) +{ + limb_t mask = 0 - (wval >> sz); /* "sign" bit -> mask */ + launder(mask); + + wval = (wval + 1) >> 1; + wval = (wval ^ mask) - mask; + + /* &0x1f, but <=0x10, is index in table, rest is extended "sign" bit */ + return wval; +} + +/* + * Key feature of these constant-time subroutines is that they tolerate + * zeros in most significant bit positions of the scalar[s], or in other + * words, zero-padded scalar values. This means that one can and should + * pass order's bit-length, which is customarily publicly known, instead + * of the factual scalars' bit-lengths. This is facilitated by point + * addition subroutines implemented to handle points at infinity, which + * are encoded as Z==0. [Doubling algorithms handle such points at + * infinity "naturally," since resulting Z is product of original Z.] + */ +#define POINT_MULT_SCALAR_WX_IMPL(ptype, SZ) \ +static void ptype##_gather_booth_w##SZ(ptype *restrict p, \ + const ptype table[1<<(SZ-1)], \ + limb_t booth_idx) \ +{ \ + size_t i; \ + bool_t booth_sign = (booth_idx >> SZ) & 1; \ +\ + booth_idx &= (1< 0) \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + else \ + wval = (scalar[0] << 1) & wmask; \ +\ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(ret, table[0], wval); \ +\ + i = 1; \ + while (bits > 0) { \ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval(scalar, bits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +\ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ + i = 0; scalar_s = scalars; \ + } \ +\ + for (; i < npoints; i++) { \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (scalar[0] << 1) & wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table[i], wval); \ + ptype##_dadd(ret, ret, temp, NULL); \ + } \ +} \ +\ +static void ptype##_mult_w##SZ(ptype *ret, const ptype *point, \ + const byte *scalar, size_t bits) \ +{ \ + limb_t wmask, wval; \ + size_t j, window; \ + ptype temp[1]; \ + ptype table[1<<(SZ-1)]; \ +\ + ptype##_precompute_w##SZ(table, point); \ +\ + /* top excess bits modulo target window size */ \ + window = bits % SZ; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + bits -= window; \ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(ret, table, wval); \ +\ + while (bits > 0) { \ + for (j = 0; j < SZ; j++) \ + ptype##_double(ret, ret); \ +\ + window = SZ; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + bits -= window; \ +\ + wval = bits ? get_wval(scalar, bits - 1, window + 1) \ + : (limb_t)scalar[0] << 1; \ + wval &= wmask; \ + wval = booth_encode(wval, SZ); \ + ptype##_gather_booth_w##SZ(temp, table, wval); \ + if (bits > 0) ptype##_add(ret, ret, temp); \ + else ptype##_dadd(ret, ret, temp, NULL); \ + } \ +} + +#if 0 +/* ~50%, or ~2x[!] slower than w5... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *ret, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit, pbit = 0; \ +\ + vec_copy(sum, p, sizeof(ptype)); \ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##_cswap(ret, sum, bit); \ + ptype##_add(sum, sum, ret); \ + ptype##_double(ret, ret); \ + pbit ^= bit; \ + } \ + ptype##_cswap(ret, sum, pbit); \ +} +#else +/* >40% better performance than above, [and ~30% slower than w5]... */ +#define POINT_MULT_SCALAR_LADDER_IMPL(ptype) \ +static void ptype##_mult_ladder(ptype *out, const ptype *p, \ + const byte *scalar, size_t bits) \ +{ \ + ptype##xz sum[1]; \ + ptype##xz pxz[1]; \ + ptype##xz ret[1]; \ + bool_t bit, pbit = 0; \ +\ + ptype##xz_ladder_pre(pxz, p); \ + vec_copy(sum, pxz, sizeof(ptype##xz)); \ + vec_zero(ret, sizeof(ptype##xz)); /* infinity */ \ +\ + while (bits--) { \ + bit = is_bit_set(scalar, bits); \ + bit ^= pbit; \ + ptype##xz_cswap(ret, sum, bit); \ + ptype##xz_ladder_step(ret, sum, pxz); \ + pbit ^= bit; \ + } \ + ptype##xz_cswap(ret, sum, pbit); \ + ptype##xz_ladder_post(out, ret, sum, pxz, p->Y); \ +} +#endif + +/* + * Sole reason for existence of this implementation is that addition + * with affine point renders a share of multiplications redundant by + * virtue of Z==1. And since pre-defined generator point can be and + * customarily is instantiated affine, it would be hardly appropriate + * to pass on this opportunity. Though while it's faster than the + * generic ladder implementation, by ~25%, it's not faster than XZ one + * above, <15% slower. Just in case, it's faster than generic ladder + * even if one accounts for prior conversion to affine coordinates, + * so that choice [for resource-constrained case] is actually between + * this plus said conversion and XZ ladder... + * + * To summarize, if ptype##_mult_w5 executed in one unit of time, then + * - naive ptype##_mult_ladder would execute in ~2; + * - XZ version above - in ~1.4; + * - ptype##_affine_mult_ladder below - in ~1.65; + * - [small-footprint ptype##_to_affine would run in ~0.18]. + * + * Caveat lector, |p_affine|*(order+2) produces wrong result, because + * addition doesn't handle doubling. Indeed, P*(order+1) is P and it + * fails to add with itself producing infinity in last addition. But + * as long as |scalar| is reduced modulo order, as it should be, it's + * not a problem... + */ +#define POINT_AFFINE_MULT_SCALAR_IMPL(ptype) \ +static void ptype##_affine_mult_ladder(ptype *ret, \ + const ptype##_affine *p_affine, \ + const byte *scalar, size_t bits) \ +{ \ + ptype sum[1]; \ + bool_t bit; \ +\ + vec_zero(ret, sizeof(ptype)); /* infinity */ \ +\ + while (bits--) { \ + ptype##_double(ret, ret); \ + ptype##_add_affine(sum, ret, p_affine); \ + bit = (scalar[bits / LIMB_T_BITS] >> (bits % LIMB_T_BITS)) & 1; \ + ptype##_ccopy(ret, sum, bit); \ + } \ +} +#endif diff --git a/crypto/blst_src/ec_ops.h b/crypto/blst_src/ec_ops.h new file mode 100644 index 00000000000..0d531f816e2 --- /dev/null +++ b/crypto/blst_src/ec_ops.h @@ -0,0 +1,787 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_384_ASM_EC_OPS_H__ +#define __BLS12_384_ASM_EC_OPS_H__ +/* + * Addition that can handle doubling [as well as points at infinity, + * which are encoded as Z==0] in constant time. It naturally comes at + * cost, but this subroutine should be called only when independent + * points are processed, which is considered reasonable compromise. + * For example, ptype##s_mult_w5 calls it, but since *major* gain is + * result of pure doublings being effectively divided by amount of + * points, slightly slower addition can be tolerated. But what is the + * additional cost more specifically? Best addition result is 11M+5S, + * while this routine takes 13M+5S (+1M+1S if a4!=0), as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1*Z2^2 | U1 = X1 + * U2 = X2*Z1^2 | + * S1 = Y1*Z2^3 | S1 = Y1 + * S2 = Y2*Z1^3 | + * zz = Z1*Z2 | zz = Z1 + * H = U2-U1 | H' = 2*Y1 + * R = S2-S1 | R' = 3*X1^2[+a*Z1^4] + * sx = U1+U2 | sx = X1+X1 + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = H*zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_IMPL(ptype, bits, field) \ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4) \ +{ \ + ptype p3; /* starts as (U1, S1, zz) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + add_##field(dbl.sx, p1->X, p1->X); /* sx = X1+X1 */\ + sqr_##field(dbl.R, p1->X); /* X1^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X1^2 */\ + add_##field(dbl.H, p1->Y, p1->Y); /* H = 2*Y1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(p3.X, p2->Z); /* Z2^2 */\ + mul_##field(p3.Z, p1->Z, p2->Z); /* Z1*Z2 */\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ +\ + if (a4 != NULL) { \ + sqr_##field(p3.Y, add.H); /* Z1^4, [borrow p3.Y] */\ + mul_##field(p3.Y, p3.Y, a4); \ + add_##field(dbl.R, dbl.R, p3.Y);/* R = 3*X1^2+a*Z1^4 */\ + } \ +\ + mul_##field(p3.Y, p1->Y, p2->Z); \ + mul_##field(p3.Y, p3.Y, p3.X); /* S1 = Y1*Z2^3 */\ + mul_##field(add.R, p2->Y, p1->Z); \ + mul_##field(add.R, add.R, add.H); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p3.Y); /* R = S2-S1 */\ +\ + mul_##field(p3.X, p3.X, p1->X); /* U1 = X1*Z2^2 */\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p3.X); /* sx = U1+U2 */\ + sub_##field(add.H, add.H, p3.X); /* H = U2-U1 */\ +\ + /* make the choice between addition and doubling */\ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(&p3, p1, &p3, sizeof(p3), is_dbl); \ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + mul_##field(p3.Z, p3.Z, add.H); /* Z3 = H*Z1*Z2 */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * Addition with affine point that can handle doubling [as well as + * points at infinity, with |p1| being encoded as Z==0 and |p2| as + * X,Y==0] in constant time. But at what additional cost? Best + * addition result is 7M+4S, while this routine takes 8M+5S, as per + * + * -------------+------------- + * addition | doubling + * -------------+------------- + * U1 = X1 | U1 = X2 + * U2 = X2*Z1^2 | + * S1 = Y1 | S1 = Y2 + * S2 = Y2*Z1^3 | + * H = U2-X1 | H' = 2*Y2 + * R = S2-Y1 | R' = 3*X2^2[+a] + * sx = X1+U2 | sx = X2+X2 + * zz = H*Z1 | zz = H' + * -------------+------------- + * H!=0 || R!=0 | H==0 && R==0 + * + * X3 = R^2-H^2*sx + * Y3 = R*(H^2*U1-X3)-H^3*S1 + * Z3 = zz + * + * As for R!=0 condition in context of H==0, a.k.a. P-P. The result is + * infinity by virtue of Z3 = (U2-U1)*zz = H*zz = 0*zz == 0. + */ +#define POINT_DADD_AFFINE_IMPL_A0(ptype, bits, field, one) \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; /* starts as (,, H*Z1) from addition side */\ + struct { vec##bits H, R, sx; } add, dbl; \ + bool_t p1inf, p2inf, is_dbl; \ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ + add_##field(dbl.sx, p2->X, p2->X); /* sx = X2+X2 */\ + sqr_##field(dbl.R, p2->X); /* X2^2 */\ + mul_by_3_##field(dbl.R, dbl.R); /* R = 3*X2^2 */\ + add_##field(dbl.H, p2->Y, p2->Y); /* H = 2*Y2 */\ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(add.H, p1->Z); /* Z1^2 */\ + mul_##field(add.R, add.H, p1->Z); /* Z1^3 */\ + mul_##field(add.R, add.R, p2->Y); /* S2 = Y2*Z1^3 */\ + sub_##field(add.R, add.R, p1->Y); /* R = S2-Y1 */\ +\ + mul_##field(add.H, add.H, p2->X); /* U2 = X2*Z1^2 */\ +\ + add_##field(add.sx, add.H, p1->X); /* sx = X1+U2 */\ + sub_##field(add.H, add.H, p1->X); /* H = U2-X1 */\ +\ + mul_##field(p3.Z, add.H, p1->Z); /* Z3 = H*Z1 */\ +\ + /* make the choice between addition and doubling */ \ + is_dbl = vec_is_zero(add.H, 2*sizeof(add.H)); \ + vec_select(p3.X, p2, p1, 2*sizeof(p3.X), is_dbl); \ + vec_select(p3.Z, dbl.H, p3.Z, sizeof(p3.Z), is_dbl);\ + vec_select(&add, &dbl, &add, sizeof(add), is_dbl); \ + /* |p3| and |add| hold all inputs now, |p3| will hold output */\ +\ + sqr_##field(dbl.H, add.H); /* H^2 */\ + mul_##field(dbl.R, dbl.H, add.H); /* H^3 */\ + mul_##field(dbl.R, dbl.R, p3.Y); /* H^3*S1 */\ + mul_##field(p3.Y, dbl.H, p3.X); /* H^2*U1 */\ +\ + mul_##field(dbl.H, dbl.H, add.sx); /* H^2*sx */\ + sqr_##field(p3.X, add.R); /* R^2 */\ + sub_##field(p3.X, p3.X, dbl.H); /* X3 = R^2-H^2*sx */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* H^2*U1-X3 */\ + mul_##field(p3.Y, p3.Y, add.R); /* R*(H^2*U1-X3) */\ + sub_##field(p3.Y, p3.Y, dbl.R); /* Y3 = R*(H^2*U1-X3)-H^3*S1 */\ +\ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-add-2007-bl + * with twist to handle either input at infinity, which are encoded as Z==0. + */ +#define POINT_ADD_IMPL(ptype, bits, field) \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, Z2Z2, U1, S1, H, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->Z, sizeof(p2->Z)); \ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(S1, Z2Z2, p2->Z); /* Z2*Z2Z2 */\ + mul_##field(S1, S1, p1->Y); /* S1 = Y1*Z2*Z2Z2 */\ +\ + sub_##field(p3.Z, p3.Z, S1); /* S2-S1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-S1) */\ +\ + mul_##field(U1, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + sub_##field(H, H, U1); /* H = U2-U1 */\ +\ + add_##field(I, H, H); /* 2*H */\ + sqr_##field(I, I); /* I = (2*H)^2 */\ +\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(S1, S1, J); /* S1*J */\ +\ + mul_##field(p3.Y, U1, I); /* V = U1*I */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, S1); \ + sub_##field(p3.Y, p3.Y, S1); /* Y3 = r*(V-X3)-2*S1*J */\ +\ + add_##field(p3.Z, p1->Z, p2->Z); /* Z1+Z2 */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+Z2)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+Z2)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, Z2Z2); /* (Z1+Z2)^2-Z1Z1-Z2Z2 */\ + mul_##field(p3.Z, p3.Z, H); /* Z3 = ((Z1+Z2)^2-Z1Z1-Z2Z2)*H */\ +\ + vec_select(&p3, p1, &p3, sizeof(ptype), p2inf); \ + vec_select(out, p2, &p3, sizeof(ptype), p1inf); \ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#addition-madd-2007-bl + * with twist to handle either input at infinity, with |p1| encoded as Z==0, + * and |p2| as X==Y==0. + */ +#define POINT_ADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype p3; \ + vec##bits Z1Z1, H, HH, I, J; \ + bool_t p1inf, p2inf; \ +\ + p1inf = vec_is_zero(p1->Z, sizeof(p1->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ +\ + mul_##field(p3.Z, Z1Z1, p1->Z); /* Z1*Z1Z1 */\ + mul_##field(p3.Z, p3.Z, p2->Y); /* S2 = Y2*Z1*Z1Z1 */\ +\ + p2inf = vec_is_zero(p2->X, 2*sizeof(p2->X)); \ +\ + mul_##field(H, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ + sub_##field(H, H, p1->X); /* H = U2-X1 */\ +\ + sqr_##field(HH, H); /* HH = H^2 */\ + add_##field(I, HH, HH); \ + add_##field(I, I, I); /* I = 4*HH */\ +\ + mul_##field(p3.Y, p1->X, I); /* V = X1*I */\ + mul_##field(J, H, I); /* J = H*I */\ + mul_##field(I, J, p1->Y); /* Y1*J */\ +\ + sub_##field(p3.Z, p3.Z, p1->Y); /* S2-Y1 */\ + add_##field(p3.Z, p3.Z, p3.Z); /* r = 2*(S2-Y1) */\ +\ + sqr_##field(p3.X, p3.Z); /* r^2 */\ + sub_##field(p3.X, p3.X, J); /* r^2-J */\ + sub_##field(p3.X, p3.X, p3.Y); \ + sub_##field(p3.X, p3.X, p3.Y); /* X3 = r^2-J-2*V */\ +\ + sub_##field(p3.Y, p3.Y, p3.X); /* V-X3 */\ + mul_##field(p3.Y, p3.Y, p3.Z); /* r*(V-X3) */\ + sub_##field(p3.Y, p3.Y, I); \ + sub_##field(p3.Y, p3.Y, I); /* Y3 = r*(V-X3)-2*Y1*J */\ +\ + add_##field(p3.Z, p1->Z, H); /* Z1+H */\ + sqr_##field(p3.Z, p3.Z); /* (Z1+H)^2 */\ + sub_##field(p3.Z, p3.Z, Z1Z1); /* (Z1+H)^2-Z1Z1 */\ + sub_##field(p3.Z, p3.Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */\ +\ + vec_select(p3.Z, one, p3.Z, sizeof(p3.Z), p1inf); \ + vec_select(p3.X, p2, p3.X, 2*sizeof(p3.X), p1inf); \ + vec_select(out, p1, &p3, sizeof(ptype), p2inf); \ +} + +/* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-l + */ +#define POINT_DOUBLE_IMPL_A0(ptype, bits, field) \ +static void ptype##_double(ptype *p3, const ptype *p1) \ +{ \ + vec##bits A, B, C; \ +\ + sqr_##field(A, p1->X); /* A = X1^2 */\ + sqr_##field(B, p1->Y); /* B = Y1^2 */\ + sqr_##field(C, B); /* C = B^2 */\ +\ + add_##field(B, B, p1->X); /* X1+B */\ + sqr_##field(B, B); /* (X1+B)^2 */\ + sub_##field(B, B, A); /* (X1+B)^2-A */\ + sub_##field(B, B, C); /* (X1+B)^2-A-C */\ + add_##field(B, B, B); /* D = 2*((X1+B)^2-A-C) */\ +\ + mul_by_3_##field(A, A); /* E = 3*A */\ +\ + sqr_##field(p3->X, A); /* F = E^2 */\ + sub_##field(p3->X, p3->X, B); \ + sub_##field(p3->X, p3->X, B); /* X3 = F-2*D */\ +\ + add_##field(p3->Z, p1->Z, p1->Z); /* 2*Z1 */\ + mul_##field(p3->Z, p3->Z, p1->Y); /* Z3 = 2*Z1*Y1 */\ +\ + mul_by_8_##field(C, C); /* 8*C */\ + sub_##field(p3->Y, B, p3->X); /* D-X3 */\ + mul_##field(p3->Y, p3->Y, A); /* E*(D-X3) */\ + sub_##field(p3->Y, p3->Y, C); /* Y3 = E*(D-X3)-8*C */\ +} + +#define POINT_LADDER_PRE_IMPL(ptype, bits, field) \ +static void ptype##xz_ladder_pre(ptype##xz *pxz, const ptype *p) \ +{ \ + mul_##field(pxz->X, p->X, p->Z); /* X2 = X1*Z1 */\ + sqr_##field(pxz->Z, p->Z); \ + mul_##field(pxz->Z, pxz->Z, p->Z); /* Z2 = Z1^3 */\ +} + +/* + * https://hyperelliptic.org/EFD/g1p/auto-shortw-xz.html#ladder-ladd-2002-it-3 + * with twist to handle either input at infinity, which are encoded as Z==0. + * Just in case, order of doubling and addition is reverse in comparison to + * hyperelliptic.org entry. This was done to minimize temporary storage. + * + * XZ1 is |p|, XZ2&XZ4 are in&out |r|, XZ3&XZ5 are in&out |s|. + */ +#define POINT_LADDER_STEP_IMPL_A0(ptype, bits, field, suffix4b) \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p) \ +{ \ + ptype##xz p5; \ + vec##bits A, B, C, D, XX, ZZ; \ + bool_t r_inf, s_inf; \ + /* s += r */\ + mul_##field(A, r->X, s->X); /* A = X2*X3 */\ + mul_##field(B, r->Z, s->Z); /* B = Z2*Z3 */\ + mul_##field(C, r->X, s->Z); /* C = X2*Z3 */\ + mul_##field(D, r->Z, s->X); /* D = X3*Z2 */\ +\ + sqr_##field(A, A); /* (A[-a*B])^2 */\ + add_##field(p5.X, C, D); /* C+D */\ + mul_##field(p5.X, p5.X, B); /* B*(C+D) */\ + mul_by_4b_##suffix4b(B, p5.X); /* b4*B*(C+D) */\ + sub_##field(p5.X, A, B); /* (A[-a*B])^2-b4*B*(C+D) */\ + mul_##field(p5.X, p5.X, p->Z); /* X5 = Z1*((A[-a*B])^2-b4*B*(C+D)) */\ +\ + sub_##field(p5.Z, C, D); /* C-D */\ + sqr_##field(p5.Z, p5.Z); /* (C-D)^2 */\ + mul_##field(p5.Z, p5.Z, p->X); /* Z5 = X1*(C-D)^2 */\ +\ + r_inf = vec_is_zero(r->Z, sizeof(r->Z)); \ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ +\ + vec_select(&p5, r, &p5, sizeof(ptype##xz), s_inf); \ + vec_select(s, s, &p5, sizeof(ptype##xz), r_inf); \ + /* r *= 2 */\ + sqr_##field(XX, r->X); /* XX = X2^2 */\ + sqr_##field(ZZ, r->Z); /* ZZ = Z2^2 */\ +\ + add_##field(r->Z, r->X, r->Z); /* X2+Z2 */\ + sqr_##field(r->Z, r->Z); /* (X2+Z2)^2 */\ + sub_##field(r->Z, r->Z, XX); /* (X2+Z2)^2-XX */\ + sub_##field(r->Z, r->Z, ZZ); /* E = (X2+Z2)^2-XX-ZZ */\ +\ + sqr_##field(A, XX); /* (XX[-a*ZZ])^2 */\ + mul_##field(B, r->Z, ZZ); /* E*ZZ */\ + mul_by_4b_##suffix4b(C, B); /* b4*E*ZZ */\ + sub_##field(r->X, A, C); /* X4 = (XX[-a*ZZ])^2-b4*E*ZZ */\ +\ + sqr_##field(ZZ, ZZ); /* ZZ^2 */\ + mul_by_4b_##suffix4b(B, ZZ); /* b4*ZZ^2 */\ + mul_##field(r->Z, r->Z, XX); /* E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, r->Z); /* 2*E*(XX[+a*ZZ]) */\ + add_##field(r->Z, r->Z, B); /* Z4 = 2*E*(XX[+a*ZZ])+b4*ZZ^2 */\ +} + +/* + * Recover the |r|'s y-coordinate using Eq. (8) from Brier-Joye, + * "Weierstraß Elliptic Curves and Side-Channel Attacks", with XZ twist + * and conversion to Jacobian coordinates from /.../ecp_smpl.c, + * and with twist to recover from |s| at infinity [which occurs when + * multiplying by (order-1)]. + * + * X4 = 2*Y1*X2*Z3*Z1*Z2 + * Y4 = 2*b*Z3*(Z1*Z2)^2 + Z3*(a*Z1*Z2+X1*X2)*(X1*Z2+X2*Z1) - X3*(X1*Z2-X2*Z1)^2 + * Z4 = 2*Y1*Z3*Z2^2*Z1 + * + * Z3x2 = 2*Z3 + * Y1Z3x2 = Y1*Z3x2 + * Z1Z2 = Z1*Z2 + * X1Z2 = X1*Z2 + * X2Z1 = X2*Z1 + * X4 = Y1Z3x2*X2*Z1Z2 + * A = b*Z3x2*(Z1Z2)^2 + * B = Z3*(a*Z1Z2+X1*X2)*(X1Z2+X2Z1) + * C = X3*(X1Z2-X2Z1)^2 + * Y4 = A+B-C + * Z4 = Y1Z3x2*Z1Z2*Z2 + * + * XZ1 is |p|, XZ2 is |r|, XZ3 is |s|, 'a' is 0. + */ +#define POINT_LADDER_POST_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##xz_ladder_post(ptype *p4, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1) \ +{ \ + vec##bits Z3x2, Y1Z3x2, Z1Z2, X1Z2, X2Z1, A, B, C; \ + bool_t s_inf; \ +\ + add_##field(Z3x2, s->Z, s->Z); /* Z3x2 = 2*Z3 */\ + mul_##field(Y1Z3x2, Y1, Z3x2); /* Y1Z3x2 = Y1*Z3x2 */\ + mul_##field(Z1Z2, p->Z, r->Z); /* Z1Z2 = Z1*Z2 */\ + mul_##field(X1Z2, p->X, r->Z); /* X1Z2 = X1*Z2 */\ + mul_##field(X2Z1, r->X, p->Z); /* X2Z1 = X2*Z1 */\ +\ + mul_##field(p4->X, Y1Z3x2, r->X); /* Y1Z3x2*X2 */\ + mul_##field(p4->X, p4->X, Z1Z2); /* X4 = Y1Z3x2*X2*Z1Z2 */\ +\ + sqr_##field(A, Z1Z2); /* (Z1Z2)^2 */\ + mul_##field(B, A, Z3x2); /* Z3x2*(Z1Z2)^2 */\ + mul_by_b_##suffixb(A, B); /* A = b*Z3x2*(Z1Z2)^2 */\ +\ + mul_##field(B, p->X, r->X); /* [a*Z1Z2+]X1*X2 */\ + mul_##field(B, B, s->Z); /* Z3*([a*Z1Z2+]X1*X2) */\ + add_##field(C, X1Z2, X2Z1); /* X1Z2+X2Z1 */\ + mul_##field(B, B, C); /* B = Z3*([a*Z2Z1+]X1*X2)*(X1Z2+X2Z1) */\ +\ + sub_##field(C, X1Z2, X2Z1); /* X1Z2-X2Z1 */\ + sqr_##field(C, C); /* (X1Z2-X2Z1)^2 */\ + mul_##field(C, C, s->X); /* C = X3*(X1Z2-X2Z1)^2 */\ +\ + add_##field(A, A, B); /* A+B */\ + sub_##field(A, A, C); /* Y4 = A+B-C */\ +\ + mul_##field(p4->Z, Z1Z2, r->Z); /* Z1Z2*Z2 */\ + mul_##field(p4->Z, p4->Z, Y1Z3x2); /* Y1Z3x2*Z1Z2*Z2 */\ +\ + s_inf = vec_is_zero(s->Z, sizeof(s->Z)); \ + vec_select(p4->X, p->X, p4->X, sizeof(p4->X), s_inf); \ + vec_select(p4->Y, Y1, A, sizeof(p4->Y), s_inf); \ + vec_select(p4->Z, p->Z, p4->Z, sizeof(p4->Z), s_inf); \ + ptype##_cneg(p4, s_inf); \ + /* to Jacobian */\ + mul_##field(p4->X, p4->X, p4->Z); /* X4 = X4*Z4 */\ + sqr_##field(B, p4->Z); \ + mul_##field(p4->Y, p4->Y, B); /* Y4 = Y4*Z4^2 */\ +} + +#define POINT_IS_EQUAL_IMPL(ptype, bits, field) \ +static limb_t ptype##_is_equal(const ptype *p1, const ptype *p2) \ +{ \ + vec##bits Z1Z1, Z2Z2; \ + ptype##_affine a1, a2; \ + bool_t is_inf1 = vec_is_zero(p1->Z, sizeof(p1->Z)); \ + bool_t is_inf2 = vec_is_zero(p2->Z, sizeof(p2->Z)); \ +\ + sqr_##field(Z1Z1, p1->Z); /* Z1Z1 = Z1^2 */\ + sqr_##field(Z2Z2, p2->Z); /* Z2Z2 = Z2^2 */\ +\ + mul_##field(a1.X, p1->X, Z2Z2); /* U1 = X1*Z2Z2 */\ + mul_##field(a2.X, p2->X, Z1Z1); /* U2 = X2*Z1Z1 */\ +\ + mul_##field(a1.Y, p1->Y, p2->Z); /* Y1*Z2 */\ + mul_##field(a2.Y, p2->Y, p1->Z); /* Y2*Z1 */\ +\ + mul_##field(a1.Y, a1.Y, Z2Z2); /* S1 = Y1*Z2*Z2Z2 */\ + mul_##field(a2.Y, a2.Y, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */\ +\ + return vec_is_equal(&a1, &a2, sizeof(a1)) & (is_inf1 ^ is_inf2 ^ 1); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 7 with a twist to handle + * |p3| pointing at either |p1| or |p2|. This is resolved by adding |t5| + * and replacing few first references to |X3| in the formula, up to step + * 21, with it. 12M[+27A], doubling and infinity are handled by the + * formula itself. Infinity is to be encoded as [0, !0, 0]. + */ +#define POINT_PROJ_DADD_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd(ptype##proj *p3, const ptype##proj *p1, \ + const ptype##proj *p2) \ +{ \ + vec##bits t0, t1, t2, t3, t4, t5; \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + mul_##field(t2, p1->Z, p2->Z); /* 3. t2 = Z1*Z2 */\ + add_##field(t3, p1->X, p1->Y); /* 4. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 5. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 6. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 7. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 8. t3 = t3-t4 */\ + add_##field(t4, p1->Y, p1->Z); /* 9. t4 = Y1+Z1 */\ + add_##field(t5, p2->Y, p2->Z); /* 10. t5 = Y2+Z2 */\ + mul_##field(t4, t4, t5); /* 11. t4 = t4*t5 */\ + add_##field(t5, t1, t2); /* 12. t5 = t1+t2 */\ + sub_##field(t4, t4, t5); /* 13. t4 = t4-t5 */\ + add_##field(t5, p1->X, p1->Z); /* 14. t5 = X1+Z1 */\ + add_##field(p3->Y, p2->X, p2->Z); /* 15. Y3 = X2+Z2 */\ + mul_##field(t5, t5, p3->Y); /* 16. t5 = t5*Y3 */\ + add_##field(p3->Y, t0, t2); /* 17. Y3 = t0+t2 */\ + sub_##field(p3->Y, t5, p3->Y); /* 18. Y3 = t5-Y3 */\ + mul_by_3_##field(t0, t0); /* 19-20. t0 = 3*t0 */\ + mul_by_3_##field(t5, t2); /* 21. t5 = 3*t2 */\ + mul_by_b_##suffixb(t2, t5); /* 21. t2 = b*t5 */\ + add_##field(p3->Z, t1, t2); /* 22. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 23. t1 = t1-t2 */\ + mul_by_3_##field(t5, p3->Y); /* 24. t5 = 3*Y3 */\ + mul_by_b_##suffixb(p3->Y, t5); /* 24. Y3 = b*t5 */\ + mul_##field(p3->X, t4, p3->Y); /* 25. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 26. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 27. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 28. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 29. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 30. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 31. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 32. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 33. Z3 = Z3+t0 */\ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 8 with a twist to handle + * |p2| being infinity encoded as [0, 0]. 11M[+21A]. + */ +#define POINT_PROJ_DADD_AFFINE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_dadd_affine(ptype##proj *out, const ptype##proj *p1, \ + const ptype##_affine *p2) \ +{ \ + ptype##proj p3[1]; \ + vec##bits t0, t1, t2, t3, t4; \ + limb_t p2inf = vec_is_zero(p2, sizeof(*p2)); \ +\ + mul_##field(t0, p1->X, p2->X); /* 1. t0 = X1*X2 */\ + mul_##field(t1, p1->Y, p2->Y); /* 2. t1 = Y1*Y2 */\ + add_##field(t3, p1->X, p1->Y); /* 3. t3 = X1+Y1 */\ + add_##field(t4, p2->X, p2->Y); /* 4. t4 = X2+Y2 */\ + mul_##field(t3, t3, t4); /* 5. t3 = t3*t4 */\ + add_##field(t4, t0, t1); /* 6. t4 = t0+t1 */\ + sub_##field(t3, t3, t4); /* 7. t3 = t3-t4 */\ + mul_##field(t4, p2->Y, p1->Z); /* 8. t4 = Y2*Z1 */\ + add_##field(t4, t4, p1->Y); /* 9. t4 = t4+Y1 */\ + mul_##field(p3->Y, p2->X, p1->Z); /* 10. Y3 = X2*Z1 */\ + add_##field(p3->Y, p3->Y, p1->X); /* 11. Y3 = Y3+X1 */\ + mul_by_3_##field(t0, t0); /* 12-13. t0 = 3*t0 */\ + mul_by_b_##suffixb(t2, p1->Z); /* 14. t2 = b*Z1 */\ + mul_by_3_##field(t2, t2); /* 14. t2 = 3*t2 */\ + add_##field(p3->Z, t1, t2); /* 15. Z3 = t1+t2 */\ + sub_##field(t1, t1, t2); /* 16. t1 = t1-t2 */\ + mul_by_b_##suffixb(t2, p3->Y); /* 17. t2 = b*Y3 */\ + mul_by_3_##field(p3->Y, t2); /* 17. Y3 = 3*t2 */\ + mul_##field(p3->X, t4, p3->Y); /* 18. X3 = t4*Y3 */\ + mul_##field(t2, t3, t1); /* 19. t2 = t3*t1 */\ + sub_##field(p3->X, t2, p3->X); /* 20. X3 = t2-X3 */\ + mul_##field(p3->Y, p3->Y, t0); /* 21. Y3 = Y3*t0 */\ + mul_##field(t1, t1, p3->Z); /* 22. t1 = t1*Z3 */\ + add_##field(p3->Y, t1, p3->Y); /* 23. Y3 = t1+Y3 */\ + mul_##field(t0, t0, t3); /* 24. t0 = t0*t3 */\ + mul_##field(p3->Z, p3->Z, t4); /* 25. Z3 = Z3*t4 */\ + add_##field(p3->Z, p3->Z, t0); /* 26. Z3 = Z3+t0 */\ +\ + vec_select(out, p1, p3, sizeof(*out), p2inf); \ +} + +/* + * https://eprint.iacr.org/2015/1060, algorithm 9 with a twist to handle + * |p3| pointing at |p1|. This is resolved by adding |t3| to hold X*Y + * and reordering operations to bring references to |p1| forward. + * 6M+2S[+13A]. + */ +#define POINT_PROJ_DOUBLE_IMPL_A0(ptype, bits, field, suffixb) \ +static void ptype##proj_double(ptype##proj *p3, const ptype##proj *p1) \ +{ \ + vec##bits t0, t1, t2, t3; \ +\ + sqr_##field(t0, p1->Y); /* 1. t0 = Y*Y */\ + mul_##field(t1, p1->Y, p1->Z); /* 5. t1 = Y*Z */\ + sqr_##field(t2, p1->Z); /* 6. t2 = Z*Z */\ + mul_##field(t3, p1->X, p1->Y); /* 16. t3 = X*Y */\ + lshift_##field(p3->Z, t0, 3); /* 2-4. Z3 = 8*t0 */\ + mul_by_b_##suffixb(p3->X, t2); /* 7. t2 = b*t2 */\ + mul_by_3_##field(t2, p3->X); /* 7. t2 = 3*t2 */\ + mul_##field(p3->X, t2, p3->Z); /* 8. X3 = t2*Z3 */\ + add_##field(p3->Y, t0, t2); /* 9. Y3 = t0+t2 */\ + mul_##field(p3->Z, t1, p3->Z); /* 10. Z3 = t1*Z3 */\ + mul_by_3_##field(t2, t2); /* 11-12. t2 = 3*t2 */\ + sub_##field(t0, t0, t2); /* 13. t0 = t0-t2 */\ + mul_##field(p3->Y, t0, p3->Y); /* 14. Y3 = t0*Y3 */\ + add_##field(p3->Y, p3->X, p3->Y); /* 15. Y3 = X3+Y3 */\ + mul_##field(p3->X, t0, t3); /* 17. X3 = t0*t3 */\ + add_##field(p3->X, p3->X, p3->X); /* 18. X3 = X3+X3 */\ +} + +#define POINT_PROJ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##proj_to_Jacobian(ptype *out, const ptype##proj *in) \ +{ \ + vec##bits ZZ; \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + mul_##field(out->Y, in->Y, ZZ); \ + vec_copy(out->Z, in->Z, sizeof(out->Z)); \ +} + +#define POINT_TO_PROJECTIVE_IMPL(ptype, bits, field, one) \ +static void ptype##_to_projective(ptype##proj *out, const ptype *in) \ +{ \ + vec##bits ZZ; \ + limb_t is_inf = vec_is_zero(in->Z, sizeof(in->Z)); \ +\ + sqr_##field(ZZ, in->Z); \ + mul_##field(out->X, in->X, in->Z); \ + vec_select(out->Y, one, in->Y, sizeof(out->Y), is_inf); \ + mul_##field(out->Z, ZZ, in->Z); \ +} + +/******************* !!!!! NOT CONSTANT TIME !!!!! *******************/ + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-add-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-dbl-2008-s-1 + * with twist to handle either input at infinity. Addition costs 12M+2S, + * while conditional doubling - 4M+6M+3S. + */ +#define POINTXYZZ_DADD_IMPL(ptype, bits, field) \ +static void ptype##xyzz_dadd(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##xyzz *p2) \ +{ \ + vec##bits U, S, P, R; \ +\ + if (vec_is_zero(p2->ZZZ, 2*sizeof(p2->ZZZ))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3, p2, sizeof(*p3)); \ + return; \ + } \ +\ + mul_##field(U, p1->X, p2->ZZ); /* U1 = X1*ZZ2 */\ + mul_##field(S, p1->Y, p2->ZZZ); /* S1 = Y1*ZZZ2 */\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + sub_##field(P, P, U); /* P = U2-U1 */\ + sub_##field(R, R, S); /* R = S2-S1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p1| and |p2| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, U, PP); /* Q = U1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, S, PPP); /* S1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-S1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, p2->ZZ); /* ZZ1*ZZ2 */\ + mul_##field(p3->ZZZ, p1->ZZZ, p2->ZZZ); /* ZZZ1*ZZZ2 */\ + mul_##field(p3->ZZ, p3->ZZ, PP); /* ZZ3 = ZZ1*ZZ2*PP */\ + mul_##field(p3->ZZZ, p3->ZZZ, PPP); /* ZZZ3 = ZZZ1*ZZZ2*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits V, W, M; /* double |p1| */\ +\ + add_##field(U, p1->Y, p1->Y); /* U = 2*Y1 */\ + sqr_##field(V, U); /* V = U^2 */\ + mul_##field(W, V, U); /* W = U*V */\ + mul_##field(S, p1->X, V); /* S = X1*V */\ + sqr_##field(M, p1->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a*ZZ1^2] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, W, p1->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + mul_##field(p3->ZZ, p1->ZZ, V); /* ZZ3 = V*ZZ1 */\ + mul_##field(p3->ZZZ, p1->ZZZ, W); /* ZZ3 = W*ZZZ1 */\ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +/* + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#addition-madd-2008-s + * http://hyperelliptic.org/EFD/g1p/auto-shortw-xyzz.html#doubling-mdbl-2008-s-1 + * with twists to handle even subtractions and either input at infinity. + * Addition costs 8M+2S, while conditional doubling - 2M+4M+3S. + */ +#define POINTXYZZ_DADD_AFFINE_IMPL(ptype, bits, field, one) \ +static void ptype##xyzz_dadd_affine(ptype##xyzz *p3, const ptype##xyzz *p1, \ + const ptype##_affine *p2, \ + bool_t subtract) \ +{ \ + vec##bits P, R; \ +\ + if (vec_is_zero(p2, sizeof(*p2))) { \ + vec_copy(p3, p1, sizeof(*p3)); \ + return; \ + } else if (vec_is_zero(p1->ZZZ, 2*sizeof(p1->ZZZ))) { \ + vec_copy(p3->X, p2->X, 2*sizeof(p3->X));\ + cneg_##field(p3->ZZZ, one, subtract); \ + vec_copy(p3->ZZ, one, sizeof(p3->ZZ)); \ + return; \ + } \ +\ + mul_##field(P, p2->X, p1->ZZ); /* U2 = X2*ZZ1 */\ + mul_##field(R, p2->Y, p1->ZZZ); /* S2 = Y2*ZZZ1 */\ + cneg_##field(R, R, subtract); \ + sub_##field(P, P, p1->X); /* P = U2-X1 */\ + sub_##field(R, R, p1->Y); /* R = S2-Y1 */\ +\ + if (!vec_is_zero(P, sizeof(P))) { /* X1!=X2 */\ + vec##bits PP, PPP, Q; /* add |p2| to |p1| */\ +\ + sqr_##field(PP, P); /* PP = P^2 */\ + mul_##field(PPP, PP, P); /* PPP = P*PP */\ + mul_##field(Q, p1->X, PP); /* Q = X1*PP */\ + sqr_##field(p3->X, R); /* R^2 */\ + add_##field(P, Q, Q); \ + sub_##field(p3->X, p3->X, PPP); /* R^2-PPP */\ + sub_##field(p3->X, p3->X, P); /* X3 = R^2-PPP-2*Q */\ + sub_##field(Q, Q, p3->X); \ + mul_##field(Q, Q, R); /* R*(Q-X3) */\ + mul_##field(p3->Y, p1->Y, PPP); /* Y1*PPP */\ + sub_##field(p3->Y, Q, p3->Y); /* Y3 = R*(Q-X3)-Y1*PPP */\ + mul_##field(p3->ZZ, p1->ZZ, PP); /* ZZ3 = ZZ1*PP */\ + mul_##field(p3->ZZZ, p1->ZZZ, PPP); /* ZZZ3 = ZZZ1*PPP */\ + } else if (vec_is_zero(R, sizeof(R))) { /* X1==X2 && Y1==Y2 */\ + vec##bits U, S, M; /* double |p2| */\ +\ + add_##field(U, p2->Y, p2->Y); /* U = 2*Y1 */\ + sqr_##field(p3->ZZ, U); /* [ZZ3 =] V = U^2 */\ + mul_##field(p3->ZZZ, p3->ZZ, U); /* [ZZZ3 =] W = U*V */\ + mul_##field(S, p2->X, p3->ZZ); /* S = X1*V */\ + sqr_##field(M, p2->X); \ + mul_by_3_##field(M, M); /* M = 3*X1^2[+a] */\ + sqr_##field(p3->X, M); \ + add_##field(U, S, S); /* 2*S */\ + sub_##field(p3->X, p3->X, U); /* X3 = M^2-2*S */\ + mul_##field(p3->Y, p3->ZZZ, p2->Y); /* W*Y1 */\ + sub_##field(S, S, p3->X); \ + mul_##field(S, S, M); /* M*(S-X3) */\ + sub_##field(p3->Y, S, p3->Y); /* Y3 = M*(S-X3)-W*Y1 */\ + cneg_##field(p3->ZZZ, p3->ZZZ, subtract); \ + } else { /* X1==X2 && Y1==-Y2 */\ + vec_zero(p3->ZZZ, 2*sizeof(p3->ZZZ)); /* set |p3| to infinity */\ + } \ +} + +#define POINTXYZZ_TO_JACOBIAN_IMPL(ptype, bits, field) \ +static void ptype##xyzz_to_Jacobian(ptype *out, const ptype##xyzz *in) \ +{ \ + mul_##field(out->X, in->X, in->ZZ); \ + mul_##field(out->Y, in->Y, in->ZZZ); \ + vec_copy(out->Z, in->ZZ, sizeof(out->Z)); \ +} + +#define POINT_TO_XYZZ_IMPL(ptype, bits, field) \ +static void ptype##_to_xyzz(ptype##xyzz *out, const ptype *in) \ +{ \ + vec_copy(out->X, in->X, 2*sizeof(out->X)); \ + sqr_##field(out->ZZ, in->Z); \ + mul_##field(out->ZZZ, out->ZZ, in->Z); \ +} + +#endif diff --git a/crypto/blst_src/errors.h b/crypto/blst_src/errors.h new file mode 100644 index 00000000000..425daeb486f --- /dev/null +++ b/crypto/blst_src/errors.h @@ -0,0 +1,19 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_ERRORS_H__ +#define __BLS12_381_ASM_ERRORS_H__ + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, +} BLST_ERROR; + +#endif diff --git a/crypto/blst_src/exp.c b/crypto/blst_src/exp.c new file mode 100644 index 00000000000..55c5c5a7875 --- /dev/null +++ b/crypto/blst_src/exp.c @@ -0,0 +1,55 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +/* + * |out| = |inp|^|pow|, small footprint, public exponent + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ +#if 1 + vec384 ret; + + vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ + --pow_bits; /* most significant bit is set, skip over */ + while (pow_bits--) { + sqr_mont_384(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* out = ret */ +#else + unsigned int i; + vec384 sqr; + + vec_copy(sqr, inp, sizeof(sqr)); + for (i = 0; !is_bit_set(pow, i++);) + sqr_mont_384(sqr, sqr, sqr, p, n0); + vec_copy(out, sqr, sizeof(sqr)); + for (; i < pow_bits; i++) { + sqr_mont_384(sqr, sqr, sqr, p, n0); + if (is_bit_set(pow, i)) + mul_mont_384(out, out, sqr, p, n0); + } +#endif +} + +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0) +{ + vec384x ret; + + vec_copy(ret, inp, sizeof(ret)); /* |ret| = |inp|^1 */ + --pow_bits; /* most significant bit is accounted for, skip over */ + while (pow_bits--) { + sqr_mont_384x(ret, ret, p, n0); + if (is_bit_set(pow, pow_bits)) + mul_mont_384x(ret, ret, inp, p, n0); + } + vec_copy(out, ret, sizeof(ret)); /* |out| = |ret| */ +} diff --git a/crypto/blst_src/exports.c b/crypto/blst_src/exports.c new file mode 100644 index 00000000000..1ca4d4757fa --- /dev/null +++ b/crypto/blst_src/exports.c @@ -0,0 +1,583 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * Why this file? Overall goal is to ensure that all internal calls + * remain internal after linking application. This is to both + * + * a) minimize possibility of external name conflicts (since all + * non-blst-prefixed and [assembly subroutines] remain static); + * b) preclude possibility of unintentional internal reference + * overload in shared library context (one can achieve same + * effect with -Bsymbolic, but we don't want to rely on end-user + * to remember to use it); + */ + +#include "fields.h" +#include "bytes.h" + +/* + * BLS12-381-specific Fr shortcuts to assembly. + */ +void blst_fr_add(vec256 ret, const vec256 a, const vec256 b) +{ add_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_sub(vec256 ret, const vec256 a, const vec256 b) +{ sub_mod_256(ret, a, b, BLS12_381_r); } + +void blst_fr_mul_by_3(vec256 ret, const vec256 a) +{ mul_by_3_mod_256(ret, a, BLS12_381_r); } + +void blst_fr_lshift(vec256 ret, const vec256 a, size_t count) +{ lshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_rshift(vec256 ret, const vec256 a, size_t count) +{ rshift_mod_256(ret, a, count, BLS12_381_r); } + +void blst_fr_mul(vec256 ret, const vec256 a, const vec256 b) +{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } + +void blst_fr_ct_bfly(vec256 x0, vec256 x1, const vec256 twiddle) +{ + vec256 x2; + + mul_mont_sparse_256(x2, x1, twiddle, BLS12_381_r, r0); + sub_mod_256(x1, x0, x2, BLS12_381_r); + add_mod_256(x0, x0, x2, BLS12_381_r); +} + +void blst_fr_gs_bfly(vec256 x0, vec256 x1, const vec256 twiddle) +{ + vec256 x2; + + sub_mod_256(x2, x0, x1, BLS12_381_r); + add_mod_256(x0, x0, x1, BLS12_381_r); + mul_mont_sparse_256(x1, x2, twiddle, BLS12_381_r, r0); +} + +void blst_fr_sqr(vec256 ret, const vec256 a) +{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_cneg(vec256 ret, const vec256 a, int flag) +{ cneg_mod_256(ret, a, is_zero(flag) ^ 1, BLS12_381_r); } + +void blst_fr_to(vec256 ret, const vec256 a) +{ mul_mont_sparse_256(ret, a, BLS12_381_rRR, BLS12_381_r, r0); } + +void blst_fr_from(vec256 ret, const vec256 a) +{ from_mont_256(ret, a, BLS12_381_r, r0); } + +void blst_fr_from_scalar(vec256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(ret, out, BLS12_381_rRR, BLS12_381_r, r0); + vec_zero(out, sizeof(out)); + } +} + +void blst_scalar_from_fr(pow256 ret, const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if ((uptr_t)ret == (uptr_t)a && is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + from_mont_256(out, a, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_fr_check(const pow256 a) +{ return (int)(check_mod_256(a, BLS12_381_r) | + bytes_are_zero(a, sizeof(pow256))); +} + +int blst_sk_check(const pow256 a) +{ return (int)check_mod_256(a, BLS12_381_r); } + +int blst_sk_add_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)add_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_sub_n_check(pow256 ret, const pow256 a, const pow256 b) +{ return (int)sub_n_check_mod_256(ret, a, b, BLS12_381_r); } + +int blst_sk_mul_n_check(pow256 ret, const pow256 a, const pow256 b) +{ + vec256 t[2]; + const union { + long one; + char little; + } is_endian = { 1 }; + bool_t is_zero; + + if (((size_t)a|(size_t)b)%sizeof(limb_t) != 0 || !is_endian.little) { + limbs_from_le_bytes(t[0], a, sizeof(pow256)); + limbs_from_le_bytes(t[1], b, sizeof(pow256)); + a = (const byte *)t[0]; + b = (const byte *)t[1]; + } + mul_mont_sparse_256(t[0], BLS12_381_rRR, (const limb_t *)a, BLS12_381_r, r0); + mul_mont_sparse_256(t[0], t[0], (const limb_t *)b, BLS12_381_r, r0); + le_bytes_from_limbs(ret, t[0], sizeof(pow256)); + is_zero = vec_is_zero(t[0], sizeof(vec256)); + vec_zero(t, sizeof(t)); + + return (int)(is_zero^1); +} + +void blst_sk_inverse(pow256 ret, const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (((size_t)a|(size_t)ret)%sizeof(limb_t) == 0 && is_endian.little) { + limb_t *out = (limb_t *)ret; + mul_mont_sparse_256(out, (const limb_t *)a, BLS12_381_rRR, + BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + } else { + vec256 out; + limbs_from_le_bytes(out, a, 32); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); + reciprocal_fr(out, out); + from_mont_256(out, out, BLS12_381_r, r0); + le_bytes_from_limbs(ret, out, 32); + vec_zero(out, sizeof(out)); + } +} + +/* + * BLS12-381-specific Fp shortcuts to assembly. + */ +void blst_fp_add(vec384 ret, const vec384 a, const vec384 b) +{ add_fp(ret, a, b); } + +void blst_fp_sub(vec384 ret, const vec384 a, const vec384 b) +{ sub_fp(ret, a, b); } + +void blst_fp_mul_by_3(vec384 ret, const vec384 a) +{ mul_by_3_fp(ret, a); } + +void blst_fp_mul_by_8(vec384 ret, const vec384 a) +{ mul_by_8_fp(ret, a); } + +void blst_fp_lshift(vec384 ret, const vec384 a, size_t count) +{ lshift_fp(ret, a, count); } + +void blst_fp_mul(vec384 ret, const vec384 a, const vec384 b) +{ mul_fp(ret, a, b); } + +void blst_fp_sqr(vec384 ret, const vec384 a) +{ sqr_fp(ret, a); } + +void blst_fp_cneg(vec384 ret, const vec384 a, int flag) +{ cneg_fp(ret, a, is_zero(flag) ^ 1); } + +void blst_fp_to(vec384 ret, const vec384 a) +{ mul_fp(ret, a, BLS12_381_RR); } + +void blst_fp_from(vec384 ret, const vec384 a) +{ from_fp(ret, a); } + +/* + * Fp serialization/deserialization. + */ +void blst_fp_from_uint32(vec384 ret, const unsigned int a[12]) +{ + if (sizeof(limb_t) == 8) { + int i; + for (i = 0; i < 6; i++) + ret[i] = a[2*i] | ((limb_t)a[2*i+1] << (32 & (8*sizeof(limb_t)-1))); + a = (const unsigned int *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint32_from_fp(unsigned int ret[12], const vec384 a) +{ + if (sizeof(limb_t) == 4) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) { + limb_t limb = out[i]; + ret[2*i] = (unsigned int)limb; + ret[2*i+1] = (unsigned int)(limb >> (32 & (8*sizeof(limb_t)-1))); + } + } +} + +void blst_fp_from_uint64(vec384 ret, const unsigned long long a[6]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 6; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_fp(ret, (const limb_t *)a, BLS12_381_RR); +} + +void blst_uint64_from_fp(unsigned long long ret[6], const vec384 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_fp((limb_t *)ret, a); + } else { + vec384 out; + int i; + + from_fp(out, a); + for (i = 0; i < 6; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + } +} + +void blst_fp_from_bendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_be_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_bendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + be_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +void blst_fp_from_lendian(vec384 ret, const unsigned char a[48]) +{ + vec384 out; + + limbs_from_le_bytes(out, a, sizeof(vec384)); + mul_fp(ret, out, BLS12_381_RR); +} + +void blst_lendian_from_fp(unsigned char ret[48], const vec384 a) +{ + vec384 out; + + from_fp(out, a); + le_bytes_from_limbs(ret, out, sizeof(vec384)); +} + +/* + * BLS12-381-specific Fp2 shortcuts to assembly. + */ +void blst_fp2_add(vec384x ret, const vec384x a, const vec384x b) +{ add_fp2(ret, a, b); } + +void blst_fp2_sub(vec384x ret, const vec384x a, const vec384x b) +{ sub_fp2(ret, a, b); } + +void blst_fp2_mul_by_3(vec384x ret, const vec384x a) +{ mul_by_3_fp2(ret, a); } + +void blst_fp2_mul_by_8(vec384x ret, const vec384x a) +{ mul_by_8_fp2(ret, a); } + +void blst_fp2_lshift(vec384x ret, const vec384x a, size_t count) +{ lshift_fp2(ret, a, count); } + +void blst_fp2_mul(vec384x ret, const vec384x a, const vec384x b) +{ mul_fp2(ret, a, b); } + +void blst_fp2_sqr(vec384x ret, const vec384x a) +{ sqr_fp2(ret, a); } + +void blst_fp2_cneg(vec384x ret, const vec384x a, int flag) +{ cneg_fp2(ret, a, is_zero(flag) ^ 1); } + +/* + * Scalar serialization/deserialization. + */ +void blst_scalar_from_uint32(pow256 ret, const unsigned int a[8]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + } +} + +void blst_uint32_from_scalar(unsigned int ret[8], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 8; i++) { + unsigned int w = (unsigned int)(*a++); + w |= (unsigned int)(*a++) << 8; + w |= (unsigned int)(*a++) << 16; + w |= (unsigned int)(*a++) << 24; + ret[i] = w; + } +} + +void blst_scalar_from_uint64(pow256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = a[i]; + *ret++ = (byte)w; + *ret++ = (byte)(w >> 8); + *ret++ = (byte)(w >> 16); + *ret++ = (byte)(w >> 24); + *ret++ = (byte)(w >> 32); + *ret++ = (byte)(w >> 40); + *ret++ = (byte)(w >> 48); + *ret++ = (byte)(w >> 56); + } +} + +void blst_uint64_from_scalar(unsigned long long ret[4], const pow256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + size_t i; + + if ((uptr_t)ret==(uptr_t)a && is_endian.little) + return; + + for(i = 0; i < 4; i++) { + unsigned long long w = (unsigned long long)(*a++); + w |= (unsigned long long)(*a++) << 8; + w |= (unsigned long long)(*a++) << 16; + w |= (unsigned long long)(*a++) << 24; + w |= (unsigned long long)(*a++) << 32; + w |= (unsigned long long)(*a++) << 40; + w |= (unsigned long long)(*a++) << 48; + w |= (unsigned long long)(*a++) << 56; + ret[i] = w; + } +} + +void blst_scalar_from_bendian(pow256 ret, const unsigned char a[32]) +{ + vec256 out; + limbs_from_be_bytes(out, a, sizeof(out)); + le_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_bendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + vec256 out; + limbs_from_le_bytes(out, a, sizeof(out)); + be_bytes_from_limbs(ret, out, sizeof(out)); + vec_zero(out, sizeof(out)); +} + +void blst_scalar_from_lendian(pow256 ret, const unsigned char a[32]) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_lendian_from_scalar(unsigned char ret[32], const pow256 a) +{ + size_t i; + + if ((uptr_t)ret==(uptr_t)a) + return; + + for (i = 0; i < 32; i++) + ret[i] = a[i]; +} + +void blst_fr_from_uint64(vec256 ret, const unsigned long long a[4]) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 4 && !is_endian.little) { + int i; + for (i = 0; i < 4; i++) { + unsigned long long limb = a[i]; + ret[2*i] = (limb_t)limb; + ret[2*i+1] = (limb_t)(limb >> 32); + } + a = (const unsigned long long *)ret; + } + mul_mont_sparse_256(ret, (const limb_t *)a, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_uint64_from_fr(unsigned long long ret[4], const vec256 a) +{ + const union { + long one; + char little; + } is_endian = { 1 }; + + if (sizeof(limb_t) == 8 || is_endian.little) { + from_mont_256((limb_t *)ret, a, BLS12_381_r, r0); + } else { + vec256 out; + int i; + + from_mont_256(out, a, BLS12_381_r, r0); + for (i = 0; i < 4; i++) + ret[i] = out[2*i] | ((unsigned long long)out[2*i+1] << 32); + vec_zero(out, sizeof(out)); + } +} + +int blst_scalar_from_le_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + size_t rem = (n - 1) % 32 + 1; + struct { vec256 out, digit; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + + n -= rem; + limbs_from_le_bytes(t.out, bytes += n, rem); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + + while (n) { + limbs_from_le_bytes(t.digit, bytes -= 32, 32); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + n -= 32; + } + + from_mont_256(t.out, t.out, BLS12_381_r, r0); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(&t, sizeof(t)); + + return (int)(ret^1); +} + +int blst_scalar_from_be_bytes(pow256 out, const unsigned char *bytes, size_t n) +{ + size_t rem = (n - 1) % 32 + 1; + struct { vec256 out, digit; } t; + limb_t ret; + + vec_zero(t.out, sizeof(t.out)); + + limbs_from_be_bytes(t.out, bytes, rem); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + + while (n -= rem) { + limbs_from_be_bytes(t.digit, bytes += rem, 32); + add_mod_256(t.out, t.out, t.digit, BLS12_381_r); + mul_mont_sparse_256(t.out, BLS12_381_rRR, t.out, BLS12_381_r, r0); + rem = 32; + } + + from_mont_256(t.out, t.out, BLS12_381_r, r0); + + ret = vec_is_zero(t.out, sizeof(t.out)); + le_bytes_from_limbs(out, t.out, 32); + vec_zero(&t, sizeof(t)); + + return (int)(ret^1); +} + +/* + * Single-short SHA-256 hash function. + */ +#include "sha256.h" + +void blst_sha256(unsigned char md[32], const void *msg, size_t len) +{ + SHA256_CTX ctx; + + sha256_init(&ctx); + sha256_update(&ctx, msg, len); + sha256_final(md, &ctx); +} + +/* + * Test facilitator. + */ +void blst_scalar_from_hexascii(pow256 ret, const char *hex) +{ bytes_from_hexascii(ret, sizeof(pow256), hex); } + +void blst_fr_from_hexascii(vec256 ret, const char *hex) +{ + limbs_from_hexascii(ret, sizeof(vec256), hex); + mul_mont_sparse_256(ret, ret, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_fp_from_hexascii(vec384 ret, const char *hex) +{ + limbs_from_hexascii(ret, sizeof(vec384), hex); + mul_fp(ret, ret, BLS12_381_RR); +} diff --git a/crypto/blst_src/fields.h b/crypto/blst_src/fields.h new file mode 100644 index 00000000000..4b2323d2cce --- /dev/null +++ b/crypto/blst_src/fields.h @@ -0,0 +1,116 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_FIELDS_H__ +#define __BLS12_381_ASM_FIELDS_H__ + +#include "vect.h" +#include "consts.h" + +/* + * BLS12-381-specific Fp shortcuts to assembly. + */ +static inline void add_fp(vec384 ret, const vec384 a, const vec384 b) +{ add_mod_384(ret, a, b, BLS12_381_P); } + +static inline void sub_fp(vec384 ret, const vec384 a, const vec384 b) +{ sub_mod_384(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp(vec384 ret, const vec384 a) +{ mul_by_3_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp(vec384 ret, const vec384 a) +{ mul_by_8_mod_384(ret, a, BLS12_381_P); } + +static inline void lshift_fp(vec384 ret, const vec384 a, size_t count) +{ lshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void rshift_fp(vec384 ret, const vec384 a, size_t count) +{ rshift_mod_384(ret, a, count, BLS12_381_P); } + +static inline void div_by_2_fp(vec384 ret, const vec384 a) +{ div_by_2_mod_384(ret, a, BLS12_381_P); } + +static inline void mul_fp(vec384 ret, const vec384 a, const vec384 b) +{ mul_mont_384(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp(vec384 ret, const vec384 a) +{ sqr_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp(vec384 ret, const vec384 a, bool_t flag) +{ cneg_mod_384(ret, a, flag, BLS12_381_P); } + +static inline void from_fp(vec384 ret, const vec384 a) +{ from_mont_384(ret, a, BLS12_381_P, p0); } + +static inline void redc_fp(vec384 ret, const vec768 a) +{ redc_mont_384(ret, a, BLS12_381_P, p0); } + +/* + * BLS12-381-specific Fp2 shortcuts to assembly. + */ +static inline void add_fp2(vec384x ret, const vec384x a, const vec384x b) +{ add_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void sub_fp2(vec384x ret, const vec384x a, const vec384x b) +{ sub_mod_384x(ret, a, b, BLS12_381_P); } + +static inline void mul_by_3_fp2(vec384x ret, const vec384x a) +{ mul_by_3_mod_384x(ret, a, BLS12_381_P); } + +static inline void mul_by_8_fp2(vec384x ret, const vec384x a) +{ mul_by_8_mod_384x(ret, a, BLS12_381_P); } + +static inline void lshift_fp2(vec384x ret, const vec384x a, size_t count) +{ + lshift_mod_384(ret[0], a[0], count, BLS12_381_P); + lshift_mod_384(ret[1], a[1], count, BLS12_381_P); +} + +static inline void mul_fp2(vec384x ret, const vec384x a, const vec384x b) +{ mul_mont_384x(ret, a, b, BLS12_381_P, p0); } + +static inline void sqr_fp2(vec384x ret, const vec384x a) +{ sqr_mont_384x(ret, a, BLS12_381_P, p0); } + +static inline void cneg_fp2(vec384x ret, const vec384x a, bool_t flag) +{ + cneg_mod_384(ret[0], a[0], flag, BLS12_381_P); + cneg_mod_384(ret[1], a[1], flag, BLS12_381_P); +} + +#define vec_load_global vec_copy + +static void reciprocal_fp(vec384 out, const vec384 inp); +static void flt_reciprocal_fp(vec384 out, const vec384 inp); +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp); +static bool_t sqrt_fp(vec384 out, const vec384 inp); + +static void reciprocal_fp2(vec384x out, const vec384x inp); +static void flt_reciprocal_fp2(vec384x out, const vec384x inp); +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, const vec384x magic_ZZZ); +static bool_t sqrt_fp2(vec384x out, const vec384x inp); +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp); + +typedef vec384x vec384fp2; +typedef vec384fp2 vec384fp6[3]; +typedef vec384fp6 vec384fp12[2]; + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a); +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b); +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0); +static void conjugate_fp12(vec384fp12 a); +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n); + +#define neg_fp(r,a) cneg_fp((r),(a),1) +#define neg_fp2(r,a) cneg_fp2((r),(a),1) + +#endif /* __BLS12_381_ASM_FIELDS_H__ */ diff --git a/crypto/blst_src/fp12_tower.c b/crypto/blst_src/fp12_tower.c new file mode 100644 index 00000000000..d6c0b124eb6 --- /dev/null +++ b/crypto/blst_src/fp12_tower.c @@ -0,0 +1,789 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +/* + * Fp2 = Fp[u] / (u^2 + 1) + * Fp6 = Fp2[v] / (v^3 - u - 1) + * Fp12 = Fp6[w] / (w^2 - v) + */ + +static inline void mul_by_u_plus_1_fp2(vec384x ret, const vec384x a) +{ mul_by_1_plus_i_mod_384x(ret, a, BLS12_381_P); } + +#if 1 && !defined(__BLST_NO_ASM__) +#define __FP2x2__ +/* + * Fp2x2 is a "widened" version of Fp2, which allows to consolidate + * reductions from several multiplications. In other words instead of + * "mul_redc-mul_redc-add" we get "mul-mul-add-redc," where latter + * addition is double-width... To be more specific this gives ~7-10% + * faster pairing depending on platform... + */ +typedef vec768 vec768x[2]; + +static inline void add_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + add_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + add_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void sub_fp2x2(vec768x ret, const vec768x a, const vec768x b) +{ + sub_mod_384x384(ret[0], a[0], b[0], BLS12_381_P); + sub_mod_384x384(ret[1], a[1], b[1], BLS12_381_P); +} + +static inline void mul_by_u_plus_1_fp2x2(vec768x ret, const vec768x a) +{ + /* caveat lector! |ret| may not be same as |a| */ + sub_mod_384x384(ret[0], a[0], a[1], BLS12_381_P); + add_mod_384x384(ret[1], a[0], a[1], BLS12_381_P); +} + +static inline void redc_fp2x2(vec384x ret, const vec768x a) +{ + redc_mont_384(ret[0], a[0], BLS12_381_P, p0); + redc_mont_384(ret[1], a[1], BLS12_381_P, p0); +} + +static void mul_fp2x2(vec768x ret, const vec384x a, const vec384x b) +{ +#if 1 + mul_382x(ret, a, b, BLS12_381_P); /* +~6% in Miller loop */ +#else + union { vec384 x[2]; vec768 x2; } t; + + add_mod_384(t.x[0], a[0], a[1], BLS12_381_P); + add_mod_384(t.x[1], b[0], b[1], BLS12_381_P); + mul_384(ret[1], t.x[0], t.x[1]); + + mul_384(ret[0], a[0], b[0]); + mul_384(t.x2, a[1], b[1]); + + sub_mod_384x384(ret[1], ret[1], ret[0], BLS12_381_P); + sub_mod_384x384(ret[1], ret[1], t.x2, BLS12_381_P); + + sub_mod_384x384(ret[0], ret[0], t.x2, BLS12_381_P); +#endif +} + +static void sqr_fp2x2(vec768x ret, const vec384x a) +{ +#if 1 + sqr_382x(ret, a, BLS12_381_P); /* +~5% in final exponentiation */ +#else + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], BLS12_381_P); + sub_mod_384(t1, a[0], a[1], BLS12_381_P); + + mul_384(ret[1], a[0], a[1]); + add_mod_384x384(ret[1], ret[1], ret[1], BLS12_381_P); + + mul_384(ret[0], t0, t1); +#endif +} +#endif /* __FP2x2__ */ + +/* + * Fp6 extension + */ +#if defined(__FP2x2__) /* ~10-13% improvement for mul_fp12 and sqr_fp12 */ +typedef vec768x vec768fp6[3]; + +static inline void sub_fp6x2(vec768fp6 ret, const vec768fp6 a, + const vec768fp6 b) +{ + sub_fp2x2(ret[0], a[0], b[0]); + sub_fp2x2(ret[1], a[1], b[1]); + sub_fp2x2(ret[2], a[2], b[2]); +} + +static void mul_fp6x2(vec768fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768x t0, t1, t2; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + mul_fp2x2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(aa, a[1], a[2]); + add_fp2(bb, b[1], b[2]); + mul_fp2x2(ret[0], aa, bb); + sub_fp2x2(ret[0], ret[0], t1); + sub_fp2x2(ret[0], ret[0], t2); + mul_by_u_plus_1_fp2x2(ret[1], ret[0]); /* borrow ret[1] for a moment */ + add_fp2x2(ret[0], ret[1], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2x2(ret[2], t2); /* borrow ret[2] for a moment */ + add_fp2x2(ret[1], ret[1], ret[2]); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(aa, a[0], a[2]); + add_fp2(bb, b[0], b[2]); + mul_fp2x2(ret[2], aa, bb); + sub_fp2x2(ret[2], ret[2], t0); + sub_fp2x2(ret[2], ret[2], t2); + add_fp2x2(ret[2], ret[2], t1); +} + +static inline void redc_fp6x2(vec384fp6 ret, const vec768fp6 a) +{ + redc_fp2x2(ret[0], a[0]); + redc_fp2x2(ret[1], a[1]); + redc_fp2x2(ret[2], a[2]); +} + +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec768fp6 r; + + mul_fp6x2(r, a, b); + redc_fp6x2(ret, r); /* narrow to normal width */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec768x s0, m01, m12, s2, rx; + + sqr_fp2x2(s0, a[0]); + + mul_fp2x2(m01, a[0], a[1]); + add_fp2x2(m01, m01, m01); + + mul_fp2x2(m12, a[1], a[2]); + add_fp2x2(m12, m12, m12); + + sqr_fp2x2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2x2(rx, ret[2]); + sub_fp2x2(rx, rx, s0); + sub_fp2x2(rx, rx, s2); + sub_fp2x2(rx, rx, m01); + sub_fp2x2(rx, rx, m12); + redc_fp2x2(ret[2], rx); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2x2(rx, m12); + add_fp2x2(rx, rx, s0); + redc_fp2x2(ret[0], rx); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2x2(rx, s2); + add_fp2x2(rx, rx, m01); + redc_fp2x2(ret[1], rx); +} +#else +static void mul_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, t2, t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + mul_fp2(t2, a[2], b[2]); + + /* ret[0] = ((a1 + a2)*(b1 + b2) - a1*b1 - a2*b2)*(u+1) + a0*b0 + = (a1*b2 + a2*b1)*(u+1) + a0*b0 */ + add_fp2(t4, a[1], a[2]); + add_fp2(t5, b[1], b[2]); + mul_fp2(t3, t4, t5); + sub_fp2(t3, t3, t1); + sub_fp2(t3, t3, t2); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*b2*(u+1) + = a0*b1 + a1*b0 + a2*b2*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + mul_by_u_plus_1_fp2(t4, t2); + add_fp2(ret[1], ret[1], t4); + + /* ret[2] = (a0 + a2)*(b0 + b2) - a0*b0 - a2*b2 + a1*b1 + = a0*b2 + a2*b0 + a1*b1 */ + add_fp2(t4, a[0], a[2]); + add_fp2(t5, b[0], b[2]); + mul_fp2(ret[2], t4, t5); + sub_fp2(ret[2], ret[2], t0); + sub_fp2(ret[2], ret[2], t2); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void sqr_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x s0, m01, m12, s2; + + sqr_fp2(s0, a[0]); + + mul_fp2(m01, a[0], a[1]); + add_fp2(m01, m01, m01); + + mul_fp2(m12, a[1], a[2]); + add_fp2(m12, m12, m12); + + sqr_fp2(s2, a[2]); + + /* ret[2] = (a0 + a1 + a2)^2 - a0^2 - a2^2 - 2*(a0*a1) - 2*(a1*a2) + = a1^2 + 2*(a0*a2) */ + add_fp2(ret[2], a[2], a[1]); + add_fp2(ret[2], ret[2], a[0]); + sqr_fp2(ret[2], ret[2]); + sub_fp2(ret[2], ret[2], s0); + sub_fp2(ret[2], ret[2], s2); + sub_fp2(ret[2], ret[2], m01); + sub_fp2(ret[2], ret[2], m12); + + /* ret[0] = a0^2 + 2*(a1*a2)*(u+1) */ + mul_by_u_plus_1_fp2(ret[0], m12); + add_fp2(ret[0], ret[0], s0); + + /* ret[1] = a2^2*(u+1) + 2*(a0*a1) */ + mul_by_u_plus_1_fp2(ret[1], s2); + add_fp2(ret[1], ret[1], m01); +} +#endif + +static void add_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + add_fp2(ret[0], a[0], b[0]); + add_fp2(ret[1], a[1], b[1]); + add_fp2(ret[2], a[2], b[2]); +} + +static void sub_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + sub_fp2(ret[0], a[0], b[0]); + sub_fp2(ret[1], a[1], b[1]); + sub_fp2(ret[2], a[2], b[2]); +} + +static void neg_fp6(vec384fp6 ret, const vec384fp6 a) +{ + neg_fp2(ret[0], a[0]); + neg_fp2(ret[1], a[1]); + neg_fp2(ret[2], a[2]); +} + +#if 0 +#define mul_by_v_fp6 mul_by_v_fp6 +static void mul_by_v_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x t; + + mul_by_u_plus_1_fp2(t, a[2]); + vec_copy(ret[2], a[1], sizeof(a[1])); + vec_copy(ret[1], a[0], sizeof(a[0])); + vec_copy(ret[0], t, sizeof(t)); +} +#endif + +/* + * Fp12 extension + */ +#if defined(__FP2x2__) +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec768fp6 t0, t1, rx; + vec384fp6 t2; + + mul_fp6x2(t0, a[0], b[0]); + mul_fp6x2(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6x2(rx, ret[1], t2); + sub_fp6x2(rx, rx, t0); + sub_fp6x2(rx, rx, t1); + redc_fp6x2(ret[1], rx); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rx[0], t1[2]); + add_fp2x2(rx[0], t0[0], rx[0]); + add_fp2x2(rx[1], t0[1], t1[0]); + add_fp2x2(rx[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rx); +} + +static inline void mul_by_0y0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + mul_fp2x2(ret[1], a[2], b); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + mul_fp2x2(ret[1], a[0], b); + mul_fp2x2(ret[2], a[1], b); +} + +static void mul_by_xy0_fp6x2(vec768fp6 ret, const vec384fp6 a, + const vec384fp6 b) +{ + vec768x t0, t1; + vec384x aa, bb; + + mul_fp2x2(t0, a[0], b[0]); + mul_fp2x2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2x2(ret[1], a[2], b[1]); /* borrow ret[1] for a moment */ + mul_by_u_plus_1_fp2x2(ret[0], ret[1]); + add_fp2x2(ret[0], ret[0], t0); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(aa, a[0], a[1]); + add_fp2(bb, b[0], b[1]); + mul_fp2x2(ret[1], aa, bb); + sub_fp2x2(ret[1], ret[1], t0); + sub_fp2x2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2x2(ret[2], a[2], b[0]); + add_fp2x2(ret[2], ret[2], t1); +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec768fp6 t0, t1, rr; + vec384fp6 t2; + + mul_by_xy0_fp6x2(t0, a[0], xy00z0); + mul_by_0y0_fp6x2(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6x2(rr, ret[1], t2); + sub_fp6x2(rr, rr, t0); + sub_fp6x2(rr, rr, t1); + redc_fp6x2(ret[1], rr); + + /* ret[0] = a0*b0 + a1*b1*v */ + mul_by_u_plus_1_fp2x2(rr[0], t1[2]); + add_fp2x2(rr[0], t0[0], rr[0]); + add_fp2x2(rr[1], t0[1], t1[0]); + add_fp2x2(rr[2], t0[2], t1[1]); + redc_fp6x2(ret[0], rr); +} +#else +static void mul_fp12(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ + vec384fp6 t0, t1, t2; + + mul_fp6(t0, a[0], b[0]); + mul_fp6(t1, a[1], b[1]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + add_fp6(t2, a[0], a[1]); + add_fp6(ret[1], b[0], b[1]); + mul_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} + +static inline void mul_by_0y0_fp6(vec384fp6 ret, const vec384fp6 a, + const vec384fp2 b) +{ + vec384x t; + + mul_fp2(t, a[2], b); + mul_fp2(ret[2], a[1], b); + mul_fp2(ret[1], a[0], b); + mul_by_u_plus_1_fp2(ret[0], t); +} + +static void mul_by_xy0_fp6(vec384fp6 ret, const vec384fp6 a, const vec384fp6 b) +{ + vec384x t0, t1, /*t2,*/ t3, t4, t5; + + mul_fp2(t0, a[0], b[0]); + mul_fp2(t1, a[1], b[1]); + + /* ret[0] = ((a1 + a2)*(b1 + 0) - a1*b1 - a2*0)*(u+1) + a0*b0 + = (a1*0 + a2*b1)*(u+1) + a0*b0 */ + mul_fp2(t3, a[2], b[1]); + mul_by_u_plus_1_fp2(t3, t3); + /* add_fp2(ret[0], t3, t0); considering possible aliasing... */ + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + a2*0*(u+1) + = a0*b1 + a1*b0 + a2*0*(u+1) */ + add_fp2(t4, a[0], a[1]); + add_fp2(t5, b[0], b[1]); + mul_fp2(ret[1], t4, t5); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); + + /* ret[2] = (a0 + a2)*(b0 + 0) - a0*b0 - a2*0 + a1*b1 + = a0*0 + a2*b0 + a1*b1 */ + mul_fp2(ret[2], a[2], b[0]); + add_fp2(ret[2], ret[2], t1); + + add_fp2(ret[0], t3, t0); /* ... moved from above */ +} + +static void mul_by_xy00z0_fp12(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ + vec384fp6 t0, t1, t2; + + mul_by_xy0_fp6(t0, a[0], xy00z0); + mul_by_0y0_fp6(t1, a[1], xy00z0[2]); + + /* ret[1] = (a0 + a1)*(b0 + b1) - a0*b0 - a1*b1 + = a0*b1 + a1*b0 */ + vec_copy(t2[0], xy00z0[0], sizeof(t2[0])); + add_fp2(t2[1], xy00z0[1], xy00z0[2]); + add_fp6(ret[1], a[0], a[1]); + mul_by_xy0_fp6(ret[1], ret[1], t2); + sub_fp6(ret[1], ret[1], t0); + sub_fp6(ret[1], ret[1], t1); + + /* ret[0] = a0*b0 + a1*b1*v */ +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + add_fp6(ret[0], t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + add_fp2(ret[0][0], t0[0], t1[2]); + add_fp2(ret[0][1], t0[1], t1[0]); + add_fp2(ret[0][2], t0[2], t1[1]); +#endif +} +#endif + +static void sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + add_fp6(t0, a[0], a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, a[1]); + add_fp6(t1, a[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], a[1][2]); + add_fp2(t1[0], a[0][0], t1[2]); + add_fp2(t1[1], a[0][1], a[1][0]); + add_fp2(t1[2], a[0][2], a[1][1]); +#endif + mul_fp6(t0, t0, t1); + mul_fp6(t1, a[0], a[1]); + + /* ret[1] = 2*(a0*a1) */ + add_fp6(ret[1], t1, t1); + + /* ret[0] = (a0 + a1)*(a0 + a1*v) - a0*a1 - a0*a1*v + = a0^2 + a1^2*v */ + sub_fp6(ret[0], t0, t1); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(ret[0], ret[0], t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(ret[0][0], ret[0][0], t1[2]); + sub_fp2(ret[0][1], ret[0][1], t1[0]); + sub_fp2(ret[0][2], ret[0][2], t1[1]); +#endif +} + +static void conjugate_fp12(vec384fp12 a) +{ neg_fp6(a[1], a[1]); } + +static void inverse_fp6(vec384fp6 ret, const vec384fp6 a) +{ + vec384x c0, c1, c2, t0, t1; + + /* c0 = a0^2 - (a1*a2)*(u+1) */ + sqr_fp2(c0, a[0]); + mul_fp2(t0, a[1], a[2]); + mul_by_u_plus_1_fp2(t0, t0); + sub_fp2(c0, c0, t0); + + /* c1 = a2^2*(u+1) - (a0*a1) */ + sqr_fp2(c1, a[2]); + mul_by_u_plus_1_fp2(c1, c1); + mul_fp2(t0, a[0], a[1]); + sub_fp2(c1, c1, t0); + + /* c2 = a1^2 - a0*a2 */ + sqr_fp2(c2, a[1]); + mul_fp2(t0, a[0], a[2]); + sub_fp2(c2, c2, t0); + + /* (a2*c1 + a1*c2)*(u+1) + a0*c0 */ + mul_fp2(t0, c1, a[2]); + mul_fp2(t1, c2, a[1]); + add_fp2(t0, t0, t1); + mul_by_u_plus_1_fp2(t0, t0); + mul_fp2(t1, c0, a[0]); + add_fp2(t0, t0, t1); + + reciprocal_fp2(t1, t0); + + mul_fp2(ret[0], c0, t1); + mul_fp2(ret[1], c1, t1); + mul_fp2(ret[2], c2, t1); +} + +static void inverse_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp6 t0, t1; + + sqr_fp6(t0, a[0]); + sqr_fp6(t1, a[1]); +#ifdef mul_by_v_fp6 + mul_by_v_fp6(t1, t1); + sub_fp6(t0, t0, t1); +#else + mul_by_u_plus_1_fp2(t1[2], t1[2]); + sub_fp2(t0[0], t0[0], t1[2]); + sub_fp2(t0[1], t0[1], t1[0]); + sub_fp2(t0[2], t0[2], t1[1]); +#endif + + inverse_fp6(t1, t0); + + mul_fp6(ret[0], a[0], t1); + mul_fp6(ret[1], a[1], t1); + neg_fp6(ret[1], ret[1]); +} + +typedef vec384x vec384fp4[2]; + +#if defined(__FP2x2__) +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec768x t0, t1, t2; + + sqr_fp2x2(t0, a0); + sqr_fp2x2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2x2(t2, t1); + add_fp2x2(t2, t2, t0); + redc_fp2x2(ret[0], t2); + + sqr_fp2x2(t2, ret[1]); + sub_fp2x2(t2, t2, t0); + sub_fp2x2(t2, t2, t1); + redc_fp2x2(ret[1], t2); +} +#else +static void sqr_fp4(vec384fp4 ret, const vec384x a0, const vec384x a1) +{ + vec384x t0, t1; + + sqr_fp2(t0, a0); + sqr_fp2(t1, a1); + add_fp2(ret[1], a0, a1); + + mul_by_u_plus_1_fp2(ret[0], t1); + add_fp2(ret[0], ret[0], t0); + + sqr_fp2(ret[1], ret[1]); + sub_fp2(ret[1], ret[1], t0); + sub_fp2(ret[1], ret[1], t1); +} +#endif + +static void cyclotomic_sqr_fp12(vec384fp12 ret, const vec384fp12 a) +{ + vec384fp4 t0, t1, t2; + + sqr_fp4(t0, a[0][0], a[1][1]); + sqr_fp4(t1, a[1][0], a[0][2]); + sqr_fp4(t2, a[0][1], a[1][2]); + + sub_fp2(ret[0][0], t0[0], a[0][0]); + add_fp2(ret[0][0], ret[0][0], ret[0][0]); + add_fp2(ret[0][0], ret[0][0], t0[0]); + + sub_fp2(ret[0][1], t1[0], a[0][1]); + add_fp2(ret[0][1], ret[0][1], ret[0][1]); + add_fp2(ret[0][1], ret[0][1], t1[0]); + + sub_fp2(ret[0][2], t2[0], a[0][2]); + add_fp2(ret[0][2], ret[0][2], ret[0][2]); + add_fp2(ret[0][2], ret[0][2], t2[0]); + + mul_by_u_plus_1_fp2(t2[1], t2[1]); + add_fp2(ret[1][0], t2[1], a[1][0]); + add_fp2(ret[1][0], ret[1][0], ret[1][0]); + add_fp2(ret[1][0], ret[1][0], t2[1]); + + add_fp2(ret[1][1], t0[1], a[1][1]); + add_fp2(ret[1][1], ret[1][1], ret[1][1]); + add_fp2(ret[1][1], ret[1][1], t0[1]); + + add_fp2(ret[1][2], t1[1], a[1][2]); + add_fp2(ret[1][2], ret[1][2], ret[1][2]); + add_fp2(ret[1][2], ret[1][2], t1[1]); +} + +/* + * caveat lector! |n| has to be non-zero and not more than 3! + */ +static inline void frobenius_map_fp2(vec384x ret, const vec384x a, size_t n) +{ + vec_copy(ret[0], a[0], sizeof(ret[0])); + cneg_fp(ret[1], a[1], n & 1); +} + +static void frobenius_map_fp6(vec384fp6 ret, const vec384fp6 a, size_t n) +{ + static const vec384x coeffs1[] = { /* (u + 1)^((P^n - 1) / 3) */ + { { 0 }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) } }, + { { TO_LIMB_T(0x30f1361b798a64e8), TO_LIMB_T(0xf3b8ddab7ece5a2a), + TO_LIMB_T(0x16a8ca3ac61577f7), TO_LIMB_T(0xc26a2ff874fd029b), + TO_LIMB_T(0x3636b76660701c6e), TO_LIMB_T(0x051ba4ab241b6160) } }, + { { 0 }, { ONE_MONT_P } } + }; + static const vec384 coeffs2[] = { /* (u + 1)^((2P^n - 2) / 3) */ + { TO_LIMB_T(0x890dc9e4867545c3), TO_LIMB_T(0x2af322533285a5d5), + TO_LIMB_T(0x50880866309b7e2c), TO_LIMB_T(0xa20d1b8c7e881024), + TO_LIMB_T(0x14e4f04fe2db9068), TO_LIMB_T(0x14e56d3f1564853a) }, + { TO_LIMB_T(0xcd03c9e48671f071), TO_LIMB_T(0x5dab22461fcda5d2), + TO_LIMB_T(0x587042afd3851b95), TO_LIMB_T(0x8eb60ebe01bacb9e), + TO_LIMB_T(0x03f97d6e83d050d2), TO_LIMB_T(0x18f0206554638741) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + + frobenius_map_fp2(ret[0], a[0], n); + frobenius_map_fp2(ret[1], a[1], n); + frobenius_map_fp2(ret[2], a[2], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1], ret[1], coeffs1[n]); + mul_fp(ret[2][0], ret[2][0], coeffs2[n]); + mul_fp(ret[2][1], ret[2][1], coeffs2[n]); +} + +static void frobenius_map_fp12(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + static const vec384x coeffs[] = { /* (u + 1)^((P^n - 1) / 6) */ + { { TO_LIMB_T(0x07089552b319d465), TO_LIMB_T(0xc6695f92b50a8313), + TO_LIMB_T(0x97e83cccd117228f), TO_LIMB_T(0xa35baecab2dc29ee), + TO_LIMB_T(0x1ce393ea5daace4d), TO_LIMB_T(0x08f2220fb0fb66eb) }, + { TO_LIMB_T(0xb2f66aad4ce5d646), TO_LIMB_T(0x5842a06bfc497cec), + TO_LIMB_T(0xcf4895d42599d394), TO_LIMB_T(0xc11b9cba40a8e8d0), + TO_LIMB_T(0x2e3813cbe5a0de89), TO_LIMB_T(0x110eefda88847faf) } }, + { { TO_LIMB_T(0xecfb361b798dba3a), TO_LIMB_T(0xc100ddb891865a2c), + TO_LIMB_T(0x0ec08ff1232bda8e), TO_LIMB_T(0xd5c13cc6f1ca4721), + TO_LIMB_T(0x47222a47bf7b5c04), TO_LIMB_T(0x0110f184e51c5f59) } }, + { { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } }, + }; + + frobenius_map_fp6(ret[0], a[0], n); + frobenius_map_fp6(ret[1], a[1], n); + --n; /* implied ONE_MONT_P at index 0 */ + mul_fp2(ret[1][0], ret[1][0], coeffs[n]); + mul_fp2(ret[1][1], ret[1][1], coeffs[n]); + mul_fp2(ret[1][2], ret[1][2], coeffs[n]); +} + + +/* + * BLS12-381-specific Fp12 shortcuts. + */ +void blst_fp12_sqr(vec384fp12 ret, const vec384fp12 a) +{ sqr_fp12(ret, a); } + +void blst_fp12_cyclotomic_sqr(vec384fp12 ret, const vec384fp12 a) +{ cyclotomic_sqr_fp12(ret, a); } + +void blst_fp12_mul(vec384fp12 ret, const vec384fp12 a, const vec384fp12 b) +{ mul_fp12(ret, a, b); } + +void blst_fp12_mul_by_xy00z0(vec384fp12 ret, const vec384fp12 a, + const vec384fp6 xy00z0) +{ mul_by_xy00z0_fp12(ret, a, xy00z0); } + +void blst_fp12_conjugate(vec384fp12 a) +{ conjugate_fp12(a); } + +void blst_fp12_inverse(vec384fp12 ret, const vec384fp12 a) +{ inverse_fp12(ret, a); } + +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(vec384fp12 ret, const vec384fp12 a, size_t n) +{ frobenius_map_fp12(ret, a, n); } + +int blst_fp12_is_equal(const vec384fp12 a, const vec384fp12 b) +{ return (int)vec_is_equal(a, b, sizeof(vec384fp12)); } + +int blst_fp12_is_one(const vec384fp12 a) +{ + return (int)(vec_is_equal(a[0][0], BLS12_381_Rx.p2, sizeof(a[0][0])) & + vec_is_zero(a[0][1], sizeof(vec384fp12) - sizeof(a[0][0]))); +} + +const vec384fp12 *blst_fp12_one(void) +{ return (const vec384fp12 *)BLS12_381_Rx.p12; } + +void blst_bendian_from_fp12(unsigned char ret[48*12], const vec384fp12 a) +{ + size_t i, j; + vec384 out; + + for (i = 0; i < 3; i++) { + for (j = 0; j < 2; j++) { + from_fp(out, a[j][i][0]); + be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; + from_fp(out, a[j][i][1]); + be_bytes_from_limbs(ret, out, sizeof(vec384)); ret += 48; + } + } +} + +size_t blst_fp12_sizeof(void) +{ return sizeof(vec384fp12); } diff --git a/crypto/blst_src/hash_to_field.c b/crypto/blst_src/hash_to_field.c new file mode 100644 index 00000000000..6816ea8b922 --- /dev/null +++ b/crypto/blst_src/hash_to_field.c @@ -0,0 +1,177 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "sha256.h" + +static const vec384 BLS12_381_RRRR = { /* RR^2 */ + TO_LIMB_T(0xed48ac6bd94ca1e0), TO_LIMB_T(0x315f831e03a7adf8), + TO_LIMB_T(0x9a53352a615e29dd), TO_LIMB_T(0x34c04e5e921e1761), + TO_LIMB_T(0x2512d43565724728), TO_LIMB_T(0x0aa6346091755d4d) +}; + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +static void sha256_init_Zpad(SHA256_CTX *ctx) +{ + ctx->h[0] = 0xda5698beU; + ctx->h[1] = 0x17b9b469U; + ctx->h[2] = 0x62335799U; + ctx->h[3] = 0x779fbecaU; + ctx->h[4] = 0x8ce5d491U; + ctx->h[5] = 0xc0d26243U; + ctx->h[6] = 0xbafef9eaU; + ctx->h[7] = 0x1837a9d8U; + ctx->N = 64; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void vec_xor(void *restrict ret, const void *restrict a, + const void *restrict b, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i] ^ bp[i]; +} + +static void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + union { limb_t align; unsigned char c[32]; } b_0; + union { limb_t align; unsigned char c[33+256+31]; } b_i; + unsigned char *p; + size_t i, b_i_bits, b_i_blocks; + SHA256_CTX ctx; + + /* + * compose template for 'strxor(b_0, b_(i-1)) || I2OSP(i, 1) || DST_prime' + */ + if (DST_len > 255) { + sha256_init(&ctx); + sha256_update(&ctx, "H2C-OVERSIZE-DST-", 17); + sha256_update(&ctx, DST, DST_len); + sha256_final(b_0.c, &ctx); + DST = b_0.c, DST_len = 32; + } + b_i_blocks = ((33 + DST_len + 1 + 9) + 63) & -64; + vec_zero(b_i.c + b_i_blocks - 64, 64); + + p = b_i.c + 33; + for (i = 0; i < DST_len; i++) + p[i] = DST[i]; + p[i++] = (unsigned char)DST_len; + p[i++] = 0x80; + p[i+6] = p[i+5] = p[i+4] = p[i+3] = p[i+2] = p[i+1] = p[i+0] = 0; + b_i_bits = (33 + DST_len + 1) * 8; + p = b_i.c + b_i_blocks; + p[-2] = (unsigned char)(b_i_bits >> 8); + p[-1] = (unsigned char)(b_i_bits); + + sha256_init_Zpad(&ctx); /* Z_pad | */ + sha256_update(&ctx, aug, aug_len); /* | aug | */ + sha256_update(&ctx, msg, msg_len); /* | msg | */ + /* | I2OSP(len_in_bytes, 2) || I2OSP(0, 1) || DST_prime */ + b_i.c[30] = (unsigned char)(len_in_bytes >> 8); + b_i.c[31] = (unsigned char)(len_in_bytes); + b_i.c[32] = 0; + sha256_update(&ctx, b_i.c + 30, 3 + DST_len + 1); + sha256_final(b_0.c, &ctx); + + sha256_init_h(ctx.h); + vec_copy(b_i.c, b_0.c, 32); + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + + len_in_bytes += 31; /* ell = ceil(len_in_bytes / b_in_bytes), with */ + len_in_bytes /= 32; /* caller being responsible for accordingly large + * buffer. hash_to_field passes one with length + * divisible by 64, remember? which works... */ + while (--len_in_bytes) { + sha256_init_h(ctx.h); + vec_xor(b_i.c, b_0.c, bytes, 32); + bytes += 32; + ++b_i.c[32]; + sha256_block_data_order(ctx.h, b_i.c, b_i_blocks / 64); + sha256_emit(bytes, ctx.h); + } +} +#endif + +/* + * |nelems| is 'count * m' from spec + */ +static void hash_to_field(vec384 elems[], size_t nelems, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t L = sizeof(vec384) + 128/8; /* ceil((ceil(log2(p)) + k) / 8) */ + size_t len_in_bytes = L * nelems; /* divisible by 64, hurray! */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ + || defined(__STDC_NO_VLA__) + limb_t *pseudo_random = alloca(len_in_bytes); +#else + limb_t pseudo_random[len_in_bytes/sizeof(limb_t)]; +#endif + unsigned char *bytes; + vec768 elem; + + aug_len = aug!=NULL ? aug_len : 0; + DST_len = DST!=NULL ? DST_len : 0; + + expand_message_xmd((unsigned char *)pseudo_random, len_in_bytes, + aug, aug_len, msg, msg_len, DST, DST_len); + + vec_zero(elem, sizeof(elem)); + bytes = (unsigned char *)pseudo_random; + while (nelems--) { + limbs_from_be_bytes(elem, bytes, L); + bytes += L; + /* + * L-bytes block % P, output is in Montgomery domain... + */ + redc_mont_384(elems[0], elem, BLS12_381_P, p0); + mul_mont_384(elems[0], elems[0], BLS12_381_RRRR, BLS12_381_P, p0); + elems++; + } +} + +void blst_expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len) +{ + size_t buf_len = (len_in_bytes+31) & ((size_t)0-32); + unsigned char *buf_ptr = bytes; + + if (buf_len > 255*32) + return; + + if (buf_len != len_in_bytes) + buf_ptr = alloca(buf_len); + + expand_message_xmd(buf_ptr, len_in_bytes, NULL, 0, msg, msg_len, + DST, DST_len); + if (buf_ptr != bytes) { + unsigned char *ptr = buf_ptr; + while (len_in_bytes--) + *bytes++ = *ptr++; + vec_zero(buf_ptr, buf_len); + } +} diff --git a/crypto/blst_src/keygen.c b/crypto/blst_src/keygen.c new file mode 100644 index 00000000000..9b62f16b534 --- /dev/null +++ b/crypto/blst_src/keygen.c @@ -0,0 +1,319 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "consts.h" +#include "bytes.h" +#include "sha256.h" + +typedef struct { + SHA256_CTX ctx; + unsigned int h_ipad[8]; + unsigned int h_opad[8]; + union { limb_t l[64/sizeof(limb_t)]; unsigned char c[64]; } tail; +} HMAC_SHA256_CTX; + +static void HMAC_init(HMAC_SHA256_CTX *ctx, const void *K, size_t K_len) +{ + size_t i; + + if (K == NULL) { /* reuse h_ipad and h_opad */ + sha256_hcopy(ctx->ctx.h, ctx->h_ipad); + ctx->ctx.N = 64; + vec_zero(ctx->ctx.buf, sizeof(ctx->ctx.buf)); + ctx->ctx.off = 0; + + return; + } + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + if (K_len > 64) { + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, K, K_len); + sha256_final(ctx->tail.c, &ctx->ctx); + } else { + sha256_bcopy(ctx->tail.c, K, K_len); + } + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)0x3636363636363636; + + sha256_init(&ctx->ctx); + sha256_update(&ctx->ctx, ctx->tail.c, 64); + sha256_hcopy(ctx->h_ipad, ctx->ctx.h); + + for (i = 0; i < 64/sizeof(limb_t); i++) + ctx->tail.l[i] ^= (limb_t)(0x3636363636363636 ^ 0x5c5c5c5c5c5c5c5c); + + sha256_init_h(ctx->h_opad); + sha256_block_data_order(ctx->h_opad, ctx->tail.c, 1); + + vec_zero(ctx->tail.c, sizeof(ctx->tail)); + ctx->tail.c[32] = 0x80; + ctx->tail.c[62] = 3; /* (64+32)*8 in big endian */ + ctx->tail.c[63] = 0; +} + +static void HMAC_update(HMAC_SHA256_CTX *ctx, const unsigned char *inp, + size_t len) +{ sha256_update(&ctx->ctx, inp, len); } + +static void HMAC_final(unsigned char md[32], HMAC_SHA256_CTX *ctx) +{ + sha256_final(ctx->tail.c, &ctx->ctx); + sha256_hcopy(ctx->ctx.h, ctx->h_opad); + sha256_block_data_order(ctx->ctx.h, ctx->tail.c, 1); + sha256_emit(md, ctx->ctx.h); +} + +static void HKDF_Extract(unsigned char PRK[32], + const void *salt, size_t salt_len, + const void *IKM, size_t IKM_len, +#ifndef __BLST_HKDF_TESTMODE__ + int IKM_fixup, +#endif + HMAC_SHA256_CTX *ctx) +{ + unsigned char zero[1] = { 0 }; + + HMAC_init(ctx, salt != NULL ? salt : zero, salt_len); + HMAC_update(ctx, IKM, IKM_len); +#ifndef __BLST_HKDF_TESTMODE__ + if (IKM_fixup) { + /* Section 2.3 KeyGen in BLS-signature draft */ + HMAC_update(ctx, zero, 1); + } +#endif + HMAC_final(PRK, ctx); +} + +static void HKDF_Expand(unsigned char *OKM, size_t L, + const unsigned char PRK[32], + const void *info, size_t info_len, +#ifndef __BLST_HKDF_TESTMODE__ + int info_fixup, +#endif + HMAC_SHA256_CTX *ctx) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ + || defined(__STDC_NO_VLA__) + unsigned char *info_prime = alloca(info_len + 2 + 1); +#else + unsigned char info_prime[info_len + 2 + 1]; +#endif + + HMAC_init(ctx, PRK, 32); + + if (info_len != 0) + sha256_bcopy(info_prime, info, info_len); +#ifndef __BLST_HKDF_TESTMODE__ + if (info_fixup) { + /* Section 2.3 KeyGen in BLS-signature draft */ + info_prime[info_len + 0] = (unsigned char)(L >> 8); + info_prime[info_len + 1] = (unsigned char)(L); + info_len += 2; + } +#endif + info_prime[info_len] = 1; /* counter */ + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + while (L > 32) { + sha256_hcopy((unsigned int *)OKM, (const unsigned int *)ctx->tail.c); + OKM += 32; L -= 32; + ++info_prime[info_len]; /* counter */ + HMAC_init(ctx, NULL, 0); + HMAC_update(ctx, ctx->tail.c, 32); + HMAC_update(ctx, info_prime, info_len + 1); + HMAC_final(ctx->tail.c, ctx); + } + sha256_bcopy(OKM, ctx->tail.c, L); +} + +#ifndef __BLST_HKDF_TESTMODE__ +static void keygen(pow256 SK, const void *IKM, size_t IKM_len, + const void *salt, size_t salt_len, + const void *info, size_t info_len, + int version) +{ + struct { + HMAC_SHA256_CTX ctx; + unsigned char PRK[32], OKM[48]; + vec512 key; + } scratch; + unsigned char salt_prime[32] = "BLS-SIG-KEYGEN-SALT-"; + + if (IKM_len < 32 || (version > 4 && salt == NULL)) { + vec_zero(SK, sizeof(pow256)); + return; + } + + /* + * Vet |info| since some callers were caught to be sloppy, e.g. + * SWIG-4.0-generated Python wrapper... + */ + info_len = info==NULL ? 0 : info_len; + + if (salt == NULL) { + salt = salt_prime; + salt_len = 20; + } + + if (version == 4) { + /* salt = H(salt) */ + sha256_init(&scratch.ctx.ctx); + sha256_update(&scratch.ctx.ctx, salt, salt_len); + sha256_final(salt_prime, &scratch.ctx.ctx); + salt = salt_prime; + salt_len = sizeof(salt_prime); + } + + while (1) { + /* PRK = HKDF-Extract(salt, IKM || I2OSP(0, 1)) */ + HKDF_Extract(scratch.PRK, salt, salt_len, + IKM, IKM_len, 1, &scratch.ctx); + + /* OKM = HKDF-Expand(PRK, key_info || I2OSP(L, 2), L) */ + HKDF_Expand(scratch.OKM, sizeof(scratch.OKM), scratch.PRK, + info, info_len, 1, &scratch.ctx); + + /* SK = OS2IP(OKM) mod r */ + vec_zero(scratch.key, sizeof(scratch.key)); + limbs_from_be_bytes(scratch.key, scratch.OKM, sizeof(scratch.OKM)); + redc_mont_256(scratch.key, scratch.key, BLS12_381_r, r0); + /* + * Given that mul_mont_sparse_256 has special boundary conditions + * it's appropriate to mention that redc_mont_256 output is fully + * reduced at this point. Because we started with 384-bit input, + * one with most significant half smaller than the modulus. + */ + mul_mont_sparse_256(scratch.key, scratch.key, BLS12_381_rRR, + BLS12_381_r, r0); + + if (version < 4 || !vec_is_zero(scratch.key, sizeof(vec256))) + break; + + /* salt = H(salt) */ + sha256_init(&scratch.ctx.ctx); + sha256_update(&scratch.ctx.ctx, salt, salt_len); + sha256_final(salt_prime, &scratch.ctx.ctx); + salt = salt_prime; + salt_len = sizeof(salt_prime); + } + + le_bytes_from_limbs(SK, scratch.key, sizeof(pow256)); + + /* + * scrub the stack just in case next callee inadvertently flashes + * a fragment across application boundary... + */ + vec_zero(&scratch, sizeof(scratch)); +} + +void blst_keygen(pow256 SK, const void *IKM, size_t IKM_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 4); } + +void blst_keygen_v3(pow256 SK, const void *IKM, size_t IKM_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, NULL, 0, info, info_len, 3); } + +void blst_keygen_v4_5(pow256 SK, const void *IKM, size_t IKM_len, + const void *salt, size_t salt_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 4); } + +void blst_keygen_v5(pow256 SK, const void *IKM, size_t IKM_len, + const void *salt, size_t salt_len, + const void *info, size_t info_len) +{ keygen(SK, IKM, IKM_len, salt, salt_len, info, info_len, 5); } + +/* + * https://eips.ethereum.org/EIPS/eip-2333 + */ +void blst_derive_master_eip2333(pow256 SK, const void *seed, size_t seed_len) +{ keygen(SK, seed, seed_len, NULL, 0, NULL, 0, 4); } + +static void parent_SK_to_lamport_PK(pow256 PK, const pow256 parent_SK, + unsigned int index) +{ + size_t i; + struct { + HMAC_SHA256_CTX ctx; + SHA256_CTX ret; + unsigned char PRK[32], IKM[32]; + unsigned char lamport[255][32]; + } scratch; + + /* salt = I2OSP(index, 4) */ + unsigned char salt[4] = { (unsigned char)(index>>24), + (unsigned char)(index>>16), + (unsigned char)(index>>8), + (unsigned char)(index) }; + + /* IKM = I2OSP(parent_SK, 32) */ + for (i = 0; i < 32; i++) + scratch.IKM[i] = parent_SK[31-i]; + + /* lamport_0 = IKM_to_lamport_SK(IKM, salt) */ + HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, + &scratch.ctx); + HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), + scratch.PRK, NULL, 0, 0, &scratch.ctx); + + vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); + scratch.ctx.ctx.buf[32] = 0x80; + scratch.ctx.ctx.buf[62] = 1; /* 32*8 in big endian */ + scratch.ctx.ctx.buf[63] = 0; + for (i = 0; i < 255; i++) { + /* lamport_PK = lamport_PK | SHA256(lamport_0[i]) */ + sha256_init_h(scratch.ctx.ctx.h); + sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); + sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); + sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); + } + + /* compressed_lamport_PK = SHA256(lamport_PK) */ + sha256_init(&scratch.ret); + sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); + + /* not_IKM = flip_bits(IKM) */ + for (i = 0; i< 32; i++) + scratch.IKM[i] = ~scratch.IKM[i]; + + /* lamport_1 = IKM_to_lamport_SK(not_IKM, salt) */ + HKDF_Extract(scratch.PRK, salt, sizeof(salt), scratch.IKM, 32, 0, + &scratch.ctx); + HKDF_Expand(scratch.lamport[0], sizeof(scratch.lamport), + scratch.PRK, NULL, 0, 0, &scratch.ctx); + + vec_zero(scratch.ctx.ctx.buf, sizeof(scratch.ctx.ctx.buf)); + scratch.ctx.ctx.buf[32] = 0x80; + scratch.ctx.ctx.buf[62] = 1; + for (i = 0; i < 255; i++) { + /* lamport_PK = lamport_PK | SHA256(lamport_1[i]) */ + sha256_init_h(scratch.ctx.ctx.h); + sha256_bcopy(scratch.ctx.ctx.buf, scratch.lamport[i], 32); + sha256_block_data_order(scratch.ctx.ctx.h, scratch.ctx.ctx.buf, 1); + sha256_emit(scratch.lamport[i], scratch.ctx.ctx.h); + } + + /* compressed_lamport_PK = SHA256(lamport_PK) */ + sha256_update(&scratch.ret, scratch.lamport, sizeof(scratch.lamport)); + sha256_final(PK, &scratch.ret); + + /* + * scrub the stack just in case next callee inadvertently flashes + * a fragment across application boundary... + */ + vec_zero(&scratch, sizeof(scratch)); +} + +void blst_derive_child_eip2333(pow256 SK, const pow256 parent_SK, + unsigned int child_index) +{ + parent_SK_to_lamport_PK(SK, parent_SK, child_index); + keygen(SK, SK, sizeof(pow256), NULL, 0, NULL, 0, 4); +} +#endif diff --git a/crypto/blst_src/map_to_g1.c b/crypto/blst_src/map_to_g1.c new file mode 100644 index 00000000000..6613d68bb29 --- /dev/null +++ b/crypto/blst_src/map_to_g1.c @@ -0,0 +1,559 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384 Aprime_E1 = { + /* (0x00144698a3b8e9433d693a02c96d4982b0ea985383ee66a8 + d8e8981aefd881ac98936f8da0e0f97f5cf428082d584c1d << 384) % P */ + TO_LIMB_T(0x2f65aa0e9af5aa51), TO_LIMB_T(0x86464c2d1e8416c3), + TO_LIMB_T(0xb85ce591b7bd31e2), TO_LIMB_T(0x27e11c91b5f24e7c), + TO_LIMB_T(0x28376eda6bfc1835), TO_LIMB_T(0x155455c3e5071d85) +}; +static const vec384 Bprime_E1 = { + /* (0x12e2908d11688030018b12e8753eee3b2016c1f0f24f4070 + a0b9c14fcef35ef55a23215a316ceaa5d1cc48e98e172be0 << 384) % P */ + TO_LIMB_T(0xfb996971fe22a1e0), TO_LIMB_T(0x9aa93eb35b742d6f), + TO_LIMB_T(0x8c476013de99c5c4), TO_LIMB_T(0x873e27c3a221e571), + TO_LIMB_T(0xca72b5e45a52d888), TO_LIMB_T(0x06824061418a386b) +}; + +static void map_fp_times_Zz(vec384 map[], const vec384 isogeny_map[], + const vec384 Zz_powers[], size_t n) +{ + while (n--) + mul_fp(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp(vec384 acc, const vec384 x, const vec384 map[], size_t n) +{ + while (n--) { + mul_fp(acc, acc, x); + add_fp(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E1(POINTonE1 *out, const POINTonE1 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,11) * x'^11 + k_(1,10) * x'^10 + k_(1,9) * x'^9 + + * ... + k_(1,0) + * ... + */ + static const vec384 isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + { TO_LIMB_T(0x4d18b6f3af00131c), TO_LIMB_T(0x19fa219793fee28c), + TO_LIMB_T(0x3f2885f1467f19ae), TO_LIMB_T(0x23dcea34f2ffb304), + TO_LIMB_T(0xd15b58d2ffc00054), TO_LIMB_T(0x0913be200a20bef4) }, + { TO_LIMB_T(0x898985385cdbbd8b), TO_LIMB_T(0x3c79e43cc7d966aa), + TO_LIMB_T(0x1597e193f4cd233a), TO_LIMB_T(0x8637ef1e4d6623ad), + TO_LIMB_T(0x11b22deed20d827b), TO_LIMB_T(0x07097bc5998784ad) }, + { TO_LIMB_T(0xa542583a480b664b), TO_LIMB_T(0xfc7169c026e568c6), + TO_LIMB_T(0x5ba2ef314ed8b5a6), TO_LIMB_T(0x5b5491c05102f0e7), + TO_LIMB_T(0xdf6e99707d2a0079), TO_LIMB_T(0x0784151ed7605524) }, + { TO_LIMB_T(0x494e212870f72741), TO_LIMB_T(0xab9be52fbda43021), + TO_LIMB_T(0x26f5577994e34c3d), TO_LIMB_T(0x049dfee82aefbd60), + TO_LIMB_T(0x65dadd7828505289), TO_LIMB_T(0x0e93d431ea011aeb) }, + { TO_LIMB_T(0x90ee774bd6a74d45), TO_LIMB_T(0x7ada1c8a41bfb185), + TO_LIMB_T(0x0f1a8953b325f464), TO_LIMB_T(0x104c24211be4805c), + TO_LIMB_T(0x169139d319ea7a8f), TO_LIMB_T(0x09f20ead8e532bf6) }, + { TO_LIMB_T(0x6ddd93e2f43626b7), TO_LIMB_T(0xa5482c9aa1ccd7bd), + TO_LIMB_T(0x143245631883f4bd), TO_LIMB_T(0x2e0a94ccf77ec0db), + TO_LIMB_T(0xb0282d480e56489f), TO_LIMB_T(0x18f4bfcbb4368929) }, + { TO_LIMB_T(0x23c5f0c953402dfd), TO_LIMB_T(0x7a43ff6958ce4fe9), + TO_LIMB_T(0x2c390d3d2da5df63), TO_LIMB_T(0xd0df5c98e1f9d70f), + TO_LIMB_T(0xffd89869a572b297), TO_LIMB_T(0x1277ffc72f25e8fe) }, + { TO_LIMB_T(0x79f4f0490f06a8a6), TO_LIMB_T(0x85f894a88030fd81), + TO_LIMB_T(0x12da3054b18b6410), TO_LIMB_T(0xe2a57f6505880d65), + TO_LIMB_T(0xbba074f260e400f1), TO_LIMB_T(0x08b76279f621d028) }, + { TO_LIMB_T(0xe67245ba78d5b00b), TO_LIMB_T(0x8456ba9a1f186475), + TO_LIMB_T(0x7888bff6e6b33bb4), TO_LIMB_T(0xe21585b9a30f86cb), + TO_LIMB_T(0x05a69cdcef55feee), TO_LIMB_T(0x09e699dd9adfa5ac) }, + { TO_LIMB_T(0x0de5c357bff57107), TO_LIMB_T(0x0a0db4ae6b1a10b2), + TO_LIMB_T(0xe256bb67b3b3cd8d), TO_LIMB_T(0x8ad456574e9db24f), + TO_LIMB_T(0x0443915f50fd4179), TO_LIMB_T(0x098c4bf7de8b6375) }, + { TO_LIMB_T(0xe6b0617e7dd929c7), TO_LIMB_T(0xfe6e37d442537375), + TO_LIMB_T(0x1dafdeda137a489e), TO_LIMB_T(0xe4efd1ad3f767ceb), + TO_LIMB_T(0x4a51d8667f0fe1cf), TO_LIMB_T(0x054fdf4bbf1d821c) }, + { TO_LIMB_T(0x72db2a50658d767b), TO_LIMB_T(0x8abf91faa257b3d5), + TO_LIMB_T(0xe969d6833764ab47), TO_LIMB_T(0x464170142a1009eb), + TO_LIMB_T(0xb14f01aadb30be2f), TO_LIMB_T(0x18ae6a856f40715d) } + }; + /* ... + * x_den = x'^10 + k_(2,9) * x'^9 + k_(2,8) * x'^8 + ... + k_(2,0) + */ + static const vec384 isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + { TO_LIMB_T(0xb962a077fdb0f945), TO_LIMB_T(0xa6a9740fefda13a0), + TO_LIMB_T(0xc14d568c3ed6c544), TO_LIMB_T(0xb43fc37b908b133e), + TO_LIMB_T(0x9c0b3ac929599016), TO_LIMB_T(0x0165aa6c93ad115f) }, + { TO_LIMB_T(0x23279a3ba506c1d9), TO_LIMB_T(0x92cfca0a9465176a), + TO_LIMB_T(0x3b294ab13755f0ff), TO_LIMB_T(0x116dda1c5070ae93), + TO_LIMB_T(0xed4530924cec2045), TO_LIMB_T(0x083383d6ed81f1ce) }, + { TO_LIMB_T(0x9885c2a6449fecfc), TO_LIMB_T(0x4a2b54ccd37733f0), + TO_LIMB_T(0x17da9ffd8738c142), TO_LIMB_T(0xa0fba72732b3fafd), + TO_LIMB_T(0xff364f36e54b6812), TO_LIMB_T(0x0f29c13c660523e2) }, + { TO_LIMB_T(0xe349cc118278f041), TO_LIMB_T(0xd487228f2f3204fb), + TO_LIMB_T(0xc9d325849ade5150), TO_LIMB_T(0x43a92bd69c15c2df), + TO_LIMB_T(0x1c2c7844bc417be4), TO_LIMB_T(0x12025184f407440c) }, + { TO_LIMB_T(0x587f65ae6acb057b), TO_LIMB_T(0x1444ef325140201f), + TO_LIMB_T(0xfbf995e71270da49), TO_LIMB_T(0xccda066072436a42), + TO_LIMB_T(0x7408904f0f186bb2), TO_LIMB_T(0x13b93c63edf6c015) }, + { TO_LIMB_T(0xfb918622cd141920), TO_LIMB_T(0x4a4c64423ecaddb4), + TO_LIMB_T(0x0beb232927f7fb26), TO_LIMB_T(0x30f94df6f83a3dc2), + TO_LIMB_T(0xaeedd424d780f388), TO_LIMB_T(0x06cc402dd594bbeb) }, + { TO_LIMB_T(0xd41f761151b23f8f), TO_LIMB_T(0x32a92465435719b3), + TO_LIMB_T(0x64f436e888c62cb9), TO_LIMB_T(0xdf70a9a1f757c6e4), + TO_LIMB_T(0x6933a38d5b594c81), TO_LIMB_T(0x0c6f7f7237b46606) }, + { TO_LIMB_T(0x693c08747876c8f7), TO_LIMB_T(0x22c9850bf9cf80f0), + TO_LIMB_T(0x8e9071dab950c124), TO_LIMB_T(0x89bc62d61c7baf23), + TO_LIMB_T(0xbc6be2d8dad57c23), TO_LIMB_T(0x17916987aa14a122) }, + { TO_LIMB_T(0x1be3ff439c1316fd), TO_LIMB_T(0x9965243a7571dfa7), + TO_LIMB_T(0xc7f7f62962f5cd81), TO_LIMB_T(0x32c6aa9af394361c), + TO_LIMB_T(0xbbc2ee18e1c227f4), TO_LIMB_T(0x0c102cbac531bb34) }, + { TO_LIMB_T(0x997614c97bacbf07), TO_LIMB_T(0x61f86372b99192c0), + TO_LIMB_T(0x5b8c95fc14353fc3), TO_LIMB_T(0xca2b066c2a87492f), + TO_LIMB_T(0x16178f5bbf698711), TO_LIMB_T(0x12a6dcd7f0f4e0e8) } + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,15) * x'^15 + k_(3,14) * x'^14 + k_(3,13) * x'^13 + + * ... + k_(3,0) + * ... + */ + static const vec384 isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + { TO_LIMB_T(0x2b567ff3e2837267), TO_LIMB_T(0x1d4d9e57b958a767), + TO_LIMB_T(0xce028fea04bd7373), TO_LIMB_T(0xcc31a30a0b6cd3df), + TO_LIMB_T(0x7d7b18a682692693), TO_LIMB_T(0x0d300744d42a0310) }, + { TO_LIMB_T(0x99c2555fa542493f), TO_LIMB_T(0xfe7f53cc4874f878), + TO_LIMB_T(0x5df0608b8f97608a), TO_LIMB_T(0x14e03832052b49c8), + TO_LIMB_T(0x706326a6957dd5a4), TO_LIMB_T(0x0a8dadd9c2414555) }, + { TO_LIMB_T(0x13d942922a5cf63a), TO_LIMB_T(0x357e33e36e261e7d), + TO_LIMB_T(0xcf05a27c8456088d), TO_LIMB_T(0x0000bd1de7ba50f0), + TO_LIMB_T(0x83d0c7532f8c1fde), TO_LIMB_T(0x13f70bf38bbf2905) }, + { TO_LIMB_T(0x5c57fd95bfafbdbb), TO_LIMB_T(0x28a359a65e541707), + TO_LIMB_T(0x3983ceb4f6360b6d), TO_LIMB_T(0xafe19ff6f97e6d53), + TO_LIMB_T(0xb3468f4550192bf7), TO_LIMB_T(0x0bb6cde49d8ba257) }, + { TO_LIMB_T(0x590b62c7ff8a513f), TO_LIMB_T(0x314b4ce372cacefd), + TO_LIMB_T(0x6bef32ce94b8a800), TO_LIMB_T(0x6ddf84a095713d5f), + TO_LIMB_T(0x64eace4cb0982191), TO_LIMB_T(0x0386213c651b888d) }, + { TO_LIMB_T(0xa5310a31111bbcdd), TO_LIMB_T(0xa14ac0f5da148982), + TO_LIMB_T(0xf9ad9cc95423d2e9), TO_LIMB_T(0xaa6ec095283ee4a7), + TO_LIMB_T(0xcf5b1f022e1c9107), TO_LIMB_T(0x01fddf5aed881793) }, + { TO_LIMB_T(0x65a572b0d7a7d950), TO_LIMB_T(0xe25c2d8183473a19), + TO_LIMB_T(0xc2fcebe7cb877dbd), TO_LIMB_T(0x05b2d36c769a89b0), + TO_LIMB_T(0xba12961be86e9efb), TO_LIMB_T(0x07eb1b29c1dfde1f) }, + { TO_LIMB_T(0x93e09572f7c4cd24), TO_LIMB_T(0x364e929076795091), + TO_LIMB_T(0x8569467e68af51b5), TO_LIMB_T(0xa47da89439f5340f), + TO_LIMB_T(0xf4fa918082e44d64), TO_LIMB_T(0x0ad52ba3e6695a79) }, + { TO_LIMB_T(0x911429844e0d5f54), TO_LIMB_T(0xd03f51a3516bb233), + TO_LIMB_T(0x3d587e5640536e66), TO_LIMB_T(0xfa86d2a3a9a73482), + TO_LIMB_T(0xa90ed5adf1ed5537), TO_LIMB_T(0x149c9c326a5e7393) }, + { TO_LIMB_T(0x462bbeb03c12921a), TO_LIMB_T(0xdc9af5fa0a274a17), + TO_LIMB_T(0x9a558ebde836ebed), TO_LIMB_T(0x649ef8f11a4fae46), + TO_LIMB_T(0x8100e1652b3cdc62), TO_LIMB_T(0x1862bd62c291dacb) }, + { TO_LIMB_T(0x05c9b8ca89f12c26), TO_LIMB_T(0x0194160fa9b9ac4f), + TO_LIMB_T(0x6a643d5a6879fa2c), TO_LIMB_T(0x14665bdd8846e19d), + TO_LIMB_T(0xbb1d0d53af3ff6bf), TO_LIMB_T(0x12c7e1c3b28962e5) }, + { TO_LIMB_T(0xb55ebf900b8a3e17), TO_LIMB_T(0xfedc77ec1a9201c4), + TO_LIMB_T(0x1f07db10ea1a4df4), TO_LIMB_T(0x0dfbd15dc41a594d), + TO_LIMB_T(0x389547f2334a5391), TO_LIMB_T(0x02419f98165871a4) }, + { TO_LIMB_T(0xb416af000745fc20), TO_LIMB_T(0x8e563e9d1ea6d0f5), + TO_LIMB_T(0x7c763e17763a0652), TO_LIMB_T(0x01458ef0159ebbef), + TO_LIMB_T(0x8346fe421f96bb13), TO_LIMB_T(0x0d2d7b829ce324d2) }, + { TO_LIMB_T(0x93096bb538d64615), TO_LIMB_T(0x6f2a2619951d823a), + TO_LIMB_T(0x8f66b3ea59514fa4), TO_LIMB_T(0xf563e63704f7092f), + TO_LIMB_T(0x724b136c4cf2d9fa), TO_LIMB_T(0x046959cfcfd0bf49) }, + { TO_LIMB_T(0xea748d4b6e405346), TO_LIMB_T(0x91e9079c2c02d58f), + TO_LIMB_T(0x41064965946d9b59), TO_LIMB_T(0xa06731f1d2bbe1ee), + TO_LIMB_T(0x07f897e267a33f1b), TO_LIMB_T(0x1017290919210e5f) }, + { TO_LIMB_T(0x872aa6c17d985097), TO_LIMB_T(0xeecc53161264562a), + TO_LIMB_T(0x07afe37afff55002), TO_LIMB_T(0x54759078e5be6838), + TO_LIMB_T(0xc4b92d15db8acca8), TO_LIMB_T(0x106d87d1b51d13b9) } + }; + /* ... + * y_den = x'^15 + k_(4,14) * x'^14 + k_(4,13) * x'^13 + ... + k_(4,0) + */ + static const vec384 isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + { TO_LIMB_T(0xeb6c359d47e52b1c), TO_LIMB_T(0x18ef5f8a10634d60), + TO_LIMB_T(0xddfa71a0889d5b7e), TO_LIMB_T(0x723e71dcc5fc1323), + TO_LIMB_T(0x52f45700b70d5c69), TO_LIMB_T(0x0a8b981ee47691f1) }, + { TO_LIMB_T(0x616a3c4f5535b9fb), TO_LIMB_T(0x6f5f037395dbd911), + TO_LIMB_T(0xf25f4cc5e35c65da), TO_LIMB_T(0x3e50dffea3c62658), + TO_LIMB_T(0x6a33dca523560776), TO_LIMB_T(0x0fadeff77b6bfe3e) }, + { TO_LIMB_T(0x2be9b66df470059c), TO_LIMB_T(0x24a2c159a3d36742), + TO_LIMB_T(0x115dbe7ad10c2a37), TO_LIMB_T(0xb6634a652ee5884d), + TO_LIMB_T(0x04fe8bb2b8d81af4), TO_LIMB_T(0x01c2a7a256fe9c41) }, + { TO_LIMB_T(0xf27bf8ef3b75a386), TO_LIMB_T(0x898b367476c9073f), + TO_LIMB_T(0x24482e6b8c2f4e5f), TO_LIMB_T(0xc8e0bbd6fe110806), + TO_LIMB_T(0x59b0c17f7631448a), TO_LIMB_T(0x11037cd58b3dbfbd) }, + { TO_LIMB_T(0x31c7912ea267eec6), TO_LIMB_T(0x1dbf6f1c5fcdb700), + TO_LIMB_T(0xd30d4fe3ba86fdb1), TO_LIMB_T(0x3cae528fbee9a2a4), + TO_LIMB_T(0xb1cce69b6aa9ad9a), TO_LIMB_T(0x044393bb632d94fb) }, + { TO_LIMB_T(0xc66ef6efeeb5c7e8), TO_LIMB_T(0x9824c289dd72bb55), + TO_LIMB_T(0x71b1a4d2f119981d), TO_LIMB_T(0x104fc1aafb0919cc), + TO_LIMB_T(0x0e49df01d942a628), TO_LIMB_T(0x096c3a09773272d4) }, + { TO_LIMB_T(0x9abc11eb5fadeff4), TO_LIMB_T(0x32dca50a885728f0), + TO_LIMB_T(0xfb1fa3721569734c), TO_LIMB_T(0xc4b76271ea6506b3), + TO_LIMB_T(0xd466a75599ce728e), TO_LIMB_T(0x0c81d4645f4cb6ed) }, + { TO_LIMB_T(0x4199f10e5b8be45b), TO_LIMB_T(0xda64e495b1e87930), + TO_LIMB_T(0xcb353efe9b33e4ff), TO_LIMB_T(0x9e9efb24aa6424c6), + TO_LIMB_T(0xf08d33680a237465), TO_LIMB_T(0x0d3378023e4c7406) }, + { TO_LIMB_T(0x7eb4ae92ec74d3a5), TO_LIMB_T(0xc341b4aa9fac3497), + TO_LIMB_T(0x5be603899e907687), TO_LIMB_T(0x03bfd9cca75cbdeb), + TO_LIMB_T(0x564c2935a96bfa93), TO_LIMB_T(0x0ef3c33371e2fdb5) }, + { TO_LIMB_T(0x7ee91fd449f6ac2e), TO_LIMB_T(0xe5d5bd5cb9357a30), + TO_LIMB_T(0x773a8ca5196b1380), TO_LIMB_T(0xd0fda172174ed023), + TO_LIMB_T(0x6cb95e0fa776aead), TO_LIMB_T(0x0d22d5a40cec7cff) }, + { TO_LIMB_T(0xf727e09285fd8519), TO_LIMB_T(0xdc9d55a83017897b), + TO_LIMB_T(0x7549d8bd057894ae), TO_LIMB_T(0x178419613d90d8f8), + TO_LIMB_T(0xfce95ebdeb5b490a), TO_LIMB_T(0x0467ffaef23fc49e) }, + { TO_LIMB_T(0xc1769e6a7c385f1b), TO_LIMB_T(0x79bc930deac01c03), + TO_LIMB_T(0x5461c75a23ede3b5), TO_LIMB_T(0x6e20829e5c230c45), + TO_LIMB_T(0x828e0f1e772a53cd), TO_LIMB_T(0x116aefa749127bff) }, + { TO_LIMB_T(0x101c10bf2744c10a), TO_LIMB_T(0xbbf18d053a6a3154), + TO_LIMB_T(0xa0ecf39ef026f602), TO_LIMB_T(0xfc009d4996dc5153), + TO_LIMB_T(0xb9000209d5bd08d3), TO_LIMB_T(0x189e5fe4470cd73c) }, + { TO_LIMB_T(0x7ebd546ca1575ed2), TO_LIMB_T(0xe47d5a981d081b55), + TO_LIMB_T(0x57b2b625b6d4ca21), TO_LIMB_T(0xb0a1ba04228520cc), + TO_LIMB_T(0x98738983c2107ff3), TO_LIMB_T(0x13dddbc4799d81d6) }, + { TO_LIMB_T(0x09319f2e39834935), TO_LIMB_T(0x039e952cbdb05c21), + TO_LIMB_T(0x55ba77a9a2f76493), TO_LIMB_T(0xfd04e3dfc6086467), + TO_LIMB_T(0xfb95832e7d78742e), TO_LIMB_T(0x0ef9c24eccaf5e0e) } + }; + vec384 Zz_powers[15], map[15], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp(Zz_powers[14], p->Z); /* ZZ^1 */ +#ifdef __OPTIMIZE_SIZE__ + for (size_t i = 14; i > 0; i--) + mul_fp(Zz_powers[i-1], Zz_powers[i], Zz_powers[14]); +#else + sqr_fp(Zz_powers[13], Zz_powers[14]); /* ZZ^2 1+1 */ + mul_fp(Zz_powers[12], Zz_powers[14], Zz_powers[13]);/* ZZ^3 2+1 */ + sqr_fp(Zz_powers[11], Zz_powers[13]); /* ZZ^4 2+2 */ + mul_fp(Zz_powers[10], Zz_powers[13], Zz_powers[12]);/* ZZ^5 2+3 */ + sqr_fp(Zz_powers[9], Zz_powers[12]); /* ZZ^6 3+3 */ + mul_fp(Zz_powers[8], Zz_powers[12], Zz_powers[11]);/* ZZ^7 3+4 */ + sqr_fp(Zz_powers[7], Zz_powers[11]); /* ZZ^8 4+4 */ + mul_fp(Zz_powers[6], Zz_powers[11], Zz_powers[10]);/* ZZ^9 4+5 */ + sqr_fp(Zz_powers[5], Zz_powers[10]); /* ZZ^10 5+5 */ + mul_fp(Zz_powers[4], Zz_powers[10], Zz_powers[9]); /* ZZ^11 5+6 */ + sqr_fp(Zz_powers[3], Zz_powers[9]); /* ZZ^12 6+6 */ + mul_fp(Zz_powers[2], Zz_powers[9], Zz_powers[8]); /* ZZ^13 6+7 */ + sqr_fp(Zz_powers[1], Zz_powers[8]); /* ZZ^14 7+7 */ + mul_fp(Zz_powers[0], Zz_powers[8], Zz_powers[7]); /* ZZ^15 7+8 */ +#endif + + map_fp_times_Zz(map, isogeny_map_x_num, Zz_powers + 4, 11); + mul_fp(xn, p->X, isogeny_map_x_num[11]); + add_fp(xn, xn, map[10]); + map_fp(xn, p->X, map, 10); + + map_fp_times_Zz(map, isogeny_map_x_den, Zz_powers + 5, 10); + add_fp(xd, p->X, map[9]); + map_fp(xd, p->X, map, 9); + mul_fp(xd, xd, Zz_powers[14]); /* xd *= Z^2 */ + + map_fp_times_Zz(map, isogeny_map_y_num, Zz_powers, 15); + mul_fp(yn, p->X, isogeny_map_y_num[15]); + add_fp(yn, yn, map[14]); + map_fp(yn, p->X, map, 14); + mul_fp(yn, yn, p->Y); /* yn *= Y */ + + map_fp_times_Zz(map, isogeny_map_y_den, Zz_powers, 15); + add_fp(yd, p->X, map[14]); + map_fp(yd, p->X, map, 14); + mul_fp(Zz_powers[14], Zz_powers[14], p->Z); + mul_fp(yd, yd, Zz_powers[14]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp(out->X, xn, yd); + mul_fp(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp(out->Y, out->Z); + mul_fp(out->Y, out->Y, xd); + mul_fp(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E1(POINTonE1 *p, const vec384 u) +{ + static const vec384 minus_A = { /* P - A */ + TO_LIMB_T(0x8a9955f1650a005a), TO_LIMB_T(0x9865b3d192cfe93c), + TO_LIMB_T(0xaed3ed0f3ef3c441), TO_LIMB_T(0x3c962ef33d92c442), + TO_LIMB_T(0x22e438dbd74f94a2), TO_LIMB_T(0x04acbc265478c915) + }; + static const vec384 Z = { /* (11<<384) % P */ + TO_LIMB_T(0x886c00000023ffdc), TO_LIMB_T(0x0f70008d3090001d), + TO_LIMB_T(0x77672417ed5828c3), TO_LIMB_T(0x9dac23e943dc1740), + TO_LIMB_T(0x50553f1b9c131521), TO_LIMB_T(0x078c712fbe0ab6e8) + }; + static const vec384 sqrt_minus_ZZZ = { + TO_LIMB_T(0x43b571cad3215f1f), TO_LIMB_T(0xccb460ef1c702dc2), + TO_LIMB_T(0x742d884f4f97100b), TO_LIMB_T(0xdb2c3e3238a3382b), + TO_LIMB_T(0xe40f3fa13fce8f88), TO_LIMB_T(0x0073a2af9892a2ff) + }; + static const vec384 ZxA = { + TO_LIMB_T(0x7f674ea0a8915178), TO_LIMB_T(0xb0f945fc13b8fa65), + TO_LIMB_T(0x4b46759a38e87d76), TO_LIMB_T(0x2e7a929641bbb6a1), + TO_LIMB_T(0x1668ddfa462bf6b6), TO_LIMB_T(0x00960e2ed1cf294c) + }; + vec384 uu, tv2, x2n, gx1, gxd, y2; +#if 0 + vec384 xn, x1n, xd, y, y1, Zuu, tv4; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +# define tv4 y1 +#endif +#define sgn0_fp(a) (sgn0_pty_mont_384((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + */ + /* x numerator variants */ + sqr_fp(uu, u); /* uu = u^2 */ + mul_fp(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp(x1n, tv2, BLS12_381_Rx.p); /* x1n = tv2 + 1 */ + mul_fp(x1n, x1n, Bprime_E1); /* x1n = x1n * B */ + mul_fp(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp(tv2, xd); /* tv2 = xd^2 */ + mul_fp(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp(tv2, Aprime_E1, tv2); /* tv2 = A * tv2 */ + sqr_fp(gx1, x1n); /* gx1 = x1n^2 */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp(tv2, Bprime_E1, gxd); /* tv2 = B * gxd */ + add_fp(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp(y1, tv4); /* y1 = tv4^c1 # (gx1*gxd^3)^((p-3)/4) */ + mul_fp(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp(y2, y1, sqrt_minus_ZZZ); /* y2 = y1 * c2 # y2 = y1*sqrt(-Z^3) */ + mul_fp(y2, y2, uu); /* y2 = y2 * uu */ + mul_fp(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp(u); + e2 = sgn0_fp(y); + cneg_fp(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp(p->X, xn, xd); /* X = xn * xd */ + mul_fp(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp +} + +static void POINTonE1_add_n_dbl(POINTonE1 *out, const POINTonE1 *p, size_t n) +{ + POINTonE1_dadd(out, out, p, NULL); + while(n--) + POINTonE1_double(out, out); +} + +static void POINTonE1_times_minus_z(POINTonE1 *out, const POINTonE1 *in) +{ + POINTonE1_double(out, in); /* 1: 0x2 */ + POINTonE1_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE1_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE1_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE1_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE1_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ + POINTonE1 p; + + map_to_isogenous_E1(&p, u); + + if (v != NULL) { + map_to_isogenous_E1(out, v); /* borrow |out| */ + POINTonE1_dadd(&p, &p, out, Aprime_E1); + } + + isogeny_map_to_E1(&p, &p); /* sprinkle isogenous powder */ + + /* clear the cofactor by multiplying |p| by 1-z, 0xd201000000010001 */ + POINTonE1_times_minus_z(out, &p); + POINTonE1_dadd(out, out, &p, NULL); +} + +void blst_map_to_g1(POINTonE1 *out, const vec384 u, const vec384 v) +{ map_to_g1(out, u, v); } + +static void Encode_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[1]; + + hash_to_field(u, 1, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], NULL); +} + +void blst_encode_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384 u[2]; + + hash_to_field(u, 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g1(p, u[0], u[1]); +} + +void blst_hash_to_g1(POINTonE1 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G1(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void sigma(POINTonE1 *out, const POINTonE1 *in); + +#if 0 +#ifdef __OPTIMIZE_SIZE__ +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + static const byte zz_minus_1_div_by_3[] = { + TO_BYTES(0x0000000055555555ULL), TO_BYTES(0x396c8c005555e156) + }; + size_t n = 126-1; + const POINTonE1 *dblin = in; + + while(n--) { + POINTonE1_double(out, dblin); dblin = out; + if (is_bit_set(zz_minus_1_div_by_3, n)) + POINTonE1_dadd(out, out, in, NULL); + } +} +#else +static void POINTonE1_dbl_n_add(POINTonE1 *out, size_t n, const POINTonE1 *p) +{ + while(n--) + POINTonE1_double(out, out); + POINTonE1_dadd(out, out, p, NULL); +} + +static void POINTonE1_times_zz_minus_1_div_by_3(POINTonE1 *out, + const POINTonE1 *in) +{ + POINTonE1 t3, t5, t7, t11, t85; + + POINTonE1_double(&t7, in); /* 2P */ + POINTonE1_dadd(&t3, &t7, in, NULL); /* 3P */ + POINTonE1_dadd(&t5, &t3, &t7, NULL); /* 5P */ + POINTonE1_dadd(&t7, &t5, &t7, NULL); /* 7P */ + POINTonE1_double(&t85, &t5); /* 10P */ + POINTonE1_dadd(&t11, &t85, in, NULL); /* 11P */ + POINTonE1_dbl_n_add(&t85, 3, &t5); /* 0x55P */ + /* (-0xd201000000010000^2 - 1) / 3 */ + POINTonE1_double(out, &t7); /* 0xe */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb */ + POINTonE1_dbl_n_add(out, 3, &t3); /* 0xe5b */ + POINTonE1_dbl_n_add(out, 3, in); /* 0x72d9 */ + POINTonE1_dbl_n_add(out, 5, &t3); /* 0xe5b23 */ + POINTonE1_dbl_n_add(out, 18, &t85); /* 0x396c8c0055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555 */ + POINTonE1_dbl_n_add(out, 3, &t7); /* 0x1cb646002aaaf */ + POINTonE1_dbl_n_add(out, 7, &t5); /* 0xe5b23001555785 */ + POINTonE1_dbl_n_add(out, 5, &t11); /* 0x1cb646002aaaf0ab */ + POINTonE1_dbl_n_add(out, 41, &t85); /* 0x396c8c005555e1560000000055 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e156000000005555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e15600000000555555 */ + POINTonE1_dbl_n_add(out, 8, &t85); /* 0x396c8c005555e1560000000055555555 */ +} +#endif + +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + sigma(&t0, P); /* σ(P) */ + sigma(&t1, &t0); /* σ²(P) */ + + POINTonE1_double(&t0, &t0); /* 2σ(P) */ + POINTonE1_dadd(&t2, &t1, P, NULL); /* P + σ²(P) */ + POINTonE1_cneg(&t2, 1); /* - P - σ²(P) */ + POINTonE1_dadd(&t2, &t2, &t0, NULL); /* 2σ(P) - P - σ²(P) */ + POINTonE1_times_zz_minus_1_div_by_3( &t0, &t2); + POINTonE1_cneg(&t1, 1); + POINTonE1_dadd(&t0, &t0, &t1, NULL); /* [(z²-1)/3](2σ(P) - P - σ²(P)) */ + /* - σ²(P) */ + return vec_is_zero(t0.Z, sizeof(t0.Z)); +} +#else +static bool_t POINTonE1_in_G1(const POINTonE1 *P) +{ + POINTonE1 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + POINTonE1_times_minus_z(&t0, P); + POINTonE1_times_minus_z(&t1, &t0); + POINTonE1_cneg(&t1, 1); /* [-z²]P */ + + sigma(&t0, P); /* σ(P) */ + sigma(&t0, &t0); /* σ²(P) */ + + return POINTonE1_is_equal(&t0, &t1); +} +#endif + +int blst_p1_in_g1(const POINTonE1 *p) +{ return (int)POINTonE1_in_G1(p); } + +int blst_p1_affine_in_g1(const POINTonE1_affine *p) +{ + POINTonE1 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE1_in_G1(&P); +} diff --git a/crypto/blst_src/map_to_g2.c b/crypto/blst_src/map_to_g2.c new file mode 100644 index 00000000000..90fd86e9d31 --- /dev/null +++ b/crypto/blst_src/map_to_g2.c @@ -0,0 +1,444 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "point.h" +#include "fields.h" + +/* + * y^2 = x^3 + A'*x + B', isogenous one + */ +static const vec384x Aprime_E2 = { /* 240*i */ + { 0 }, + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) } +}; +static const vec384x Bprime_E2 = { /* 1012 + 1012*i */ + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) }, + { TO_LIMB_T(0x22ea00000cf89db2), TO_LIMB_T(0x6ec832df71380aa4), + TO_LIMB_T(0x6e1b94403db5a66e), TO_LIMB_T(0x75bf3c53a79473ba), + TO_LIMB_T(0x3dd3a569412c0a34), TO_LIMB_T(0x125cdb5e74dc4fd1) } +}; + +static void map_fp2_times_Zz(vec384x map[], const vec384x isogeny_map[], + const vec384x Zz_powers[], size_t n) +{ + while (n--) + mul_fp2(map[n], isogeny_map[n], Zz_powers[n]); +} + +static void map_fp2(vec384x acc, const vec384x x, const vec384x map[], size_t n) +{ + while (n--) { + mul_fp2(acc, acc, x); + add_fp2(acc, acc, map[n]); + } +} + +static void isogeny_map_to_E2(POINTonE2 *out, const POINTonE2 *p) +{ + /* + * x = x_num / x_den, where + * x_num = k_(1,3) * x'^3 + k_(1,2) * x'^2 + k_(1,1) * x' + k_(1,0) + * ... + */ + static const vec384x isogeny_map_x_num[] = { /* (k_(1,*)<<384) % P */ + {{ TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }, + { TO_LIMB_T(0x47f671c71ce05e62), TO_LIMB_T(0x06dd57071206393e), + TO_LIMB_T(0x7c80cd2af3fd71a2), TO_LIMB_T(0x048103ea9e6cd062), + TO_LIMB_T(0xc54516acc8d037f6), TO_LIMB_T(0x13808f550920ea41) }}, + {{ 0 }, + { TO_LIMB_T(0x5fe55555554c71d0), TO_LIMB_T(0x873fffdd236aaaa3), + TO_LIMB_T(0x6a6b4619b26ef918), TO_LIMB_T(0x21c2888408874945), + TO_LIMB_T(0x2836cda7028cabc5), TO_LIMB_T(0x0ac73310a7fd5abd) }}, + {{ TO_LIMB_T(0x0a0c5555555971c3), TO_LIMB_T(0xdb0c00101f9eaaae), + TO_LIMB_T(0xb1fb2f941d797997), TO_LIMB_T(0xd3960742ef416e1c), + TO_LIMB_T(0xb70040e2c20556f4), TO_LIMB_T(0x149d7861e581393b) }, + { TO_LIMB_T(0xaff2aaaaaaa638e8), TO_LIMB_T(0x439fffee91b55551), + TO_LIMB_T(0xb535a30cd9377c8c), TO_LIMB_T(0x90e144420443a4a2), + TO_LIMB_T(0x941b66d3814655e2), TO_LIMB_T(0x0563998853fead5e) }}, + {{ TO_LIMB_T(0x40aac71c71c725ed), TO_LIMB_T(0x190955557a84e38e), + TO_LIMB_T(0xd817050a8f41abc3), TO_LIMB_T(0xd86485d4c87f6fb1), + TO_LIMB_T(0x696eb479f885d059), TO_LIMB_T(0x198e1a74328002d2) }, + { 0 }} + }; + /* ... + * x_den = x'^2 + k_(2,1) * x' + k_(2,0) + */ + static const vec384x isogeny_map_x_den[] = { /* (k_(2,*)<<384) % P */ + {{ 0 }, + { TO_LIMB_T(0x1f3affffff13ab97), TO_LIMB_T(0xf25bfc611da3ff3e), + TO_LIMB_T(0xca3757cb3819b208), TO_LIMB_T(0x3e6427366f8cec18), + TO_LIMB_T(0x03977bc86095b089), TO_LIMB_T(0x04f69db13f39a952) }}, + {{ TO_LIMB_T(0x447600000027552e), TO_LIMB_T(0xdcb8009a43480020), + TO_LIMB_T(0x6f7ee9ce4a6e8b59), TO_LIMB_T(0xb10330b7c0a95bc6), + TO_LIMB_T(0x6140b1fcfb1e54b7), TO_LIMB_T(0x0381be097f0bb4e1) }, + { TO_LIMB_T(0x7588ffffffd8557d), TO_LIMB_T(0x41f3ff646e0bffdf), + TO_LIMB_T(0xf7b1e8d2ac426aca), TO_LIMB_T(0xb3741acd32dbb6f8), + TO_LIMB_T(0xe9daf5b9482d581f), TO_LIMB_T(0x167f53e0ba7431b8) }} + }; + /* + * y = y' * y_num / y_den, where + * y_num = k_(3,3) * x'^3 + k_(3,2) * x'^2 + k_(3,1) * x' + k_(3,0) + * ... + */ + static const vec384x isogeny_map_y_num[] = { /* (k_(3,*)<<384) % P */ + {{ TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }, + { TO_LIMB_T(0x96d8f684bdfc77be), TO_LIMB_T(0xb530e4f43b66d0e2), + TO_LIMB_T(0x184a88ff379652fd), TO_LIMB_T(0x57cb23ecfae804e1), + TO_LIMB_T(0x0fd2e39eada3eba9), TO_LIMB_T(0x08c8055e31c5d5c3) }}, + {{ 0 }, + { TO_LIMB_T(0xbf0a71c71c91b406), TO_LIMB_T(0x4d6d55d28b7638fd), + TO_LIMB_T(0x9d82f98e5f205aee), TO_LIMB_T(0xa27aa27b1d1a18d5), + TO_LIMB_T(0x02c3b2b2d2938e86), TO_LIMB_T(0x0c7d13420b09807f) }}, + {{ TO_LIMB_T(0xd7f9555555531c74), TO_LIMB_T(0x21cffff748daaaa8), + TO_LIMB_T(0x5a9ad1866c9bbe46), TO_LIMB_T(0x4870a2210221d251), + TO_LIMB_T(0x4a0db369c0a32af1), TO_LIMB_T(0x02b1ccc429ff56af) }, + { TO_LIMB_T(0xe205aaaaaaac8e37), TO_LIMB_T(0xfcdc000768795556), + TO_LIMB_T(0x0c96011a8a1537dd), TO_LIMB_T(0x1c06a963f163406e), + TO_LIMB_T(0x010df44c82a881e6), TO_LIMB_T(0x174f45260f808feb) }}, + {{ TO_LIMB_T(0xa470bda12f67f35c), TO_LIMB_T(0xc0fe38e23327b425), + TO_LIMB_T(0xc9d3d0f2c6f0678d), TO_LIMB_T(0x1c55c9935b5a982e), + TO_LIMB_T(0x27f6c0e2f0746764), TO_LIMB_T(0x117c5e6e28aa9054) }, + { 0 }} + }; + /* ... + * y_den = x'^3 + k_(4,2) * x'^2 + k_(4,1) * x' + k_(4,0) + */ + static const vec384x isogeny_map_y_den[] = { /* (k_(4,*)<<384) % P */ + {{ TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }, + { TO_LIMB_T(0x0162fffffa765adf), TO_LIMB_T(0x8f7bea480083fb75), + TO_LIMB_T(0x561b3c2259e93611), TO_LIMB_T(0x11e19fc1a9c875d5), + TO_LIMB_T(0xca713efc00367660), TO_LIMB_T(0x03c6a03d41da1151) }}, + {{ 0 }, + { TO_LIMB_T(0x5db0fffffd3b02c5), TO_LIMB_T(0xd713f52358ebfdba), + TO_LIMB_T(0x5ea60761a84d161a), TO_LIMB_T(0xbb2c75a34ea6c44a), + TO_LIMB_T(0x0ac6735921c1119b), TO_LIMB_T(0x0ee3d913bdacfbf6) }}, + {{ TO_LIMB_T(0x66b10000003affc5), TO_LIMB_T(0xcb1400e764ec0030), + TO_LIMB_T(0xa73e5eb56fa5d106), TO_LIMB_T(0x8984c913a0fe09a9), + TO_LIMB_T(0x11e10afb78ad7f13), TO_LIMB_T(0x05429d0e3e918f52) }, + { TO_LIMB_T(0x534dffffffc4aae6), TO_LIMB_T(0x5397ff174c67ffcf), + TO_LIMB_T(0xbff273eb870b251d), TO_LIMB_T(0xdaf2827152870915), + TO_LIMB_T(0x393a9cbaca9e2dc3), TO_LIMB_T(0x14be74dbfaee5748) }} + }; + vec384x Zz_powers[3], map[3], xn, xd, yn, yd; + + /* lay down Z^2 powers in descending order */ + sqr_fp2(Zz_powers[2], p->Z); /* ZZ^1 */ + sqr_fp2(Zz_powers[1], Zz_powers[2]); /* ZZ^2 1+1 */ + mul_fp2(Zz_powers[0], Zz_powers[2], Zz_powers[1]); /* ZZ^3 2+1 */ + + map_fp2_times_Zz(map, isogeny_map_x_num, Zz_powers, 3); + mul_fp2(xn, p->X, isogeny_map_x_num[3]); + add_fp2(xn, xn, map[2]); + map_fp2(xn, p->X, map, 2); + + map_fp2_times_Zz(map, isogeny_map_x_den, Zz_powers + 1, 2); + add_fp2(xd, p->X, map[1]); + map_fp2(xd, p->X, map, 1); + mul_fp2(xd, xd, Zz_powers[2]); /* xd *= Z^2 */ + + map_fp2_times_Zz(map, isogeny_map_y_num, Zz_powers, 3); + mul_fp2(yn, p->X, isogeny_map_y_num[3]); + add_fp2(yn, yn, map[2]); + map_fp2(yn, p->X, map, 2); + mul_fp2(yn, yn, p->Y); /* yn *= Y */ + + map_fp2_times_Zz(map, isogeny_map_y_den, Zz_powers, 3); + add_fp2(yd, p->X, map[2]); + map_fp2(yd, p->X, map, 2); + mul_fp2(Zz_powers[2], Zz_powers[2], p->Z); + mul_fp2(yd, yd, Zz_powers[2]); /* yd *= Z^3 */ + + /* convert (xn, xd, yn, yd) to Jacobian coordinates */ + mul_fp2(out->Z, xd, yd); /* Z = xd * yd */ + mul_fp2(out->X, xn, yd); + mul_fp2(out->X, out->X, out->Z); /* X = xn * xd * yd^2 */ + sqr_fp2(out->Y, out->Z); + mul_fp2(out->Y, out->Y, xd); + mul_fp2(out->Y, out->Y, yn); /* Y = yn * xd^3 * yd^2 */ +} + +static void map_to_isogenous_E2(POINTonE2 *p, const vec384x u) +{ + static const vec384x minus_A = { + { 0 }, + { TO_LIMB_T(0xd4c4fffffcec5869), TO_LIMB_T(0x1da3f3eed25bfd79), + TO_LIMB_T(0x7fa833c5136fff67), TO_LIMB_T(0x59261433cd540cbd), + TO_LIMB_T(0x48450f5f2b84682c), TO_LIMB_T(0x07e05d00bf959233) } + }; + static const vec384x Z = { /* -2 - i */ + { TO_LIMB_T(0x87ebfffffff9555c), TO_LIMB_T(0x656fffe5da8ffffa), + TO_LIMB_T(0x0fd0749345d33ad2), TO_LIMB_T(0xd951e663066576f4), + TO_LIMB_T(0xde291a3d41e980d3), TO_LIMB_T(0x0815664c7dfe040d) }, + { TO_LIMB_T(0x43f5fffffffcaaae), TO_LIMB_T(0x32b7fff2ed47fffd), + TO_LIMB_T(0x07e83a49a2e99d69), TO_LIMB_T(0xeca8f3318332bb7a), + TO_LIMB_T(0xef148d1ea0f4c069), TO_LIMB_T(0x040ab3263eff0206) } + }; + static const vec384x recip_ZZZ = { /* 1/(Z^3) */ + { TO_LIMB_T(0x65018f5c28f598eb), TO_LIMB_T(0xe6020417f022d916), + TO_LIMB_T(0xd6327313288369c7), TO_LIMB_T(0x622ded8eb447156f), + TO_LIMB_T(0xe52a2aee72c2a01f), TO_LIMB_T(0x089812fb8481ffe4) }, + { TO_LIMB_T(0x2574eb851eb8619f), TO_LIMB_T(0xdba2e97912925604), + TO_LIMB_T(0x67e495a909e7a18e), TO_LIMB_T(0xdf2da23b8145b8f7), + TO_LIMB_T(0xcf5d3728310ebf6d), TO_LIMB_T(0x11be446236f4c116) } + }; + static const vec384x magic_ZZZ = { /* 1/Z^3 = a + b*i */ + /* a^2 + b^2 */ + { TO_LIMB_T(0xaa7eb851eb8508e0), TO_LIMB_T(0x1c54fdf360989374), + TO_LIMB_T(0xc87f2fc6e716c62e), TO_LIMB_T(0x0124aefb1f9efea7), + TO_LIMB_T(0xb2f8be63e844865c), TO_LIMB_T(0x08b47f775a7ef35a) }, + /* (a^2 + b^2)^((P-3)/4) */ + { TO_LIMB_T(0xe4132bbd838cf70a), TO_LIMB_T(0x01d769ac83772c19), + TO_LIMB_T(0xa83dd6e974c22e45), TO_LIMB_T(0xbc8ec3e777b08dff), + TO_LIMB_T(0xc035c2042ecf5da3), TO_LIMB_T(0x073929e97f0850bf) } + }; + static const vec384x ZxA = { /* 240 - 480*i */ + { TO_LIMB_T(0xe53a000003135242), TO_LIMB_T(0x01080c0fdef80285), + TO_LIMB_T(0xe7889edbe340f6bd), TO_LIMB_T(0x0b51375126310601), + TO_LIMB_T(0x02d6985717c744ab), TO_LIMB_T(0x1220b4e979ea5467) }, + { TO_LIMB_T(0xa989fffff9d8b0d2), TO_LIMB_T(0x3b47e7dda4b7faf3), + TO_LIMB_T(0xff50678a26dffece), TO_LIMB_T(0xb24c28679aa8197a), + TO_LIMB_T(0x908a1ebe5708d058), TO_LIMB_T(0x0fc0ba017f2b2466) } + }; + vec384x uu, tv2, tv4, x2n, gx1, gxd, y2; +#if 0 + vec384x xn, x1n, xd, y, y1, Zuu; +#else +# define xn p->X +# define y p->Y +# define xd p->Z +# define x1n xn +# define y1 y +# define Zuu x2n +#endif +#define sgn0_fp2(a) (sgn0_pty_mont_384x((a), BLS12_381_P, p0) & 1) + bool_t e1, e2; + + /* + * as per map_to_curve() from poc/sswu_opt.sage at + * https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve + * with 9mod16 twists... + */ + /* x numerator variants */ + sqr_fp2(uu, u); /* uu = u^2 */ + mul_fp2(Zuu, Z, uu); /* Zuu = Z * uu */ + sqr_fp2(tv2, Zuu); /* tv2 = Zuu^2 */ + add_fp2(tv2, tv2, Zuu); /* tv2 = tv2 + Zuu */ + add_fp2(x1n, tv2, BLS12_381_Rx.p2); /* x1n = tv2 + 1 */ + mul_fp2(x1n, x1n, Bprime_E2); /* x1n = x1n * B */ + mul_fp2(x2n, Zuu, x1n); /* x2n = Zuu * x1n */ + + /* x denumenator */ + mul_fp2(xd, minus_A, tv2); /* xd = -A * tv2 */ + e1 = vec_is_zero(xd, sizeof(xd)); /* e1 = xd == 0 */ + vec_select(xd, ZxA, xd, sizeof(xd), e1); /* # If xd == 0, set xd = Z*A */ + + /* y numerators variants */ + sqr_fp2(tv2, xd); /* tv2 = xd^2 */ + mul_fp2(gxd, xd, tv2); /* gxd = xd^3 */ + mul_fp2(tv2, Aprime_E2, tv2); /* tv2 = A * tv2 */ + sqr_fp2(gx1, x1n); /* gx1 = x1n^2 */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1n^2 + A*xd^2 */ + mul_fp2(gx1, gx1, x1n); /* gx1 = gx1 * x1n # x1n^3 + A*x1n*xd^2 */ + mul_fp2(tv2, Bprime_E2, gxd); /* tv2 = B * gxd */ + add_fp2(gx1, gx1, tv2); /* gx1 = gx1 + tv2 # x1^3 + A*x1*xd^2 + B*xd^3 */ + sqr_fp2(tv4, gxd); /* tv4 = gxd^2 */ + mul_fp2(tv2, gx1, gxd); /* tv2 = gx1 * gxd */ + mul_fp2(tv4, tv4, tv2); /* tv4 = tv4 * tv2 # gx1*gxd^3 */ + e2 = recip_sqrt_fp2(y1, tv4, /* y1 = tv4^c1 # (gx1*gxd^3)^((p^2-9)/16) */ + recip_ZZZ, magic_ZZZ); + mul_fp2(y1, y1, tv2); /* y1 = y1 * tv2 # gx1*gxd*y1 */ + mul_fp2(y2, y1, uu); /* y2 = y1 * uu */ + mul_fp2(y2, y2, u); /* y2 = y2 * u */ + + /* choose numerators */ + vec_select(xn, x1n, x2n, sizeof(xn), e2); /* xn = e2 ? x1n : x2n */ + vec_select(y, y1, y2, sizeof(y), e2); /* y = e2 ? y1 : y2 */ + + e1 = sgn0_fp2(u); + e2 = sgn0_fp2(y); + cneg_fp2(y, y, e1^e2); /* fix sign of y */ + /* return (xn, xd, y, 1) */ + + /* convert (xn, xd, y, 1) to Jacobian projective coordinates */ + mul_fp2(p->X, xn, xd); /* X = xn * xd */ + mul_fp2(p->Y, y, gxd); /* Y = y * xd^3 */ +#ifndef xd + vec_copy(p->Z, xd, sizeof(xd)); /* Z = xd */ +#else +# undef xn +# undef y +# undef xd +# undef x1n +# undef y1 +# undef Zuu +# undef tv4 +#endif +#undef sgn0_fp2 +} + +#if 0 +static const byte h_eff[] = { + TO_BYTES(0xe8020005aaa95551), TO_BYTES(0x59894c0adebbf6b4), + TO_BYTES(0xe954cbc06689f6a3), TO_BYTES(0x2ec0ec69d7477c1a), + TO_BYTES(0x6d82bf015d1212b0), TO_BYTES(0x329c2f178731db95), + TO_BYTES(0x9986ff031508ffe1), TO_BYTES(0x88e2a8e9145ad768), + TO_BYTES(0x584c6a0ea91b3528), TO_BYTES(0x0bc69f08f2ee75b3) +}; + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ POINTonE2_mult_w5(out, p, h_eff, 636); } +#else +/* + * As per suggestions in "7. Clearing the cofactor" at + * https://tools.ietf.org/html/draft-irtf-cfrg-hash-to-curve-06 + */ +static void POINTonE2_add_n_dbl(POINTonE2 *out, const POINTonE2 *p, size_t n) +{ + POINTonE2_dadd(out, out, p, NULL); + while(n--) + POINTonE2_double(out, out); +} + +static void POINTonE2_times_minus_z(POINTonE2 *out, const POINTonE2 *in) +{ + POINTonE2_double(out, in); /* 1: 0x2 */ + POINTonE2_add_n_dbl(out, in, 2); /* 2..4: 0x3..0xc */ + POINTonE2_add_n_dbl(out, in, 3); /* 5..8: 0xd..0x68 */ + POINTonE2_add_n_dbl(out, in, 9); /* 9..18: 0x69..0xd200 */ + POINTonE2_add_n_dbl(out, in, 32); /* 19..51: ..0xd20100000000 */ + POINTonE2_add_n_dbl(out, in, 16); /* 52..68: ..0xd201000000010000 */ +} + +static void psi(POINTonE2 *out, const POINTonE2 *in); + +static void clear_cofactor(POINTonE2 *out, const POINTonE2 *p) +{ + POINTonE2 t0, t1; + + /* A.Budroni, F.Pintore, "Efficient hash maps to G2 on BLS curves" */ + POINTonE2_double(out, p); /* out = 2P */ + psi(out, out); /* out = Ψ(2P) */ + psi(out, out); /* out = Ψ²(2P) */ + + vec_copy(&t0, p, sizeof(t0)); + POINTonE2_cneg(&t0, 1); /* t0 = -P */ + psi(&t1, &t0); /* t1 = -Ψ(P) */ + POINTonE2_dadd(out, out, &t0, NULL);/* out = Ψ²(2P) - P */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = Ψ²(2P) - P - Ψ(P) */ + + POINTonE2_times_minus_z(&t0, p); /* t0 = [-z]P */ + POINTonE2_dadd(&t0, &t0, p, NULL); /* t0 = [-z + 1]P */ + POINTonE2_dadd(&t0, &t0, &t1, NULL);/* t0 = [-z + 1]P - Ψ(P) */ + POINTonE2_times_minus_z(&t1, &t0); /* t1 = [z² - z]P + [z]Ψ(P) */ + POINTonE2_dadd(out, out, &t1, NULL);/* out = [z² - z - 1]P */ + /* + [z - 1]Ψ(P) */ + /* + Ψ²(2P) */ +} +#endif + +/* + * |u|, |v| are expected to be in Montgomery representation + */ +static void map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ + POINTonE2 p; + + map_to_isogenous_E2(&p, u); + + if (v != NULL) { + map_to_isogenous_E2(out, v); /* borrow |out| */ + POINTonE2_dadd(&p, &p, out, Aprime_E2); + } + + isogeny_map_to_E2(&p, &p); /* sprinkle isogenous powder */ + clear_cofactor(out, &p); +} + +void blst_map_to_g2(POINTonE2 *out, const vec384x u, const vec384x v) +{ map_to_g2(out, u, v); } + +static void Encode_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[1]; + + hash_to_field(u[0], 2, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], NULL); +} + +void blst_encode_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Encode_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static void Hash_to_G2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ + vec384x u[2]; + + hash_to_field(u[0], 4, aug, aug_len, msg, msg_len, DST, DST_len); + map_to_g2(p, u[0], u[1]); +} + +void blst_hash_to_g2(POINTonE2 *p, const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len, + const unsigned char *aug, size_t aug_len) +{ Hash_to_G2(p, msg, msg_len, DST, DST_len, aug, aug_len); } + +static bool_t POINTonE2_in_G2(const POINTonE2 *P) +{ +#if 0 + POINTonE2 t0, t1, t2; + + /* Bowe, S., "Faster subgroup checks for BLS12-381" */ + psi(&t0, P); /* Ψ(P) */ + psi(&t0, &t0); /* Ψ²(P) */ + psi(&t1, &t0); /* Ψ³(P) */ + + POINTonE2_times_minus_z(&t2, &t1); + POINTonE2_dadd(&t0, &t0, &t2, NULL); + POINTonE2_cneg(&t0, 1); + POINTonE2_dadd(&t0, &t0, P, NULL); /* [z]Ψ³(P) - Ψ²(P) + P */ + + return vec_is_zero(t0.Z, sizeof(t0.Z)); +#else + POINTonE2 t0, t1; + + /* Scott, M., https://eprint.iacr.org/2021/1130 */ + psi(&t0, P); /* Ψ(P) */ + + POINTonE2_times_minus_z(&t1, P); + POINTonE2_cneg(&t1, 1); /* [z]P */ + + return POINTonE2_is_equal(&t0, &t1); +#endif +} + +int blst_p2_in_g2(const POINTonE2 *p) +{ return (int)POINTonE2_in_G2(p); } + +int blst_p2_affine_in_g2(const POINTonE2_affine *p) +{ + POINTonE2 P; + + vec_copy(P.X, p->X, 2*sizeof(P.X)); + vec_select(P.Z, p->X, BLS12_381_Rx.p, sizeof(P.Z), + vec_is_zero(p, sizeof(*p))); + + return (int)POINTonE2_in_G2(&P); +} diff --git a/crypto/blst_src/multi_scalar.c b/crypto/blst_src/multi_scalar.c new file mode 100644 index 00000000000..55ab8227718 --- /dev/null +++ b/crypto/blst_src/multi_scalar.c @@ -0,0 +1,427 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" +#include "point.h" + +/* + * Infinite point among inputs would be devastating. Shall we change it? + */ +#define POINTS_TO_AFFINE_IMPL(prefix, ptype, bits, field) \ +static void ptype##s_to_affine(ptype##_affine dst[], \ + const ptype *const points[], size_t npoints) \ +{ \ + size_t i; \ + vec##bits *acc, ZZ, ZZZ; \ + const ptype *point = NULL; \ + const size_t stride = sizeof(ptype)==sizeof(POINTonE1) ? 1536 : 768; \ +\ + while (npoints) { \ + const ptype *p, *const *walkback; \ + size_t delta = strideZ, sizeof(vec##bits)); \ + for (i = 1; i < delta; i++, acc++) \ + point = *points ? *points++ : point+1, \ + mul_##field(acc[0], acc[-1], point->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + walkback = points-1, p = point, --delta, dst += delta; \ + for (i = 0; i < delta; i++, acc--, dst--) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], p->Z, acc[0]); \ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + p = (p == *walkback) ? *--walkback : p-1; \ + } \ + sqr_##field(ZZ, acc[0]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[0]); /* 1/Z^3 */\ + mul_##field(dst->X, p->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, p->Y, ZZZ); /* Y = Y'/Z^3 */\ + ++delta, dst += delta, npoints -= delta; \ + } \ +} \ +\ +void prefix##s_to_affine(ptype##_affine dst[], const ptype *const points[], \ + size_t npoints) \ +{ ptype##s_to_affine(dst, points, npoints); } + +POINTS_TO_AFFINE_IMPL(blst_p1, POINTonE1, 384, fp) +POINTS_TO_AFFINE_IMPL(blst_p2, POINTonE2, 384x, fp2) + +/* + * This is two-step multi-scalar multiplication procedure. First, given + * a set of points you pre-compute a table for chosen windowing factor + * [expressed in bits with value between 2 and 14], and then you pass + * this table to the actual multiplication procedure along with scalars. + * Idea is that the pre-computed table will be reused multiple times. In + * which case multiplication runs faster than below Pippenger algorithm + * implementation for up to ~16K points for wbits=8, naturally at the + * expense of multi-megabyte table. One can trade even more memory for + * performance, but each wbits increment doubles the memory requirement, + * so at some point it gets prohibively large... For reference, without + * reusing the table it's faster than Pippenger algorithm for up ~32 + * points [with wbits=5]... + */ + +#define SCRATCH_SZ(ptype) (sizeof(ptype)==sizeof(POINTonE1) ? 8192 : 4096) + +#define PRECOMPUTE_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_precompute_row_wbits(ptype row[], size_t wbits, \ + const ptype##_affine *point) \ +{ \ + size_t i, j, n = (size_t)1 << (wbits-1); \ + /* row[-1] is implicit infinity */\ + vec_copy(&row[0], point, sizeof(*point)); /* row[0]=p*1 */\ + vec_copy(&row[0].Z, one, sizeof(row[0].Z)); \ + ptype##_double(&row[1], &row[0]); /* row[1]=p*(1+1) */\ + for (i = 2, j = 1; i < n; i += 2, j++) \ + ptype##_add_affine(&row[i], &row[i-1], point), /* row[2]=p*(2+1) */\ + ptype##_double(&row[i+1], &row[j]); /* row[3]=p*(2+2) */\ +} /* row[4] ... */\ +\ +static void ptype##s_to_affine_row_wbits(ptype##_affine dst[], ptype src[], \ + size_t wbits, size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t i, j; \ + vec##bits *acc, ZZ, ZZZ; \ +\ + src += total; \ + acc = (vec##bits *)src; \ + vec_copy(acc++, one, sizeof(vec##bits)); \ + for (i = 0; i < npoints; i++) \ + for (j = nwin; --src, --j; acc++) \ + mul_##field(acc[0], acc[-1], src->Z); \ +\ + --acc; reciprocal_##field(acc[0], acc[0]); \ +\ + for (i = 0; i < npoints; i++) { \ + vec_copy(dst++, src++, sizeof(ptype##_affine)); \ + for (j = 1; j < nwin; j++, acc--, src++, dst++) { \ + mul_##field(acc[-1], acc[-1], acc[0]); /* 1/Z */\ + sqr_##field(ZZ, acc[-1]); /* 1/Z^2 */\ + mul_##field(ZZZ, ZZ, acc[-1]); /* 1/Z^3 */\ + mul_##field(acc[-1], src->Z, acc[0]); \ + mul_##field(dst->X, src->X, ZZ); /* X = X'/Z^2 */\ + mul_##field(dst->Y, src->Y, ZZZ); /* Y = Y'/Z^3 */\ + } \ + } \ +} \ +\ +/* flat |points[n]| can be placed at the end of |table[n<<(wbits-1)]| */\ +static void ptype##s_precompute_wbits(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ \ + size_t total = npoints << (wbits-1); \ + size_t nwin = (size_t)1 << (wbits-1); \ + size_t nmin = wbits>9 ? (size_t)1: (size_t)1 << (9-wbits); \ + size_t i, top = 0; \ + ptype *rows, *row; \ + const ptype##_affine *point = NULL; \ + size_t stride = ((512*1024)/sizeof(ptype##_affine)) >> wbits; \ + if (stride == 0) stride = 1; \ +\ + while (npoints >= nmin) { \ + size_t limit = total - npoints; \ +\ + if (top + (stride << wbits) > limit) { \ + stride = (limit - top) >> wbits; \ + if (stride == 0) break; \ + } \ + rows = row = (ptype *)(&table[top]); \ + for (i = 0; i < stride; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, stride); \ + top += stride << (wbits-1); \ + npoints -= stride; \ + } \ + rows = row = alloca(2*sizeof(ptype##_affine) * npoints * nwin); \ + for (i = 0; i < npoints; i++, row += nwin) \ + point = *points ? *points++ : point+1, \ + ptype##_precompute_row_wbits(row, wbits, point); \ + ptype##s_to_affine_row_wbits(&table[top], rows, wbits, npoints); \ +} \ +\ +size_t prefix##s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints) \ +{ return (sizeof(ptype##_affine)*npoints) << (wbits-1); } \ +void prefix##s_mult_wbits_precompute(ptype##_affine table[], size_t wbits, \ + const ptype##_affine *const points[], \ + size_t npoints) \ +{ ptype##s_precompute_wbits(table, wbits, points, npoints); } + +#define POINTS_MULT_WBITS_IMPL(prefix, ptype, bits, field, one) \ +static void ptype##_gather_booth_wbits(ptype *p, const ptype##_affine row[], \ + size_t wbits, limb_t booth_idx) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ + bool_t idx_is_zero; \ + static const ptype##_affine infinity = { 0 }; \ +\ + booth_idx &= ((limb_t)1 << wbits) - 1; \ + idx_is_zero = is_zero(booth_idx); \ + booth_idx -= 1 ^ idx_is_zero; \ + vec_select(p, &infinity, &row[booth_idx], sizeof(row[0]), idx_is_zero); \ + ptype##_cneg(p, booth_sign); \ +} \ +\ +static void ptype##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ \ + limb_t wmask, wval; \ + size_t i, j, z, nbytes, window, nwin = (size_t)1 << (wbits-1); \ + const byte *scalar, *const *scalar_s = scalars; \ + const ptype##_affine *row = table; \ +\ + size_t scratch_sz = SCRATCH_SZ(ptype); \ + if (scratch == NULL) { \ + scratch_sz /= 4; /* limit to 288K */ \ + scratch_sz = scratch_sz < npoints ? scratch_sz : npoints; \ + scratch = alloca(sizeof(ptype) * scratch_sz); \ + } \ +\ + nbytes = (nbits + 7)/8; /* convert |nbits| to bytes */ \ + scalar = *scalar_s++; \ +\ + /* top excess bits modulo target window size */ \ + window = nbits % wbits; /* yes, it may be zero */ \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ +\ + nbits -= window; \ + z = is_zero(nbits); \ + wval = (get_wval_limb(scalar, nbits - (z^1), wbits + (z^1)) << z) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[0], row, wbits, wval); \ + row += nwin; \ +\ + i = 1; vec_zero(ret, sizeof(*ret)); \ + while (nbits > 0) { \ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = get_wval_limb(scalar, nbits - 1, window + 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +\ + for (j = 0; j < wbits; j++) \ + ptype##_double(ret, ret); \ +\ + window = wbits; \ + wmask = ((limb_t)1 << (window + 1)) - 1; \ + nbits -= window; \ + i = 0; row = table; scalar_s = scalars; \ + } \ +\ + for (j = i; i < npoints; i++, j++, row += nwin) { \ + if (j == scratch_sz) \ + ptype##s_accumulate(ret, scratch, j), j = 0; \ + scalar = *scalar_s ? *scalar_s++ : scalar+nbytes; \ + wval = (get_wval_limb(scalar, 0, wbits) << 1) & wmask; \ + wval = booth_encode(wval, wbits); \ + ptype##_gather_booth_wbits(&scratch[j], row, wbits, wval); \ + } \ + ptype##s_accumulate(ret, scratch, j); \ +} \ +\ +size_t prefix##s_mult_wbits_scratch_sizeof(size_t npoints) \ +{ \ + const size_t scratch_sz = SCRATCH_SZ(ptype); \ + return sizeof(ptype) * (npoints < scratch_sz ? npoints : scratch_sz); \ +} \ +void prefix##s_mult_wbits(ptype *ret, const ptype##_affine table[], \ + size_t wbits, size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype scratch[]) \ +{ ptype##s_mult_wbits(ret, table, wbits, npoints, scalars, nbits, scratch); } + +PRECOMPUTE_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_WBITS_IMPL(blst_p1, POINTonE1, 384, fp, BLS12_381_Rx.p) + +PRECOMPUTE_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_WBITS_IMPL(blst_p2, POINTonE2, 384x, fp2, BLS12_381_Rx.p2) + +/* + * Pippenger algorithm implementation, fastest option for larger amount + * of points... + */ + +static size_t pippenger_window_size(size_t npoints) +{ + size_t wbits; + + for (wbits=0; npoints>>=1; wbits++) ; + + return wbits>12 ? wbits-3 : (wbits>4 ? wbits-2 : (wbits ? 2 : 1)); +} + +#define DECLARE_PRIVATE_POINTXYZZ(ptype, bits) \ +typedef struct { vec##bits X,Y,ZZZ,ZZ; } ptype##xyzz; + +#define POINTS_MULT_PIPPENGER_IMPL(prefix, ptype) \ +static void ptype##_integrate_buckets(ptype *out, ptype##xyzz buckets[], \ + size_t wbits) \ +{ \ + ptype##xyzz ret[1], acc[1]; \ + size_t n = (size_t)1 << wbits; \ +\ + /* Calculate sum of x[i-1]*i for i=1 through 1<<|wbits|. */\ + vec_copy(acc, &buckets[--n], sizeof(acc)); \ + vec_copy(ret, &buckets[n], sizeof(ret)); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + while (n--) { \ + ptype##xyzz_dadd(acc, acc, &buckets[n]); \ + ptype##xyzz_dadd(ret, ret, acc); \ + vec_zero(&buckets[n], sizeof(buckets[n])); \ + } \ + ptype##xyzz_to_Jacobian(out, ret); \ +} \ +\ +static void ptype##_bucket(ptype##xyzz buckets[], limb_t booth_idx, \ + size_t wbits, const ptype##_affine *p) \ +{ \ + bool_t booth_sign = (booth_idx >> wbits) & 1; \ +\ + booth_idx &= (1< nbits) wbits = nbits - bit0, cbits = wbits + 1; \ + else wbits = cbits = window; \ + ptype##s_tile_pippenger(ret, points, npoints, scalars, nbits, scratch, \ + bit0, wbits, cbits); \ +} \ +void prefix##s_mult_pippenger(ptype *ret, \ + const ptype##_affine *const points[], \ + size_t npoints, \ + const byte *const scalars[], size_t nbits, \ + ptype##xyzz scratch[]) \ +{ \ + if (npoints == 1) { \ + prefix##_from_affine(ret, points[0]); \ + prefix##_mult(ret, ret, scalars[0], nbits); \ + return; \ + } \ + if ((npoints * sizeof(ptype##_affine) * 8 * 3) <= SCRATCH_LIMIT) { \ + ptype##_affine *table = alloca(npoints * sizeof(ptype##_affine) * 8); \ + ptype##s_precompute_wbits(table, 4, points, npoints); \ + ptype##s_mult_wbits(ret, table, 4, npoints, scalars, nbits, NULL); \ + return; \ + } \ + ptype##s_mult_pippenger(ret, points, npoints, scalars, nbits, scratch, 0); \ +} + +DECLARE_PRIVATE_POINTXYZZ(POINTonE1, 384) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_IMPL(POINTonE1, 384, fp) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE1, 384, fp, BLS12_381_Rx.p) +POINTS_MULT_PIPPENGER_IMPL(blst_p1, POINTonE1) + +DECLARE_PRIVATE_POINTXYZZ(POINTonE2, 384x) +POINTXYZZ_TO_JACOBIAN_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_IMPL(POINTonE2, 384x, fp2) +POINTXYZZ_DADD_AFFINE_IMPL(POINTonE2, 384x, fp2, BLS12_381_Rx.p2) +POINTS_MULT_PIPPENGER_IMPL(blst_p2, POINTonE2) diff --git a/crypto/blst_src/no_asm.h b/crypto/blst_src/no_asm.h new file mode 100644 index 00000000000..be7bf47e197 --- /dev/null +++ b/crypto/blst_src/no_asm.h @@ -0,0 +1,1345 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#if LIMB_T_BITS==32 +typedef unsigned long long llimb_t; +#endif + +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 || defined(__STDC_NO_VLA__) +# error "unsupported compiler" +#endif + +#if defined(__clang__) +# pragma GCC diagnostic ignored "-Wstatic-in-inline" +#endif + +#if !defined(__clang__) && !defined(__builtin_assume) +# if defined(__GNUC__) && __GNUC__>=5 +# define __builtin_assume(condition) if (!(condition)) __builtin_unreachable() +# elif defined(_MSC_VER) +# define __builtin_assume(condition) __assume(condition) +# else +# define __builtin_assume(condition) (void)(condition) +# endif +#endif + +static void mul_mont_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n+1], carry; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (carry=0, j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + limbx = tmp[i] + (hi + (llimb_t)carry); + tmp[i-1] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + limbx = hi + (llimb_t)carry; + tmp[i] = (limb_t)limbx; + carry = (limb_t)(limbx >> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + launder(mask); + + for (carry=0, i=0; i> LIMB_T_BITS); + } +} + +#define SUB_MOD_IMPL(bits) \ +inline void sub_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits b, const vec##bits p) \ +{ sub_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_MOD_IMPL(256) +SUB_MOD_IMPL(384) + +static void mul_by_3_mod_n(limb_t ret[], const limb_t a[], const limb_t p[], + size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t mask, carry, borrow, tmp[n], two_a[n]; + size_t i; + + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS) & 1; + } + + flag &= vec_is_zero(a, sizeof(tmp)) ^ 1; + mask = (limb_t)0 - flag; + + for(i=0; i> LIMB_T_BITS) & 1; + } + + return borrow & (is_zero(acc) ^ 1); +} + +#define CHECK_MOD_IMPL(bits) \ +inline limb_t check_mod_##bits(const pow##bits a, const vec##bits p) \ +{ return check_mod_n(a, p, NLIMBS(bits)); } + +CHECK_MOD_IMPL(256) + +static limb_t add_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + __builtin_assume(n != 0); + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + add_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define ADD_N_CHECK_MOD_IMPL(bits) \ +inline limb_t add_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return add_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +ADD_N_CHECK_MOD_IMPL(256) + +static limb_t sub_n_check_mod_n(byte ret[], const byte a[], const byte b[], + const limb_t p[], size_t n) +{ + __builtin_assume(n != 0); + limb_t ret_[n], a_[n], b_[n], zero; + + limbs_from_le_bytes(a_, a, sizeof(a_)); + limbs_from_le_bytes(b_, b, sizeof(b_)); + + sub_mod_n(ret_, a_, b_, p, n); + zero = vec_is_zero(ret_, sizeof(ret_)); + + le_bytes_from_limbs(ret, ret_, sizeof(ret_)); + + return zero^1; +} + +#define SUB_N_CHECK_MOD_IMPL(bits) \ +inline limb_t sub_n_check_mod_##bits(pow##bits ret, const pow##bits a, \ + const pow##bits b, const vec##bits p) \ +{ return sub_n_check_mod_n(ret, a, b, p, NLIMBS(bits)); } + +SUB_N_CHECK_MOD_IMPL(256) + +static void from_mont_n(limb_t ret[], const limb_t a[], + const limb_t p[], limb_t n0, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t mask, borrow, mx, hi, tmp[n]; + size_t i, j; + + for (j=0; j> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = hi; + a = tmp; + } + + /* this is needed only if input can be non-fully-reduced */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = hi; + b = tmp; + } + + for (carry=0, i=0; i> LIMB_T_BITS); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + mask = carry - borrow; + launder(mask); + + for(i=0; i> LIMB_T_BITS); + } + + for (next=ret[0], i=0; i> 1; + next = ret[i+1]; + ret[i] = limb | next << (LIMB_T_BITS-1); + } + ret[i] = next >> 1 | carry << (LIMB_T_BITS-1); + + a = ret; + } +} + +#define RSHIFT_MOD_IMPL(bits) \ +inline void rshift_mod_##bits(vec##bits ret, const vec##bits a, size_t count, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, count, p, NLIMBS(bits)); } + +RSHIFT_MOD_IMPL(256) +RSHIFT_MOD_IMPL(384) + +#define DIV_BY_2_MOD_IMPL(bits) \ +inline void div_by_2_mod_##bits(vec##bits ret, const vec##bits a, \ + const vec##bits p) \ +{ rshift_mod_n(ret, a, 1, p, NLIMBS(bits)); } + +DIV_BY_2_MOD_IMPL(384) + +static limb_t sgn0_pty_mod_n(const limb_t a[], const limb_t p[], size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t carry, borrow, ret, tmp[n]; + size_t i; + + ret = a[0] & 1; /* parity */ + + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + + ret |= ((carry - borrow) & 2) ^ 2; + + return ret; +} + +inline limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p) +{ return sgn0_pty_mod_n(a, p, NLIMBS(384)); } + +inline limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0) +{ + vec384 tmp; + + from_mont_n(tmp, a, p, n0, NLIMBS(384)); + + return sgn0_pty_mod_n(tmp, p, NLIMBS(384)); +} + +inline limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p) +{ + limb_t re, im, sign, prty; + + re = sgn0_pty_mod_n(a[0], p, NLIMBS(384)); + im = sgn0_pty_mod_n(a[1], p, NLIMBS(384)); + + /* a->im!=0 ? sgn0(a->im) : sgn0(a->re) */ + sign = (limb_t)0 - vec_is_zero(a[1], sizeof(vec384)); + sign = (re & sign) | (im & ~sign); + + /* a->re==0 ? prty(a->im) : prty(a->re) */ + prty = (limb_t)0 - vec_is_zero(a[0], sizeof(vec384)); + prty = (im & prty) | (re & ~prty); + + return (sign & 2) | (prty & 1); +} + +inline limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0) +{ + vec384x tmp; + + from_mont_n(tmp[0], a[0], p, n0, NLIMBS(384)); + from_mont_n(tmp[1], a[1], p, n0, NLIMBS(384)); + + return sgn0_pty_mod_384x(tmp, p); +} + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0) +{ + vec384 aa, bb, cc; + + add_mod_n(aa, a[0], a[1], p, NLIMBS(384)); + add_mod_n(bb, b[0], b[1], p, NLIMBS(384)); + mul_mont_n(bb, bb, aa, p, n0, NLIMBS(384)); + mul_mont_n(aa, a[0], b[0], p, n0, NLIMBS(384)); + mul_mont_n(cc, a[1], b[1], p, n0, NLIMBS(384)); + sub_mod_n(ret[0], aa, cc, p, NLIMBS(384)); + sub_mod_n(ret[1], bb, aa, p, NLIMBS(384)); + sub_mod_n(ret[1], ret[1], cc, p, NLIMBS(384)); +} + +/* + * mul_mont_n without final conditional subtraction, which implies + * that modulus is one bit short, which in turn means that there are + * no carries to handle between iterations... + */ +static void mul_mont_nonred_n(limb_t ret[], const limb_t a[], const limb_t b[], + const limb_t p[], limb_t n0, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t mx, hi, tmp[n+1]; + size_t i, j; + + for (mx=b[0], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + + for (j=0; ; ) { + limbx = (mx * (llimb_t)p[0]) + tmp[0]; + hi = (limb_t)(limbx >> LIMB_T_BITS); + for (i=1; i> LIMB_T_BITS); + } + tmp[i-1] = tmp[i] + hi; + + if (++j==n) + break; + + for (mx=b[j], hi=0, i=0; i> LIMB_T_BITS); + } + mx = n0*tmp[0]; + tmp[i] = hi; + } + + vec_copy(ret, tmp, sizeof(tmp)-sizeof(limb_t)); +} + +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b) +{ + __builtin_assume(count != 0); + while(count--) { + mul_mont_nonred_n(ret, a, a, p, n0, NLIMBS(384)); + a = ret; + } + mul_mont_n(ret, ret, b, p, n0, NLIMBS(384)); +} + +void sqr_mont_382x(vec384x ret, const vec384x a, + const vec384 p, limb_t n0) +{ + llimb_t limbx; + limb_t mask, carry, borrow; + size_t i; + vec384 t0, t1; + + /* "add_mod_n(t0, a[0], a[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + + /* "sub_mod_n(t1, a[0], a[1], p, NLIMBS(384));" */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + launder(mask); + + /* "mul_mont_n(ret[1], a[0], a[1], p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[1], a[0], a[1], p, n0, NLIMBS(384)); + + /* "add_mod_n(ret[1], ret[1], ret[1], p, NLIMBS(384));" */ + for (carry=0, i=0; i>(LIMB_T_BITS-1); + } + + /* "mul_mont_n(ret[0], t0, t1, p, n0, NLIMBS(384));" */ + mul_mont_nonred_n(ret[0], t0, t1, p, n0, NLIMBS(384)); + + /* account for t1's sign... */ + for (borrow=0, i=0; i> LIMB_T_BITS) & 1; + } + mask = 0 - borrow; + launder(mask); + for (carry=0, i=0; i> LIMB_T_BITS); + } +} + +#if defined(__GNUC__) || defined(__clang__) +# define MSB(x) ({ limb_t ret = (x) >> (LIMB_T_BITS-1); launder(ret); ret; }) +#else +# define MSB(x) ((x) >> (LIMB_T_BITS-1)) +#endif + +static size_t num_bits(limb_t l) +{ + limb_t x, mask; + size_t bits = is_zero(l) ^ 1; + + if (sizeof(limb_t) == 8) { + x = l >> (32 & (8*sizeof(limb_t)-1)); + mask = 0 - MSB(0 - x); + bits += 32 & mask; + l ^= (x ^ l) & mask; + } + + x = l >> 16; + mask = 0 - MSB(0 - x); + bits += 16 & mask; + l ^= (x ^ l) & mask; + + x = l >> 8; + mask = 0 - MSB(0 - x); + bits += 8 & mask; + l ^= (x ^ l) & mask; + + x = l >> 4; + mask = 0 - MSB(0 - x); + bits += 4 & mask; + l ^= (x ^ l) & mask; + + x = l >> 2; + mask = 0 - MSB(0 - x); + bits += 2 & mask; + l ^= (x ^ l) & mask; + + bits += l >> 1; + + return bits; +} + +#if defined(__clang_major__) && __clang_major__>7 +__attribute__((optnone)) +#endif +static limb_t lshift_2(limb_t hi, limb_t lo, size_t l) +{ + size_t r = LIMB_T_BITS - l; + limb_t mask = 0 - (is_zero(l)^1); + return (hi << (l&(LIMB_T_BITS-1))) | ((lo & mask) >> (r&(LIMB_T_BITS-1))); +} + +/* + * https://eprint.iacr.org/2020/972 with 'k' being LIMB_T_BITS-1. + */ +static void ab_approximation_n(limb_t a_[2], const limb_t a[], + limb_t b_[2], const limb_t b[], size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + limb_t a_hi, a_lo, b_hi, b_lo, mask; + size_t i; + + i = n-1; + a_hi = a[i], a_lo = a[i-1]; + b_hi = b[i], b_lo = b[i-1]; + for (i--; --i;) { + mask = 0 - is_zero(a_hi | b_hi); + a_hi = ((a_lo ^ a_hi) & mask) ^ a_hi; + b_hi = ((b_lo ^ b_hi) & mask) ^ b_hi; + a_lo = ((a[i] ^ a_lo) & mask) ^ a_lo; + b_lo = ((b[i] ^ b_lo) & mask) ^ b_lo; + } + i = LIMB_T_BITS - num_bits(a_hi | b_hi); + /* |i| can be LIMB_T_BITS if all a[2..]|b[2..] were zeros */ + + a_[0] = a[0], a_[1] = lshift_2(a_hi, a_lo, i); + b_[0] = b[0], b_[1] = lshift_2(b_hi, b_lo, i); +} + +typedef struct { limb_t f0, g0, f1, g1; } factors; + +static void inner_loop_n(factors *fg, const limb_t a_[2], const limb_t b_[2], + size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t f0 = 1, g0 = 0, f1 = 0, g1 = 1; + limb_t a_lo, a_hi, b_lo, b_hi, t_lo, t_hi, odd, borrow, xorm; + + a_lo = a_[0], a_hi = a_[1]; + b_lo = b_[0], b_hi = b_[1]; + + while(n--) { + odd = 0 - (a_lo&1); + + /* a_ -= b_ if a_ is odd */ + t_lo = a_lo, t_hi = a_hi; + limbx = a_lo - (llimb_t)(b_lo & odd); + a_lo = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1= g1; +} + +static limb_t cneg_n(limb_t ret[], const limb_t a[], limb_t neg, size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx = 0; + limb_t carry; + size_t i; + + for (carry=neg&1, i=0; i> LIMB_T_BITS); + } + + return 0 - MSB((limb_t)limbx); +} + +static limb_t add_n(limb_t ret[], const limb_t a[], limb_t b[], size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t carry; + size_t i; + + for (carry=0, i=0; i> LIMB_T_BITS); + } + + return carry; +} + +static limb_t umul_n(limb_t ret[], const limb_t a[], limb_t b, size_t n) +{ + __builtin_assume(n != 0); + llimb_t limbx; + limb_t hi; + size_t i; + + for (hi=0, i=0; i> LIMB_T_BITS); + } + + return hi; +} + +static limb_t smul_n_shift_n(limb_t ret[], const limb_t a[], limb_t *f_, + const limb_t b[], limb_t *g_, + size_t n) +{ + __builtin_assume(n != 0); + limb_t a_[n+1], b_[n+1], f, g, neg, carry, hi; + size_t i; + + /* |a|*|f_| */ + f = *f_; + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + (void)cneg_n(a_, a, neg, n); + hi = umul_n(a_, a_, f, n); + a_[n] = hi - (f & neg); + + /* |b|*|g_| */ + g = *g_; + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + (void)cneg_n(b_, b, neg, n); + hi = umul_n(b_, b_, g, n); + b_[n] = hi - (g & neg); + + /* |a|*|f_| + |b|*|g_| */ + (void)add_n(a_, a_, b_, n+1); + + /* (|a|*|f_| + |b|*|g_|) >> k */ + for (carry=a_[0], i=0; i> (LIMB_T_BITS-2); + carry = a_[i+1]; + ret[i] = hi | (carry << 2); + } + + /* ensure result is non-negative, fix up |f_| and |g_| accordingly */ + neg = 0 - MSB(carry); + *f_ = (*f_ ^ neg) - neg; + *g_ = (*g_ ^ neg) - neg; + (void)cneg_n(ret, ret, neg, n); + + return neg; +} + +static limb_t smul_2n(limb_t ret[], const limb_t u[], limb_t f, + const limb_t v[], limb_t g, size_t n) +{ + __builtin_assume(n != 0); + limb_t u_[n], v_[n], neg, hi; + + /* |u|*|f_| */ + neg = 0 - MSB(f); + f = (f ^ neg) - neg; /* ensure |f| is positive */ + neg = cneg_n(u_, u, neg, n); + hi = umul_n(u_, u_, f, n) - (f&neg); + + /* |v|*|g_| */ + neg = 0 - MSB(g); + g = (g ^ neg) - neg; /* ensure |g| is positive */ + neg = cneg_n(v_, v, neg, n); + hi += umul_n(v_, v_, g, n) - (g&neg); + + /* |u|*|f_| + |v|*|g_| */ + hi += add_n(ret, u_, v_, n); + + return hi; +} + +static void ct_inverse_mod_n(limb_t ret[], const limb_t inp[], + const limb_t mod[], const limb_t modx[], size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t a[n], b[n], u[2*n], v[2*n], t[2*n]; + limb_t a_[2], b_[2], sign, carry, top; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + vec_zero(u, sizeof(u)); u[0] = 1; + vec_zero(v, sizeof(v)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + inner_loop_n(&fg, a_, b_, LIMB_T_BITS-2); + (void)smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + smul_2n(t, u, fg.f0, v, fg.g0, 2*n); + smul_2n(v, u, fg.f1, v, fg.g1, 2*n); + vec_copy(u, t, sizeof(u)); + } + + inner_loop_n(&fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + top = smul_2n(ret, u, fg.f1, v, fg.g1, 2*n); + + sign = 0 - MSB(top); /* top is 1, 0 or -1 */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + top += carry; + sign = 0 - top; /* top is 1, 0 or -1 */ + top |= sign; + for (i=0; i> LIMB_T_BITS) & 1; + limbx = a_hi - ((llimb_t)(b_hi & odd) + borrow); + a_hi = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS); + + L += ((t_lo & b_lo) >> 1) & borrow; + + /* negate a_-b_ if it borrowed */ + a_lo ^= borrow; + a_hi ^= borrow; + limbx = a_lo + (llimb_t)(borrow & 1); + a_lo = (limb_t)limbx; + a_hi += (limb_t)(limbx >> LIMB_T_BITS) & 1; + + /* b_=a_ if a_-b_ borrowed */ + b_lo = ((t_lo ^ b_lo) & borrow) ^ b_lo; + b_hi = ((t_hi ^ b_hi) & borrow) ^ b_hi; + + /* exchange f0 and f1 if a_-b_ borrowed */ + xorm = (f0 ^ f1) & borrow; + f0 ^= xorm; + f1 ^= xorm; + + /* exchange g0 and g1 if a_-b_ borrowed */ + xorm = (g0 ^ g1) & borrow; + g0 ^= xorm; + g1 ^= xorm; + + /* subtract if a_ was odd */ + f0 -= f1 & odd; + g0 -= g1 & odd; + + f1 <<= 1; + g1 <<= 1; + a_lo >>= 1; a_lo |= a_hi << (LIMB_T_BITS-1); + a_hi >>= 1; + + L += (b_lo + 2) >> 2; + } + + fg->f0 = f0, fg->g0 = g0, fg->f1 = f1, fg->g1 = g1; + + return L; +} + +static bool_t ct_is_sqr_mod_n(const limb_t inp[], const limb_t mod[], size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + limb_t a[n], b[n], t[n]; + limb_t a_[2], b_[2], neg, L = 0; + factors fg; + size_t i; + + vec_copy(a, inp, sizeof(a)); + vec_copy(b, mod, sizeof(b)); + + for (i=0; i<(2*n*LIMB_T_BITS)/(LIMB_T_BITS-2); i++) { + ab_approximation_n(a_, a, b_, b, n); + L = legendre_loop_n(L, &fg, a_, b_, LIMB_T_BITS-2); + neg = smul_n_shift_n(t, a, &fg.f0, b, &fg.g0, n); + (void)smul_n_shift_n(b, a, &fg.f1, b, &fg.g1, n); + vec_copy(a, t, sizeof(a)); + L += (b[0] >> 1) & neg; + } + + L = legendre_loop_n(L, &fg, a, b, (2*n*LIMB_T_BITS)%(LIMB_T_BITS-2)); + + return (L & 1) ^ 1; +} + +#define CT_IS_SQR_MOD_IMPL(bits) \ +inline bool_t ct_is_square_mod_##bits(const vec##bits inp, \ + const vec##bits mod) \ +{ return ct_is_sqr_mod_n(inp, mod, NLIMBS(bits)); } + +CT_IS_SQR_MOD_IMPL(384) + +/* + * |div_top| points at two most significant limbs of the dividend, |d_hi| + * and |d_lo| are two most significant limbs of the divisor. If divisor + * is only one limb, it is to be passed in |d_hi| with zero in |d_lo|. + * The divisor is required to be "bitwise left-aligned," and dividend's + * top limbs to be not larger than the divisor's. The latter limitation + * can be problematic in the first iteration of multi-precision division, + * where in most general case the condition would have to be "smaller." + * The subroutine considers four limbs, two of which are "overlapping," + * hence the name... Another way to look at it is to think of the pair + * of the dividend's limbs being suffixed with a zero: + * +-------+-------+-------+ + * R | | | 0 | + * +-------+-------+-------+ + * +-------+-------+ + * D | | | + * +-------+-------+ + */ +limb_t div_3_limbs(const limb_t div_top[2], limb_t d_lo, limb_t d_hi) +{ + llimb_t Rx; + limb_t r_lo = div_top[0], r_hi = div_top[1]; + limb_t Q = 0, mask, borrow, rx; + size_t i; + + for (i = 0; i < LIMB_T_BITS; i++) { + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS); + + /* "if (R >= D) R -= D" */ + r_lo = ((r_lo ^ rx) & borrow) ^ rx; + rx = (limb_t)Rx; + r_hi = ((r_hi ^ rx) & borrow) ^ rx; + + Q <<= 1; + Q |= ~borrow & 1; + + /* "D >>= 1" */ + d_lo >>= 1; d_lo |= d_hi << (LIMB_T_BITS - 1); + d_hi >>= 1; + } + + mask = 0 - MSB(Q); /* does it overflow? */ + + /* "borrow, Rx = R - D" */ + Rx = (llimb_t)r_lo - d_lo; + rx = (limb_t)Rx; + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + Rx = r_hi - (d_hi + (llimb_t)borrow); + borrow = (limb_t)(Rx >> LIMB_T_BITS) & 1; + + Q <<= 1; + Q |= borrow ^ 1; + + return (Q | mask); +} + +static limb_t quot_rem_n(limb_t *div_rem, const limb_t *divisor, + limb_t quotient, size_t n) +{ + __builtin_assume(n != 0 && n%2 == 0); + llimb_t limbx; + limb_t tmp[n+1], carry, mask, borrow; + size_t i; + + /* divisor*quotient */ + for (carry=0, i=0; i> LIMB_T_BITS); + } + tmp[i] = carry; + + /* remainder = dividend - divisor*quotient */ + for (borrow=0, i=0; i<=n; i++) { + limbx = div_rem[i] - (tmp[i] + (llimb_t)borrow); + tmp[i] = (limb_t)limbx; + borrow = (limb_t)(limbx >> LIMB_T_BITS) & 1; + } + + mask = 0 - borrow; + launder(mask); + + /* if quotient was off by one, add divisor to the remainder */ + for (carry=0, i=0; i> LIMB_T_BITS) & 1; + } + + return (div_rem[i] = quotient + mask); +} + +inline limb_t quot_rem_128(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(128)); } + +inline limb_t quot_rem_64(limb_t *div_rem, const limb_t *divisor, + limb_t quotient) +{ return quot_rem_n(div_rem, divisor, quotient, NLIMBS(64)); } + +/* + * Unlock reference implementations in vect.c + */ +#define mul_by_8_mod_384 mul_by_8_mod_384 +#define mul_by_8_mod_384x mul_by_8_mod_384x +#define mul_by_3_mod_384x mul_by_3_mod_384x +#define mul_by_1_plus_i_mod_384x mul_by_1_plus_i_mod_384x +#define add_mod_384x add_mod_384x +#define sub_mod_384x sub_mod_384x +#define lshift_mod_384x lshift_mod_384x +#define sqr_mont_384x sqr_mont_384x + +inline void vec_prefetch(const void *ptr, size_t len) +{ (void)ptr; (void)len; } + +/* + * SHA-256 + */ +#define ROTR(x,n) ((x)>>n | (x)<<(32-n)) +#define Sigma0(x) (ROTR((x),2) ^ ROTR((x),13) ^ ROTR((x),22)) +#define Sigma1(x) (ROTR((x),6) ^ ROTR((x),11) ^ ROTR((x),25)) +#define sigma0(x) (ROTR((x),7) ^ ROTR((x),18) ^ ((x)>>3)) +#define sigma1(x) (ROTR((x),17) ^ ROTR((x),19) ^ ((x)>>10)) +#define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +void blst_sha256_block_data_order(unsigned int *v, const void *inp, + size_t blocks) +{ + static const unsigned int K256[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 + }; + unsigned int X[16], l, a, b, c, d, e, f, g, h, s0, s1, T1, T2; + const unsigned char *data = inp; + size_t round; + + a = v[0]; + b = v[1]; + c = v[2]; + d = v[3]; + e = v[4]; + f = v[5]; + g = v[6]; + h = v[7]; + + while (blocks--) { + for (round = 0; round < 16; round++) { + l = (unsigned int)data[0] << 24; + l |= (unsigned int)data[1] << 16; + l |= (unsigned int)data[2] << 8; + l |= (unsigned int)data[3]; + data += 4; + T1 = X[round] = l; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + for (; round < 64; round++) { + s0 = X[(round + 1) & 0x0f]; + s0 = sigma0(s0); + s1 = X[(round + 14) & 0x0f]; + s1 = sigma1(s1); + + T1 = X[round & 0xf] += s0 + s1 + X[(round + 9) & 0xf]; + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[round]; + T2 = Sigma0(a) + Maj(a, b, c); + h = g; + g = f; + f = e; + e = d + T1; + d = c; + c = b; + b = a; + a = T1 + T2; + } + + a += v[0]; v[0] = a; + b += v[1]; v[1] = b; + c += v[2]; v[2] = c; + d += v[3]; v[3] = d; + e += v[4]; v[4] = e; + f += v[5]; v[5] = f; + g += v[6]; v[6] = g; + h += v[7]; v[7] = h; + } +} +#undef ROTR +#undef Sigma0 +#undef Sigma1 +#undef sigma0 +#undef sigma1 +#undef Ch +#undef Maj + +void blst_sha256_hcopy(unsigned int dst[8], const unsigned int src[8]) +{ + size_t i; + + for (i=0; i<8; i++) + dst[i] = src[i]; +} + +void blst_sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + size_t i; + + for (i=0; i<8; i++, md+=4) { + unsigned int h_i = h[i]; + md[0] = (unsigned char)(h_i >> 24); + md[1] = (unsigned char)(h_i >> 16); + md[2] = (unsigned char)(h_i >> 8); + md[3] = (unsigned char)h_i; + } +} + +void blst_sha256_bcopy(void *dst_, const void *src_, size_t len) +{ + unsigned char *dst = dst_; + const unsigned char *src = src_; + size_t i; + + for (i=0; iZ); /* Z1Z1 = Z1^2 */ + mul_fp2(U2, Q->X, Z1Z1); /* U2 = X2*Z1Z1 */ + + mul_fp2(S2, Q->Y, R->Z); + mul_fp2(S2, S2, Z1Z1); /* S2 = Y2*Z1*Z1Z1 */ + + sub_fp2(H, U2, R->X); /* H = U2-X1 */ + + sqr_fp2(HH, H); /* HH = H^2 */ + add_fp2(I, HH, HH); + add_fp2(I, I, I); /* I = 4*HH */ + + mul_fp2(J, H, I); /* J = H*I */ + + sub_fp2(r, S2, R->Y); + add_fp2(r, r, r); /* r = 2*(S2-Y1) */ + + mul_fp2(V, R->X, I); /* V = X1*I */ + + sqr_fp2(T->X, r); + sub_fp2(T->X, T->X, J); + sub_fp2(T->X, T->X, V); + sub_fp2(T->X, T->X, V); /* X3 = r^2-J-2*V */ + + mul_fp2(J, J, R->Y); + sub_fp2(T->Y, V, T->X); + mul_fp2(T->Y, T->Y, r); + sub_fp2(T->Y, T->Y, J); + sub_fp2(T->Y, T->Y, J); /* Y3 = r*(V-X3)-2*Y1*J */ + + add_fp2(T->Z, R->Z, H); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, Z1Z1); + sub_fp2(T->Z, T->Z, HH); /* Z3 = (Z1+H)^2-Z1Z1-HH */ + + /* + * line evaluation + */ + mul_fp2(I, r, Q->X); + mul_fp2(J, Q->Y, T->Z); + sub_fp2(I, I, J); + add_fp2(line[0], I, I); /* 2*(r*X2 - Y2*Z3) */ +#ifdef r +# undef r +#else + vec_copy(line[1], r, sizeof(r)); +#endif + vec_copy(line[2], T->Z, sizeof(T->Z)); +} + +static void line_dbl(vec384fp6 line, POINTonE2 *T, const POINTonE2 *Q) +{ + vec384x ZZ, A, B, C, D, E, F; + + /* + * https://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-0.html#doubling-dbl-2009-alnr + */ + sqr_fp2(A, Q->X); /* A = X1^2 */ + sqr_fp2(B, Q->Y); /* B = Y1^2 */ + sqr_fp2(ZZ, Q->Z); /* ZZ = Z1^2 */ + sqr_fp2(C, B); /* C = B^2 */ + + add_fp2(D, Q->X, B); /* X1+B */ + sqr_fp2(D, D); /* (X1+B)^2 */ + sub_fp2(D, D, A); /* (X1+B)^2-A */ + sub_fp2(D, D, C); /* (X1+B)^2-A-C */ + add_fp2(D, D, D); /* D = 2*((X1+B)^2-A-C) */ + + mul_by_3_fp2(E, A); /* E = 3*A */ + sqr_fp2(F, E); /* F = E^2 */ + + add_fp2(line[0], E, Q->X); /* 3*A+X1 for line evaluation */ + + sub_fp2(T->X, F, D); + sub_fp2(T->X, T->X, D); /* X3 = F-2*D */ + + add_fp2(T->Z, Q->Y, Q->Z); + sqr_fp2(T->Z, T->Z); + sub_fp2(T->Z, T->Z, B); + sub_fp2(T->Z, T->Z, ZZ); /* Z3 = (Y1+Z1)^2-B-ZZ */ + + mul_by_8_fp2(C, C); /* 8*C */ + sub_fp2(T->Y, D, T->X); /* D-X3 */ + mul_fp2(T->Y, T->Y, E); /* E*(D-X3) */ + sub_fp2(T->Y, T->Y, C); /* Y3 = E*(D-X3)-8*C */ + + /* + * line evaluation + */ + sqr_fp2(line[0], line[0]); + sub_fp2(line[0], line[0], A); + sub_fp2(line[0], line[0], F); /* (3*A+X1)^2 - X1^2 - 9*A^2 */ + lshift_fp2(B, B, 2); + sub_fp2(line[0], line[0], B); /* 6*X1^3 - 4*Y1^2 */ + + mul_fp2(line[1], E, ZZ); /* 3*X1^2 * Z1^2 */ + + mul_fp2(line[2], T->Z, ZZ); /* Z3 * Z1^2 */ +} + +static void line_by_Px2(vec384fp6 line, const POINTonE1_affine *Px2) +{ + mul_fp(line[1][0], line[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(line[1][1], line[1][1], Px2->X); + + mul_fp(line[2][0], line[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(line[2][1], line[2][1], Px2->Y); +} + +#if 0 +static void add_n_dbl(vec384fp12 ret, POINTonE2 *T, const POINTonE2_affine *Q, + const POINTonE1_affine *Px2, vec384fp6 line, size_t n) +{ + line_add(line, T, T, Q); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + line_dbl(line, T, T); line_by_Px2(line, Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop(vec384fp12 ret, const POINTonE2 *Q, const POINTonE1 *P) +{ +#define Q ((const POINTonE2_affine *)Q) + POINTonE2 T[1]; + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T, T); /* 0x2 */ + line_by_Px2(line, Px2); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + add_n_dbl(ret, T, Q, Px2, line, 2); /* ..0xc */ + add_n_dbl(ret, T, Q, Px2, line, 3); /* ..0x68 */ + add_n_dbl(ret, T, Q, Px2, line, 9); /* ..0xd200 */ + add_n_dbl(ret, T, Q, Px2, line, 32); /* ..0xd20100000000 */ + add_n_dbl(ret, T, Q, Px2, line, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +#undef Q +} +#endif + +static void start_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE1_affine Px2[], size_t n) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + line_dbl(line, T+0, T+0); line_by_Px2(line, Px2+0); + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + + for (i = 1; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void add_n_dbl_n(vec384fp12 ret, POINTonE2 T[], + const POINTonE2_affine Q[], + const POINTonE1_affine Px2[], + size_t n, size_t k) +{ + size_t i; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + for (i = 0; i < n; i++) { + line_add(line, T+i, T+i, Q+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + while (k--) { + sqr_fp12(ret, ret); + for (i = 0; i < n; i++) { + line_dbl(line, T+i, T+i); line_by_Px2(line, Px2+i); + mul_by_xy00z0_fp12(ret, ret, line); + } + } +} + +static void miller_loop_n(vec384fp12 ret, const POINTonE2_affine Q[], + const POINTonE1_affine P[], size_t n) +{ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 \ + || defined(__STDC_NO_VLA__) + POINTonE2 *T = alloca(n*sizeof(POINTonE2)); + POINTonE1_affine *Px2 = alloca(n*sizeof(POINTonE1_affine)); +#else + POINTonE2 T[n]; + POINTonE1_affine Px2[n]; +#endif + size_t i; + + if ((n == 1) && (vec_is_zero(&Q[0], sizeof(Q[0])) | + vec_is_zero(&P[0], sizeof(P[0]))) ) { + /* + * Special case of infinite aggregated signature, pair the additive + * group's identity with the multiplicative group's identity. + */ + vec_copy(ret, BLS12_381_Rx.p12, sizeof(vec384fp12)); + return; + } + + for (i = 0; i < n; i++) { + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2[i].X, P[i].X, P[i].X); + neg_fp(Px2[i].X, Px2[i].X); + add_fp(Px2[i].Y, P[i].Y, P[i].Y); + + vec_copy(T[i].X, Q[i].X, 2*sizeof(T[i].X)); + vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); + } + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + start_dbl_n(ret, T, Px2, n); /* 0x2 */ + add_n_dbl_n(ret, T, Q, Px2, n, 2); /* ..0xc */ + add_n_dbl_n(ret, T, Q, Px2, n, 3); /* ..0x68 */ + add_n_dbl_n(ret, T, Q, Px2, n, 9); /* ..0xd200 */ + add_n_dbl_n(ret, T, Q, Px2, n, 32); /* ..0xd20100000000 */ + add_n_dbl_n(ret, T, Q, Px2, n, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +static void pre_add_n_dbl(vec384fp6 lines[], POINTonE2 *T, + const POINTonE2_affine *Q, + size_t n) +{ + line_add(lines++[0], T, T, Q); + while (n--) + line_dbl(lines++[0], T, T); +} + +static void precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ + POINTonE2 T[1]; + + vec_copy(T->X, Q->X, 2*sizeof(T->X)); + vec_copy(T->Z, BLS12_381_Rx.p2, sizeof(T->Z)); + + line_dbl(Qlines[0], T, T); /* 0x2 */ + pre_add_n_dbl(&Qlines[1], T, Q, 2); /* ..0xc */ + pre_add_n_dbl(&Qlines[4], T, Q, 3); /* ..0x68 */ + pre_add_n_dbl(&Qlines[8], T, Q, 9); /* ..0xd200 */ + pre_add_n_dbl(&Qlines[18], T, Q, 32); /* ..0xd20100000000 */ + pre_add_n_dbl(&Qlines[51], T, Q, 16); /* ..0xd201000000010000 */ +} + +static void post_line_by_Px2(vec384fp6 out, const vec384fp6 in, + const POINTonE1_affine *Px2) +{ + vec_copy(out[0], in[0], sizeof(out[0])); + + mul_fp(out[1][0], in[1][0], Px2->X); /* "b01" *= -2*P->X */ + mul_fp(out[1][1], in[1][1], Px2->X); + + mul_fp(out[2][0], in[2][0], Px2->Y); /* "b11" *= 2*P->Y */ + mul_fp(out[2][1], in[2][1], Px2->Y); +} + +static void post_add_n_dbl(vec384fp12 ret, const vec384fp6 lines[], + const POINTonE1_affine *Px2, size_t n) +{ + vec384fp6 line; + + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + while (n--) { + sqr_fp12(ret, ret); + post_line_by_Px2(line, lines++[0], Px2); + mul_by_xy00z0_fp12(ret, ret, line); + } +} + +static void miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ + POINTonE1_affine Px2[1]; + vec384fp6 line; /* it's not actual fp6, but 3 packed fp2, "xy00z0" */ + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2->X, P->X, P->X); + neg_fp(Px2->X, Px2->X); + add_fp(Px2->Y, P->Y, P->Y); + + /* first step is ret = 1^2*line, which is replaced with ret = line */ + post_line_by_Px2(line, Qlines[0], Px2); /* 0x2 */ + vec_zero(ret, sizeof(vec384fp12)); + vec_copy(ret[0][0], line[0], 2*sizeof(vec384fp2)); + vec_copy(ret[1][1], line[2], sizeof(vec384fp2)); + post_add_n_dbl(ret, &Qlines[1], Px2, 2); /* ..0xc */ + post_add_n_dbl(ret, &Qlines[4], Px2, 3); /* ..0x68 */ + post_add_n_dbl(ret, &Qlines[8], Px2, 9); /* ..0xd200 */ + post_add_n_dbl(ret, &Qlines[18], Px2, 32); /* ..0xd20100000000 */ + post_add_n_dbl(ret, &Qlines[51], Px2, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#ifdef INTERNAL_TESTMODE +static void miller_loop_alt(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ + vec384fp6 lines[68]; + + precompute_lines(lines, Q); + miller_loop_lines(ret, lines, P); +} +#endif + +static void mul_n_sqr(vec384fp12 ret, const vec384fp12 a, size_t n) +{ + mul_fp12(ret, ret, a); + while (n--) + cyclotomic_sqr_fp12(ret, ret); +} + +static void raise_to_z_div_by_2(vec384fp12 ret, const vec384fp12 a) +{ + cyclotomic_sqr_fp12(ret, a); /* 0x2 */ + mul_n_sqr(ret, a, 2); /* ..0xc */ + mul_n_sqr(ret, a, 3); /* ..0x68 */ + mul_n_sqr(ret, a, 9); /* ..0xd200 */ + mul_n_sqr(ret, a, 32); /* ..0xd20100000000 */ + mul_n_sqr(ret, a, 16-1); /* ..0x6900800000008000 */ + conjugate_fp12(ret); /* account for z being negative */ +} + +#define raise_to_z(a, b) (raise_to_z_div_by_2(a, b), cyclotomic_sqr_fp12(a, a)) + +/* + * Adaptation from /pairing/src/bls12_381/mod.rs + */ +static void final_exp(vec384fp12 ret, const vec384fp12 f) +{ + vec384fp12 y0, y1, y2, y3; + + vec_copy(y1, f, sizeof(y1)); + conjugate_fp12(y1); + inverse_fp12(y2, f); + mul_fp12(ret, y1, y2); + frobenius_map_fp12(y2, ret, 2); + mul_fp12(ret, ret, y2); + + cyclotomic_sqr_fp12(y0, ret); + raise_to_z(y1, y0); + raise_to_z_div_by_2(y2, y1); + vec_copy(y3, ret, sizeof(y3)); + conjugate_fp12(y3); + mul_fp12(y1, y1, y3); + conjugate_fp12(y1); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y1); + raise_to_z(y3, y2); + conjugate_fp12(y1); + mul_fp12(y3, y3, y1); + conjugate_fp12(y1); + frobenius_map_fp12(y1, y1, 3); + frobenius_map_fp12(y2, y2, 2); + mul_fp12(y1, y1, y2); + raise_to_z(y2, y3); + mul_fp12(y2, y2, y0); + mul_fp12(y2, y2, ret); + mul_fp12(y1, y1, y2); + frobenius_map_fp12(y2, y3, 1); + mul_fp12(ret, y1, y2); +} + +void blst_miller_loop(vec384fp12 ret, const POINTonE2_affine *Q, + const POINTonE1_affine *P) +{ miller_loop_n(ret, Q ? Q : (const POINTonE2_affine *)&BLS12_381_G2, + P ? P : (const POINTonE1_affine *)&BLS12_381_G1, 1); +} + +#ifndef MILLER_LOOP_N_MAX +# define MILLER_LOOP_N_MAX 16 +#endif + +void blst_miller_loop_n(vec384fp12 out, const POINTonE2_affine *const Qs[], + const POINTonE1_affine *const Ps[], + size_t n) +{ /* ~10KB of stack storage */ + POINTonE2 T[MILLER_LOOP_N_MAX]; + POINTonE2_affine Q[MILLER_LOOP_N_MAX]; + POINTonE1_affine Px2[MILLER_LOOP_N_MAX]; + const POINTonE2_affine *Qptr = NULL; + const POINTonE1_affine *Pptr = NULL; + size_t i, j; + + for (i = 0, j = 0; j < n; j++) { + Qptr = *Qs ? *Qs++ : Qptr+1; + Pptr = *Ps ? *Ps++ : Pptr+1; + + /* Move common expression from line evaluation to line_by_Px2. */ + add_fp(Px2[i].X, Pptr->X, Pptr->X); + neg_fp(Px2[i].X, Px2[i].X); + add_fp(Px2[i].Y, Pptr->Y, Pptr->Y); + + vec_copy(Q[i].X, Qptr->X, 2*sizeof(Q[i].X)); + vec_copy(T[i].X, Qptr->X, 2*sizeof(T[i].X)); + vec_copy(T[i].Z, BLS12_381_Rx.p2, sizeof(T[i].Z)); + + if (++i == MILLER_LOOP_N_MAX || j == n-1) { + vec384fp12 tmp; + vec384fp6 *ret = j < MILLER_LOOP_N_MAX ? out : tmp; + + /* first step is ret = 1^2*line, which is just ret = line */ + start_dbl_n(ret, T, Px2, i); /* 0x2 */ + add_n_dbl_n(ret, T, Q, Px2, i, 2); /* ..0xc */ + add_n_dbl_n(ret, T, Q, Px2, i, 3); /* ..0x68 */ + add_n_dbl_n(ret, T, Q, Px2, i, 9); /* ..0xd200 */ + add_n_dbl_n(ret, T, Q, Px2, i, 32); /* ..0xd20100000000 */ + add_n_dbl_n(ret, T, Q, Px2, i, 16); /* ..0xd201000000010000 */ + conjugate_fp12(ret); /* account for z being negative */ + + if (j >= MILLER_LOOP_N_MAX) + mul_fp12(out, out, ret); + + i = 0; + } + } +} + +void blst_final_exp(vec384fp12 ret, const vec384fp12 f) +{ final_exp(ret, f); } + +void blst_precompute_lines(vec384fp6 Qlines[68], const POINTonE2_affine *Q) +{ precompute_lines(Qlines, Q); } + +void blst_miller_loop_lines(vec384fp12 ret, const vec384fp6 Qlines[68], + const POINTonE1_affine *P) +{ miller_loop_lines(ret, Qlines, P); } + +static bool_t is_cyclotomic(const vec384fp12 f) +{ + vec384fp12 a, b; + + frobenius_map_fp12(a, f, 2); + frobenius_map_fp12(b, a, 2); + mul_fp12(b, b, f); + + return vec_is_equal(a, b, sizeof(a)); +} + +int blst_fp12_in_group(const vec384fp12 f) +{ + vec384fp12 a, b; + + if (vec_is_zero(f, sizeof(vec384fp12)) || !is_cyclotomic(f)) + return 0; + + frobenius_map_fp12(a, f, 1); + raise_to_z(b, f); + + return (int)vec_is_equal(a, b, sizeof(a)); +} diff --git a/crypto/blst_src/pentaroot-addchain.h b/crypto/blst_src/pentaroot-addchain.h new file mode 100644 index 00000000000..5bdd9ddf7f7 --- /dev/null +++ b/crypto/blst_src/pentaroot-addchain.h @@ -0,0 +1,333 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is 1/5 modulo BLS12_381_r-1. Exponentiation to which + * yields 5th root of the base. + * + * Generated with 'addchain 20974350070050476191779096203274386335076221000211055129041463479975432473805' + * https://github.com/kwantam/addchain + * # Bos-Coster (win=4) : 307 (15) + * # Bos-Coster (win=10) : 307 (18) + * # Yacobi : 319 (16) + * # Bos-Coster (win=2) : 319 ( 5) + * # Bos-Coster (win=5) : 306 (19) <<< + * # Bos-Coster (win=7) : 311 (22) + * # Bos-Coster (win=9) : 313 (20) + * # Bos-Coster (win=3) : 314 ( 9) + * # Bos-Coster (win=6) : 309 (21) + * # Bos-Coster (win=8) : 309 (23) + * # Bergeron-Berstel-Brlek-Duboc : 334 ( 5) + */ + +#define PENTAROOT_MOD_BLS12_381_r(out, inp, ptype) do { \ +ptype t[19]; \ +vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[7], t[1]); /* 1: 2 */\ +sqr(t[0], t[7]); /* 2: 4 */\ +sqr(t[2], t[0]); /* 3: 8 */\ +mul(t[10], t[2], t[1]); /* 4: 9 */\ +mul(t[3], t[10], t[7]); /* 5: b */\ +mul(t[1], t[10], t[0]); /* 6: d */\ +mul(t[5], t[3], t[0]); /* 7: f */\ +mul(t[9], t[10], t[2]); /* 8: 11 */\ +mul(t[4], t[3], t[2]); /* 9: 13 */\ +mul(t[15], t[5], t[2]); /* 10: 17 */\ +mul(t[8], t[15], t[2]); /* 11: 1f */\ +mul(t[13], t[8], t[7]); /* 12: 21 */\ +mul(t[14], t[8], t[0]); /* 13: 23 */\ +mul(t[12], t[13], t[0]); /* 14: 25 */\ +mul(t[6], t[8], t[2]); /* 15: 27 */\ +mul(t[11], t[14], t[2]); /* 16: 2b */\ +sqr(t[0], t[15]); /* 17: 2e */\ +mul(t[18], t[6], t[2]); /* 18: 2f */\ +mul(t[2], t[11], t[2]); /* 19: 33 */\ +mul(t[16], t[2], t[7]); /* 20: 35 */\ +mul(t[7], t[0], t[3]); /* 21: 39 */\ +mul(t[17], t[0], t[5]); /* 22: 3d */\ +/* sqr(t[0], t[0]); */ /* 23: 5c */\ +/* sqr(t[0], t[0]); */ /* 24: b8 */\ +/* sqr(t[0], t[0]); */ /* 25: 170 */\ +/* sqr(t[0], t[0]); */ /* 26: 2e0 */\ +/* sqr(t[0], t[0]); */ /* 27: 5c0 */\ +/* sqr(t[0], t[0]); */ /* 28: b80 */\ +/* sqr(t[0], t[0]); */ /* 29: 1700 */\ +sqr_n_mul(t[0], t[0], 7, t[18]); /* 30: 172f */\ +/* sqr(t[0], t[0]); */ /* 31: 2e5e */\ +/* sqr(t[0], t[0]); */ /* 32: 5cbc */\ +/* sqr(t[0], t[0]); */ /* 33: b978 */\ +/* sqr(t[0], t[0]); */ /* 34: 172f0 */\ +/* sqr(t[0], t[0]); */ /* 35: 2e5e0 */\ +/* sqr(t[0], t[0]); */ /* 36: 5cbc0 */\ +sqr_n_mul(t[0], t[0], 6, t[13]); /* 37: 5cbe1 */\ +/* sqr(t[0], t[0]); */ /* 38: b97c2 */\ +/* sqr(t[0], t[0]); */ /* 39: 172f84 */\ +/* sqr(t[0], t[0]); */ /* 40: 2e5f08 */\ +/* sqr(t[0], t[0]); */ /* 41: 5cbe10 */\ +/* sqr(t[0], t[0]); */ /* 42: b97c20 */\ +/* sqr(t[0], t[0]); */ /* 43: 172f840 */\ +sqr_n_mul(t[0], t[0], 6, t[17]); /* 44: 172f87d */\ +/* sqr(t[0], t[0]); */ /* 45: 2e5f0fa */\ +/* sqr(t[0], t[0]); */ /* 46: 5cbe1f4 */\ +/* sqr(t[0], t[0]); */ /* 47: b97c3e8 */\ +/* sqr(t[0], t[0]); */ /* 48: 172f87d0 */\ +/* sqr(t[0], t[0]); */ /* 49: 2e5f0fa0 */\ +/* sqr(t[0], t[0]); */ /* 50: 5cbe1f40 */\ +sqr_n_mul(t[0], t[0], 6, t[16]); /* 51: 5cbe1f75 */\ +/* sqr(t[0], t[0]); */ /* 52: b97c3eea */\ +/* sqr(t[0], t[0]); */ /* 53: 172f87dd4 */\ +/* sqr(t[0], t[0]); */ /* 54: 2e5f0fba8 */\ +/* sqr(t[0], t[0]); */ /* 55: 5cbe1f750 */\ +/* sqr(t[0], t[0]); */ /* 56: b97c3eea0 */\ +sqr_n_mul(t[0], t[0], 5, t[15]); /* 57: b97c3eeb7 */\ +/* sqr(t[0], t[0]); */ /* 58: 172f87dd6e */\ +/* sqr(t[0], t[0]); */ /* 59: 2e5f0fbadc */\ +/* sqr(t[0], t[0]); */ /* 60: 5cbe1f75b8 */\ +/* sqr(t[0], t[0]); */ /* 61: b97c3eeb70 */\ +/* sqr(t[0], t[0]); */ /* 62: 172f87dd6e0 */\ +/* sqr(t[0], t[0]); */ /* 63: 2e5f0fbadc0 */\ +sqr_n_mul(t[0], t[0], 6, t[15]); /* 64: 2e5f0fbadd7 */\ +/* sqr(t[0], t[0]); */ /* 65: 5cbe1f75bae */\ +/* sqr(t[0], t[0]); */ /* 66: b97c3eeb75c */\ +/* sqr(t[0], t[0]); */ /* 67: 172f87dd6eb8 */\ +/* sqr(t[0], t[0]); */ /* 68: 2e5f0fbadd70 */\ +/* sqr(t[0], t[0]); */ /* 69: 5cbe1f75bae0 */\ +/* sqr(t[0], t[0]); */ /* 70: b97c3eeb75c0 */\ +/* sqr(t[0], t[0]); */ /* 71: 172f87dd6eb80 */\ +/* sqr(t[0], t[0]); */ /* 72: 2e5f0fbadd700 */\ +sqr_n_mul(t[0], t[0], 8, t[14]); /* 73: 2e5f0fbadd723 */\ +/* sqr(t[0], t[0]); */ /* 74: 5cbe1f75bae46 */\ +/* sqr(t[0], t[0]); */ /* 75: b97c3eeb75c8c */\ +/* sqr(t[0], t[0]); */ /* 76: 172f87dd6eb918 */\ +/* sqr(t[0], t[0]); */ /* 77: 2e5f0fbadd7230 */\ +/* sqr(t[0], t[0]); */ /* 78: 5cbe1f75bae460 */\ +/* sqr(t[0], t[0]); */ /* 79: b97c3eeb75c8c0 */\ +/* sqr(t[0], t[0]); */ /* 80: 172f87dd6eb9180 */\ +/* sqr(t[0], t[0]); */ /* 81: 2e5f0fbadd72300 */\ +sqr_n_mul(t[0], t[0], 8, t[13]); /* 82: 2e5f0fbadd72321 */\ +/* sqr(t[0], t[0]); */ /* 83: 5cbe1f75bae4642 */\ +/* sqr(t[0], t[0]); */ /* 84: b97c3eeb75c8c84 */\ +/* sqr(t[0], t[0]); */ /* 85: 172f87dd6eb91908 */\ +/* sqr(t[0], t[0]); */ /* 86: 2e5f0fbadd723210 */\ +/* sqr(t[0], t[0]); */ /* 87: 5cbe1f75bae46420 */\ +/* sqr(t[0], t[0]); */ /* 88: b97c3eeb75c8c840 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 89: b97c3eeb75c8c873 */\ +/* sqr(t[0], t[0]); */ /* 90: 172f87dd6eb9190e6 */\ +/* sqr(t[0], t[0]); */ /* 91: 2e5f0fbadd72321cc */\ +/* sqr(t[0], t[0]); */ /* 92: 5cbe1f75bae464398 */\ +/* sqr(t[0], t[0]); */ /* 93: b97c3eeb75c8c8730 */\ +/* sqr(t[0], t[0]); */ /* 94: 172f87dd6eb9190e60 */\ +/* sqr(t[0], t[0]); */ /* 95: 2e5f0fbadd72321cc0 */\ +sqr_n_mul(t[0], t[0], 6, t[13]); /* 96: 2e5f0fbadd72321ce1 */\ +/* sqr(t[0], t[0]); */ /* 97: 5cbe1f75bae46439c2 */\ +/* sqr(t[0], t[0]); */ /* 98: b97c3eeb75c8c87384 */\ +/* sqr(t[0], t[0]); */ /* 99: 172f87dd6eb9190e708 */\ +/* sqr(t[0], t[0]); */ /* 100: 2e5f0fbadd72321ce10 */\ +/* sqr(t[0], t[0]); */ /* 101: 5cbe1f75bae46439c20 */\ +/* sqr(t[0], t[0]); */ /* 102: b97c3eeb75c8c873840 */\ +/* sqr(t[0], t[0]); */ /* 103: 172f87dd6eb9190e7080 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 104: 172f87dd6eb9190e70a5 */\ +/* sqr(t[0], t[0]); */ /* 105: 2e5f0fbadd72321ce14a */\ +/* sqr(t[0], t[0]); */ /* 106: 5cbe1f75bae46439c294 */\ +/* sqr(t[0], t[0]); */ /* 107: b97c3eeb75c8c8738528 */\ +/* sqr(t[0], t[0]); */ /* 108: 172f87dd6eb9190e70a50 */\ +/* sqr(t[0], t[0]); */ /* 109: 2e5f0fbadd72321ce14a0 */\ +/* sqr(t[0], t[0]); */ /* 110: 5cbe1f75bae46439c2940 */\ +/* sqr(t[0], t[0]); */ /* 111: b97c3eeb75c8c87385280 */\ +/* sqr(t[0], t[0]); */ /* 112: 172f87dd6eb9190e70a500 */\ +sqr_n_mul(t[0], t[0], 8, t[11]); /* 113: 172f87dd6eb9190e70a52b */\ +/* sqr(t[0], t[0]); */ /* 114: 2e5f0fbadd72321ce14a56 */\ +/* sqr(t[0], t[0]); */ /* 115: 5cbe1f75bae46439c294ac */\ +/* sqr(t[0], t[0]); */ /* 116: b97c3eeb75c8c873852958 */\ +/* sqr(t[0], t[0]); */ /* 117: 172f87dd6eb9190e70a52b0 */\ +/* sqr(t[0], t[0]); */ /* 118: 2e5f0fbadd72321ce14a560 */\ +/* sqr(t[0], t[0]); */ /* 119: 5cbe1f75bae46439c294ac0 */\ +sqr_n_mul(t[0], t[0], 6, t[1]); /* 120: 5cbe1f75bae46439c294acd */\ +/* sqr(t[0], t[0]); */ /* 121: b97c3eeb75c8c873852959a */\ +/* sqr(t[0], t[0]); */ /* 122: 172f87dd6eb9190e70a52b34 */\ +/* sqr(t[0], t[0]); */ /* 123: 2e5f0fbadd72321ce14a5668 */\ +/* sqr(t[0], t[0]); */ /* 124: 5cbe1f75bae46439c294acd0 */\ +/* sqr(t[0], t[0]); */ /* 125: b97c3eeb75c8c873852959a0 */\ +/* sqr(t[0], t[0]); */ /* 126: 172f87dd6eb9190e70a52b340 */\ +/* sqr(t[0], t[0]); */ /* 127: 2e5f0fbadd72321ce14a56680 */\ +/* sqr(t[0], t[0]); */ /* 128: 5cbe1f75bae46439c294acd00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 129: 5cbe1f75bae46439c294acd33 */\ +/* sqr(t[0], t[0]); */ /* 130: b97c3eeb75c8c873852959a66 */\ +/* sqr(t[0], t[0]); */ /* 131: 172f87dd6eb9190e70a52b34cc */\ +/* sqr(t[0], t[0]); */ /* 132: 2e5f0fbadd72321ce14a566998 */\ +/* sqr(t[0], t[0]); */ /* 133: 5cbe1f75bae46439c294acd330 */\ +/* sqr(t[0], t[0]); */ /* 134: b97c3eeb75c8c873852959a660 */\ +/* sqr(t[0], t[0]); */ /* 135: 172f87dd6eb9190e70a52b34cc0 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 136: 172f87dd6eb9190e70a52b34ceb */\ +/* sqr(t[0], t[0]); */ /* 137: 2e5f0fbadd72321ce14a56699d6 */\ +/* sqr(t[0], t[0]); */ /* 138: 5cbe1f75bae46439c294acd33ac */\ +/* sqr(t[0], t[0]); */ /* 139: b97c3eeb75c8c873852959a6758 */\ +/* sqr(t[0], t[0]); */ /* 140: 172f87dd6eb9190e70a52b34ceb0 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 141: 172f87dd6eb9190e70a52b34ceb9 */\ +/* sqr(t[0], t[0]); */ /* 142: 2e5f0fbadd72321ce14a56699d72 */\ +/* sqr(t[0], t[0]); */ /* 143: 5cbe1f75bae46439c294acd33ae4 */\ +/* sqr(t[0], t[0]); */ /* 144: b97c3eeb75c8c873852959a675c8 */\ +/* sqr(t[0], t[0]); */ /* 145: 172f87dd6eb9190e70a52b34ceb90 */\ +/* sqr(t[0], t[0]); */ /* 146: 2e5f0fbadd72321ce14a56699d720 */\ +sqr_n_mul(t[0], t[0], 5, t[8]); /* 147: 2e5f0fbadd72321ce14a56699d73f */\ +/* sqr(t[0], t[0]); */ /* 148: 5cbe1f75bae46439c294acd33ae7e */\ +/* sqr(t[0], t[0]); */ /* 149: b97c3eeb75c8c873852959a675cfc */\ +/* sqr(t[0], t[0]); */ /* 150: 172f87dd6eb9190e70a52b34ceb9f8 */\ +/* sqr(t[0], t[0]); */ /* 151: 2e5f0fbadd72321ce14a56699d73f0 */\ +/* sqr(t[0], t[0]); */ /* 152: 5cbe1f75bae46439c294acd33ae7e0 */\ +/* sqr(t[0], t[0]); */ /* 153: b97c3eeb75c8c873852959a675cfc0 */\ +/* sqr(t[0], t[0]); */ /* 154: 172f87dd6eb9190e70a52b34ceb9f80 */\ +/* sqr(t[0], t[0]); */ /* 155: 2e5f0fbadd72321ce14a56699d73f00 */\ +/* sqr(t[0], t[0]); */ /* 156: 5cbe1f75bae46439c294acd33ae7e00 */\ +/* sqr(t[0], t[0]); */ /* 157: b97c3eeb75c8c873852959a675cfc00 */\ +/* sqr(t[0], t[0]); */ /* 158: 172f87dd6eb9190e70a52b34ceb9f800 */\ +/* sqr(t[0], t[0]); */ /* 159: 2e5f0fbadd72321ce14a56699d73f000 */\ +/* sqr(t[0], t[0]); */ /* 160: 5cbe1f75bae46439c294acd33ae7e000 */\ +/* sqr(t[0], t[0]); */ /* 161: b97c3eeb75c8c873852959a675cfc000 */\ +/* sqr(t[0], t[0]); */ /* 162: 172f87dd6eb9190e70a52b34ceb9f8000 */\ +sqr_n_mul(t[0], t[0], 15, t[9]); /* 163: 172f87dd6eb9190e70a52b34ceb9f8011 */\ +/* sqr(t[0], t[0]); */ /* 164: 2e5f0fbadd72321ce14a56699d73f0022 */\ +/* sqr(t[0], t[0]); */ /* 165: 5cbe1f75bae46439c294acd33ae7e0044 */\ +/* sqr(t[0], t[0]); */ /* 166: b97c3eeb75c8c873852959a675cfc0088 */\ +/* sqr(t[0], t[0]); */ /* 167: 172f87dd6eb9190e70a52b34ceb9f80110 */\ +/* sqr(t[0], t[0]); */ /* 168: 2e5f0fbadd72321ce14a56699d73f00220 */\ +/* sqr(t[0], t[0]); */ /* 169: 5cbe1f75bae46439c294acd33ae7e00440 */\ +/* sqr(t[0], t[0]); */ /* 170: b97c3eeb75c8c873852959a675cfc00880 */\ +/* sqr(t[0], t[0]); */ /* 171: 172f87dd6eb9190e70a52b34ceb9f801100 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 172: 172f87dd6eb9190e70a52b34ceb9f80110b */\ +/* sqr(t[0], t[0]); */ /* 173: 2e5f0fbadd72321ce14a56699d73f002216 */\ +/* sqr(t[0], t[0]); */ /* 174: 5cbe1f75bae46439c294acd33ae7e00442c */\ +/* sqr(t[0], t[0]); */ /* 175: b97c3eeb75c8c873852959a675cfc008858 */\ +/* sqr(t[0], t[0]); */ /* 176: 172f87dd6eb9190e70a52b34ceb9f80110b0 */\ +/* sqr(t[0], t[0]); */ /* 177: 2e5f0fbadd72321ce14a56699d73f0022160 */\ +sqr_n_mul(t[0], t[0], 5, t[8]); /* 178: 2e5f0fbadd72321ce14a56699d73f002217f */\ +/* sqr(t[0], t[0]); */ /* 179: 5cbe1f75bae46439c294acd33ae7e00442fe */\ +/* sqr(t[0], t[0]); */ /* 180: b97c3eeb75c8c873852959a675cfc00885fc */\ +/* sqr(t[0], t[0]); */ /* 181: 172f87dd6eb9190e70a52b34ceb9f80110bf8 */\ +/* sqr(t[0], t[0]); */ /* 182: 2e5f0fbadd72321ce14a56699d73f002217f0 */\ +/* sqr(t[0], t[0]); */ /* 183: 5cbe1f75bae46439c294acd33ae7e00442fe0 */\ +/* sqr(t[0], t[0]); */ /* 184: b97c3eeb75c8c873852959a675cfc00885fc0 */\ +/* sqr(t[0], t[0]); */ /* 185: 172f87dd6eb9190e70a52b34ceb9f80110bf80 */\ +/* sqr(t[0], t[0]); */ /* 186: 2e5f0fbadd72321ce14a56699d73f002217f00 */\ +/* sqr(t[0], t[0]); */ /* 187: 5cbe1f75bae46439c294acd33ae7e00442fe00 */\ +/* sqr(t[0], t[0]); */ /* 188: b97c3eeb75c8c873852959a675cfc00885fc00 */\ +sqr_n_mul(t[0], t[0], 10, t[7]); /* 189: b97c3eeb75c8c873852959a675cfc00885fc39 */\ +/* sqr(t[0], t[0]); */ /* 190: 172f87dd6eb9190e70a52b34ceb9f80110bf872 */\ +/* sqr(t[0], t[0]); */ /* 191: 2e5f0fbadd72321ce14a56699d73f002217f0e4 */\ +/* sqr(t[0], t[0]); */ /* 192: 5cbe1f75bae46439c294acd33ae7e00442fe1c8 */\ +/* sqr(t[0], t[0]); */ /* 193: b97c3eeb75c8c873852959a675cfc00885fc390 */\ +/* sqr(t[0], t[0]); */ /* 194: 172f87dd6eb9190e70a52b34ceb9f80110bf8720 */\ +/* sqr(t[0], t[0]); */ /* 195: 2e5f0fbadd72321ce14a56699d73f002217f0e40 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 196: 2e5f0fbadd72321ce14a56699d73f002217f0e67 */\ +/* sqr(t[0], t[0]); */ /* 197: 5cbe1f75bae46439c294acd33ae7e00442fe1cce */\ +/* sqr(t[0], t[0]); */ /* 198: b97c3eeb75c8c873852959a675cfc00885fc399c */\ +/* sqr(t[0], t[0]); */ /* 199: 172f87dd6eb9190e70a52b34ceb9f80110bf87338 */\ +/* sqr(t[0], t[0]); */ /* 200: 2e5f0fbadd72321ce14a56699d73f002217f0e670 */\ +/* sqr(t[0], t[0]); */ /* 201: 5cbe1f75bae46439c294acd33ae7e00442fe1cce0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 202: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3 */\ +/* sqr(t[0], t[0]); */ /* 203: b97c3eeb75c8c873852959a675cfc00885fc399e6 */\ +/* sqr(t[0], t[0]); */ /* 204: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc */\ +/* sqr(t[0], t[0]); */ /* 205: 2e5f0fbadd72321ce14a56699d73f002217f0e6798 */\ +/* sqr(t[0], t[0]); */ /* 206: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf30 */\ +/* sqr(t[0], t[0]); */ /* 207: b97c3eeb75c8c873852959a675cfc00885fc399e60 */\ +/* sqr(t[0], t[0]); */ /* 208: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cc0 */\ +/* sqr(t[0], t[0]); */ /* 209: 2e5f0fbadd72321ce14a56699d73f002217f0e67980 */\ +/* sqr(t[0], t[0]); */ /* 210: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 211: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf333 */\ +/* sqr(t[0], t[0]); */ /* 212: b97c3eeb75c8c873852959a675cfc00885fc399e666 */\ +/* sqr(t[0], t[0]); */ /* 213: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc */\ +/* sqr(t[0], t[0]); */ /* 214: 2e5f0fbadd72321ce14a56699d73f002217f0e679998 */\ +/* sqr(t[0], t[0]); */ /* 215: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3330 */\ +/* sqr(t[0], t[0]); */ /* 216: b97c3eeb75c8c873852959a675cfc00885fc399e6660 */\ +/* sqr(t[0], t[0]); */ /* 217: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc0 */\ +/* sqr(t[0], t[0]); */ /* 218: 2e5f0fbadd72321ce14a56699d73f002217f0e6799980 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 219: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f */\ +/* sqr(t[0], t[0]); */ /* 220: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e */\ +/* sqr(t[0], t[0]); */ /* 221: b97c3eeb75c8c873852959a675cfc00885fc399e6663c */\ +/* sqr(t[0], t[0]); */ /* 222: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78 */\ +/* sqr(t[0], t[0]); */ /* 223: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f0 */\ +/* sqr(t[0], t[0]); */ /* 224: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e0 */\ +/* sqr(t[0], t[0]); */ /* 225: b97c3eeb75c8c873852959a675cfc00885fc399e6663c0 */\ +/* sqr(t[0], t[0]); */ /* 226: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc780 */\ +/* sqr(t[0], t[0]); */ /* 227: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f00 */\ +/* sqr(t[0], t[0]); */ /* 228: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e00 */\ +sqr_n_mul(t[0], t[0], 9, t[2]); /* 229: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33 */\ +/* sqr(t[0], t[0]); */ /* 230: b97c3eeb75c8c873852959a675cfc00885fc399e6663c66 */\ +/* sqr(t[0], t[0]); */ /* 231: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc */\ +/* sqr(t[0], t[0]); */ /* 232: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f198 */\ +/* sqr(t[0], t[0]); */ /* 233: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e330 */\ +/* sqr(t[0], t[0]); */ /* 234: b97c3eeb75c8c873852959a675cfc00885fc399e6663c660 */\ +/* sqr(t[0], t[0]); */ /* 235: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc0 */\ +/* sqr(t[0], t[0]); */ /* 236: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1980 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 237: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993 */\ +/* sqr(t[0], t[0]); */ /* 238: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326 */\ +/* sqr(t[0], t[0]); */ /* 239: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c */\ +/* sqr(t[0], t[0]); */ /* 240: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc98 */\ +/* sqr(t[0], t[0]); */ /* 241: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19930 */\ +/* sqr(t[0], t[0]); */ /* 242: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33260 */\ +/* sqr(t[0], t[0]); */ /* 243: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664c0 */\ +/* sqr(t[0], t[0]); */ /* 244: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc980 */\ +/* sqr(t[0], t[0]); */ /* 245: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 246: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333 */\ +/* sqr(t[0], t[0]); */ /* 247: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666 */\ +/* sqr(t[0], t[0]); */ /* 248: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc */\ +/* sqr(t[0], t[0]); */ /* 249: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9998 */\ +/* sqr(t[0], t[0]); */ /* 250: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993330 */\ +/* sqr(t[0], t[0]); */ /* 251: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326660 */\ +/* sqr(t[0], t[0]); */ /* 252: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccc0 */\ +/* sqr(t[0], t[0]); */ /* 253: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99980 */\ +/* sqr(t[0], t[0]); */ /* 254: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 255: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333 */\ +/* sqr(t[0], t[0]); */ /* 256: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666 */\ +/* sqr(t[0], t[0]); */ /* 257: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc */\ +/* sqr(t[0], t[0]); */ /* 258: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999998 */\ +/* sqr(t[0], t[0]); */ /* 259: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f199333330 */\ +/* sqr(t[0], t[0]); */ /* 260: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666660 */\ +/* sqr(t[0], t[0]); */ /* 261: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccc0 */\ +/* sqr(t[0], t[0]); */ /* 262: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999980 */\ +/* sqr(t[0], t[0]); */ /* 263: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 264: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f1993333333 */\ +/* sqr(t[0], t[0]); */ /* 265: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666 */\ +/* sqr(t[0], t[0]); */ /* 266: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc */\ +/* sqr(t[0], t[0]); */ /* 267: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999998 */\ +/* sqr(t[0], t[0]); */ /* 268: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333330 */\ +/* sqr(t[0], t[0]); */ /* 269: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666660 */\ +/* sqr(t[0], t[0]); */ /* 270: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664ccccccc0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 271: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb */\ +/* sqr(t[0], t[0]); */ /* 272: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996 */\ +/* sqr(t[0], t[0]); */ /* 273: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c */\ +/* sqr(t[0], t[0]); */ /* 274: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666658 */\ +/* sqr(t[0], t[0]); */ /* 275: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb0 */\ +/* sqr(t[0], t[0]); */ /* 276: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999960 */\ +/* sqr(t[0], t[0]); */ /* 277: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332c0 */\ +/* sqr(t[0], t[0]); */ /* 278: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666580 */\ +/* sqr(t[0], t[0]); */ /* 279: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 280: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33 */\ +/* sqr(t[0], t[0]); */ /* 281: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666 */\ +/* sqr(t[0], t[0]); */ /* 282: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc */\ +/* sqr(t[0], t[0]); */ /* 283: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665998 */\ +/* sqr(t[0], t[0]); */ /* 284: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb330 */\ +/* sqr(t[0], t[0]); */ /* 285: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996660 */\ +/* sqr(t[0], t[0]); */ /* 286: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccc0 */\ +/* sqr(t[0], t[0]); */ /* 287: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659980 */\ +/* sqr(t[0], t[0]); */ /* 288: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 289: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333 */\ +/* sqr(t[0], t[0]); */ /* 290: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666 */\ +/* sqr(t[0], t[0]); */ /* 291: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc */\ +/* sqr(t[0], t[0]); */ /* 292: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e3326666666599998 */\ +/* sqr(t[0], t[0]); */ /* 293: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb33330 */\ +/* sqr(t[0], t[0]); */ /* 294: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc99999999666660 */\ +/* sqr(t[0], t[0]); */ /* 295: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccc0 */\ +/* sqr(t[0], t[0]); */ /* 296: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e33266666665999980 */\ +/* sqr(t[0], t[0]); */ /* 297: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333300 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 298: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb333333 */\ +/* sqr(t[0], t[0]); */ /* 299: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc999999996666666 */\ +/* sqr(t[0], t[0]); */ /* 300: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc */\ +/* sqr(t[0], t[0]); */ /* 301: 5cbe1f75bae46439c294acd33ae7e00442fe1ccf3331e332666666659999998 */\ +/* sqr(t[0], t[0]); */ /* 302: b97c3eeb75c8c873852959a675cfc00885fc399e6663c664cccccccb3333330 */\ +/* sqr(t[0], t[0]); */ /* 303: 172f87dd6eb9190e70a52b34ceb9f80110bf8733cccc78cc9999999966666660 */\ +/* sqr(t[0], t[0]); */ /* 304: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332ccccccc0 */\ +sqr_n_mul(out, t[0], 6, t[1]); /* 305: 2e5f0fbadd72321ce14a56699d73f002217f0e679998f19933333332cccccccd */\ +} while(0) diff --git a/crypto/blst_src/pentaroot.c b/crypto/blst_src/pentaroot.c new file mode 100644 index 00000000000..71f334df50a --- /dev/null +++ b/crypto/blst_src/pentaroot.c @@ -0,0 +1,76 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +static inline void mul_fr(vec256 ret, const vec256 a, const vec256 b) +{ mul_mont_sparse_256(ret, a, b, BLS12_381_r, r0); } + +static inline void sqr_fr(vec256 ret, const vec256 a) +{ sqr_mont_sparse_256(ret, a, BLS12_381_r, r0); } + +#ifdef __OPTIMIZE_SIZE__ +void blst_fr_pentaroot(vec256 out, const vec256 inp) +{ + static const byte pow[] = { + TO_BYTES(0x33333332cccccccd), TO_BYTES(0x217f0e679998f199), + TO_BYTES(0xe14a56699d73f002), TO_BYTES(0x2e5f0fbadd72321c) + }; + size_t pow_bits = 254; + vec256 ret; + + vec_copy(ret, inp, sizeof(ret)); /* ret = inp^1 */ + --pow_bits; /* most significant bit is set, skip over */ + while (pow_bits--) { + sqr_fr(ret, ret); + if (is_bit_set(pow, pow_bits)) + mul_fr(ret, ret, inp); + } + vec_copy(out, ret, sizeof(ret)); /* out = ret */ +} +#else +# if 0 +/* + * "255"-bit variant omits full reductions at the ends of squarings, + * not implemented yet[?]. + */ +static inline void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, + const vec256 b) +{ sqr_n_mul_mont_255(out, a, count, BLS12_381_r, r0, b); } +# else +static void sqr_n_mul_fr(vec256 out, const vec256 a, size_t count, + const vec256 b) +{ + do { + sqr_fr(out, a); + a = out; + } while (--count); + mul_fr(out, out, b); +} +# endif + +# define sqr(ret,a) sqr_fr(ret,a) +# define mul(ret,a,b) mul_fr(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fr(ret,a,n,b) + +# include "pentaroot-addchain.h" +void blst_fr_pentaroot(vec256 out, const vec256 inp) +{ PENTAROOT_MOD_BLS12_381_r(out, inp, vec256); } +# undef PENTAROOT_MOD_BLS12_381_r + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +void blst_fr_pentapow(vec256 out, const vec256 inp) +{ + vec256 tmp; + + sqr_fr(tmp, inp); + sqr_fr(tmp, tmp); + mul_fr(out, tmp, inp); +} diff --git a/crypto/blst_src/point.h b/crypto/blst_src/point.h new file mode 100644 index 00000000000..0aa7379671f --- /dev/null +++ b/crypto/blst_src/point.h @@ -0,0 +1,62 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_POINT_H__ +#define __BLS12_381_ASM_POINT_H__ + +#include "vect.h" +#include "bytes.h" + +#define DECLARE_POINT(ptype, bits) \ +typedef struct { vec##bits X,Y,Z; } ptype; \ +typedef struct { vec##bits X,Y; } ptype##_affine; \ +\ +static void ptype##_dadd(ptype *out, const ptype *p1, const ptype *p2, \ + const vec##bits a4); \ +static void ptype##_dadd_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_add(ptype *out, const ptype *p1, const ptype *p2); \ +static void ptype##_add_affine(ptype *out, const ptype *p1, \ + const ptype##_affine *p2); \ +static void ptype##_double(ptype *out, const ptype *p1); \ +static void ptype##_mult_w5(ptype *out, const ptype *point, \ + const byte *scalar, size_t nbits); \ +static void ptype##_cneg(ptype *p, limb_t cbit); \ +static void ptype##_to_affine(ptype##_affine *out, const ptype *in); \ +static void ptype##_from_Jacobian(ptype *out, const ptype *in); \ +\ +static inline void ptype##_cswap(ptype *restrict a, \ + ptype *restrict b, bool_t cbit) { \ + vec_cswap(a, b, sizeof(ptype), cbit); \ +} \ +static inline void ptype##_ccopy(ptype *restrict a, \ + const ptype *restrict b, bool_t cbit) {\ + vec_select(a, b, a, sizeof(ptype), cbit); \ +} + +#define DECLARE_PRIVATE_POINTXZ(ptype, bits) \ +typedef struct { vec##bits X,Z; } ptype##xz; \ +\ +static void ptype##xz_ladder_pre(ptype##xz *out, const ptype *in); \ +static void ptype##xz_ladder_step(ptype##xz *r, ptype##xz *s, \ + const ptype##xz *p); \ +static void ptype##xz_ladder_post(ptype *ret, \ + const ptype##xz *r, const ptype##xz *s, \ + const ptype##xz *p, const vec##bits Y1);\ +\ +static inline void ptype##xz_cswap(ptype##xz *restrict a, \ + ptype##xz *restrict b, bool_t cbit) {\ + vec_cswap(a, b, sizeof(ptype##xz), cbit); \ +} + +DECLARE_POINT(POINTonE1, 384) + +DECLARE_POINT(POINTonE2, 384x) + +#ifdef __GNUC__ +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +#endif diff --git a/crypto/blst_src/rb_tree.c b/crypto/blst_src/rb_tree.c new file mode 100644 index 00000000000..207becdad18 --- /dev/null +++ b/crypto/blst_src/rb_tree.c @@ -0,0 +1,145 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include + +/* + * Red-black tree tailored for uniqueness test. Amount of messages to be + * checked is known prior context initialization, implementation is + * insert-only, failure is returned if message is already in the tree. + */ + +struct node { + struct node *leafs[2]; + const void *data; + size_t len_n_colour; /* len<<1 | colour */ +}; + +struct rb_tree { + struct node *root; + size_t n_nodes; + struct node nodes[1]; +}; + +static long bytes_compare(const unsigned char *ptr0, size_t len0, + const unsigned char *ptr1, size_t len1) +{ + size_t i, len = len0len_n_colour &= ~(size_t)1) +#define PAINT_RED(p) ((p)->len_n_colour |= 1) +#define IS_RED(p) ((p)->len_n_colour & 1) + +static int rb_tree_insert(struct rb_tree *tree, const void *data, size_t len) +{ + struct node *nodes[8*sizeof(void *)]; /* visited nodes */ + unsigned char dirs[8*sizeof(void *)]; /* taken directions */ + size_t k = 0; /* walked distance */ + struct node *p, *y, *z; + + for (p = tree->root; p != NULL; k++) { + long cmp = bytes_compare(data, len, p->data, p->len_n_colour>>1); + + if (cmp == 0) + return 0; /* already in tree, no insertion */ + + /* record the step */ + nodes[k] = p; + p = p->leafs[(dirs[k] = cmp>0)]; + } + + /* allocate new node */ + z = &tree->nodes[tree->n_nodes++]; + z->leafs[0] = z->leafs[1] = NULL; + z->data = data; + z->len_n_colour = len<<1; + PAINT_RED(z); + + /* graft |z| */ + if (k > 0) + nodes[k-1]->leafs[dirs[k-1]] = z; + else + tree->root = z; + + /* re-balance |tree| */ + while (k >= 2 && IS_RED(y = nodes[k-1])) { + size_t ydir = dirs[k-2]; + struct node *x = nodes[k-2], /* |z|'s grandparent */ + *s = x->leafs[ydir^1]; /* |z|'s uncle */ + + if (s != NULL && IS_RED(s)) { + PAINT_RED(x); + PAINT_BLACK(y); + PAINT_BLACK(s); + k -= 2; + } else { + if (dirs[k-1] != ydir) { + /* | | + * x x + * / \ \ + * y s -> z s + * \ / + * z y + * / \ + * ? ? + */ + struct node *t = y; + y = y->leafs[ydir^1]; + t->leafs[ydir^1] = y->leafs[ydir]; + y->leafs[ydir] = t; + } + + /* | | + * x y + * \ / \ + * y s -> z x + * / \ / \ + * z ? ? s + */ + x->leafs[ydir] = y->leafs[ydir^1]; + y->leafs[ydir^1] = x; + + PAINT_RED(x); + PAINT_BLACK(y); + + if (k > 2) + nodes[k-3]->leafs[dirs[k-3]] = y; + else + tree->root = y; + + break; + } + } + + PAINT_BLACK(tree->root); + + return 1; +} + +#undef IS_RED +#undef PAINT_RED +#undef PAINT_BLACK + +size_t blst_uniq_sizeof(size_t n_nodes) +{ return sizeof(struct rb_tree) + sizeof(struct node)*(n_nodes-1); } + +void blst_uniq_init(struct rb_tree *tree) +{ + tree->root = NULL; + tree->n_nodes = 0; +} + +int blst_uniq_test(struct rb_tree *tree, const void *data, size_t len) +{ return (int)rb_tree_insert(tree, data, len); } diff --git a/crypto/blst_src/recip-addchain.h b/crypto/blst_src/recip-addchain.h new file mode 100644 index 00000000000..e4e436a3f09 --- /dev/null +++ b/crypto/blst_src/recip-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is BLS12_381_P-2. Exponentiation to which yields + * reciprocal to input base. + * + * Generated with 'addchain 4002409555221667393417789825735904156556882819939007885332058136124031650490837864442687629129015664037894272559785' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 461 (16) <<< + * # Bos-Coster (win=3) : 464 ( 9) + * # Bos-Coster (win=8) : 469 (35) + * # Bos-Coster (win=5) : 463 (28) + * # Bos-Coster (win=9) : 467 (32) + * # Bos-Coster (win=7) : 462 (27) + * # Yacobi : 481 (31) + * # Bos-Coster (win=10) : 475 (30) + * # Bos-Coster (win=6) : 463 (32) + * # Bos-Coster (win=2) : 489 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 498 ( 5) + */ + +#define RECIPROCAL_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[1], inp, sizeof(ptype)); /* 0: 1 */\ +sqr(t[0], t[1]); /* 1: 2 */\ +mul(t[9], t[0], t[1]); /* 2: 3 */\ +sqr(t[5], t[0]); /* 3: 4 */\ +mul(t[2], t[9], t[0]); /* 4: 5 */\ +mul(t[7], t[5], t[9]); /* 5: 7 */\ +mul(t[10], t[2], t[5]); /* 6: 9 */\ +mul(t[13], t[7], t[5]); /* 7: b */\ +mul(t[4], t[10], t[5]); /* 8: d */\ +mul(t[8], t[13], t[5]); /* 9: f */\ +mul(t[15], t[4], t[5]); /* 10: 11 */\ +mul(t[11], t[8], t[5]); /* 11: 13 */\ +mul(t[3], t[15], t[5]); /* 12: 15 */\ +mul(t[12], t[11], t[5]); /* 13: 17 */\ +sqr(t[0], t[4]); /* 14: 1a */\ +mul(t[14], t[12], t[5]); /* 15: 1b */\ +mul(t[6], t[0], t[9]); /* 16: 1d */\ +mul(t[5], t[0], t[2]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[8]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[10]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[8]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[1]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[13]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[10]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[6]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[12]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[11]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[9]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[7]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[12]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[11]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[4]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[8]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[10]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[8]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[9]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[8]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[5]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[4]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[2]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +/* sqr(t[0], t[0]); */ /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +/* sqr(t[0], t[0]); */ /* 458: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd554 */\ +/* sqr(t[0], t[0]); */ /* 459: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa8 */\ +sqr_n_mul(out, t[0], 3, t[1]); /* 460: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaaa9 */\ +} while(0) diff --git a/crypto/blst_src/recip.c b/crypto/blst_src/recip.c new file mode 100644 index 00000000000..e0c700635ed --- /dev/null +++ b/crypto/blst_src/recip.c @@ -0,0 +1,139 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +/* + * 608 multiplications for scalar inversion modulo BLS12-381 prime, 32% + * more than corresponding optimal addition-chain, plus mispredicted + * branch penalties on top of that... The addition chain below was + * measured to be >50% faster. + */ +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + static const byte BLS12_381_P_minus_2[] = { + TO_BYTES(0xb9feffffffffaaa9), TO_BYTES(0x1eabfffeb153ffff), + TO_BYTES(0x6730d2a0f6b0f624), TO_BYTES(0x64774b84f38512bf), + TO_BYTES(0x4b1ba7b6434bacd7), TO_BYTES(0x1a0111ea397fe69a) + }; + + exp_mont_384(out, inp, BLS12_381_P_minus_2, 381, BLS12_381_P, p0); +} +#else +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "recip-addchain.h" +static void flt_reciprocal_fp(vec384 out, const vec384 inp) +{ + RECIPROCAL_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIPROCAL_MOD_BLS12_381_P +# undef sqr_n_mul +# undef mul +# undef sqr +#endif + +static void flt_reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + flt_reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +static void reciprocal_fp(vec384 out, const vec384 inp) +{ + static const vec384 Px8 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xcff7fffffffd5558), TO_LIMB_T(0xf55ffff58a9ffffd), + TO_LIMB_T(0x39869507b587b120), TO_LIMB_T(0x23ba5c279c2895fb), + TO_LIMB_T(0x58dd3db21a5d66bb), TO_LIMB_T(0xd0088f51cbff34d2) + }; +#ifdef __BLST_NO_ASM__ +# define RRx4 BLS12_381_RR +#else + static const vec384 RRx4 = { /* (4<<768)%P */ + TO_LIMB_T(0x5f7e7cd070d107c2), TO_LIMB_T(0xec839a9ac49c13c8), + TO_LIMB_T(0x6933786f44f4ef0b), TO_LIMB_T(0xd6bf8b9c676be983), + TO_LIMB_T(0xd3adaaaa4dcefb06), TO_LIMB_T(0x12601bc1d82bc175) + }; +#endif + union { vec768 x; vec384 r[2]; } temp; + + ct_inverse_mod_383(temp.x, inp, BLS12_381_P, Px8); + redc_mont_384(temp.r[0], temp.x, BLS12_381_P, p0); + mul_mont_384(temp.r[0], temp.r[0], RRx4, BLS12_381_P, p0); + +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + /* sign goes straight to flt_reciprocal */ + mul_mont_384(temp.r[1], temp.r[0], inp, BLS12_381_P, p0); + if (vec_is_equal(temp.r[1], BLS12_381_Rx.p, sizeof(vec384)) | + vec_is_zero(temp.r[1], sizeof(vec384))) + vec_copy(out, temp.r[0], sizeof(vec384)); + else + flt_reciprocal_fp(out, inp); +#else + vec_copy(out, temp.r[0], sizeof(vec384)); +#endif +#undef RRx4 +} + +void blst_fp_inverse(vec384 out, const vec384 inp) +{ reciprocal_fp(out, inp); } + +void blst_fp_eucl_inverse(vec384 ret, const vec384 a) +{ reciprocal_fp(ret, a); } + +static void reciprocal_fp2(vec384x out, const vec384x inp) +{ + vec384 t0, t1; + + /* + * |out| = 1/(a + b*i) = a/(a^2+b^2) - b/(a^2+b^2)*i + */ + sqr_fp(t0, inp[0]); + sqr_fp(t1, inp[1]); + add_fp(t0, t0, t1); + reciprocal_fp(t1, t0); + mul_fp(out[0], inp[0], t1); + mul_fp(out[1], inp[1], t1); + neg_fp(out[1], out[1]); +} + +void blst_fp2_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +void blst_fp2_eucl_inverse(vec384x out, const vec384x inp) +{ reciprocal_fp2(out, inp); } + +static void reciprocal_fr(vec256 out, const vec256 inp) +{ + static const vec256 rx2 = { /* left-aligned value of the modulus */ + TO_LIMB_T(0xfffffffe00000002), TO_LIMB_T(0xa77b4805fffcb7fd), + TO_LIMB_T(0x6673b0101343b00a), TO_LIMB_T(0xe7db4ea6533afa90), + }; + vec512 temp; + + ct_inverse_mod_256(temp, inp, BLS12_381_r, rx2); + redc_mont_256(out, temp, BLS12_381_r, r0); + mul_mont_sparse_256(out, out, BLS12_381_rRR, BLS12_381_r, r0); +} + +void blst_fr_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } + +void blst_fr_eucl_inverse(vec256 out, const vec256 inp) +{ reciprocal_fr(out, inp); } diff --git a/crypto/blst_src/sha256.h b/crypto/blst_src/sha256.h new file mode 100644 index 00000000000..77ddb6dc848 --- /dev/null +++ b/crypto/blst_src/sha256.h @@ -0,0 +1,140 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_SHA256_H__ +#define __BLS12_381_ASM_SHA256_H__ + +#include "vect.h" + +#if (defined(__x86_64__) || defined(__x86_64) || defined(_M_X64)) && \ + defined(__SHA__) /* -msha */ && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_data_order_shaext +#elif defined(__aarch64__) && \ + defined(__ARM_FEATURE_CRYPTO) && !defined(__BLST_PORTABLE__) +# define sha256_block_data_order blst_sha256_block_armv8 +#else +# define sha256_block_data_order blst_sha256_block_data_order +#endif +#define sha256_hcopy blst_sha256_hcopy +#define sha256_bcopy blst_sha256_bcopy +#define sha256_emit blst_sha256_emit + +void sha256_block_data_order(unsigned int *h, const void *inp, size_t blocks); +void sha256_hcopy(unsigned int dst[8], const unsigned int src[8]); +void sha256_bcopy(void *dst, const void *src, size_t len); + +/* + * If SHA256_CTX conflicts with something, just redefine it to alternative + * custom name prior including this header. + */ +typedef struct { + unsigned int h[8]; + unsigned long long N; + unsigned char buf[64]; + size_t off; +} SHA256_CTX; + + +static void sha256_init_h(unsigned int h[8]) +{ + h[0] = 0x6a09e667U; + h[1] = 0xbb67ae85U; + h[2] = 0x3c6ef372U; + h[3] = 0xa54ff53aU; + h[4] = 0x510e527fU; + h[5] = 0x9b05688cU; + h[6] = 0x1f83d9abU; + h[7] = 0x5be0cd19U; +} + +static void sha256_init(SHA256_CTX *ctx) +{ + sha256_init_h(ctx->h); + ctx->N = 0; + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; +} + +static void sha256_update(SHA256_CTX *ctx, const void *_inp, size_t len) +{ + size_t n; + const unsigned char *inp = _inp; + + ctx->N += len; + + if ((len != 0) & ((n = ctx->off) != 0)) { + size_t rem = sizeof(ctx->buf) - n; + + if (rem > len) { + sha256_bcopy(ctx->buf + n, inp, len); + ctx->off += len; + return; + } else { + sha256_bcopy(ctx->buf + n, inp, rem); + inp += rem; + len -= rem; + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + ctx->off = 0; + } + } + + n = len / sizeof(ctx->buf); + if (n > 0) { + sha256_block_data_order(ctx->h, inp, n); + n *= sizeof(ctx->buf); + inp += n; + len -= n; + } + + if (len) + sha256_bcopy(ctx->buf, inp, ctx->off = len); +} + +#define __TOBE32(ptr, val) ((ptr)[0] = (unsigned char)((val)>>24), \ + (ptr)[1] = (unsigned char)((val)>>16), \ + (ptr)[2] = (unsigned char)((val)>>8), \ + (ptr)[3] = (unsigned char)(val)) + +#if 1 +void sha256_emit(unsigned char md[32], const unsigned int h[8]); +#else +static void sha256_emit(unsigned char md[32], const unsigned int h[8]) +{ + unsigned int h_i; + + h_i = h[0]; __TOBE32(md + 0, h_i); + h_i = h[1]; __TOBE32(md + 4, h_i); + h_i = h[2]; __TOBE32(md + 8, h_i); + h_i = h[3]; __TOBE32(md + 12, h_i); + h_i = h[4]; __TOBE32(md + 16, h_i); + h_i = h[5]; __TOBE32(md + 20, h_i); + h_i = h[6]; __TOBE32(md + 24, h_i); + h_i = h[7]; __TOBE32(md + 28, h_i); +} +#endif + +static void sha256_final(unsigned char md[32], SHA256_CTX *ctx) +{ + unsigned long long bits = ctx->N * 8; + size_t n = ctx->off; + unsigned char *tail; + + ctx->buf[n++] = 0x80; + + if (n > (sizeof(ctx->buf) - 8)) { + sha256_block_data_order(ctx->h, ctx->buf, 1); + vec_zero(ctx->buf, sizeof(ctx->buf)); + } + + tail = ctx->buf + sizeof(ctx->buf) - 8; + __TOBE32(tail, (unsigned int)(bits >> 32)); + __TOBE32(tail + 4, (unsigned int)bits); + sha256_block_data_order(ctx->h, ctx->buf, 1); + sha256_emit(md, ctx->h); +} + +#undef __TOBE32 +#endif diff --git a/crypto/blst_src/sqrt-addchain.h b/crypto/blst_src/sqrt-addchain.h new file mode 100644 index 00000000000..4e7f0beb6b1 --- /dev/null +++ b/crypto/blst_src/sqrt-addchain.h @@ -0,0 +1,489 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +/* + * The "magic" number is (BLS12_381_P-3)/4. Exponentiation to which + * yields reciprocal of sqrt(x), which is used in simplified Shallue- + * van de Woestijne-Ulas map-to-curve method, but it's trivial to adapt + * it for more "traditional" sqrt(x) as 'x*ret' (or for is_square(x) + * as 'x*ret^2==1'). + * + * Generated with 'addchain 1000602388805416848354447456433976039139220704984751971333014534031007912622709466110671907282253916009473568139946' + * https://github.com/kwantam/addchain + * + * # Bos-Coster (win=4) : 458 (16) <<< + * # Bos-Coster (win=5) : 460 (28) + * # Bos-Coster (win=6) : 461 (33) + * # Bos-Coster (win=7) : 460 (28) + * # Bos-Coster (win=3) : 462 ( 9) + * # Bos-Coster (win=8) : 466 (34) + * # Bos-Coster (win=9) : 464 (31) + * # Yacobi : 478 (31) + * # Bos-Coster (win=10) : 473 (30) + * # Bos-Coster (win=2) : 486 ( 5) + * # Bergeron-Berstel-Brlek-Duboc : 489 ( 5) + */ + +#define RECIP_SQRT_MOD_BLS12_381_P(out, inp, ptype) do { \ +ptype t[16]; \ +vec_copy(t[13], inp, sizeof(ptype));/* 0: 1 */\ +sqr(t[0], t[13]); /* 1: 2 */\ +mul(t[8], t[0], t[13]); /* 2: 3 */\ +sqr(t[4], t[0]); /* 3: 4 */\ +mul(t[1], t[8], t[0]); /* 4: 5 */\ +mul(t[6], t[4], t[8]); /* 5: 7 */\ +mul(t[9], t[1], t[4]); /* 6: 9 */\ +mul(t[12], t[6], t[4]); /* 7: b */\ +mul(t[3], t[9], t[4]); /* 8: d */\ +mul(t[7], t[12], t[4]); /* 9: f */\ +mul(t[15], t[3], t[4]); /* 10: 11 */\ +mul(t[10], t[7], t[4]); /* 11: 13 */\ +mul(t[2], t[15], t[4]); /* 12: 15 */\ +mul(t[11], t[10], t[4]); /* 13: 17 */\ +sqr(t[0], t[3]); /* 14: 1a */\ +mul(t[14], t[11], t[4]); /* 15: 1b */\ +mul(t[5], t[0], t[8]); /* 16: 1d */\ +mul(t[4], t[0], t[1]); /* 17: 1f */\ +/* sqr(t[0], t[0]); */ /* 18: 34 */\ +/* sqr(t[0], t[0]); */ /* 19: 68 */\ +/* sqr(t[0], t[0]); */ /* 20: d0 */\ +/* sqr(t[0], t[0]); */ /* 21: 1a0 */\ +/* sqr(t[0], t[0]); */ /* 22: 340 */\ +/* sqr(t[0], t[0]); */ /* 23: 680 */\ +/* sqr(t[0], t[0]); */ /* 24: d00 */\ +/* sqr(t[0], t[0]); */ /* 25: 1a00 */\ +/* sqr(t[0], t[0]); */ /* 26: 3400 */\ +/* sqr(t[0], t[0]); */ /* 27: 6800 */\ +/* sqr(t[0], t[0]); */ /* 28: d000 */\ +/* sqr(t[0], t[0]); */ /* 29: 1a000 */\ +sqr_n_mul(t[0], t[0], 12, t[15]); /* 30: 1a011 */\ +/* sqr(t[0], t[0]); */ /* 31: 34022 */\ +/* sqr(t[0], t[0]); */ /* 32: 68044 */\ +/* sqr(t[0], t[0]); */ /* 33: d0088 */\ +/* sqr(t[0], t[0]); */ /* 34: 1a0110 */\ +/* sqr(t[0], t[0]); */ /* 35: 340220 */\ +/* sqr(t[0], t[0]); */ /* 36: 680440 */\ +/* sqr(t[0], t[0]); */ /* 37: d00880 */\ +sqr_n_mul(t[0], t[0], 7, t[7]); /* 38: d0088f */\ +/* sqr(t[0], t[0]); */ /* 39: 1a0111e */\ +/* sqr(t[0], t[0]); */ /* 40: 340223c */\ +/* sqr(t[0], t[0]); */ /* 41: 6804478 */\ +/* sqr(t[0], t[0]); */ /* 42: d0088f0 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 43: d0088f5 */\ +/* sqr(t[0], t[0]); */ /* 44: 1a0111ea */\ +/* sqr(t[0], t[0]); */ /* 45: 340223d4 */\ +/* sqr(t[0], t[0]); */ /* 46: 680447a8 */\ +/* sqr(t[0], t[0]); */ /* 47: d0088f50 */\ +/* sqr(t[0], t[0]); */ /* 48: 1a0111ea0 */\ +/* sqr(t[0], t[0]); */ /* 49: 340223d40 */\ +sqr_n_mul(t[0], t[0], 6, t[6]); /* 50: 340223d47 */\ +/* sqr(t[0], t[0]); */ /* 51: 680447a8e */\ +/* sqr(t[0], t[0]); */ /* 52: d0088f51c */\ +/* sqr(t[0], t[0]); */ /* 53: 1a0111ea38 */\ +/* sqr(t[0], t[0]); */ /* 54: 340223d470 */\ +/* sqr(t[0], t[0]); */ /* 55: 680447a8e0 */\ +/* sqr(t[0], t[0]); */ /* 56: d0088f51c0 */\ +/* sqr(t[0], t[0]); */ /* 57: 1a0111ea380 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 58: 1a0111ea397 */\ +/* sqr(t[0], t[0]); */ /* 59: 340223d472e */\ +/* sqr(t[0], t[0]); */ /* 60: 680447a8e5c */\ +/* sqr(t[0], t[0]); */ /* 61: d0088f51cb8 */\ +/* sqr(t[0], t[0]); */ /* 62: 1a0111ea3970 */\ +/* sqr(t[0], t[0]); */ /* 63: 340223d472e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 64: 340223d472ff */\ +/* sqr(t[0], t[0]); */ /* 65: 680447a8e5fe */\ +/* sqr(t[0], t[0]); */ /* 66: d0088f51cbfc */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 67: d0088f51cbff */\ +/* sqr(t[0], t[0]); */ /* 68: 1a0111ea397fe */\ +/* sqr(t[0], t[0]); */ /* 69: 340223d472ffc */\ +/* sqr(t[0], t[0]); */ /* 70: 680447a8e5ff8 */\ +/* sqr(t[0], t[0]); */ /* 71: d0088f51cbff0 */\ +/* sqr(t[0], t[0]); */ /* 72: 1a0111ea397fe0 */\ +/* sqr(t[0], t[0]); */ /* 73: 340223d472ffc0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 74: 340223d472ffcd */\ +/* sqr(t[0], t[0]); */ /* 75: 680447a8e5ff9a */\ +/* sqr(t[0], t[0]); */ /* 76: d0088f51cbff34 */\ +/* sqr(t[0], t[0]); */ /* 77: 1a0111ea397fe68 */\ +/* sqr(t[0], t[0]); */ /* 78: 340223d472ffcd0 */\ +/* sqr(t[0], t[0]); */ /* 79: 680447a8e5ff9a0 */\ +/* sqr(t[0], t[0]); */ /* 80: d0088f51cbff340 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 81: d0088f51cbff34d */\ +/* sqr(t[0], t[0]); */ /* 82: 1a0111ea397fe69a */\ +/* sqr(t[0], t[0]); */ /* 83: 340223d472ffcd34 */\ +/* sqr(t[0], t[0]); */ /* 84: 680447a8e5ff9a68 */\ +/* sqr(t[0], t[0]); */ /* 85: d0088f51cbff34d0 */\ +/* sqr(t[0], t[0]); */ /* 86: 1a0111ea397fe69a0 */\ +/* sqr(t[0], t[0]); */ /* 87: 340223d472ffcd340 */\ +sqr_n_mul(t[0], t[0], 6, t[9]); /* 88: 340223d472ffcd349 */\ +/* sqr(t[0], t[0]); */ /* 89: 680447a8e5ff9a692 */\ +/* sqr(t[0], t[0]); */ /* 90: d0088f51cbff34d24 */\ +/* sqr(t[0], t[0]); */ /* 91: 1a0111ea397fe69a48 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 92: 1a0111ea397fe69a4b */\ +/* sqr(t[0], t[0]); */ /* 93: 340223d472ffcd3496 */\ +/* sqr(t[0], t[0]); */ /* 94: 680447a8e5ff9a692c */\ +/* sqr(t[0], t[0]); */ /* 95: d0088f51cbff34d258 */\ +/* sqr(t[0], t[0]); */ /* 96: 1a0111ea397fe69a4b0 */\ +/* sqr(t[0], t[0]); */ /* 97: 340223d472ffcd34960 */\ +/* sqr(t[0], t[0]); */ /* 98: 680447a8e5ff9a692c0 */\ +/* sqr(t[0], t[0]); */ /* 99: d0088f51cbff34d2580 */\ +sqr_n_mul(t[0], t[0], 7, t[3]); /* 100: d0088f51cbff34d258d */\ +/* sqr(t[0], t[0]); */ /* 101: 1a0111ea397fe69a4b1a */\ +/* sqr(t[0], t[0]); */ /* 102: 340223d472ffcd349634 */\ +/* sqr(t[0], t[0]); */ /* 103: 680447a8e5ff9a692c68 */\ +/* sqr(t[0], t[0]); */ /* 104: d0088f51cbff34d258d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 105: d0088f51cbff34d258dd */\ +/* sqr(t[0], t[0]); */ /* 106: 1a0111ea397fe69a4b1ba */\ +/* sqr(t[0], t[0]); */ /* 107: 340223d472ffcd3496374 */\ +/* sqr(t[0], t[0]); */ /* 108: 680447a8e5ff9a692c6e8 */\ +/* sqr(t[0], t[0]); */ /* 109: d0088f51cbff34d258dd0 */\ +/* sqr(t[0], t[0]); */ /* 110: 1a0111ea397fe69a4b1ba0 */\ +/* sqr(t[0], t[0]); */ /* 111: 340223d472ffcd34963740 */\ +sqr_n_mul(t[0], t[0], 6, t[7]); /* 112: 340223d472ffcd3496374f */\ +/* sqr(t[0], t[0]); */ /* 113: 680447a8e5ff9a692c6e9e */\ +/* sqr(t[0], t[0]); */ /* 114: d0088f51cbff34d258dd3c */\ +/* sqr(t[0], t[0]); */ /* 115: 1a0111ea397fe69a4b1ba78 */\ +/* sqr(t[0], t[0]); */ /* 116: 340223d472ffcd3496374f0 */\ +/* sqr(t[0], t[0]); */ /* 117: 680447a8e5ff9a692c6e9e0 */\ +/* sqr(t[0], t[0]); */ /* 118: d0088f51cbff34d258dd3c0 */\ +sqr_n_mul(t[0], t[0], 6, t[14]); /* 119: d0088f51cbff34d258dd3db */\ +/* sqr(t[0], t[0]); */ /* 120: 1a0111ea397fe69a4b1ba7b6 */\ +/* sqr(t[0], t[0]); */ /* 121: 340223d472ffcd3496374f6c */\ +/* sqr(t[0], t[0]); */ /* 122: 680447a8e5ff9a692c6e9ed8 */\ +sqr_n_mul(t[0], t[0], 3, t[13]); /* 123: 680447a8e5ff9a692c6e9ed9 */\ +/* sqr(t[0], t[0]); */ /* 124: d0088f51cbff34d258dd3db2 */\ +/* sqr(t[0], t[0]); */ /* 125: 1a0111ea397fe69a4b1ba7b64 */\ +/* sqr(t[0], t[0]); */ /* 126: 340223d472ffcd3496374f6c8 */\ +/* sqr(t[0], t[0]); */ /* 127: 680447a8e5ff9a692c6e9ed90 */\ +/* sqr(t[0], t[0]); */ /* 128: d0088f51cbff34d258dd3db20 */\ +/* sqr(t[0], t[0]); */ /* 129: 1a0111ea397fe69a4b1ba7b640 */\ +/* sqr(t[0], t[0]); */ /* 130: 340223d472ffcd3496374f6c80 */\ +/* sqr(t[0], t[0]); */ /* 131: 680447a8e5ff9a692c6e9ed900 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 132: 680447a8e5ff9a692c6e9ed90d */\ +/* sqr(t[0], t[0]); */ /* 133: d0088f51cbff34d258dd3db21a */\ +/* sqr(t[0], t[0]); */ /* 134: 1a0111ea397fe69a4b1ba7b6434 */\ +/* sqr(t[0], t[0]); */ /* 135: 340223d472ffcd3496374f6c868 */\ +/* sqr(t[0], t[0]); */ /* 136: 680447a8e5ff9a692c6e9ed90d0 */\ +/* sqr(t[0], t[0]); */ /* 137: d0088f51cbff34d258dd3db21a0 */\ +/* sqr(t[0], t[0]); */ /* 138: 1a0111ea397fe69a4b1ba7b64340 */\ +/* sqr(t[0], t[0]); */ /* 139: 340223d472ffcd3496374f6c8680 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 140: 340223d472ffcd3496374f6c8697 */\ +/* sqr(t[0], t[0]); */ /* 141: 680447a8e5ff9a692c6e9ed90d2e */\ +/* sqr(t[0], t[0]); */ /* 142: d0088f51cbff34d258dd3db21a5c */\ +/* sqr(t[0], t[0]); */ /* 143: 1a0111ea397fe69a4b1ba7b6434b8 */\ +/* sqr(t[0], t[0]); */ /* 144: 340223d472ffcd3496374f6c86970 */\ +/* sqr(t[0], t[0]); */ /* 145: 680447a8e5ff9a692c6e9ed90d2e0 */\ +sqr_n_mul(t[0], t[0], 5, t[12]); /* 146: 680447a8e5ff9a692c6e9ed90d2eb */\ +/* sqr(t[0], t[0]); */ /* 147: d0088f51cbff34d258dd3db21a5d6 */\ +/* sqr(t[0], t[0]); */ /* 148: 1a0111ea397fe69a4b1ba7b6434bac */\ +/* sqr(t[0], t[0]); */ /* 149: 340223d472ffcd3496374f6c869758 */\ +/* sqr(t[0], t[0]); */ /* 150: 680447a8e5ff9a692c6e9ed90d2eb0 */\ +/* sqr(t[0], t[0]); */ /* 151: d0088f51cbff34d258dd3db21a5d60 */\ +/* sqr(t[0], t[0]); */ /* 152: 1a0111ea397fe69a4b1ba7b6434bac0 */\ +sqr_n_mul(t[0], t[0], 6, t[3]); /* 153: 1a0111ea397fe69a4b1ba7b6434bacd */\ +/* sqr(t[0], t[0]); */ /* 154: 340223d472ffcd3496374f6c869759a */\ +/* sqr(t[0], t[0]); */ /* 155: 680447a8e5ff9a692c6e9ed90d2eb34 */\ +/* sqr(t[0], t[0]); */ /* 156: d0088f51cbff34d258dd3db21a5d668 */\ +/* sqr(t[0], t[0]); */ /* 157: 1a0111ea397fe69a4b1ba7b6434bacd0 */\ +/* sqr(t[0], t[0]); */ /* 158: 340223d472ffcd3496374f6c869759a0 */\ +/* sqr(t[0], t[0]); */ /* 159: 680447a8e5ff9a692c6e9ed90d2eb340 */\ +sqr_n_mul(t[0], t[0], 6, t[5]); /* 160: 680447a8e5ff9a692c6e9ed90d2eb35d */\ +/* sqr(t[0], t[0]); */ /* 161: d0088f51cbff34d258dd3db21a5d66ba */\ +/* sqr(t[0], t[0]); */ /* 162: 1a0111ea397fe69a4b1ba7b6434bacd74 */\ +/* sqr(t[0], t[0]); */ /* 163: 340223d472ffcd3496374f6c869759ae8 */\ +/* sqr(t[0], t[0]); */ /* 164: 680447a8e5ff9a692c6e9ed90d2eb35d0 */\ +sqr_n_mul(t[0], t[0], 4, t[9]); /* 165: 680447a8e5ff9a692c6e9ed90d2eb35d9 */\ +/* sqr(t[0], t[0]); */ /* 166: d0088f51cbff34d258dd3db21a5d66bb2 */\ +/* sqr(t[0], t[0]); */ /* 167: 1a0111ea397fe69a4b1ba7b6434bacd764 */\ +/* sqr(t[0], t[0]); */ /* 168: 340223d472ffcd3496374f6c869759aec8 */\ +/* sqr(t[0], t[0]); */ /* 169: 680447a8e5ff9a692c6e9ed90d2eb35d90 */\ +/* sqr(t[0], t[0]); */ /* 170: d0088f51cbff34d258dd3db21a5d66bb20 */\ +/* sqr(t[0], t[0]); */ /* 171: 1a0111ea397fe69a4b1ba7b6434bacd7640 */\ +/* sqr(t[0], t[0]); */ /* 172: 340223d472ffcd3496374f6c869759aec80 */\ +/* sqr(t[0], t[0]); */ /* 173: 680447a8e5ff9a692c6e9ed90d2eb35d900 */\ +sqr_n_mul(t[0], t[0], 8, t[5]); /* 174: 680447a8e5ff9a692c6e9ed90d2eb35d91d */\ +/* sqr(t[0], t[0]); */ /* 175: d0088f51cbff34d258dd3db21a5d66bb23a */\ +/* sqr(t[0], t[0]); */ /* 176: 1a0111ea397fe69a4b1ba7b6434bacd76474 */\ +/* sqr(t[0], t[0]); */ /* 177: 340223d472ffcd3496374f6c869759aec8e8 */\ +/* sqr(t[0], t[0]); */ /* 178: 680447a8e5ff9a692c6e9ed90d2eb35d91d0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 179: 680447a8e5ff9a692c6e9ed90d2eb35d91dd */\ +/* sqr(t[0], t[0]); */ /* 180: d0088f51cbff34d258dd3db21a5d66bb23ba */\ +/* sqr(t[0], t[0]); */ /* 181: 1a0111ea397fe69a4b1ba7b6434bacd764774 */\ +/* sqr(t[0], t[0]); */ /* 182: 340223d472ffcd3496374f6c869759aec8ee8 */\ +/* sqr(t[0], t[0]); */ /* 183: 680447a8e5ff9a692c6e9ed90d2eb35d91dd0 */\ +/* sqr(t[0], t[0]); */ /* 184: d0088f51cbff34d258dd3db21a5d66bb23ba0 */\ +/* sqr(t[0], t[0]); */ /* 185: 1a0111ea397fe69a4b1ba7b6434bacd7647740 */\ +/* sqr(t[0], t[0]); */ /* 186: 340223d472ffcd3496374f6c869759aec8ee80 */\ +sqr_n_mul(t[0], t[0], 7, t[11]); /* 187: 340223d472ffcd3496374f6c869759aec8ee97 */\ +/* sqr(t[0], t[0]); */ /* 188: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e */\ +/* sqr(t[0], t[0]); */ /* 189: d0088f51cbff34d258dd3db21a5d66bb23ba5c */\ +/* sqr(t[0], t[0]); */ /* 190: 1a0111ea397fe69a4b1ba7b6434bacd764774b8 */\ +/* sqr(t[0], t[0]); */ /* 191: 340223d472ffcd3496374f6c869759aec8ee970 */\ +/* sqr(t[0], t[0]); */ /* 192: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e0 */\ +/* sqr(t[0], t[0]); */ /* 193: d0088f51cbff34d258dd3db21a5d66bb23ba5c0 */\ +/* sqr(t[0], t[0]); */ /* 194: 1a0111ea397fe69a4b1ba7b6434bacd764774b80 */\ +/* sqr(t[0], t[0]); */ /* 195: 340223d472ffcd3496374f6c869759aec8ee9700 */\ +/* sqr(t[0], t[0]); */ /* 196: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e00 */\ +sqr_n_mul(t[0], t[0], 9, t[10]); /* 197: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13 */\ +/* sqr(t[0], t[0]); */ /* 198: d0088f51cbff34d258dd3db21a5d66bb23ba5c26 */\ +/* sqr(t[0], t[0]); */ /* 199: 1a0111ea397fe69a4b1ba7b6434bacd764774b84c */\ +sqr_n_mul(t[0], t[0], 2, t[8]); /* 200: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f */\ +/* sqr(t[0], t[0]); */ /* 201: 340223d472ffcd3496374f6c869759aec8ee9709e */\ +/* sqr(t[0], t[0]); */ /* 202: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13c */\ +/* sqr(t[0], t[0]); */ /* 203: d0088f51cbff34d258dd3db21a5d66bb23ba5c278 */\ +/* sqr(t[0], t[0]); */ /* 204: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f0 */\ +/* sqr(t[0], t[0]); */ /* 205: 340223d472ffcd3496374f6c869759aec8ee9709e0 */\ +sqr_n_mul(t[0], t[0], 5, t[6]); /* 206: 340223d472ffcd3496374f6c869759aec8ee9709e7 */\ +/* sqr(t[0], t[0]); */ /* 207: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce */\ +/* sqr(t[0], t[0]); */ /* 208: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c */\ +/* sqr(t[0], t[0]); */ /* 209: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38 */\ +/* sqr(t[0], t[0]); */ /* 210: 340223d472ffcd3496374f6c869759aec8ee9709e70 */\ +/* sqr(t[0], t[0]); */ /* 211: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce0 */\ +/* sqr(t[0], t[0]); */ /* 212: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c0 */\ +/* sqr(t[0], t[0]); */ /* 213: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f380 */\ +sqr_n_mul(t[0], t[0], 7, t[1]); /* 214: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385 */\ +/* sqr(t[0], t[0]); */ /* 215: 340223d472ffcd3496374f6c869759aec8ee9709e70a */\ +/* sqr(t[0], t[0]); */ /* 216: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce14 */\ +/* sqr(t[0], t[0]); */ /* 217: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c28 */\ +/* sqr(t[0], t[0]); */ /* 218: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f3850 */\ +/* sqr(t[0], t[0]); */ /* 219: 340223d472ffcd3496374f6c869759aec8ee9709e70a0 */\ +/* sqr(t[0], t[0]); */ /* 220: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce140 */\ +/* sqr(t[0], t[0]); */ /* 221: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c280 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 222: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c289 */\ +/* sqr(t[0], t[0]); */ /* 223: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512 */\ +/* sqr(t[0], t[0]); */ /* 224: 340223d472ffcd3496374f6c869759aec8ee9709e70a24 */\ +/* sqr(t[0], t[0]); */ /* 225: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce1448 */\ +/* sqr(t[0], t[0]); */ /* 226: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2890 */\ +/* sqr(t[0], t[0]); */ /* 227: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f385120 */\ +/* sqr(t[0], t[0]); */ /* 228: 340223d472ffcd3496374f6c869759aec8ee9709e70a240 */\ +sqr_n_mul(t[0], t[0], 6, t[11]); /* 229: 340223d472ffcd3496374f6c869759aec8ee9709e70a257 */\ +/* sqr(t[0], t[0]); */ /* 230: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae */\ +/* sqr(t[0], t[0]); */ /* 231: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895c */\ +/* sqr(t[0], t[0]); */ /* 232: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512b8 */\ +/* sqr(t[0], t[0]); */ /* 233: 340223d472ffcd3496374f6c869759aec8ee9709e70a2570 */\ +/* sqr(t[0], t[0]); */ /* 234: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144ae0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 235: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd */\ +/* sqr(t[0], t[0]); */ /* 236: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa */\ +/* sqr(t[0], t[0]); */ /* 237: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf4 */\ +/* sqr(t[0], t[0]); */ /* 238: 340223d472ffcd3496374f6c869759aec8ee9709e70a257e8 */\ +/* sqr(t[0], t[0]); */ /* 239: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd0 */\ +/* sqr(t[0], t[0]); */ /* 240: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 241: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3 */\ +/* sqr(t[0], t[0]); */ /* 242: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf66 */\ +/* sqr(t[0], t[0]); */ /* 243: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ecc */\ +/* sqr(t[0], t[0]); */ /* 244: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd98 */\ +/* sqr(t[0], t[0]); */ /* 245: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb30 */\ +/* sqr(t[0], t[0]); */ /* 246: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf660 */\ +sqr_n_mul(t[0], t[0], 5, t[10]); /* 247: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf673 */\ +/* sqr(t[0], t[0]); */ /* 248: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece6 */\ +/* sqr(t[0], t[0]); */ /* 249: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc */\ +/* sqr(t[0], t[0]); */ /* 250: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398 */\ +/* sqr(t[0], t[0]); */ /* 251: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730 */\ +/* sqr(t[0], t[0]); */ /* 252: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece60 */\ +/* sqr(t[0], t[0]); */ /* 253: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc0 */\ +/* sqr(t[0], t[0]); */ /* 254: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3980 */\ +/* sqr(t[0], t[0]); */ /* 255: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf67300 */\ +sqr_n_mul(t[0], t[0], 8, t[3]); /* 256: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d */\ +/* sqr(t[0], t[0]); */ /* 257: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a */\ +/* sqr(t[0], t[0]); */ /* 258: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34 */\ +/* sqr(t[0], t[0]); */ /* 259: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39868 */\ +/* sqr(t[0], t[0]); */ /* 260: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d0 */\ +/* sqr(t[0], t[0]); */ /* 261: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a0 */\ +/* sqr(t[0], t[0]); */ /* 262: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc340 */\ +/* sqr(t[0], t[0]); */ /* 263: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398680 */\ +sqr_n_mul(t[0], t[0], 7, t[2]); /* 264: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695 */\ +/* sqr(t[0], t[0]); */ /* 265: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a */\ +/* sqr(t[0], t[0]); */ /* 266: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a54 */\ +/* sqr(t[0], t[0]); */ /* 267: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a8 */\ +/* sqr(t[0], t[0]); */ /* 268: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb3986950 */\ +/* sqr(t[0], t[0]); */ /* 269: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0 */\ +/* sqr(t[0], t[0]); */ /* 270: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a540 */\ +/* sqr(t[0], t[0]); */ /* 271: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a80 */\ +/* sqr(t[0], t[0]); */ /* 272: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869500 */\ +/* sqr(t[0], t[0]); */ /* 273: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a00 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 274: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f */\ +/* sqr(t[0], t[0]); */ /* 275: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e */\ +/* sqr(t[0], t[0]); */ /* 276: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83c */\ +/* sqr(t[0], t[0]); */ /* 277: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb398695078 */\ +/* sqr(t[0], t[0]); */ /* 278: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f0 */\ +/* sqr(t[0], t[0]); */ /* 279: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541e0 */\ +sqr_n_mul(t[0], t[0], 5, t[3]); /* 280: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed */\ +/* sqr(t[0], t[0]); */ /* 281: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83da */\ +/* sqr(t[0], t[0]); */ /* 282: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b4 */\ +/* sqr(t[0], t[0]); */ /* 283: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f68 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 284: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b */\ +/* sqr(t[0], t[0]); */ /* 285: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed6 */\ +/* sqr(t[0], t[0]); */ /* 286: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac */\ +/* sqr(t[0], t[0]); */ /* 287: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b58 */\ +/* sqr(t[0], t[0]); */ /* 288: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0 */\ +/* sqr(t[0], t[0]); */ /* 289: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed60 */\ +/* sqr(t[0], t[0]); */ /* 290: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac0 */\ +/* sqr(t[0], t[0]); */ /* 291: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b580 */\ +/* sqr(t[0], t[0]); */ /* 292: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b00 */\ +sqr_n_mul(t[0], t[0], 8, t[7]); /* 293: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f */\ +/* sqr(t[0], t[0]); */ /* 294: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61e */\ +/* sqr(t[0], t[0]); */ /* 295: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3c */\ +/* sqr(t[0], t[0]); */ /* 296: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b5878 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 297: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b */\ +/* sqr(t[0], t[0]); */ /* 298: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6 */\ +/* sqr(t[0], t[0]); */ /* 299: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec */\ +/* sqr(t[0], t[0]); */ /* 300: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8 */\ +/* sqr(t[0], t[0]); */ /* 301: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b0 */\ +/* sqr(t[0], t[0]); */ /* 302: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f60 */\ +/* sqr(t[0], t[0]); */ /* 303: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec0 */\ +/* sqr(t[0], t[0]); */ /* 304: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d80 */\ +sqr_n_mul(t[0], t[0], 7, t[9]); /* 305: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89 */\ +/* sqr(t[0], t[0]); */ /* 306: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b12 */\ +/* sqr(t[0], t[0]); */ /* 307: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f624 */\ +/* sqr(t[0], t[0]); */ /* 308: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec48 */\ +/* sqr(t[0], t[0]); */ /* 309: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d890 */\ +/* sqr(t[0], t[0]); */ /* 310: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120 */\ +/* sqr(t[0], t[0]); */ /* 311: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6240 */\ +/* sqr(t[0], t[0]); */ /* 312: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec480 */\ +/* sqr(t[0], t[0]); */ /* 313: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8900 */\ +/* sqr(t[0], t[0]); */ /* 314: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b1200 */\ +sqr_n_mul(t[0], t[0], 9, t[7]); /* 315: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f */\ +/* sqr(t[0], t[0]); */ /* 316: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e */\ +/* sqr(t[0], t[0]); */ /* 317: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c */\ +/* sqr(t[0], t[0]); */ /* 318: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d89078 */\ +/* sqr(t[0], t[0]); */ /* 319: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f0 */\ +/* sqr(t[0], t[0]); */ /* 320: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241e0 */\ +/* sqr(t[0], t[0]); */ /* 321: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483c0 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 322: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d5 */\ +/* sqr(t[0], t[0]); */ /* 323: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa */\ +/* sqr(t[0], t[0]); */ /* 324: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f54 */\ +/* sqr(t[0], t[0]); */ /* 325: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241ea8 */\ +/* sqr(t[0], t[0]); */ /* 326: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d50 */\ +/* sqr(t[0], t[0]); */ /* 327: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aa0 */\ +/* sqr(t[0], t[0]); */ /* 328: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f540 */\ +sqr_n_mul(t[0], t[0], 6, t[4]); /* 329: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f */\ +/* sqr(t[0], t[0]); */ /* 330: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe */\ +/* sqr(t[0], t[0]); */ /* 331: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57c */\ +/* sqr(t[0], t[0]); */ /* 332: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaf8 */\ +/* sqr(t[0], t[0]); */ /* 333: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55f0 */\ +/* sqr(t[0], t[0]); */ /* 334: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 335: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff */\ +/* sqr(t[0], t[0]); */ /* 336: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe */\ +/* sqr(t[0], t[0]); */ /* 337: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffc */\ +/* sqr(t[0], t[0]); */ /* 338: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ff8 */\ +/* sqr(t[0], t[0]); */ /* 339: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabff0 */\ +/* sqr(t[0], t[0]); */ /* 340: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 341: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff */\ +/* sqr(t[0], t[0]); */ /* 342: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aafffe */\ +/* sqr(t[0], t[0]); */ /* 343: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55fffc */\ +/* sqr(t[0], t[0]); */ /* 344: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfff8 */\ +/* sqr(t[0], t[0]); */ /* 345: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 346: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd */\ +/* sqr(t[0], t[0]); */ /* 347: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffa */\ +/* sqr(t[0], t[0]); */ /* 348: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff4 */\ +/* sqr(t[0], t[0]); */ /* 349: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffe8 */\ +sqr_n_mul(t[0], t[0], 3, t[8]); /* 350: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb */\ +/* sqr(t[0], t[0]); */ /* 351: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd6 */\ +/* sqr(t[0], t[0]); */ /* 352: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac */\ +/* sqr(t[0], t[0]); */ /* 353: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58 */\ +/* sqr(t[0], t[0]); */ /* 354: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb0 */\ +/* sqr(t[0], t[0]); */ /* 355: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd60 */\ +/* sqr(t[0], t[0]); */ /* 356: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac0 */\ +/* sqr(t[0], t[0]); */ /* 357: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff580 */\ +/* sqr(t[0], t[0]); */ /* 358: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb00 */\ +sqr_n_mul(t[0], t[0], 8, t[2]); /* 359: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb15 */\ +/* sqr(t[0], t[0]); */ /* 360: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a */\ +/* sqr(t[0], t[0]); */ /* 361: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54 */\ +/* sqr(t[0], t[0]); */ /* 362: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a8 */\ +/* sqr(t[0], t[0]); */ /* 363: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb150 */\ +/* sqr(t[0], t[0]); */ /* 364: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a0 */\ +/* sqr(t[0], t[0]); */ /* 365: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac540 */\ +/* sqr(t[0], t[0]); */ /* 366: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 367: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f */\ +/* sqr(t[0], t[0]); */ /* 368: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e */\ +/* sqr(t[0], t[0]); */ /* 369: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7c */\ +/* sqr(t[0], t[0]); */ /* 370: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54f8 */\ +/* sqr(t[0], t[0]); */ /* 371: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9f0 */\ +/* sqr(t[0], t[0]); */ /* 372: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 373: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff */\ +/* sqr(t[0], t[0]); */ /* 374: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe */\ +/* sqr(t[0], t[0]); */ /* 375: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffc */\ +/* sqr(t[0], t[0]); */ /* 376: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ff8 */\ +/* sqr(t[0], t[0]); */ /* 377: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ff0 */\ +/* sqr(t[0], t[0]); */ /* 378: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 379: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff */\ +/* sqr(t[0], t[0]); */ /* 380: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54fffe */\ +/* sqr(t[0], t[0]); */ /* 381: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9fffc */\ +/* sqr(t[0], t[0]); */ /* 382: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153fff8 */\ +/* sqr(t[0], t[0]); */ /* 383: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7fff0 */\ +sqr_n_mul(t[0], t[0], 4, t[7]); /* 384: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff */\ +/* sqr(t[0], t[0]); */ /* 385: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffe */\ +/* sqr(t[0], t[0]); */ /* 386: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffc */\ +/* sqr(t[0], t[0]); */ /* 387: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffff8 */\ +/* sqr(t[0], t[0]); */ /* 388: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[6]); /* 389: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff7 */\ +/* sqr(t[0], t[0]); */ /* 390: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee */\ +/* sqr(t[0], t[0]); */ /* 391: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc */\ +/* sqr(t[0], t[0]); */ /* 392: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb8 */\ +/* sqr(t[0], t[0]); */ /* 393: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff70 */\ +/* sqr(t[0], t[0]); */ /* 394: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee0 */\ +/* sqr(t[0], t[0]); */ /* 395: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdc0 */\ +/* sqr(t[0], t[0]); */ /* 396: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb80 */\ +sqr_n_mul(t[0], t[0], 7, t[4]); /* 397: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f */\ +/* sqr(t[0], t[0]); */ /* 398: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e */\ +/* sqr(t[0], t[0]); */ /* 399: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7c */\ +/* sqr(t[0], t[0]); */ /* 400: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcf8 */\ +/* sqr(t[0], t[0]); */ /* 401: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9f0 */\ +/* sqr(t[0], t[0]); */ /* 402: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73e0 */\ +sqr_n_mul(t[0], t[0], 5, t[5]); /* 403: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd */\ +/* sqr(t[0], t[0]); */ /* 404: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa */\ +/* sqr(t[0], t[0]); */ /* 405: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff4 */\ +/* sqr(t[0], t[0]); */ /* 406: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fe8 */\ +/* sqr(t[0], t[0]); */ /* 407: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fd0 */\ +/* sqr(t[0], t[0]); */ /* 408: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fa0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 409: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf */\ +/* sqr(t[0], t[0]); */ /* 410: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e */\ +/* sqr(t[0], t[0]); */ /* 411: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefc */\ +/* sqr(t[0], t[0]); */ /* 412: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdf8 */\ +/* sqr(t[0], t[0]); */ /* 413: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbf0 */\ +/* sqr(t[0], t[0]); */ /* 414: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7e0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 415: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff */\ +/* sqr(t[0], t[0]); */ /* 416: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe */\ +/* sqr(t[0], t[0]); */ /* 417: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffc */\ +/* sqr(t[0], t[0]); */ /* 418: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbff8 */\ +/* sqr(t[0], t[0]); */ /* 419: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ff0 */\ +/* sqr(t[0], t[0]); */ /* 420: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 421: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff */\ +/* sqr(t[0], t[0]); */ /* 422: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe */\ +/* sqr(t[0], t[0]); */ /* 423: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffc */\ +/* sqr(t[0], t[0]); */ /* 424: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fff8 */\ +/* sqr(t[0], t[0]); */ /* 425: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffff0 */\ +/* sqr(t[0], t[0]); */ /* 426: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 427: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff */\ +/* sqr(t[0], t[0]); */ /* 428: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe */\ +/* sqr(t[0], t[0]); */ /* 429: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7ffffc */\ +/* sqr(t[0], t[0]); */ /* 430: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffff8 */\ +/* sqr(t[0], t[0]); */ /* 431: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffff0 */\ +/* sqr(t[0], t[0]); */ /* 432: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 433: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff */\ +/* sqr(t[0], t[0]); */ /* 434: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe */\ +/* sqr(t[0], t[0]); */ /* 435: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffc */\ +/* sqr(t[0], t[0]); */ /* 436: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffff8 */\ +/* sqr(t[0], t[0]); */ /* 437: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbffffff0 */\ +/* sqr(t[0], t[0]); */ /* 438: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffe0 */\ +sqr_n_mul(t[0], t[0], 5, t[4]); /* 439: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff */\ +/* sqr(t[0], t[0]); */ /* 440: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9fefffffffe */\ +/* sqr(t[0], t[0]); */ /* 441: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdfffffffc */\ +/* sqr(t[0], t[0]); */ /* 442: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffff8 */\ +/* sqr(t[0], t[0]); */ /* 443: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffff0 */\ +sqr_n_mul(t[0], t[0], 4, t[3]); /* 444: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd */\ +/* sqr(t[0], t[0]); */ /* 445: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa */\ +/* sqr(t[0], t[0]); */ /* 446: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff4 */\ +/* sqr(t[0], t[0]); */ /* 447: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffe8 */\ +/* sqr(t[0], t[0]); */ /* 448: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd0 */\ +/* sqr(t[0], t[0]); */ /* 449: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffa0 */\ +/* sqr(t[0], t[0]); */ /* 450: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff40 */\ +sqr_n_mul(t[0], t[0], 6, t[2]); /* 451: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff55 */\ +/* sqr(t[0], t[0]); */ /* 452: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaa */\ +/* sqr(t[0], t[0]); */ /* 453: d0088f51cbff34d258dd3db21a5d66bb23ba5c279c2895fb39869507b587b120f55ffff58a9ffffdcff7fffffffd54 */\ +/* sqr(t[0], t[0]); */ /* 454: 1a0111ea397fe69a4b1ba7b6434bacd764774b84f38512bf6730d2a0f6b0f6241eabfffeb153ffffb9feffffffffaa8 */\ +/* sqr(t[0], t[0]); */ /* 455: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff550 */\ +sqr_n_mul(t[0], t[0], 4, t[1]); /* 456: 340223d472ffcd3496374f6c869759aec8ee9709e70a257ece61a541ed61ec483d57fffd62a7ffff73fdffffffff555 */\ +sqr(out, t[0]); /* 457: 680447a8e5ff9a692c6e9ed90d2eb35d91dd2e13ce144afd9cc34a83dac3d8907aaffffac54ffffee7fbfffffffeaaa */\ +} while(0) diff --git a/crypto/blst_src/sqrt.c b/crypto/blst_src/sqrt.c new file mode 100644 index 00000000000..cf149fd1124 --- /dev/null +++ b/crypto/blst_src/sqrt.c @@ -0,0 +1,261 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "fields.h" + +#ifdef __OPTIMIZE_SIZE__ +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + static const byte BLS_12_381_P_minus_3_div_4[] = { + TO_BYTES(0xee7fbfffffffeaaa), TO_BYTES(0x07aaffffac54ffff), + TO_BYTES(0xd9cc34a83dac3d89), TO_BYTES(0xd91dd2e13ce144af), + TO_BYTES(0x92c6e9ed90d2eb35), TO_BYTES(0x0680447a8e5ff9a6) + }; + + exp_mont_384(out, inp, BLS_12_381_P_minus_3_div_4, 379, BLS12_381_P, p0); +} +#else +# if 1 +/* + * "383"-bit variant omits full reductions at the ends of squarings, + * which results in up to ~15% improvement. [One can improve further + * by omitting full reductions even after multiplications and + * performing final reduction at the very end of the chain.] + */ +static inline void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ sqr_n_mul_mont_383(out, a, count, BLS12_381_P, p0, b); } +# else +static void sqr_n_mul_fp(vec384 out, const vec384 a, size_t count, + const vec384 b) +{ + while(count--) { + sqr_fp(out, a); + a = out; + } + mul_fp(out, out, b); +} +# endif + +# define sqr(ret,a) sqr_fp(ret,a) +# define mul(ret,a,b) mul_fp(ret,a,b) +# define sqr_n_mul(ret,a,n,b) sqr_n_mul_fp(ret,a,n,b) + +# include "sqrt-addchain.h" +static void recip_sqrt_fp_3mod4(vec384 out, const vec384 inp) +{ + RECIP_SQRT_MOD_BLS12_381_P(out, inp, vec384); +} +# undef RECIP_SQRT_MOD_BLS12_381_P + +# undef sqr_n_mul +# undef sqr +# undef mul +#endif + +static bool_t recip_sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t1, t0, inp); + sqr_fp(t1, t1); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +static bool_t sqrt_fp(vec384 out, const vec384 inp) +{ + vec384 t0, t1; + bool_t ret; + + recip_sqrt_fp_3mod4(t0, inp); + + mul_fp(t0, t0, inp); + sqr_fp(t1, t0); + ret = vec_is_equal(t1, inp, sizeof(t1)); + vec_copy(out, t0, sizeof(t0)); + + return ret; +} + +int blst_fp_sqrt(vec384 out, const vec384 inp) +{ return (int)sqrt_fp(out, inp); } + +int blst_fp_is_square(const vec384 inp) +{ + return (int)ct_is_square_mod_384(inp, BLS12_381_P); +} + +static bool_t sqrt_align_fp2(vec384x out, const vec384x ret, + const vec384x sqrt, const vec384x inp) +{ + static const vec384x sqrt_minus_1 = { { 0 }, { ONE_MONT_P } }; + static const vec384x sqrt_sqrt_minus_1 = { + /* + * "magic" number is ±2^((p-3)/4)%p, which is "1/sqrt(2)", + * in quotes because 2*"1/sqrt(2)"^2 == -1 mod p, not 1, + * but it pivots into "complex" plane nevertheless... + */ + { TO_LIMB_T(0x3e2f585da55c9ad1), TO_LIMB_T(0x4294213d86c18183), + TO_LIMB_T(0x382844c88b623732), TO_LIMB_T(0x92ad2afd19103e18), + TO_LIMB_T(0x1d794e4fac7cf0b9), TO_LIMB_T(0x0bd592fc7d825ec8) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + static const vec384x sqrt_minus_sqrt_minus_1 = { + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) }, + { TO_LIMB_T(0x7bcfa7a25aa30fda), TO_LIMB_T(0xdc17dec12a927e7c), + TO_LIMB_T(0x2f088dd86b4ebef1), TO_LIMB_T(0xd1ca2087da74d4a7), + TO_LIMB_T(0x2da2596696cebc1d), TO_LIMB_T(0x0e2b7eedbbfd87d2) } + }; + vec384x coeff, t0, t1; + bool_t is_sqrt, flag; + + /* + * Instead of multiple trial squarings we can perform just one + * and see if the result is "rotated by multiple of 90°" in + * relation to |inp|, and "rotate" |ret| accordingly. + */ + sqr_fp2(t0, sqrt); + /* "sqrt(|inp|)"^2 = (a + b*i)^2 = (a^2-b^2) + 2ab*i */ + + /* (a^2-b^2) + 2ab*i == |inp| ? |ret| is spot on */ + sub_fp2(t1, t0, inp); + is_sqrt = vec_is_zero(t1, sizeof(t1)); + vec_copy(coeff, BLS12_381_Rx.p2, sizeof(coeff)); + + /* -(a^2-b^2) - 2ab*i == |inp| ? "rotate |ret| by 90°" */ + add_fp2(t1, t0, inp); + vec_select(coeff, sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* 2ab - (a^2-b^2)*i == |inp| ? "rotate |ret| by 135°" */ + sub_fp(t1[0], t0[0], inp[1]); + add_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* -2ab + (a^2-b^2)*i == |inp| ? "rotate |ret| by 45°" */ + add_fp(t1[0], t0[0], inp[1]); + sub_fp(t1[1], t0[1], inp[0]); + vec_select(coeff, sqrt_minus_sqrt_minus_1, coeff, sizeof(coeff), + flag = vec_is_zero(t1, sizeof(t1))); + is_sqrt |= flag; + + /* actual "rotation" */ + mul_fp2(out, ret, coeff); + + return is_sqrt; +} + +/* + * |inp| = a + b*i + */ +static bool_t recip_sqrt_fp2(vec384x out, const vec384x inp, + const vec384x recip_ZZZ, + const vec384x magic_ZZZ) +{ + vec384 aa, bb, cc; + vec384x inp_; + bool_t is_sqrt; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + is_sqrt = recip_sqrt_fp(cc, aa); /* 1/sqrt(a²+b²) */ + + /* if |inp| doesn't have quadratic residue, multiply by "1/Z³" ... */ + mul_fp2(inp_, inp, recip_ZZZ); + /* ... and adjust |aa| and |cc| accordingly */ + { + vec384 za, zc; + + mul_fp(za, aa, magic_ZZZ[0]); /* aa*(za² + zb²) */ + mul_fp(zc, cc, magic_ZZZ[1]); /* cc*(za² + zb²)^((p-3)/4) */ + vec_select(aa, aa, za, sizeof(aa), is_sqrt); + vec_select(cc, cc, zc, sizeof(cc), is_sqrt); + } + vec_select(inp_, inp, inp_, sizeof(inp_), is_sqrt); + + mul_fp(aa, aa, cc); /* sqrt(a²+b²) */ + + sub_fp(bb, inp_[0], aa); + add_fp(aa, inp_[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(out[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(out[1], inp_[1]); + mul_fp(out[1], out[1], out[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(out[0], out[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* bound to succeed */ + (void)sqrt_align_fp2(out, out, out, inp_); + + mul_fp(out[0], out[0], cc); /* inverse the result */ + mul_fp(out[1], out[1], cc); + neg_fp(out[1], out[1]); + + return is_sqrt; +} + +static bool_t sqrt_fp2(vec384x out, const vec384x inp) +{ + vec384x ret; + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + /* don't pay attention to return value, final "align" will tell... */ + (void)sqrt_fp(aa, aa); /* sqrt(a²+b²) */ + + sub_fp(bb, inp[0], aa); + add_fp(aa, inp[0], aa); + vec_select(aa, bb, aa, sizeof(aa), vec_is_zero(aa, sizeof(aa))); + div_by_2_fp(aa, aa); /* (a ± sqrt(a²+b²))/2 */ + + /* if it says "no sqrt," final "align" will find right one... */ + (void)recip_sqrt_fp(ret[0], aa); /* 1/sqrt((a ± sqrt(a²+b²))/2) */ + + div_by_2_fp(ret[1], inp[1]); + mul_fp(ret[1], ret[1], ret[0]); /* b/(2*sqrt((a ± sqrt(a²+b²))/2)) */ + mul_fp(ret[0], ret[0], aa); /* sqrt((a ± sqrt(a²+b²))/2) */ + + /* + * Now see if |ret| is or can be made sqrt(|inp|)... + */ + + return sqrt_align_fp2(out, ret, ret, inp); +} + +int blst_fp2_sqrt(vec384x out, const vec384x inp) +{ return (int)sqrt_fp2(out, inp); } + +int blst_fp2_is_square(const vec384x inp) +{ + vec384 aa, bb; + + sqr_fp(aa, inp[0]); + sqr_fp(bb, inp[1]); + add_fp(aa, aa, bb); + + return (int)ct_is_square_mod_384(aa, BLS12_381_P); +} diff --git a/crypto/blst_src/vect.c b/crypto/blst_src/vect.c new file mode 100644 index 00000000000..1834a48fadd --- /dev/null +++ b/crypto/blst_src/vect.c @@ -0,0 +1,176 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "vect.h" + +#ifdef __BLST_NO_ASM__ +# include "no_asm.h" +#endif + +/* + * Following are some reference C implementations to assist new + * assembly modules development, as starting-point stand-ins and for + * cross-checking. In order to "polyfil" specific subroutine redefine + * it on compiler command line, e.g. -Dmul_mont_384x=_mul_mont_384x. + */ + +#ifdef lshift_mod_384 +inline void lshift_mod_384(vec384 ret, const vec384 a, size_t n, + const vec384 mod) +{ + while(n--) + add_mod_384(ret, a, a, mod), a = ret; +} +#endif + +#ifdef mul_by_8_mod_384 +inline void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ lshift_mod_384(ret, a, 3, mod); } +#endif + +#ifdef mul_by_3_mod_384 +inline void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a, a, mod); + add_mod_384(ret, t, a, mod); +} +#endif + +#ifdef mul_by_3_mod_384x +inline void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_3_mod_384(ret[0], a[0], mod); + mul_by_3_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_8_mod_384x +inline void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 mod) +{ + mul_by_8_mod_384(ret[0], a[0], mod); + mul_by_8_mod_384(ret[1], a[1], mod); +} +#endif + +#ifdef mul_by_1_plus_i_mod_384x +inline void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, + const vec384 mod) +{ + vec384 t; + + add_mod_384(t, a[0], a[1], mod); + sub_mod_384(ret[0], a[0], a[1], mod); + vec_copy(ret[1], t, sizeof(t)); +} +#endif + +#ifdef add_mod_384x +inline void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + add_mod_384(ret[0], a[0], b[0], mod); + add_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef sub_mod_384x +inline void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod) +{ + sub_mod_384(ret[0], a[0], b[0], mod); + sub_mod_384(ret[1], a[1], b[1], mod); +} +#endif + +#ifdef lshift_mod_384x +inline void lshift_mod_384x(vec384x ret, const vec384x a, size_t n, + const vec384 mod) +{ + lshift_mod_384(ret[0], a[0], n, mod); + lshift_mod_384(ret[1], a[1], n, mod); +} +#endif + +#if defined(mul_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 mod, limb_t n0) +{ + vec768 t0, t1, t2; + vec384 aa, bb; + + mul_384(t0, a[0], b[0]); + mul_384(t1, a[1], b[1]); + + add_mod_384(aa, a[0], a[1], mod); + add_mod_384(bb, b[0], b[1], mod); + mul_384(t2, aa, bb); + sub_mod_384x384(t2, t2, t0, mod); + sub_mod_384x384(t2, t2, t1, mod); + + sub_mod_384x384(t0, t0, t1, mod); + + redc_mont_384(ret[0], t0, mod, n0); + redc_mont_384(ret[1], t2, mod, n0); +} +#endif + +#if defined(sqr_mont_384x) && !(defined(__ADX__) && !defined(__BLST_PORTABLE__)) +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 mod, limb_t n0) +{ + vec384 t0, t1; + + add_mod_384(t0, a[0], a[1], mod); + sub_mod_384(t1, a[0], a[1], mod); + + mul_mont_384(ret[1], a[0], a[1], mod, n0); + add_mod_384(ret[1], ret[1], ret[1], mod); + + mul_mont_384(ret[0], t0, t1, mod, n0); +} +#endif + +limb_t div_3_limbs(const limb_t dividend_top[2], limb_t d_lo, limb_t d_hi); +limb_t quot_rem_128(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); +limb_t quot_rem_64(limb_t *quot_rem, const limb_t *divisor, limb_t quotient); + +/* + * Divide 255-bit |val| by z^2 yielding 128-bit quotient and remainder in place. + */ +static void div_by_zz(limb_t val[]) +{ + static const limb_t zz[] = { TO_LIMB_T(0x0000000100000000), + TO_LIMB_T(0xac45a4010001a402) }; + size_t loop, zz_len = sizeof(zz)/sizeof(zz[0]); + limb_t d_lo, d_hi; + + d_lo = zz[zz_len - 2]; + d_hi = zz[zz_len - 1]; + for (loop = zz_len, zz_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + zz_len, d_lo, d_hi); + (void)quot_rem_128(val + loop, zz, q); + } + /* remainder is in low half of val[], quotient is in high */ +} + +/* + * Divide 128-bit |val| by z yielding 64-bit quotient and remainder in place. + */ +static void div_by_z(limb_t val[]) +{ + static const limb_t z[] = { TO_LIMB_T(0xd201000000010000) }; + size_t loop, z_len = sizeof(z)/sizeof(z[0]); + limb_t d_lo, d_hi; + + d_lo = (sizeof(z) == sizeof(limb_t)) ? 0 : z[z_len - 2]; + d_hi = z[z_len - 1]; + for (loop = z_len, z_len--; loop--;) { + limb_t q = div_3_limbs(val + loop + z_len, d_lo, d_hi); + (void)quot_rem_64(val + loop, z, q); + } + /* remainder is in low half of val[], quotient is in high */ +} diff --git a/crypto/blst_src/vect.h b/crypto/blst_src/vect.h new file mode 100644 index 00000000000..554dd5daefc --- /dev/null +++ b/crypto/blst_src/vect.h @@ -0,0 +1,418 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLS12_381_ASM_VECT_H__ +#define __BLS12_381_ASM_VECT_H__ + +#include + +#if defined(__x86_64__) || defined(__aarch64__) +/* These are available even in ILP32 flavours, but even then they are + * capable of performing 64-bit operations as efficiently as in *P64. */ +typedef unsigned long long limb_t; +# define LIMB_T_BITS 64 + +#elif defined(_WIN64) /* Win64 is P64 */ +typedef unsigned __int64 limb_t; +# define LIMB_T_BITS 64 + +#elif defined(__BLST_NO_ASM__) || defined(__wasm64__) +typedef unsigned int limb_t; +# define LIMB_T_BITS 32 +# ifndef __BLST_NO_ASM__ +# define __BLST_NO_ASM__ +# endif + +#else /* 32 bits on 32-bit platforms, 64 - on 64-bit */ +typedef unsigned long limb_t; +# ifdef _LP64 +# define LIMB_T_BITS 64 +# else +# define LIMB_T_BITS 32 +# define __BLST_NO_ASM__ +# endif +#endif + +/* + * Why isn't LIMB_T_BITS defined as 8*sizeof(limb_t)? Because pre-processor + * knows nothing about sizeof(anything)... + */ +#if LIMB_T_BITS == 64 +# define TO_LIMB_T(limb64) limb64 +#else +# define TO_LIMB_T(limb64) (limb_t)limb64,(limb_t)(limb64>>32) +#endif + +#define NLIMBS(bits) (bits/LIMB_T_BITS) + +typedef limb_t vec256[NLIMBS(256)]; +typedef limb_t vec512[NLIMBS(512)]; +typedef limb_t vec384[NLIMBS(384)]; +typedef limb_t vec768[NLIMBS(768)]; +typedef vec384 vec384x[2]; /* 0 is "real" part, 1 is "imaginary" */ + +typedef unsigned char byte; +#define TO_BYTES(limb64) (byte)limb64,(byte)(limb64>>8),\ + (byte)(limb64>>16),(byte)(limb64>>24),\ + (byte)(limb64>>32),(byte)(limb64>>40),\ + (byte)(limb64>>48),(byte)(limb64>>56) +typedef byte pow256[256/8]; + +/* + * Internal Boolean type, Boolean by value, hence safe to cast to or + * reinterpret as 'bool'. + */ +typedef limb_t bool_t; + +/* + * Assembly subroutines... + */ +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__)\ + && !defined(__BLST_NO_ASM__) +# define mul_mont_sparse_256 mulx_mont_sparse_256 +# define sqr_mont_sparse_256 sqrx_mont_sparse_256 +# define from_mont_256 fromx_mont_256 +# define redc_mont_256 redcx_mont_256 +# define mul_mont_384 mulx_mont_384 +# define sqr_mont_384 sqrx_mont_384 +# define sqr_n_mul_mont_384 sqrx_n_mul_mont_384 +# define sqr_n_mul_mont_383 sqrx_n_mul_mont_383 +# define mul_384 mulx_384 +# define sqr_384 sqrx_384 +# define redc_mont_384 redcx_mont_384 +# define from_mont_384 fromx_mont_384 +# define sgn0_pty_mont_384 sgn0x_pty_mont_384 +# define sgn0_pty_mont_384x sgn0x_pty_mont_384x +# define ct_inverse_mod_383 ctx_inverse_mod_383 +#elif defined(__BLST_NO_ASM__) +# define ct_inverse_mod_383 ct_inverse_mod_384 +#endif + +void mul_mont_sparse_256(vec256 ret, const vec256 a, const vec256 b, + const vec256 p, limb_t n0); +void sqr_mont_sparse_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); +void redc_mont_256(vec256 ret, const vec512 a, const vec256 p, limb_t n0); +void from_mont_256(vec256 ret, const vec256 a, const vec256 p, limb_t n0); + +void add_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void sub_mod_256(vec256 ret, const vec256 a, const vec256 b, const vec256 p); +void mul_by_3_mod_256(vec256 ret, const vec256 a, const vec256 p); +void cneg_mod_256(vec256 ret, const vec256 a, bool_t flag, const vec256 p); +void lshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +void rshift_mod_256(vec256 ret, const vec256 a, size_t count, const vec256 p); +bool_t eucl_inverse_mod_256(vec256 ret, const vec256 a, const vec256 p, + const vec256 one); +limb_t check_mod_256(const pow256 a, const vec256 p); +limb_t add_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); +limb_t sub_n_check_mod_256(pow256 ret, const pow256 a, const pow256 b, + const vec256 p); + +void vec_prefetch(const void *ptr, size_t len); + +void mul_mont_384(vec384 ret, const vec384 a, const vec384 b, + const vec384 p, limb_t n0); +void sqr_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +void sqr_n_mul_mont_384(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); +void sqr_n_mul_mont_383(vec384 ret, const vec384 a, size_t count, + const vec384 p, limb_t n0, const vec384 b); + +void mul_384(vec768 ret, const vec384 a, const vec384 b); +void sqr_384(vec768 ret, const vec384 a); +void redc_mont_384(vec384 ret, const vec768 a, const vec384 p, limb_t n0); +void from_mont_384(vec384 ret, const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384(const vec384 a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mont_384x(const vec384x a, const vec384 p, limb_t n0); +limb_t sgn0_pty_mod_384(const vec384 a, const vec384 p); +limb_t sgn0_pty_mod_384x(const vec384x a, const vec384 p); + +void add_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void sub_mod_384(vec384 ret, const vec384 a, const vec384 b, const vec384 p); +void mul_by_8_mod_384(vec384 ret, const vec384 a, const vec384 p); +void mul_by_3_mod_384(vec384 ret, const vec384 a, const vec384 p); +void cneg_mod_384(vec384 ret, const vec384 a, bool_t flag, const vec384 p); +void lshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void rshift_mod_384(vec384 ret, const vec384 a, size_t count, const vec384 p); +void div_by_2_mod_384(vec384 ret, const vec384 a, const vec384 p); +void ct_inverse_mod_383(vec768 ret, const vec384 inp, const vec384 mod, + const vec384 modx); +void ct_inverse_mod_256(vec512 ret, const vec256 inp, const vec256 mod, + const vec256 modx); +bool_t ct_is_square_mod_384(const vec384 inp, const vec384 mod); + +#if defined(__ADX__) /* e.g. -march=broadwell */ && !defined(__BLST_PORTABLE__) +# define mul_mont_384x mulx_mont_384x +# define sqr_mont_384x sqrx_mont_384x +# define sqr_mont_382x sqrx_mont_382x +# define mul_382x mulx_382x +# define sqr_382x sqrx_382x +#endif + +void mul_mont_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p, limb_t n0); +void sqr_mont_384x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void sqr_mont_382x(vec384x ret, const vec384x a, const vec384 p, limb_t n0); +void mul_382x(vec768 ret[2], const vec384x a, const vec384x b, const vec384 p); +void sqr_382x(vec768 ret[2], const vec384x a, const vec384 p); + +void add_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void sub_mod_384x(vec384x ret, const vec384x a, const vec384x b, + const vec384 p); +void mul_by_8_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_3_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void mul_by_1_plus_i_mod_384x(vec384x ret, const vec384x a, const vec384 p); +void add_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); +void sub_mod_384x384(vec768 ret, const vec768 a, const vec768 b, + const vec384 p); + +/* + * C subroutines + */ +static void exp_mont_384(vec384 out, const vec384 inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void exp_mont_384x(vec384x out, const vec384x inp, const byte *pow, + size_t pow_bits, const vec384 p, limb_t n0); +static void div_by_zz(limb_t val[]); +static void div_by_z(limb_t val[]); + +#ifdef __UINTPTR_TYPE__ +typedef __UINTPTR_TYPE__ uptr_t; +#else +typedef const void *uptr_t; +#endif + +#if !defined(restrict) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define restrict __restrict__ +# elif defined(_MSC_VER) +# define restrict __restrict +# else +# define restrict +# endif +# endif +#endif + +#if !defined(inline) && !defined(__cplusplus) +# if !defined(__STDC_VERSION__) || __STDC_VERSION__<199901 +# if defined(__GNUC__) && __GNUC__>=2 +# define inline __inline__ +# elif defined(_MSC_VER) +# define inline __inline +# else +# define inline +# endif +# endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define launder(var) __asm__ __volatile__("" : "+r"(var)) +#else +# define launder(var) +#endif + +static inline bool_t is_bit_set(const byte *v, size_t i) +{ + bool_t ret = (v[i/8] >> (i%8)) & 1; + launder(ret); + return ret; +} + +static inline bool_t byte_is_zero(unsigned char c) +{ + limb_t ret = ((limb_t)(c) - 1) >> (LIMB_T_BITS - 1); + launder(ret); + return ret; +} + +static inline bool_t bytes_are_zero(const unsigned char *a, size_t num) +{ + unsigned char acc; + size_t i; + + for (acc = 0, i = 0; i < num; i++) + acc |= a[i]; + + return byte_is_zero(acc); +} + +static inline void vec_cswap(void *restrict a, void *restrict b, size_t num, + bool_t cbit) +{ + limb_t ai, *ap = (limb_t *)a; + limb_t bi, *bp = (limb_t *)b; + limb_t xorm, mask; + size_t i; + + launder(cbit); + mask = (limb_t)0 - cbit; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = ((ai = ap[i]) ^ (bi = bp[i])) & mask; + ap[i] = ai ^ xorm; + bp[i] = bi ^ xorm; + } +} + +/* ret = bit ? a : b */ +void vec_select_32(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_48(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_96(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_144(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_192(void *ret, const void *a, const void *b, bool_t sel_a); +void vec_select_288(void *ret, const void *a, const void *b, bool_t sel_a); +static inline void vec_select(void *ret, const void *a, const void *b, + size_t num, bool_t sel_a) +{ + launder(sel_a); +#ifndef __BLST_NO_ASM__ + if (num == 32) vec_select_32(ret, a, b, sel_a); + else if (num == 48) vec_select_48(ret, a, b, sel_a); + else if (num == 96) vec_select_96(ret, a, b, sel_a); + else if (num == 144) vec_select_144(ret, a, b, sel_a); + else if (num == 192) vec_select_192(ret, a, b, sel_a); + else if (num == 288) vec_select_288(ret, a, b, sel_a); +#else + if (0) ; +#endif + else { + limb_t bi; + volatile limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t xorm, mask = (limb_t)0 - sel_a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) { + xorm = (ap[i] ^ (bi = bp[i])) & mask; + rp[i] = bi ^ xorm; + } + } +} + +static inline bool_t is_zero(limb_t l) +{ + limb_t ret = (~l & (l - 1)) >> (LIMB_T_BITS - 1); + launder(ret); + return ret; +} + +static inline bool_t vec_is_zero(const void *a, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + limb_t acc; + size_t i; + +#ifndef __BLST_NO_ASM__ + bool_t vec_is_zero_16x(const void *a, size_t num); + if ((num & 15) == 0) + return vec_is_zero_16x(a, num); +#endif + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i]; + + return is_zero(acc); +} + +static inline bool_t vec_is_equal(const void *a, const void *b, size_t num) +{ + const limb_t *ap = (const limb_t *)a; + const limb_t *bp = (const limb_t *)b; + limb_t acc; + size_t i; + +#ifndef __BLST_NO_ASM__ + bool_t vec_is_equal_16x(const void *a, const void *b, size_t num); + if ((num & 15) == 0) + return vec_is_equal_16x(a, b, num); +#endif + + num /= sizeof(limb_t); + + for (acc = 0, i = 0; i < num; i++) + acc |= ap[i] ^ bp[i]; + + return is_zero(acc); +} + +static inline void cneg_mod_384x(vec384x ret, const vec384x a, bool_t flag, + const vec384 p) +{ + cneg_mod_384(ret[0], a[0], flag, p); + cneg_mod_384(ret[1], a[1], flag, p); +} + +static inline void vec_copy(void *restrict ret, const void *a, size_t num) +{ + limb_t *rp = (limb_t *)ret; + const limb_t *ap = (const limb_t *)a; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = ap[i]; +} + +static inline void vec_zero(void *ret, size_t num) +{ + volatile limb_t *rp = (volatile limb_t *)ret; + size_t i; + + num /= sizeof(limb_t); + + for (i = 0; i < num; i++) + rp[i] = 0; + +#if defined(__GNUC__) || defined(__clang__) + __asm__ __volatile__("" : : "r"(ret) : "memory"); +#endif +} + +/* + * Some compilers get arguably overzealous(*) when passing pointer to + * multi-dimensional array [such as vec384x] as 'const' argument. + * General direction seems to be to legitimize such constification, + * so it's argued that suppressing the warning is appropriate. + * + * (*) http://www.open-std.org/jtc1/sc22/wg14/www/docs/n1923.htm + */ +#if defined(__INTEL_COMPILER) +# pragma warning(disable:167) +# pragma warning(disable:556) +#elif defined(__GNUC__) && !defined(__clang__) +# pragma GCC diagnostic ignored "-Wpedantic" +#elif defined(_MSC_VER) +# pragma warning(disable: 4127 4189) +#endif + +#if !defined(__wasm__) && __STDC_HOSTED__-0 != 0 +# include +#endif + +#if defined(__GNUC__) +# ifndef alloca +# define alloca(s) __builtin_alloca(s) +# endif +#elif defined(__sun) +# include +#elif defined(_WIN32) +# include +# ifndef alloca +# define alloca(s) _alloca(s) +# endif +#endif + +#endif /* __BLS12_381_ASM_VECT_H__ */ diff --git a/crypto/build_dependency.sh b/crypto/build_dependency.sh deleted file mode 100644 index 4bfe99dbad2..00000000000 --- a/crypto/build_dependency.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -PKG_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -RELIC_DIR_NAME="relic" -RELIC_DIR="${PKG_DIR}/${RELIC_DIR_NAME}" - -# grant permissions if not existant -if [[ ! -r ${PKG_DIR} || ! -w ${PKG_DIR} || ! -x ${PKG_DIR} ]]; then - chmod -R 755 "${PKG_DIR}" -fi - -rm -rf "${RELIC_DIR}" - -# relic version or tag -relic_version="7d885d1ba34be61bf22190943a73549a910c1714" - -# clone a specific version of Relic without history if it's tagged. -# git -c http.sslVerify=true clone --branch $(relic_version) --single-branch --depth 1 https://github.com/relic-toolkit/relic.git ${RELIC_DIR_NAME} || { echo "git clone failed"; exit 1; } - -# clone all the history if the version is only defined by a commit hash. -git -c http.sslVerify=true clone --branch main --single-branch https://github.com/relic-toolkit/relic.git ${RELIC_DIR_NAME} || { echo "git clone failed"; exit 1; } - -if [ -d "${RELIC_DIR}" ] -then - ( - cd ${RELIC_DIR_NAME} || { echo "cd relic failed"; exit 1; } - git checkout $relic_version - ) - # build relic - bash relic_build.sh -else - { echo "couldn't find relic directory"; exit 1; } -fi - diff --git a/crypto/common.go b/crypto/common.go index f476de92e3f..b9e072c9930 100644 --- a/crypto/common.go +++ b/crypto/common.go @@ -8,9 +8,6 @@ import ( //revive:disable:var-naming -// the `go generate` command requires bash scripting, `cmake` and `git`. -//go:generate bash ./build_dependency.sh - const ( // Minimum targeted bits of security. // This is used as a reference but it doesn't mean all implemented primitives provide this minimum. @@ -21,9 +18,6 @@ const ( // it is still recommened that seed is generated using a secure RNG. KeyGenSeedMinLen = 2 * (securityBits / 8) KeyGenSeedMaxLen = 256 - - // max relic PRG seed length in bytes - maxRelicPrgSeed = 1 << 32 ) // TODO: update this code to make sure diff --git a/crypto/dkg.go b/crypto/dkg.go index 6e74f3d54a5..03305d016c7 100644 --- a/crypto/dkg.go +++ b/crypto/dkg.go @@ -22,7 +22,7 @@ import ( // Flow uses DKG with the value t = floor((n-1)/2) to optimize for unforgeability and robustness // of the threshold signature scheme using the output keys. // -// Private keys are scalar in Zr, where r is the group order of G1/G2. +// Private keys are scalar in Fr, where r is the group order of G1/G2. // Public keys are in G2. const ( @@ -34,9 +34,6 @@ const ( DKGMinSize int = MinimumThreshold + 1 // DKGMaxSize is the maximum size of a group participating in a DKG protocol DKGMaxSize int = 254 - // SeedMinLenDKG is the minumum seed length required to participate in a DKG protocol - SeedMinLenDKG = securityBits / 8 - SeedMaxLenDKG = maxRelicPrgSeed ) type DKGState interface { diff --git a/crypto/dkg_core.c b/crypto/dkg_core.c index 3a2bce01559..c8fee6917f6 100644 --- a/crypto/dkg_core.c +++ b/crypto/dkg_core.c @@ -1,127 +1,109 @@ -// +build relic - #include "dkg_include.h" - -#define N_max 250 -#define N_bits_max 8 // log(250) -#define T_max ((N_max-1)/2) - -// computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r) -// r being the order of G1 -// writes P(x) in out and P(x).g2 in y if y is non NULL -// x being a small integer -void Zr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x){ - bn_t image; - bn_new(image); - Zr_polynomialImage(image, y, a, a_size, x); - // exports the result - const int out_size = Fr_BYTES; - bn_write_bin(out, out_size, image); - bn_free(image); +// computes P(x) = a_0 + a_1*x + .. + a_n x^n in F_r +// where `x` is a small integer (byte) and `degree` is P's degree n. +// P(x) is written in `out` and P(x).g2 is written in `y` if `y` is non NULL. +void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int degree, + const byte x) { + Fr image; + Fr_polynomial_image(&image, y, a, degree, x); + // exports the result + Fr_write_bytes(out, &image); } -// computes P(x) = a_0 + a_1*x + .. + a_n x^n (mod r) -// r being the order of G1 -// writes P(x) in out and P(x).g2 in y if y is non NULL -// x being a small integer -void Zr_polynomialImage(bn_t image, ep2_t y, const bn_st *a, const int a_size, const byte x){ - bn_t r; - bn_new(r); - g2_get_ord(r); - - // temp variables - bn_t acc; - bn_new(acc); - bn_new_size(acc, BITS_TO_DIGITS(Fr_BITS+8+1)); - bn_set_dig(acc, 0); - - for (int i=a_size-1; i >= 0; i--) { - bn_mul_dig(acc, acc, x); - // Use basic reduction as it's an 9-bits reduction - // in the worst case (|acc|<|r|+9 ) - bn_mod_basic(acc, acc, r); - bn_add(acc, acc, &a[i]); - } - // export the result - bn_mod_basic(image, acc, r); - - // compute y = P(x).g2 - if (y) g2_mul_gen(y, acc); +// computes P(x) = a_0 + a_1 * x + .. + a_n * x^n where P is in Fr[X]. +// a_i are all in Fr, `degree` is P's degree, x is a small integer less than +// `MAX_IND` (currently 255). +// The function writes P(x) in `image` and P(x).g2 in `y` if `y` is non NULL. +void Fr_polynomial_image(Fr *image, E2 *y, const Fr *a, const int degree, + const byte x) { + Fr_set_zero(image); + // convert `x` to Montgomery form + Fr xR; + Fr_set_limb(&xR, (limb_t)x); + Fr_to_montg(&xR, &xR); - bn_free(acc) - bn_free(r); + for (int i = degree; i >= 0; i--) { + Fr_mul_montg(image, image, &xR); + Fr_add(image, image, &a[i]); // image is in normal form + } + // compute y = P(x).g2 + if (y) { + G2_mult_gen(y, image); + } } // computes Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 -// and stores the point in y -// r is the order of G2 -static void G2_polynomialImage(ep2_t y, const ep2_st* A, const int len_A, - const byte x, const bn_t r){ - - bn_t bn_x; - bn_new(bn_x); - ep2_set_infty(y); - bn_set_dig(bn_x, x); - for (int i = len_A-1; i >= 0 ; i--) { - ep2_mul_lwnaf(y, y, bn_x); - ep2_add_projc(y, y, (ep2_st*)&A[i]); - } - - ep2_norm(y, y); // not necessary but left here to optimize the - // multiple pairing computations with the same public key - bn_free(bn_x); +// and stores the point in y. +// - A_i being G2 points +// - x being a small scalar (less than `MAX_IND`) +static void E2_polynomial_image(E2 *y, const E2 *A, const int degree, + const byte x) { + E2_set_infty(y); + for (int i = degree; i >= 0; i--) { + E2_mult_small_expo(y, y, x); + E2_add(y, y, &A[i]); + } } -// compute the participants public keys from the verification vector -// y[i] = Q(i+1) for all participants i, with: -// Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 -void G2_polynomialImages(ep2_st *y, const int len_y, const ep2_st* A, const int len_A) { - // order r - bn_t r; - bn_new(r); - g2_get_ord(r); - for (byte i=0; i 0 { + // genarate a_i on F_r, for 0 0 { - for i := 1; i < s.threshold; i++ { - C.bn_new_wrapper((*C.bn_st)(&s.a[i])) - randZr(&s.a[i]) - generatorScalarMultG2(&s.vA[i], &s.a[i]) - } - // non-zero a[t] to enforce the polynomial degree - randZrStar(&s.a[s.threshold]) - generatorScalarMultG2(&s.vA[s.threshold], &s.a[s.threshold]) + return fmt.Errorf("failed to generate random polynomial: %w", err) + } + + // compute the verification vector A_i = g2^a_i + s.vA = make([]pointE2, s.threshold+1) + for i := 0; i <= s.threshold; i++ { + generatorScalarMultG2(&s.vA[i], &s.a[i]) } // compute the shares @@ -287,17 +319,17 @@ func (s *feldmanVSSstate) generateShares(seed []byte) error { // the dealer's own share if i-1 == s.myIndex { xdata := make([]byte, shareSize) - zrPolynomialImage(xdata, s.a, i, &s.y[i-1]) - C.bn_read_bin((*C.bn_st)(&s.x), - (*C.uchar)(&xdata[0]), - PrKeyLenBLSBLS12381, - ) + frPolynomialImage(xdata, s.a, i, &s.y[i-1]) + err := readScalarFrStar(&s.x, xdata) + if err != nil { + return fmt.Errorf("unexpected error when generating the dealer's own share: %w", err) + } continue } // the-other-participant shares data := make([]byte, shareSize+1) data[0] = byte(feldmanVSSShare) - zrPolynomialImage(data[1:], s.a, i, &s.y[i-1]) + frPolynomialImage(data[1:], s.a, i, &s.y[i-1]) s.processor.PrivateSend(int(i-1), data) } // broadcast the vector @@ -350,13 +382,11 @@ func (s *feldmanVSSstate) receiveShare(origin index, data []byte) { } // read the participant private share - if C.bn_read_Zr_bin((*C.bn_st)(&s.x), - (*C.uchar)(&data[0]), - PrKeyLenBLSBLS12381, - ) != valid { + err := readScalarFrStar(&s.x, data) + if err != nil { s.validKey = false s.processor.FlagMisbehavior(int(origin), - fmt.Sprintf("invalid share value %x", data)) + fmt.Sprintf("invalid share value %x: %s", data, err)) return } @@ -365,9 +395,9 @@ func (s *feldmanVSSstate) receiveShare(origin index, data []byte) { } } -// receives the public vector from the +// receives the public vector from the dealer func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) { - // only accept the verification vector from the . + // only accept the verification vector from the dealer. if origin != s.dealerIndex { return } @@ -387,7 +417,7 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) { return } // read the verification vector - s.vA = make([]pointG2, s.threshold+1) + s.vA = make([]pointE2, s.threshold+1) err := readVerifVector(s.vA, data) if err != nil { s.vAReceived = true @@ -396,7 +426,7 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) { fmt.Sprintf("reading the verification vector failed: %s", err)) } - s.y = make([]pointG2, s.size) + s.y = make([]pointE2, s.size) s.computePublicKeys() s.vAReceived = true @@ -405,44 +435,46 @@ func (s *feldmanVSSstate) receiveVerifVector(origin index, data []byte) { } } -// zrPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Z/Zr +// frPolynomialImage computes P(x) = a_0 + a_1*x + .. + a_n*x^n (mod r) in Fr[X] // r being the order of G1 // P(x) is written in dest, while g2^P(x) is written in y // x being a small integer -func zrPolynomialImage(dest []byte, a []scalar, x index, y *pointG2) { - C.Zr_polynomialImage_export((*C.uchar)(&dest[0]), - (*C.ep2_st)(y), - (*C.bn_st)(&a[0]), (C.int)(len(a)), +func frPolynomialImage(dest []byte, a []scalar, x index, y *pointE2) { + C.Fr_polynomial_image_write((*C.uchar)(&dest[0]), + (*C.E2)(y), + (*C.Fr)(&a[0]), (C.int)(len(a)-1), (C.uint8_t)(x), ) } // writeVerifVector exports a vector A into an array of bytes // assuming the array length matches the vector length -func writeVerifVector(dest []byte, A []pointG2) { - C.ep2_vector_write_bin((*C.uchar)(&dest[0]), - (*C.ep2_st)(&A[0]), +func writeVerifVector(dest []byte, A []pointE2) { + C.E2_vector_write_bytes((*C.uchar)(&dest[0]), + (*C.E2)(&A[0]), (C.int)(len(A)), ) } -// readVerifVector imports A vector from an array of bytes, -// assuming the slice length matches the vector length -func readVerifVector(A []pointG2, src []byte) error { - read := C.ep2_vector_read_bin((*C.ep2_st)(&A[0]), +// readVerifVector imports A vector (G2 points) from an array of bytes, +// assuming the slice length matches the vector length. +func readVerifVector(A []pointE2, src []byte) error { + read := C.G2_vector_read_bytes( + (*C.E2)(&A[0]), (*C.uchar)(&src[0]), (C.int)(len(A))) if read == valid { return nil } // invalid A vector - return invalidInputsErrorf("the verifcation vector does not serialize G2 points") + return invalidInputsErrorf("the verification vector does not serialize valid G2 points: error code %d", read) } func (s *feldmanVSSstate) verifyShare() bool { // check y[current] == x.G2 - return C.verifyshare((*C.bn_st)(&s.x), - (*C.ep2_st)(&s.y[s.myIndex])) == 1 + return bool(C.G2_check_log( + (*C.Fr)(&s.x), + (*C.E2)(&s.y[s.myIndex]))) } // computePublicKeys extracts the participants public keys from the verification vector @@ -450,8 +482,8 @@ func (s *feldmanVSSstate) verifyShare() bool { // // Q(x) = A_0 + A_1*x + ... + A_n*x^n in G2 func (s *feldmanVSSstate) computePublicKeys() { - C.G2_polynomialImages( - (*C.ep2_st)(&s.y[0]), (C.int)(len(s.y)), - (*C.ep2_st)(&s.vA[0]), (C.int)(len(s.vA)), + C.E2_polynomial_images( + (*C.E2)(&s.y[0]), (C.int)(len(s.y)), + (*C.E2)(&s.vA[0]), (C.int)(len(s.vA)-1), ) } diff --git a/crypto/dkg_feldmanvssq.go b/crypto/dkg_feldmanvssq.go index 335ce6fc86d..c3aca992ee2 100644 --- a/crypto/dkg_feldmanvssq.go +++ b/crypto/dkg_feldmanvssq.go @@ -1,9 +1,5 @@ -//go:build relic -// +build relic - package crypto -// #cgo CFLAGS: -g -Wall -std=c99 // #include "dkg_include.h" import "C" @@ -27,7 +23,7 @@ import ( // a complaint answer. The protocol ends with all honest participants // reaching a consensus about the dealer qualification/disqualification. -// Private keys are scalar in Zr, where r is the group order of G1/G2 +// Private keys are scalar in Fr, where r is the group order of G1/G2 // Public keys are in G2. // feldman VSS protocol, with complaint mechanism, implements DKGState @@ -162,7 +158,7 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error) if c.received && !c.answerReceived { s.disqualified = true s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("complaint from %d was not answered", + fmt.Sprintf("complaint from (%d) was not answered", complainer)) break } @@ -204,9 +200,9 @@ func (s *feldmanVSSQualState) End() (PrivateKey, PublicKey, []PublicKey, error) return x, Y, y, nil } -const ( +var ( complaintSize = 1 - complaintAnswerSize = 1 + PrKeyLenBLSBLS12381 + complaintAnswerSize = 1 + frBytesLen ) // HandleBroadcastMsg processes a new broadcasted message received by the current participant. @@ -402,19 +398,17 @@ func (s *feldmanVSSQualState) receiveShare(origin index, data []byte) { return } // read the participant private share - if C.bn_read_Zr_bin((*C.bn_st)(&s.x), - (*C.uchar)(&data[0]), - PrKeyLenBLSBLS12381, - ) != valid { + err := readScalarFrStar(&s.x, data) + if err != nil { s.buildAndBroadcastComplaint() s.processor.FlagMisbehavior(int(origin), - fmt.Sprintf("invalid share value %x", data)) + fmt.Sprintf("invalid share value %x: %s", data, err)) return } if s.vAReceived { if !s.verifyShare() { - // otherwise, build a complaint + // build a complaint s.buildAndBroadcastComplaint() } } @@ -448,7 +442,7 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) { return } // read the verification vector - s.vA = make([]pointG2, s.threshold+1) + s.vA = make([]pointE2, s.threshold+1) err := readVerifVector(s.vA, data) if err != nil { s.disqualified = true @@ -457,7 +451,8 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) { return } - s.y = make([]pointG2, s.size) + s.y = make([]pointE2, s.size) + // compute all public keys s.computePublicKeys() // check the (already) registered complaints @@ -466,8 +461,8 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) { if s.checkComplaint(complainer, c) { s.disqualified = true s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("verification vector received: a complaint answer to %d is invalid", - complainer)) + fmt.Sprintf("verification vector received: a complaint answer to (%d) is invalid, answer is %s, computed key is %s", + complainer, &c.answer, &s.y[complainer])) return } } @@ -483,6 +478,14 @@ func (s *feldmanVSSQualState) receiveVerifVector(origin index, data []byte) { // build a complaint against the dealer, add it to the local // complaint map and broadcast it func (s *feldmanVSSQualState) buildAndBroadcastComplaint() { + var logMsg string + if s.vAReceived && s.xReceived { + logMsg = fmt.Sprintf("building a complaint, share is %s, computed public key is %s", + &s.x, &s.y[s.myIndex]) + } else { + logMsg = "building a complaint" + } + s.processor.FlagMisbehavior(int(s.dealerIndex), logMsg) s.complaints[s.myIndex] = &complaint{ received: true, answerReceived: false, @@ -497,7 +500,7 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index) data := make([]byte, complaintAnswerSize+1) data[0] = byte(feldmanVSSComplaintAnswer) data[1] = byte(complainee) - zrPolynomialImage(data[2:], s.a, complainee+1, nil) + frPolynomialImage(data[2:], s.a, complainee+1, nil) s.complaints[complainee].answerReceived = true s.processor.Broadcast(data) } @@ -507,8 +510,10 @@ func (s *feldmanVSSQualState) buildAndBroadcastComplaintAnswer(complainee index) // - true if the complaint answer is not correct func (s *feldmanVSSQualState) checkComplaint(complainer index, c *complaint) bool { // check y[complainer] == share.G2 - return C.verifyshare((*C.bn_st)(&c.answer), - (*C.ep2_st)(&s.y[complainer])) == 0 + isLog := C.G2_check_log( + (*C.Fr)(&c.answer), + (*C.E2)(&s.y[complainer])) + return !bool(isLog) } // data = |complainee| @@ -582,8 +587,8 @@ func (s *feldmanVSSQualState) receiveComplaint(origin index, data []byte) { s.disqualified = s.checkComplaint(origin, c) if s.disqualified { s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("complaint received: complaint answer to %d is invalid", - origin)) + fmt.Sprintf("complaint received: answer to (%d) is invalid, answer is %s, computed public key is %s", + origin, &c.answer, &s.y[origin])) } return } @@ -624,14 +629,11 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) } // read the complainer private share - C.bn_new_wrapper((*C.bn_st)(&s.complaints[complainer].answer)) - if C.bn_read_Zr_bin((*C.bn_st)(&s.complaints[complainer].answer), - (*C.uchar)(&data[1]), - PrKeyLenBLSBLS12381, - ) != valid { + err := readScalarFrStar(&s.complaints[complainer].answer, data[1:]) + if err != nil { s.disqualified = true s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("invalid complaint answer value %x", data)) + fmt.Sprintf("invalid complaint answer value %x: %s", data, err)) return } return @@ -648,22 +650,19 @@ func (s *feldmanVSSQualState) receiveComplaintAnswer(origin index, data []byte) // flag check is a sanity check if c.received { // read the complainer private share - C.bn_new_wrapper((*C.bn_st)(&c.answer)) - if C.bn_read_Zr_bin((*C.bn_st)(&c.answer), - (*C.uchar)(&data[1]), - PrKeyLenBLSBLS12381, - ) != valid { + err := readScalarFrStar(&c.answer, data[1:]) + if err != nil { s.disqualified = true s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("invalid complaint answer value %x", data)) + fmt.Sprintf("invalid complaint answer value %x: %s", data, err)) return } if s.vAReceived { s.disqualified = s.checkComplaint(complainer, c) if s.disqualified { s.processor.Disqualify(int(s.dealerIndex), - fmt.Sprintf("complaint answer received: complaint answer to %d is invalid", - complainer)) + fmt.Sprintf("complaint answer received: answer to (%d) is invalid, answer is %s, computed key is %s", + complainer, &c.answer, &s.y[complainer])) } } diff --git a/crypto/dkg_include.h b/crypto/dkg_include.h index 5e518300071..02fb9a363f4 100644 --- a/crypto/dkg_include.h +++ b/crypto/dkg_include.h @@ -1,19 +1,15 @@ -// +build relic - -#ifndef _REL_DKG_INCLUDE_H -#define _REL_DKG_INCLUDE_H +#ifndef _DKG_INCLUDE_H +#define _DKG_INCLUDE_H #include "bls12381_utils.h" -// the highest index of a DKG participant -#define MAX_IND 255 -#define MAX_IND_BITS 8 - -void Zr_polynomialImage_export(byte* out, ep2_t y, const bn_st* a, const int a_size, const byte x); -void Zr_polynomialImage(bn_t out, ep2_t y, const bn_st* a, const int a_size, const byte x); -void G2_polynomialImages(ep2_st* y, const int len_y, const ep2_st* A, const int len_A); -void ep2_vector_write_bin(byte* out, const ep2_st* A, const int len); -int ep2_vector_read_bin(ep2_st* A, const byte* src, const int len); -int verifyshare(const bn_t x, const ep2_t y); +void Fr_polynomial_image_write(byte *out, E2 *y, const Fr *a, const int deg, + const byte x); +void Fr_polynomial_image(Fr *out, E2 *y, const Fr *a, const int deg, + const byte x); +void E2_polynomial_images(E2 *y, const int len_y, const E2 *A, const int deg); +void E2_vector_write_bytes(byte *out, const E2 *A, const int len); +ERROR G2_vector_read_bytes(E2 *A, const byte *src, const int len); +bool G2_check_log(const Fr *x, const E2 *y); #endif diff --git a/crypto/dkg_jointfeldman.go b/crypto/dkg_jointfeldman.go index 7b63f88e810..115730e33d9 100644 --- a/crypto/dkg_jointfeldman.go +++ b/crypto/dkg_jointfeldman.go @@ -1,10 +1,5 @@ -//go:build relic -// +build relic - package crypto -// #cgo CFLAGS: -g -Wall -std=c99 -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "dkg_include.h" import "C" @@ -34,7 +29,7 @@ import ( // from the protocol, and the overall key is taking into account // all chunks from qualified dealers. -// Private keys are scalar in Zr, where r is the group order of G1/G2 +// Private keys are scalar in Fr, where r is the group order of G1/G2 // Public keys are in G2. // Joint Feldman protocol, with complaint mechanism, implements DKGState @@ -45,11 +40,11 @@ type JointFeldmanState struct { // feldmanVSSQualState parallel states fvss []feldmanVSSQualState // is the group public key - jointPublicKey pointG2 + jointPublicKey pointE2 // Private share of the current participant jointx scalar // Public keys of the group participants, the vector size is (n) - jointy []pointG2 + jointy []pointE2 } // NewJointFeldman creates a new instance of a Joint Feldman protocol. @@ -194,7 +189,7 @@ func (s *JointFeldmanState) End() (PrivateKey, PublicKey, []PublicKey, error) { if disqualifiedTotal > s.threshold || s.size-disqualifiedTotal <= s.threshold { return nil, nil, nil, dkgFailureErrorf( - "Joint-Feldman failed because the diqualified participants number is high: %d disqualified, threshold is %d, size is %d", + "Joint-Feldman failed because the disqualified participants number is high: %d disqualified, threshold is %d, size is %d", disqualifiedTotal, s.threshold, s.size) } @@ -298,34 +293,33 @@ func (s *JointFeldmanState) ForceDisqualify(participant int) error { } // sum up the 3 type of keys from all qualified dealers to end the protocol -func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointG2, []pointG2) { +func (s *JointFeldmanState) sumUpQualifiedKeys(qualified int) (*scalar, *pointE2, []pointE2) { qualifiedx, qualifiedPubKey, qualifiedy := s.getQualifiedKeys(qualified) // sum up x var jointx scalar - C.bn_new_wrapper((*C.bn_st)(&jointx)) - C.bn_sum_vector((*C.bn_st)(&jointx), (*C.bn_st)(&qualifiedx[0]), + C.Fr_sum_vector((*C.Fr)(&jointx), (*C.Fr)(&qualifiedx[0]), (C.int)(qualified)) // sum up Y - var jointPublicKey pointG2 - C.ep2_sum_vector((*C.ep2_st)(&jointPublicKey), - (*C.ep2_st)(&qualifiedPubKey[0]), (C.int)(qualified)) + var jointPublicKey pointE2 + C.E2_sum_vector_to_affine((*C.E2)(&jointPublicKey), + (*C.E2)(&qualifiedPubKey[0]), (C.int)(qualified)) // sum up []y - jointy := make([]pointG2, s.size) + jointy := make([]pointE2, s.size) for i := 0; i < s.size; i++ { - C.ep2_sum_vector((*C.ep2_st)(&jointy[i]), - (*C.ep2_st)(&qualifiedy[i][0]), (C.int)(qualified)) + C.E2_sum_vector_to_affine((*C.E2)(&jointy[i]), + (*C.E2)(&qualifiedy[i][0]), (C.int)(qualified)) } return &jointx, &jointPublicKey, jointy } // get the 3 type of keys from all qualified dealers -func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointG2, [][]pointG2) { +func (s *JointFeldmanState) getQualifiedKeys(qualified int) ([]scalar, []pointE2, [][]pointE2) { qualifiedx := make([]scalar, 0, qualified) - qualifiedPubKey := make([]pointG2, 0, qualified) - qualifiedy := make([][]pointG2, s.size) + qualifiedPubKey := make([]pointE2, 0, qualified) + qualifiedy := make([][]pointE2, s.size) for i := 0; i < s.size; i++ { - qualifiedy[i] = make([]pointG2, 0, qualified) + qualifiedy[i] = make([]pointE2, 0, qualified) } for i := 0; i < s.size; i++ { diff --git a/crypto/dkg_test.go b/crypto/dkg_test.go index 3cc1d172cca..2bd4dc51fa0 100644 --- a/crypto/dkg_test.go +++ b/crypto/dkg_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto import ( @@ -292,7 +289,7 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) { // start DKG in all participants // start listening on the channels - seed := make([]byte, SeedMinLenDKG) + seed := make([]byte, KeyGenSeedMinLen) sync.Add(n) log.Info("DKG protocol starts") @@ -366,7 +363,6 @@ func dkgCommonTest(t *testing.T, dkg int, n int, threshold int, test testCase) { "2 group public keys are mismatching") } } - } // time after which a silent channel causes switching to the next dkg phase @@ -591,13 +587,12 @@ func (proc *testDKGProcessor) invalidShareSend(dest int, data []byte) { } } else { - gt.Logf("turns out to be a honest send\n%x\n", data) + gt.Logf("%d to %d: turns out to be a honest send\n%x\n", data, proc.current, dest) } // honest send case: this is the only message sent // malicious send case: this is a second correct send, to test the second message gets ignored // by the receiver (sender has been tagged malicious after the first send) proc.chans[dest] <- originalMsg - } // This is a testing function @@ -771,7 +766,7 @@ func TestDKGTransitionErrors(t *testing.T) { threshold := 3 myIndex := 0 dealer := 1 - seed := make([]byte, SeedMinLenDKG) + seed := make([]byte, KeyGenSeedMinLen) t.Run("feldman VSS", func(t *testing.T) { state, err := NewFeldmanVSS(n, threshold, myIndex, dummyTestDKGProcessor{}, dealer) diff --git a/crypto/ecdsa.go b/crypto/ecdsa.go index dca3604570a..b09d3d5922f 100644 --- a/crypto/ecdsa.go +++ b/crypto/ecdsa.go @@ -321,7 +321,7 @@ func (a *ecdsaAlgo) decodePublicKeyCompressed(pkBytes []byte) (PublicKey, error) return &pubKeyECDSA{a, goPubKey}, nil } -// prKeyECDSA is the private key of ECDSA, it implements the generic PrivateKey +// prKeyECDSA is the private key of ECDSA, it implements the interface PrivateKey type prKeyECDSA struct { // the signature algo alg *ecdsaAlgo @@ -331,6 +331,8 @@ type prKeyECDSA struct { pubKey *pubKeyECDSA } +var _ PrivateKey = (*prKeyECDSA)(nil) + // Algorithm returns the algo related to the private key func (sk *prKeyECDSA) Algorithm() SigningAlgorithm { return sk.alg.algo @@ -399,6 +401,8 @@ type pubKeyECDSA struct { goPubKey *ecdsa.PublicKey } +var _ PublicKey = (*pubKeyECDSA)(nil) + // Algorithm returns the the algo related to the private key func (pk *pubKeyECDSA) Algorithm() SigningAlgorithm { return pk.alg.algo diff --git a/crypto/ecdsa_test.go b/crypto/ecdsa_test.go index cf9a137e1e7..ed005a11e07 100644 --- a/crypto/ecdsa_test.go +++ b/crypto/ecdsa_test.go @@ -1,6 +1,3 @@ -//go:build !relic -// +build !relic - package crypto import ( @@ -160,7 +157,7 @@ func TestECDSAUtils(t *testing.T) { // TestScalarMult is a unit test of the scalar multiplication // This is only a sanity check meant to make sure the curve implemented // is checked against an independent test vector -func TestScalarMult(t *testing.T) { +func TestScalarMultP256_secp256k1(t *testing.T) { secp256k1 := secp256k1Instance.curve p256 := p256Instance.curve genericMultTests := []struct { diff --git a/crypto/go.mod b/crypto/go.mod index 9895e1c35db..d31f36cf023 100644 --- a/crypto/go.mod +++ b/crypto/go.mod @@ -6,7 +6,6 @@ require ( github.com/btcsuite/btcd/btcec/v2 v2.2.1 github.com/sirupsen/logrus v1.4.2 github.com/stretchr/testify v1.8.0 - github.com/supranational/blst v0.3.10 golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d gonum.org/v1/gonum v0.6.1 pgregory.net/rapid v0.4.7 diff --git a/crypto/go.sum b/crypto/go.sum index 19a05d05d6d..820bb87a41c 100644 --- a/crypto/go.sum +++ b/crypto/go.sum @@ -28,8 +28,6 @@ github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXf github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/supranational/blst v0.3.10 h1:CMciDZ/h4pXDDXQASe8ZGTNKUiVNxVVA5hpci2Uuhuk= -github.com/supranational/blst v0.3.10/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d h1:sK3txAijHtOK88l68nt020reeT1ZdKLIYetKl95FzVY= golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= diff --git a/crypto/internal/blst/blst.go b/crypto/internal/blst/blst.go new file mode 100644 index 00000000000..c890f55e367 --- /dev/null +++ b/crypto/internal/blst/blst.go @@ -0,0 +1,3434 @@ +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +// DO NOT EDIT THIS FILE!! +// The file is generated from *.tgo by generate.py +//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +package blst + +// #cgo CFLAGS: -I${SRCDIR}/.. -I${SRCDIR}/../../build -I${SRCDIR}/../../src -D__BLST_CGO__ -fno-builtin-memcpy -fno-builtin-memset +// #cgo amd64 CFLAGS: -D__ADX__ -mno-avx +// #cgo mips64 mips64le ppc64 ppc64le riscv64 s390x CFLAGS: -D__BLST_NO_ASM__ +// #include "blst.h" +// +// #if defined(__x86_64__) && (defined(__unix__) || defined(__APPLE__)) +// # include +// # include +// static void handler(int signum) +// { ssize_t n = write(2, "Caught SIGILL in blst_cgo_init, " +// "consult /bindings/go/README.md.\n", 70); +// _exit(128+SIGILL); +// (void)n; +// } +// __attribute__((constructor)) static void blst_cgo_init() +// { blst_fp temp = { 0 }; +// struct sigaction act = { handler }, oact; +// sigaction(SIGILL, &act, &oact); +// blst_fp_sqr(&temp, &temp); +// sigaction(SIGILL, &oact, NULL); +// } +// #endif +// +// static size_t go_pairing_sizeof(size_t DST_len) +// { return (blst_pairing_sizeof() + DST_len + sizeof(blst_pairing) - 1) / +// sizeof(blst_pairing); +// } +// static void go_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, +// const byte *DST, size_t DST_len) +// { if (DST != NULL) { +// byte *dst = (byte*)new_ctx + blst_pairing_sizeof(); +// for(size_t i = 0; i < DST_len; i++) dst[i] = DST[i]; +// DST = dst; +// } +// blst_pairing_init(new_ctx, hash_or_encode, DST, DST_len); +// } +// static void go_pairing_as_fp12(blst_fp12 *pt, blst_pairing *ctx) +// { *pt = *blst_pairing_as_fp12(ctx); } +// +// static void go_p1slice_to_affine(blst_p1_affine dst[], +// const blst_p1 points[], size_t npoints) +// { const blst_p1 *ppoints[2] = { points, NULL }; +// blst_p1s_to_affine(dst, ppoints, npoints); +// } +// static void go_p1slice_add(blst_p1 *dst, const blst_p1_affine points[], +// size_t npoints) +// { const blst_p1_affine *ppoints[2] = { points, NULL }; +// blst_p1s_add(dst, ppoints, npoints); +// } +// static void go_p2slice_to_affine(blst_p2_affine dst[], +// const blst_p2 points[], size_t npoints) +// { const blst_p2 *ppoints[2] = { points, NULL }; +// blst_p2s_to_affine(dst, ppoints, npoints); +// } +// static void go_p2slice_add(blst_p2 *dst, const blst_p2_affine points[], +// size_t npoints) +// { const blst_p2_affine *ppoints[2] = { points, NULL }; +// blst_p2s_add(dst, ppoints, npoints); +// } +// +// static void go_p1_mult_n_acc(blst_p1 *acc, const blst_fp *x, bool affine, +// const byte *scalar, size_t nbits) +// { blst_p1 m[1]; +// const void *p = x; +// if (p == NULL) +// p = blst_p1_generator(); +// else if (affine) +// blst_p1_from_affine(m, p), p = m; +// blst_p1_mult(m, p, scalar, nbits); +// blst_p1_add_or_double(acc, acc, m); +// } +// static void go_p2_mult_n_acc(blst_p2 *acc, const blst_fp2 *x, bool affine, +// const byte *scalar, size_t nbits) +// { blst_p2 m[1]; +// const void *p = x; +// if (p == NULL) +// p = blst_p2_generator(); +// else if (affine) +// blst_p2_from_affine(m, p), p = m; +// blst_p2_mult(m, p, scalar, nbits); +// blst_p2_add_or_double(acc, acc, m); +// } +// +// static void go_p1_sub_assign(blst_p1 *a, const blst_fp *x, bool affine) +// { blst_p1 minus_b; +// if (affine) +// blst_p1_from_affine(&minus_b, (const blst_p1_affine*)x); +// else +// minus_b = *(const blst_p1*)x; +// blst_p1_cneg(&minus_b, 1); +// blst_p1_add_or_double(a, a, &minus_b); +// } +// +// static void go_p2_sub_assign(blst_p2 *a, const blst_fp2 *x, bool affine) +// { blst_p2 minus_b; +// if (affine) +// blst_p2_from_affine(&minus_b, (const blst_p2_affine*)x); +// else +// minus_b = *(const blst_p2*)x; +// blst_p2_cneg(&minus_b, 1); +// blst_p2_add_or_double(a, a, &minus_b); +// } +// +// static bool go_scalar_from_bendian(blst_scalar *ret, const byte *in) +// { blst_scalar_from_bendian(ret, in); +// return blst_sk_check(ret); +// } +// static bool go_hash_to_scalar(blst_scalar *ret, +// const byte *msg, size_t msg_len, +// const byte *DST, size_t DST_len) +// { byte elem[48]; +// blst_expand_message_xmd(elem, sizeof(elem), msg, msg_len, DST, DST_len); +// return blst_scalar_from_be_bytes(ret, elem, sizeof(elem)); +// } +// static void go_miller_loop_n(blst_fp12 *dst, const blst_p2_affine Q[], +// const blst_p1_affine P[], +// size_t npoints, bool acc) +// { const blst_p2_affine *Qs[2] = { Q, NULL }; +// const blst_p1_affine *Ps[2] = { P, NULL }; +// if (acc) { +// blst_fp12 tmp; +// blst_miller_loop_n(&tmp, Qs, Ps, npoints); +// blst_fp12_mul(dst, dst, &tmp); +// } else { +// blst_miller_loop_n(dst, Qs, Ps, npoints); +// } +// } +// static void go_fp12slice_mul(blst_fp12 *dst, const blst_fp12 in[], size_t n) +// { size_t i; +// blst_fp12_mul(dst, &in[0], &in[1]); +// for (i = 2; i < n; i++) +// blst_fp12_mul(dst, dst, &in[i]); +// } +import "C" +import ( + "fmt" + "math/bits" + "runtime" + "sync" + "sync/atomic" +) + +const BLST_SCALAR_BYTES = 256 / 8 +const BLST_FP_BYTES = 384 / 8 +const BLST_P1_COMPRESS_BYTES = BLST_FP_BYTES +const BLST_P1_SERIALIZE_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_COMPRESS_BYTES = BLST_FP_BYTES * 2 +const BLST_P2_SERIALIZE_BYTES = BLST_FP_BYTES * 4 + +type Scalar = C.blst_scalar +type Fp = C.blst_fp +type Fp2 = C.blst_fp2 +type Fp6 = C.blst_fp6 +type Fp12 = C.blst_fp12 +type P1 = C.blst_p1 +type P2 = C.blst_p2 +type P1Affine = C.blst_p1_affine +type P2Affine = C.blst_p2_affine +type Message = []byte +type Pairing = []C.blst_pairing +type SecretKey = Scalar +type P1s []P1 +type P2s []P2 +type P1Affines []P1Affine +type P2Affines []P2Affine + +// +// Configuration +// + +var maxProcs = initMaxProcs() + +func initMaxProcs() int { + maxProcs := runtime.GOMAXPROCS(0) + var version float32 + _, err := fmt.Sscanf(runtime.Version(), "go%f", &version) + if err != nil || version < 1.14 { + // be cooperative and leave one processor for the application + maxProcs -= 1 + } + if maxProcs <= 0 { + maxProcs = 1 + } + return maxProcs +} + +func SetMaxProcs(max int) { + if max <= 0 { + max = 1 + } + maxProcs = max +} + +// Secret key +func (sk *SecretKey) Zeroize() { + var zero SecretKey + *sk = zero +} + +func KeyGen(ikm []byte, optional ...[]byte) *SecretKey { + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + if len(info) > 0 { + infoP = (*C.byte)(&info[0]) + } + } + if len(ikm) < 32 { + return nil + } + C.blst_keygen(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + infoP, C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV3(ikm []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + if len(info) > 0 { + infoP = (*C.byte)(&info[0]) + } + } + C.blst_keygen_v3(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + infoP, C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV45(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + if len(info) > 0 { + infoP = (*C.byte)(&info[0]) + } + } + C.blst_keygen_v4_5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + (*C.byte)(&salt[0]), C.size_t(len(salt)), + infoP, C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func KeyGenV5(ikm []byte, salt []byte, optional ...[]byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + var info []byte + var infoP *C.byte + if len(optional) > 0 { + info = optional[0] + if len(info) > 0 { + infoP = (*C.byte)(&info[0]) + } + } + C.blst_keygen_v5(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm)), + (*C.byte)(&salt[0]), C.size_t(len(salt)), + infoP, C.size_t(len(info))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func DeriveMasterEip2333(ikm []byte) *SecretKey { + if len(ikm) < 32 { + return nil + } + var sk SecretKey + C.blst_derive_master_eip2333(&sk, (*C.byte)(&ikm[0]), C.size_t(len(ikm))) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +func (master *SecretKey) DeriveChildEip2333(child_index uint32) *SecretKey { + var sk SecretKey + C.blst_derive_child_eip2333(&sk, master, C.uint(child_index)) + // Postponing secret key zeroing till garbage collection can be too + // late to be effective, but every little bit helps... + runtime.SetFinalizer(&sk, func(sk *SecretKey) { sk.Zeroize() }) + return &sk +} + +// Pairing +func PairingCtx(hash_or_encode bool, DST []byte) Pairing { + DST_len := C.size_t(len(DST)) + ctx := make([]C.blst_pairing, int(C.go_pairing_sizeof(DST_len))) + var uDST *C.byte + if DST_len > 0 { + uDST = (*C.byte)(&DST[0]) + } + C.go_pairing_init(&ctx[0], C.bool(hash_or_encode), uDST, DST_len) + return ctx +} + +func PairingCommit(ctx Pairing) { + C.blst_pairing_commit(&ctx[0]) +} + +func PairingMerge(ctx Pairing, ctx1 Pairing) int { + r := C.blst_pairing_merge(&ctx[0], &ctx1[0]) + return int(r) +} + +func PairingFinalVerify(ctx Pairing, optional ...*Fp12) bool { + var gtsig *Fp12 = nil + if len(optional) > 0 { + gtsig = optional[0] + } + return bool(C.blst_pairing_finalverify(&ctx[0], gtsig)) +} + +func PairingRawAggregate(ctx Pairing, q *P2Affine, p *P1Affine) { + C.blst_pairing_raw_aggregate(&ctx[0], q, p) +} + +func PairingAsFp12(ctx Pairing) *Fp12 { + var pt Fp12 + C.go_pairing_as_fp12(&pt, &ctx[0]) + return &pt +} + +func Fp12One() Fp12 { + return *C.blst_fp12_one() +} + +func Fp12FinalVerify(pt1 *Fp12, pt2 *Fp12) bool { + return bool(C.blst_fp12_finalverify(pt1, pt2)) +} + +func Fp12MillerLoop(q *P2Affine, p *P1Affine) *Fp12 { + var pt Fp12 + C.blst_miller_loop(&pt, q, p) + return &pt +} + +func Fp12MillerLoopN(qs []P2Affine, ps []P1Affine) *Fp12 { + if len(qs) != len(ps) || len(qs) == 0 { + panic("inputs' lengths mismatch") + } + + nElems := uint32(len(qs)) + nThreads := uint32(maxProcs) + + if nThreads == 1 || nElems == 1 { + var pt Fp12 + C.go_miller_loop_n(&pt, &qs[0], &ps[0], C.size_t(nElems), false) + return &pt + } + + stride := (nElems + nThreads - 1) / nThreads + if stride > 16 { + stride = 16 + } + + strides := (nElems + stride - 1) / stride + if nThreads > strides { + nThreads = strides + } + + msgsCh := make(chan Fp12, nThreads) + curElem := uint32(0) + + for tid := uint32(0); tid < nThreads; tid++ { + go func() { + acc := Fp12One() + first := true + for { + work := atomic.AddUint32(&curElem, stride) - stride + if work >= nElems { + break + } + n := nElems - work + if n > stride { + n = stride + } + C.go_miller_loop_n(&acc, &qs[work], &ps[work], C.size_t(n), + C.bool(!first)) + first = false + } + msgsCh <- acc + }() + } + + var ret = make([]Fp12, nThreads) + for i := range ret { + ret[i] = <-msgsCh + } + + var pt Fp12 + C.go_fp12slice_mul(&pt, &ret[0], C.size_t(nThreads)) + return &pt +} + +func (pt *Fp12) MulAssign(p *Fp12) { + C.blst_fp12_mul(pt, pt, p) +} + +func (pt *Fp12) FinalExp() { + C.blst_final_exp(pt, pt) +} + +func (pt *Fp12) InGroup() bool { + return bool(C.blst_fp12_in_group(pt)) +} + +func (pt *Fp12) ToBendian() []byte { + var out [BLST_FP_BYTES * 12]byte + C.blst_bendian_from_fp12((*C.byte)(&out[0]), pt) + return out[:] +} + +func (pt1 *Fp12) Equals(pt2 *Fp12) bool { + return *pt1 == *pt2 +} + +// +// MIN-PK +// + +// +// PublicKey +// + +func (pk *P1Affine) From(s *Scalar) *P1Affine { + C.blst_sk_to_pk2_in_g1(nil, pk, s) + return pk +} + +func (pk *P1Affine) KeyValidate() bool { + return !bool(C.blst_p1_affine_is_inf(pk)) && + bool(C.blst_p1_affine_in_g1(pk)) +} + +// sigInfcheck, check for infinity, is a way to avoid going +// into resource-consuming verification. Passing 'false' is +// always cryptographically safe, but application might want +// to guard against obviously bogus individual[!] signatures. +func (sig *P2Affine) SigValidate(sigInfcheck bool) bool { + if sigInfcheck && bool(C.blst_p2_affine_is_inf(sig)) { + return false + } + return bool(C.blst_p2_affine_in_g2(sig)) +} + +// +// Sign +// + +func (sig *P2Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P2Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P2 + if useHash { + q = HashToG2(msg, dst, augSingle) + } else { + q = EncodeToG2(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g1(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP2 func() *P2Affine +type pkGetterP1 func(i uint32, temp *P1Affine) (*P1Affine, []byte) + +// Single verify with decompressed pk +func (sig *P2Affine) Verify(sigGroupcheck bool, pk *P1Affine, pkValidate bool, + msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify(sigGroupcheck, []*P1Affine{pk}, pkValidate, + []Message{msg}, dst, useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, + pk []byte, pkValidate bool, msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, + [][]byte{pk}, pkValidate, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +// Note that checking message uniqueness, if required, is left to the user. +// Not all signature schemes require it and this keeps the binding minimal +// and fast. Refer to the Uniq function for one method method of performing +// this check. +func (sig *P2Affine) AggregateVerify(sigGroupcheck bool, + pks []*P1Affine, pksVerify bool, msgs []Message, dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + sigFn := func() *P2Affine { + return sig + } + + pkFn := func(i uint32, _ *P1Affine) (*P1Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } else { + return pks[i], nil + } + } + + return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (dummy *P2Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, + pks [][]byte, pksVerify bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P2Affine { + sigP := new(P2Affine) + if sigP.Uncompress(sig) == nil { + return nil + } + return sigP + } + pkFn := func(i uint32, pk *P1Affine) (*P1Affine, []byte) { + bytes := pks[i] + if len(bytes) == BLST_P1_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else if len(bytes) == BLST_P1_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } else { + return nil, nil + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG1(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +func coreAggregateVerifyPkInG1(sigFn sigGetterP2, sigGroupcheck bool, + pkFn pkGetterP1, pkValidate bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + mutex := sync.Mutex{} + + mutex.Lock() + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } else if work == 0 && maxProcs == numCores-1 && + numThreads == maxProcs { + // Avoid consuming all cores by waiting until the + // main thread has completed its miller loop before + // proceeding. + mutex.Lock() + mutex.Unlock() + } + + // Pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + ret := PairingAggregatePkInG1(pairing, curPk, pkValidate, + nil, false, msgs[work], aug) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && + !sig.SigValidate(false) { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 { + C.blst_aggregated_in_g2(>sig, sig) + } + mutex.Unlock() + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func CoreVerifyPkInG1(pk *P1Affine, sig *P2Affine, hash_or_encode bool, + msg Message, dst []byte, optional ...[]byte) int { + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + if runtime.NumGoroutine() < maxProcs { + sigFn := func() *P2Affine { + return sig + } + pkFn := func(_ uint32, _ *P1Affine) (*P1Affine, []byte) { + return pk, aug + } + if !coreAggregateVerifyPkInG1(sigFn, true, pkFn, true, []Message{msg}, + dst, hash_or_encode) { + return C.BLST_VERIFY_FAIL + } + return C.BLST_SUCCESS + } + + var udst *C.byte + if len(dst) > 0 { + udst = (*C.byte)(&dst[0]) + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + return int(C.blst_core_verify_pk_in_g1(pk, sig, C.bool(hash_or_encode), + umsg, C.size_t(len(msg)), + udst, C.size_t(len(dst)), + uaug, C.size_t(len(aug)))) +} + +// pks are assumed to be verified for proof of possession, +// which implies that they are already group-checked +func (sig *P2Affine) FastAggregateVerify(sigGroupcheck bool, + pks []*P1Affine, msg Message, dst []byte, + optional ...interface{}) bool { // pass-through to Verify + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P1Aggregate) + if !aggregator.Aggregate(pks, false) { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) +} + +func (dummy *P2Affine) MultipleAggregateVerify(sigs []*P2Affine, + sigsGroupcheck bool, pks []*P1Affine, pksVerify bool, + msgs []Message, dst []byte, randFn func(*Scalar), randBits int, + optional ...interface{}) bool { // useHash + + // Sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n || len(sigs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + paramsFn := + func(work uint32, sig *P2Affine, pk *P1Affine, rand *Scalar) ( + *P2Affine, *P1Affine, *Scalar, []byte) { + randFn(rand) + var aug []byte + if useAugs { + aug = augs[work] + } + return sigs[work], pks[work], rand, aug + } + + return multipleAggregateVerifyPkInG1(paramsFn, sigsGroupcheck, pksVerify, + msgs, dst, randBits, useHash) +} + +type mulAggGetterPkInG1 func(work uint32, sig *P2Affine, pk *P1Affine, + rand *Scalar) (*P2Affine, *P1Affine, *Scalar, []byte) + +func multipleAggregateVerifyPkInG1(paramsFn mulAggGetterPkInG1, + sigsGroupcheck bool, pksVerify bool, msgs []Message, + dst []byte, randBits int, + optional ...bool) bool { // useHash + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var tempRand Scalar + var tempPk P1Affine + var tempSig P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + curSig, curPk, curRand, aug := paramsFn(work, &tempSig, + &tempPk, &tempRand) + + if PairingMulNAggregatePkInG1(pairing, curPk, pksVerify, + curSig, sigsGroupcheck, curRand, + randBits, msgs[work], aug) != + C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, nil) +} + +// +// Aggregate P2 +// + +type aggGetterP2 func(i uint32, temp *P2Affine) *P2Affine +type P2Aggregate struct { + v *P2 +} + +// Aggregate uncompressed elements +func (agg *P2Aggregate) Aggregate(elmts []*P2Affine, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, _ *P2Affine) *P2Affine { return elmts[i] } + return agg.aggregate(getter, groupcheck, len(elmts)) +} + +// Aggregate compressed elements +func (agg *P2Aggregate) AggregateCompressed(elmts [][]byte, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, p *P2Affine) *P2Affine { + bytes := elmts[i] + if p.Uncompress(bytes) == nil { + return nil + } + return p + } + return agg.aggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P2Aggregate) AddAggregate(other *P2Aggregate) { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p2_add_or_double(agg.v, agg.v, other.v) + } +} + +func (agg *P2Aggregate) Add(elmt *P2Affine, groupcheck bool) bool { + if groupcheck && !bool(C.blst_p2_affine_in_g2(elmt)) { + return false + } + if agg.v == nil { + agg.v = new(P2) + C.blst_p2_from_affine(agg.v, elmt) + } else { + C.blst_p2_add_or_double_affine(agg.v, agg.v, elmt) + } + return true +} + +func (agg *P2Aggregate) ToAffine() *P2Affine { + if agg.v == nil { + return new(P2Affine) + } + return agg.v.ToAffine() +} + +func (agg *P2Aggregate) aggregate(getter aggGetterP2, groupcheck bool, + n int) bool { + + if n == 0 { + return true + } + // operations are considered short enough for not to care about + // keeping one core free... + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P2 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P2 + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if groupcheck && !bool(C.blst_p2_affine_in_g2(curElmt)) { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p2_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p2_add_or_double_affine(&agg, &agg, curElmt) + } + // application might have some async work to do + runtime.Gosched() + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p2_add_or_double(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} + +// +// MIN-SIG +// + +// +// PublicKey +// + +func (pk *P2Affine) From(s *Scalar) *P2Affine { + C.blst_sk_to_pk2_in_g2(nil, pk, s) + return pk +} + +func (pk *P2Affine) KeyValidate() bool { + return !bool(C.blst_p2_affine_is_inf(pk)) && + bool(C.blst_p2_affine_in_g2(pk)) +} + +// sigInfcheck, check for infinity, is a way to avoid going +// into resource-consuming verification. Passing 'false' is +// always cryptographically safe, but application might want +// to guard against obviously bogus individual[!] signatures. +func (sig *P1Affine) SigValidate(sigInfcheck bool) bool { + if sigInfcheck && bool(C.blst_p1_affine_is_inf(sig)) { + return false + } + return bool(C.blst_p1_affine_in_g1(sig)) +} + +// +// Sign +// + +func (sig *P1Affine) Sign(sk *SecretKey, msg []byte, dst []byte, + optional ...interface{}) *P1Affine { + augSingle, aug, useHash, ok := parseOpts(optional...) + if !ok || len(aug) != 0 { + return nil + } + + var q *P1 + if useHash { + q = HashToG1(msg, dst, augSingle) + } else { + q = EncodeToG1(msg, dst, augSingle) + } + C.blst_sign_pk2_in_g2(nil, sig, q, sk) + return sig +} + +// +// Signature +// + +// Functions to return a signature and public key+augmentation tuple. +// This enables point decompression (if needed) to happen in parallel. +type sigGetterP1 func() *P1Affine +type pkGetterP2 func(i uint32, temp *P2Affine) (*P2Affine, []byte) + +// Single verify with decompressed pk +func (sig *P1Affine) Verify(sigGroupcheck bool, pk *P2Affine, pkValidate bool, + msg Message, dst []byte, + optional ...interface{}) bool { // useHash bool, aug []byte + + aug, _, useHash, ok := parseOpts(optional...) + if !ok { + return false + } + return sig.AggregateVerify(sigGroupcheck, []*P2Affine{pk}, pkValidate, + []Message{msg}, dst, useHash, [][]byte{aug}) +} + +// Single verify with compressed pk +// Uses a dummy signature to get the correct type +func (dummy *P1Affine) VerifyCompressed(sig []byte, sigGroupcheck bool, + pk []byte, pkValidate bool, msg Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + return dummy.AggregateVerifyCompressed(sig, sigGroupcheck, + [][]byte{pk}, pkValidate, + []Message{msg}, dst, optional...) +} + +// Aggregate verify with uncompressed signature and public keys +// Note that checking message uniqueness, if required, is left to the user. +// Not all signature schemes require it and this keeps the binding minimal +// and fast. Refer to the Uniq function for one method method of performing +// this check. +func (sig *P1Affine) AggregateVerify(sigGroupcheck bool, + pks []*P2Affine, pksVerify bool, msgs []Message, dst []byte, + optional ...interface{}) bool { // useHash bool, augs [][]byte + + // sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + sigFn := func() *P1Affine { + return sig + } + + pkFn := func(i uint32, _ *P2Affine) (*P2Affine, []byte) { + if useAugs { + return pks[i], augs[i] + } else { + return pks[i], nil + } + } + + return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +// Aggregate verify with compressed signature and public keys +// Uses a dummy signature to get the correct type +func (dummy *P1Affine) AggregateVerifyCompressed(sig []byte, sigGroupcheck bool, + pks [][]byte, pksVerify bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash bool, usePksAsAugs bool + + // sanity checks and argument parsing + if len(pks) != len(msgs) { + return false + } + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + usePksAsAugs := false + if len(optional) > 1 { + usePksAsAugs = optional[1] + } + + sigFn := func() *P1Affine { + sigP := new(P1Affine) + if sigP.Uncompress(sig) == nil { + return nil + } + return sigP + } + pkFn := func(i uint32, pk *P2Affine) (*P2Affine, []byte) { + bytes := pks[i] + if len(bytes) == BLST_P2_SERIALIZE_BYTES && (bytes[0]&0x80) == 0 { + // Not compressed + if pk.Deserialize(bytes) == nil { + return nil, nil + } + } else if len(bytes) == BLST_P2_COMPRESS_BYTES && (bytes[0]&0x80) != 0 { + if pk.Uncompress(bytes) == nil { + return nil, nil + } + } else { + return nil, nil + } + if usePksAsAugs { + return pk, bytes + } + return pk, nil + } + return coreAggregateVerifyPkInG2(sigFn, sigGroupcheck, pkFn, pksVerify, + msgs, dst, useHash) +} + +func coreAggregateVerifyPkInG2(sigFn sigGetterP1, sigGroupcheck bool, + pkFn pkGetterP2, pkValidate bool, msgs []Message, dst []byte, + optional ...bool) bool { // useHash + + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + mutex := sync.Mutex{} + + mutex.Lock() + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var temp P2Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } else if work == 0 && maxProcs == numCores-1 && + numThreads == maxProcs { + // Avoid consuming all cores by waiting until the + // main thread has completed its miller loop before + // proceeding. + mutex.Lock() + mutex.Unlock() + } + + // Pull Public Key and augmentation blob + curPk, aug := pkFn(work, &temp) + if curPk == nil { + atomic.StoreInt32(&valid, 0) + break + } + + // Pairing and accumulate + ret := PairingAggregatePkInG2(pairing, curPk, pkValidate, + nil, false, msgs[work], aug) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Uncompress and check signature + var gtsig Fp12 + sig := sigFn() + if sig == nil { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 && sigGroupcheck && + !sig.SigValidate(false) { + atomic.StoreInt32(&valid, 0) + } + if atomic.LoadInt32(&valid) > 0 { + C.blst_aggregated_in_g1(>sig, sig) + } + mutex.Unlock() + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, >sig) +} + +func CoreVerifyPkInG2(pk *P2Affine, sig *P1Affine, hash_or_encode bool, + msg Message, dst []byte, optional ...[]byte) int { + + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + + if runtime.NumGoroutine() < maxProcs { + sigFn := func() *P1Affine { + return sig + } + pkFn := func(_ uint32, _ *P2Affine) (*P2Affine, []byte) { + return pk, aug + } + if !coreAggregateVerifyPkInG2(sigFn, true, pkFn, true, []Message{msg}, + dst, hash_or_encode) { + return C.BLST_VERIFY_FAIL + } + return C.BLST_SUCCESS + } + + var udst *C.byte + if len(dst) > 0 { + udst = (*C.byte)(&dst[0]) + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + return int(C.blst_core_verify_pk_in_g2(pk, sig, C.bool(hash_or_encode), + umsg, C.size_t(len(msg)), + udst, C.size_t(len(dst)), + uaug, C.size_t(len(aug)))) +} + +// pks are assumed to be verified for proof of possession, +// which implies that they are already group-checked +func (sig *P1Affine) FastAggregateVerify(sigGroupcheck bool, + pks []*P2Affine, msg Message, dst []byte, + optional ...interface{}) bool { // pass-through to Verify + n := len(pks) + + // TODO: return value for length zero? + if n == 0 { + return false + } + + aggregator := new(P2Aggregate) + if !aggregator.Aggregate(pks, false) { + return false + } + pkAff := aggregator.ToAffine() + + // Verify + return sig.Verify(sigGroupcheck, pkAff, false, msg, dst, optional...) +} + +func (dummy *P1Affine) MultipleAggregateVerify(sigs []*P1Affine, + sigsGroupcheck bool, pks []*P2Affine, pksVerify bool, + msgs []Message, dst []byte, randFn func(*Scalar), randBits int, + optional ...interface{}) bool { // useHash + + // Sanity checks and argument parsing + n := len(pks) + if n == 0 || len(msgs) != n || len(sigs) != n { + return false + } + _, augs, useHash, ok := parseOpts(optional...) + useAugs := len(augs) != 0 + if !ok || (useAugs && len(augs) != n) { + return false + } + + paramsFn := + func(work uint32, sig *P1Affine, pk *P2Affine, rand *Scalar) ( + *P1Affine, *P2Affine, *Scalar, []byte) { + randFn(rand) + var aug []byte + if useAugs { + aug = augs[work] + } + return sigs[work], pks[work], rand, aug + } + + return multipleAggregateVerifyPkInG2(paramsFn, sigsGroupcheck, pksVerify, + msgs, dst, randBits, useHash) +} + +type mulAggGetterPkInG2 func(work uint32, sig *P1Affine, pk *P2Affine, + rand *Scalar) (*P1Affine, *P2Affine, *Scalar, []byte) + +func multipleAggregateVerifyPkInG2(paramsFn mulAggGetterPkInG2, + sigsGroupcheck bool, pksVerify bool, msgs []Message, + dst []byte, randBits int, + optional ...bool) bool { // useHash + n := len(msgs) + if n == 0 { + return false + } + + useHash := true + if len(optional) > 0 { + useHash = optional[0] + } + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding pk,msg[,aug] tuple and + // repeat until n is exceeded. The resulting accumulations will be + // fed into the msgsCh channel. + msgsCh := make(chan Pairing, numThreads) + valid := int32(1) + curItem := uint32(0) + + for tid := 0; tid < numThreads; tid++ { + go func() { + pairing := PairingCtx(useHash, dst) + var tempRand Scalar + var tempPk P2Affine + var tempSig P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + curSig, curPk, curRand, aug := paramsFn(work, &tempSig, + &tempPk, &tempRand) + + if PairingMulNAggregatePkInG2(pairing, curPk, pksVerify, + curSig, sigsGroupcheck, curRand, + randBits, msgs[work], aug) != + C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + break + } + + // application might have some async work to do + runtime.Gosched() + } + if atomic.LoadInt32(&valid) > 0 { + PairingCommit(pairing) + msgsCh <- pairing + } else { + msgsCh <- nil + } + }() + } + + // Accumulate the thread results + var pairings Pairing + for i := 0; i < numThreads; i++ { + msg := <-msgsCh + if msg != nil { + if pairings == nil { + pairings = msg + } else { + ret := PairingMerge(pairings, msg) + if ret != C.BLST_SUCCESS { + atomic.StoreInt32(&valid, 0) + } + } + } + } + if atomic.LoadInt32(&valid) == 0 || pairings == nil { + return false + } + + return PairingFinalVerify(pairings, nil) +} + +// +// Aggregate P1 +// + +type aggGetterP1 func(i uint32, temp *P1Affine) *P1Affine +type P1Aggregate struct { + v *P1 +} + +// Aggregate uncompressed elements +func (agg *P1Aggregate) Aggregate(elmts []*P1Affine, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, _ *P1Affine) *P1Affine { return elmts[i] } + return agg.aggregate(getter, groupcheck, len(elmts)) +} + +// Aggregate compressed elements +func (agg *P1Aggregate) AggregateCompressed(elmts [][]byte, + groupcheck bool) bool { + if len(elmts) == 0 { + return true + } + getter := func(i uint32, p *P1Affine) *P1Affine { + bytes := elmts[i] + if p.Uncompress(bytes) == nil { + return nil + } + return p + } + return agg.aggregate(getter, groupcheck, len(elmts)) +} + +func (agg *P1Aggregate) AddAggregate(other *P1Aggregate) { + if other.v == nil { + // do nothing + } else if agg.v == nil { + agg.v = other.v + } else { + C.blst_p1_add_or_double(agg.v, agg.v, other.v) + } +} + +func (agg *P1Aggregate) Add(elmt *P1Affine, groupcheck bool) bool { + if groupcheck && !bool(C.blst_p1_affine_in_g1(elmt)) { + return false + } + if agg.v == nil { + agg.v = new(P1) + C.blst_p1_from_affine(agg.v, elmt) + } else { + C.blst_p1_add_or_double_affine(agg.v, agg.v, elmt) + } + return true +} + +func (agg *P1Aggregate) ToAffine() *P1Affine { + if agg.v == nil { + return new(P1Affine) + } + return agg.v.ToAffine() +} + +func (agg *P1Aggregate) aggregate(getter aggGetterP1, groupcheck bool, + n int) bool { + + if n == 0 { + return true + } + // operations are considered short enough for not to care about + // keeping one core free... + numThreads := runtime.GOMAXPROCS(0) + if numThreads > n { + numThreads = n + } + + valid := int32(1) + type result struct { + agg *P1 + empty bool + } + msgs := make(chan result, numThreads) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + first := true + var agg P1 + var temp P1Affine + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + + // Signature validate + curElmt := getter(work, &temp) + if curElmt == nil { + atomic.StoreInt32(&valid, 0) + break + } + if groupcheck && !bool(C.blst_p1_affine_in_g1(curElmt)) { + atomic.StoreInt32(&valid, 0) + break + } + if first { + C.blst_p1_from_affine(&agg, curElmt) + first = false + } else { + C.blst_p1_add_or_double_affine(&agg, &agg, curElmt) + } + // application might have some async work to do + runtime.Gosched() + } + if first { + msgs <- result{nil, true} + } else if atomic.LoadInt32(&valid) > 0 { + msgs <- result{&agg, false} + } else { + msgs <- result{nil, false} + } + }() + } + + // Accumulate the thread results + first := agg.v == nil + validLocal := true + for i := 0; i < numThreads; i++ { + msg := <-msgs + if !validLocal || msg.empty { + // do nothing + } else if msg.agg == nil { + validLocal = false + // This should be unnecessary but seems safer + atomic.StoreInt32(&valid, 0) + } else { + if first { + agg.v = msg.agg + first = false + } else { + C.blst_p1_add_or_double(agg.v, agg.v, msg.agg) + } + } + } + if atomic.LoadInt32(&valid) == 0 { + agg.v = nil + return false + } + return true +} +func PairingAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, + sig *P2Affine, sigGroupcheck bool, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_chk_n_aggr_pk_in_g1(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + umsg, C.size_t(len(msg)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +func PairingMulNAggregatePkInG1(ctx Pairing, PK *P1Affine, pkValidate bool, + sig *P2Affine, sigGroupcheck bool, + rand *Scalar, randBits int, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g1(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + &rand.b[0], C.size_t(randBits), + umsg, C.size_t(len(msg)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +// +// Serialization/Deserialization. +// + +// P1 Serdes +func (p1 *P1Affine) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_affine_serialize((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Deserialize(in []byte) *P1Affine { + if len(in) != BLST_P1_SERIALIZE_BYTES { + return nil + } + if C.blst_p1_deserialize(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p1 +} +func (p1 *P1Affine) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_affine_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1Affine) Uncompress(in []byte) *P1Affine { + if len(in) != BLST_P1_COMPRESS_BYTES { + return nil + } + if C.blst_p1_uncompress(p1, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p1 +} + +func (p1 *P1Affine) InG1() bool { + return bool(C.blst_p1_affine_in_g1(p1)) +} + +func (dummy *P1Affine) BatchUncompress(in [][]byte) []*P1Affine { + // Allocate space for all of the resulting points. Later we'll save pointers + // and return those so that the result could be used in other functions, + // such as MultipleAggregateVerify. + n := len(in) + points := make([]P1Affine, n) + pointsPtrs := make([]*P1Affine, n) + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding point, and + // repeat until n is exceeded. Each thread will send a result (true for + // success, false for failure) into the channel when complete. + resCh := make(chan bool, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + if points[work].Uncompress(in[work]) == nil { + atomic.StoreInt32(&valid, 0) + break + } + pointsPtrs[work] = &points[work] + } + if atomic.LoadInt32(&valid) > 0 { + resCh <- true + } else { + resCh <- false + } + }() + } + + // Collect the threads + result := true + for i := 0; i < numThreads; i++ { + if !<-resCh { + result = false + } + } + if atomic.LoadInt32(&valid) == 0 || !result { + return nil + } + return pointsPtrs +} + +func (p1 *P1) Serialize() []byte { + var out [BLST_P1_SERIALIZE_BYTES]byte + C.blst_p1_serialize((*C.byte)(&out[0]), p1) + return out[:] +} +func (p1 *P1) Compress() []byte { + var out [BLST_P1_COMPRESS_BYTES]byte + C.blst_p1_compress((*C.byte)(&out[0]), p1) + return out[:] +} + +func (p1 *P1) MultAssign(scalarIf interface{}, optional ...int) *P1 { + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.blst_p1_mult(p1, p1, scalar, C.size_t(nbits)) + return p1 +} + +func (p1 *P1) Mult(scalarIf interface{}, optional ...int) *P1 { + ret := *p1 + return ret.MultAssign(scalarIf, optional...) +} + +func (p1 *P1) AddAssign(pointIf interface{}) *P1 { + switch val := pointIf.(type) { + case *P1: + C.blst_p1_add_or_double(p1, p1, val) + case *P1Affine: + C.blst_p1_add_or_double_affine(p1, p1, val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + return p1 +} + +func (p1 *P1) Add(pointIf interface{}) *P1 { + ret := *p1 + return ret.AddAssign(pointIf) +} + +func (p1 *P1) SubAssign(pointIf interface{}) *P1 { + var x *Fp + var affine C.bool + switch val := pointIf.(type) { + case *P1: + x = &val.x + affine = false + case *P1Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + C.go_p1_sub_assign(p1, x, affine) + return p1 +} + +func (p1 *P1) Sub(pointIf interface{}) *P1 { + ret := *p1 + return ret.SubAssign(pointIf) +} + +func P1Generator() *P1 { + return C.blst_p1_generator() +} + +// 'acc += point * scalar', passing 'nil' for 'point' means "use the +// +// group generator point" +func (acc *P1) MultNAccumulate(pointIf interface{}, scalarIf interface{}, + optional ...int) *P1 { + var x *Fp + var affine C.bool + if pointIf != nil { + switch val := pointIf.(type) { + case *P1: + x = &val.x + affine = false + case *P1Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + } + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.go_p1_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits)) + return acc +} + +// +// Affine +// + +func (p *P1) ToAffine() *P1Affine { + var pa P1Affine + C.blst_p1_to_affine(&pa, p) + return &pa +} + +func (p *P1) FromAffine(pa *P1Affine) { + C.blst_p1_from_affine(p, pa) +} + +// Hash +func HashToG1(msg []byte, dst []byte, + optional ...[]byte) *P1 { // aug + var q P1 + + // Handle zero length message + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + var aug []byte + var augC *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + augC = (*C.byte)(&aug[0]) + } + } + + C.blst_hash_to_g1(&q, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst)), + augC, C.size_t(len(aug))) + return &q +} + +func EncodeToG1(msg []byte, dst []byte, + optional ...[]byte) *P1 { // aug + var q P1 + + // Handle zero length message + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + var aug []byte + var augC *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + augC = (*C.byte)(&aug[0]) + } + } + + C.blst_encode_to_g1(&q, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst)), + augC, C.size_t(len(aug))) + return &q +} + +// +// Multi-point/scalar operations +// + +func P1sToAffine(points []*P1, optional ...int) P1Affines { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + ret := make([]P1Affine, npoints) + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret +} + +func (points P1s) ToAffine(optional ...P1Affines) P1Affines { + npoints := len(points) + var ret P1Affines + + if len(optional) > 0 { // used in benchmark + ret = optional[0] + if len(ret) < npoints { + panic("npoints mismatch") + } + } else { + ret = make([]P1Affine, npoints) + } + + if maxProcs < 2 || npoints < 768 { + C.go_p1slice_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + var wg sync.WaitGroup + wg.Add(nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(out *P1Affine, inp *P1, delta int) { + C.go_p1slice_to_affine(out, inp, C.size_t(delta)) + wg.Done() + }(&ret[x], &points[x], delta) + } + wg.Wait() + + return ret +} + +// +// Batch addition +// + +func P1AffinesAdd(points []*P1Affine, optional ...int) *P1 { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + var ret P1 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_add(&ret, &points[0], C.size_t(npoints)) + return &ret +} + +func (points P1Affines) Add() *P1 { + npoints := len(points) + if maxProcs < 2 || npoints < 768 { + var ret P1 + C.go_p1slice_add(&ret, &points[0], C.size_t(npoints)) + return &ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + msgs := make(chan P1, nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(points *P1Affine, delta int) { + var ret P1 + C.go_p1slice_add(&ret, points, C.size_t(delta)) + msgs <- ret + }(&points[x], delta) + } + + ret := <-msgs + for i := 1; i < nslices; i++ { + msg := <-msgs + C.blst_p1_add_or_double(&ret, &ret, &msg) + } + return &ret +} + +func (points P1s) Add() *P1 { + return points.ToAffine().Add() +} + +// +// Multi-scalar multiplication +// + +func P1AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P1 { + var npoints int + switch val := pointsIf.(type) { + case []*P1Affine: + npoints = len(val) + case []P1Affine: + npoints = len(val) + case P1Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + nbytes := (nbits + 7) / 8 + var scalars []*C.byte + switch val := scalarsIf.(type) { + case []byte: + if len(val) < npoints*nbytes { + return nil + } + case [][]byte: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = (*C.byte)(&val[i][0]) + } + case []Scalar: + if len(val) < npoints { + return nil + } + if nbits <= 248 { + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + } + case []*Scalar: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + numThreads := maxProcs + numCores := runtime.GOMAXPROCS(0) + if numCores < maxProcs { + numThreads = numCores + } + + if numThreads < 2 || npoints < 32 { + sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 + scratch := make([]uint64, sz) + + pointsBySlice := [2]*P1Affine{nil, nil} + var p_points **P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + p_points = &val[0] + case []P1Affine: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + case P1Affines: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + } + + scalarsBySlice := [2]*C.byte{nil, nil} + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[0]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[0] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = (*C.byte)(&val[0].b[0]) + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[0] + } + case []*Scalar: + p_scalars = &scalars[0] + } + + var ret P1 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p1s_mult_pippenger(&ret, p_points, C.size_t(npoints), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0])) + + for i := range scalars { + scalars[i] = nil + } + + return &ret + } + + // this is sizeof(scratch[0]) + sz := int(C.blst_p1s_mult_pippenger_scratch_sizeof(0)) / 8 + + nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), + numThreads) + + // |grid[]| holds "coordinates" and place for result + grid := make([]struct { + x, dx, y, dy int + point P1 + }, nx*ny) + + dx := npoints / nx + y := window * (ny - 1) + total := 0 + for ; total < nx; total++ { + grid[total].x = total * dx + grid[total].dx = dx + grid[total].y = y + grid[total].dy = nbits - y + } + grid[total-1].dx = npoints - grid[total-1].x + + for y > 0 { + y -= window + for i := 0; i < nx; i++ { + grid[total].x = grid[i].x + grid[total].dx = grid[i].dx + grid[total].y = y + grid[total].dy = window + total++ + } + } + + if numThreads > total { + numThreads = total + } + + msgsCh := make(chan int, ny) + rowSync := make([]int32, ny) // count up to |nx| + curItem := int32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + scratch := make([]uint64, sz<= total { + break + } + + x := grid[workItem].x + y := grid[workItem].y + + var p_points **P1Affine + switch val := pointsIf.(type) { + case []*P1Affine: + p_points = &val[x] + case []P1Affine: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + case P1Affines: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + } + + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[x] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = (*C.byte)(&val[x].b[0]) + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[x] + } + case []*Scalar: + p_scalars = &scalars[x] + } + + C.blst_p1s_tile_pippenger(&grid[workItem].point, + p_points, C.size_t(grid[workItem].dx), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0]), + C.size_t(y), C.size_t(window)) + + if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { + msgsCh <- y // "row" is done + } else { + runtime.Gosched() // be nice to the application + } + } + + pointsBySlice[0] = nil + scalarsBySlice[0] = nil + }() + } + + var ret P1 + rows := make([]bool, ny) + row := 0 // actually index in |grid[]| + for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" + y := <-msgsCh + rows[y/window] = true // mark the "row" + for grid[row].y == y { // if it's current "row", process it + for row < total && grid[row].y == y { + C.blst_p1_add_or_double(&ret, &ret, &grid[row].point) + row++ + } + if y == 0 { + break // one can as well 'return &ret' here + } + for j := 0; j < window; j++ { + C.blst_p1_double(&ret, &ret) + } + y -= window + if !rows[y/window] { // see if next "row" was marked already + break + } + } + } + + for i := range scalars { + scalars[i] = nil + } + + return &ret +} + +func (points P1Affines) Mult(scalarsIf interface{}, nbits int) *P1 { + return P1AffinesMult(points, scalarsIf, nbits) +} + +func (points P1s) Mult(scalarsIf interface{}, nbits int) *P1 { + return points.ToAffine().Mult(scalarsIf, nbits) +} +func PairingAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, + sig *P1Affine, sigGroupcheck bool, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_chk_n_aggr_pk_in_g2(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + umsg, C.size_t(len(msg)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +func PairingMulNAggregatePkInG2(ctx Pairing, PK *P2Affine, pkValidate bool, + sig *P1Affine, sigGroupcheck bool, + rand *Scalar, randBits int, msg []byte, + optional ...[]byte) int { // aug + var aug []byte + var uaug *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + uaug = (*C.byte)(&aug[0]) + } + } + var umsg *C.byte + if len(msg) > 0 { + umsg = (*C.byte)(&msg[0]) + } + + r := C.blst_pairing_chk_n_mul_n_aggr_pk_in_g2(&ctx[0], + PK, C.bool(pkValidate), + sig, C.bool(sigGroupcheck), + &rand.b[0], C.size_t(randBits), + umsg, C.size_t(len(msg)), + uaug, C.size_t(len(aug))) + + return int(r) +} + +// +// Serialization/Deserialization. +// + +// P2 Serdes +func (p2 *P2Affine) Serialize() []byte { + var out [BLST_P2_SERIALIZE_BYTES]byte + C.blst_p2_affine_serialize((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2Affine) Deserialize(in []byte) *P2Affine { + if len(in) != BLST_P2_SERIALIZE_BYTES { + return nil + } + if C.blst_p2_deserialize(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p2 +} +func (p2 *P2Affine) Compress() []byte { + var out [BLST_P2_COMPRESS_BYTES]byte + C.blst_p2_affine_compress((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2Affine) Uncompress(in []byte) *P2Affine { + if len(in) != BLST_P2_COMPRESS_BYTES { + return nil + } + if C.blst_p2_uncompress(p2, (*C.byte)(&in[0])) != C.BLST_SUCCESS { + return nil + } + return p2 +} + +func (p2 *P2Affine) InG2() bool { + return bool(C.blst_p2_affine_in_g2(p2)) +} + +func (dummy *P2Affine) BatchUncompress(in [][]byte) []*P2Affine { + // Allocate space for all of the resulting points. Later we'll save pointers + // and return those so that the result could be used in other functions, + // such as MultipleAggregateVerify. + n := len(in) + points := make([]P2Affine, n) + pointsPtrs := make([]*P2Affine, n) + + numCores := runtime.GOMAXPROCS(0) + numThreads := maxProcs + if numThreads > numCores { + numThreads = numCores + } + if numThreads > n { + numThreads = n + } + // Each thread will determine next message to process by atomically + // incrementing curItem, process corresponding point, and + // repeat until n is exceeded. Each thread will send a result (true for + // success, false for failure) into the channel when complete. + resCh := make(chan bool, numThreads) + valid := int32(1) + curItem := uint32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + for atomic.LoadInt32(&valid) > 0 { + // Get a work item + work := atomic.AddUint32(&curItem, 1) - 1 + if work >= uint32(n) { + break + } + if points[work].Uncompress(in[work]) == nil { + atomic.StoreInt32(&valid, 0) + break + } + pointsPtrs[work] = &points[work] + } + if atomic.LoadInt32(&valid) > 0 { + resCh <- true + } else { + resCh <- false + } + }() + } + + // Collect the threads + result := true + for i := 0; i < numThreads; i++ { + if !<-resCh { + result = false + } + } + if atomic.LoadInt32(&valid) == 0 || !result { + return nil + } + return pointsPtrs +} + +func (p2 *P2) Serialize() []byte { + var out [BLST_P2_SERIALIZE_BYTES]byte + C.blst_p2_serialize((*C.byte)(&out[0]), p2) + return out[:] +} +func (p2 *P2) Compress() []byte { + var out [BLST_P2_COMPRESS_BYTES]byte + C.blst_p2_compress((*C.byte)(&out[0]), p2) + return out[:] +} + +func (p2 *P2) MultAssign(scalarIf interface{}, optional ...int) *P2 { + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.blst_p2_mult(p2, p2, scalar, C.size_t(nbits)) + return p2 +} + +func (p2 *P2) Mult(scalarIf interface{}, optional ...int) *P2 { + ret := *p2 + return ret.MultAssign(scalarIf, optional...) +} + +func (p2 *P2) AddAssign(pointIf interface{}) *P2 { + switch val := pointIf.(type) { + case *P2: + C.blst_p2_add_or_double(p2, p2, val) + case *P2Affine: + C.blst_p2_add_or_double_affine(p2, p2, val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + return p2 +} + +func (p2 *P2) Add(pointIf interface{}) *P2 { + ret := *p2 + return ret.AddAssign(pointIf) +} + +func (p2 *P2) SubAssign(pointIf interface{}) *P2 { + var x *Fp2 + var affine C.bool + switch val := pointIf.(type) { + case *P2: + x = &val.x + affine = false + case *P2Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + C.go_p2_sub_assign(p2, x, affine) + return p2 +} + +func (p2 *P2) Sub(pointIf interface{}) *P2 { + ret := *p2 + return ret.SubAssign(pointIf) +} + +func P2Generator() *P2 { + return C.blst_p2_generator() +} + +// 'acc += point * scalar', passing 'nil' for 'point' means "use the +// +// group generator point" +func (acc *P2) MultNAccumulate(pointIf interface{}, scalarIf interface{}, + optional ...int) *P2 { + var x *Fp2 + var affine C.bool + if pointIf != nil { + switch val := pointIf.(type) { + case *P2: + x = &val.x + affine = false + case *P2Affine: + x = &val.x + affine = true + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + } + var nbits int + var scalar *C.byte + switch val := scalarIf.(type) { + case []byte: + scalar = (*C.byte)(&val[0]) + nbits = len(val) * 8 + case *Scalar: + scalar = &val.b[0] + nbits = 255 + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + if len(optional) > 0 { + nbits = optional[0] + } + C.go_p2_mult_n_acc(acc, x, affine, scalar, C.size_t(nbits)) + return acc +} + +// +// Affine +// + +func (p *P2) ToAffine() *P2Affine { + var pa P2Affine + C.blst_p2_to_affine(&pa, p) + return &pa +} + +func (p *P2) FromAffine(pa *P2Affine) { + C.blst_p2_from_affine(p, pa) +} + +// Hash +func HashToG2(msg []byte, dst []byte, + optional ...[]byte) *P2 { // aug + var q P2 + + // Handle zero length message + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + var aug []byte + var augC *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + augC = (*C.byte)(&aug[0]) + } + } + + C.blst_hash_to_g2(&q, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst)), + augC, C.size_t(len(aug))) + return &q +} + +func EncodeToG2(msg []byte, dst []byte, + optional ...[]byte) *P2 { // aug + var q P2 + + // Handle zero length message + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + var aug []byte + var augC *C.byte + if len(optional) > 0 { + aug = optional[0] + if len(aug) > 0 { + augC = (*C.byte)(&aug[0]) + } + } + + C.blst_encode_to_g2(&q, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst)), + augC, C.size_t(len(aug))) + return &q +} + +// +// Multi-point/scalar operations +// + +func P2sToAffine(points []*P2, optional ...int) P2Affines { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + ret := make([]P2Affine, npoints) + _cgoCheckPointer := func(...interface{}) {} + C.blst_p2s_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret +} + +func (points P2s) ToAffine(optional ...P2Affines) P2Affines { + npoints := len(points) + var ret P2Affines + + if len(optional) > 0 { // used in benchmark + ret = optional[0] + if len(ret) < npoints { + panic("npoints mismatch") + } + } else { + ret = make([]P2Affine, npoints) + } + + if maxProcs < 2 || npoints < 768 { + C.go_p2slice_to_affine(&ret[0], &points[0], C.size_t(npoints)) + return ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + var wg sync.WaitGroup + wg.Add(nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(out *P2Affine, inp *P2, delta int) { + C.go_p2slice_to_affine(out, inp, C.size_t(delta)) + wg.Done() + }(&ret[x], &points[x], delta) + } + wg.Wait() + + return ret +} + +// +// Batch addition +// + +func P2AffinesAdd(points []*P2Affine, optional ...int) *P2 { + var npoints int + if len(optional) > 0 { + npoints = optional[0] + } else { + npoints = len(points) + } + var ret P2 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p2s_add(&ret, &points[0], C.size_t(npoints)) + return &ret +} + +func (points P2Affines) Add() *P2 { + npoints := len(points) + if maxProcs < 2 || npoints < 768 { + var ret P2 + C.go_p2slice_add(&ret, &points[0], C.size_t(npoints)) + return &ret + } + + nslices := (npoints + 511) / 512 + if nslices > maxProcs { + nslices = maxProcs + } + delta, rem := npoints/nslices+1, npoints%nslices + + msgs := make(chan P2, nslices) + for x := 0; x < npoints; x += delta { + if rem == 0 { + delta -= 1 + } + rem -= 1 + go func(points *P2Affine, delta int) { + var ret P2 + C.go_p2slice_add(&ret, points, C.size_t(delta)) + msgs <- ret + }(&points[x], delta) + } + + ret := <-msgs + for i := 1; i < nslices; i++ { + msg := <-msgs + C.blst_p2_add_or_double(&ret, &ret, &msg) + } + return &ret +} + +func (points P2s) Add() *P2 { + return points.ToAffine().Add() +} + +// +// Multi-scalar multiplication +// + +func P2AffinesMult(pointsIf interface{}, scalarsIf interface{}, nbits int) *P2 { + var npoints int + switch val := pointsIf.(type) { + case []*P2Affine: + npoints = len(val) + case []P2Affine: + npoints = len(val) + case P2Affines: + npoints = len(val) + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + nbytes := (nbits + 7) / 8 + var scalars []*C.byte + switch val := scalarsIf.(type) { + case []byte: + if len(val) < npoints*nbytes { + return nil + } + case [][]byte: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = (*C.byte)(&val[i][0]) + } + case []Scalar: + if len(val) < npoints { + return nil + } + if nbits <= 248 { + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + } + case []*Scalar: + if len(val) < npoints { + return nil + } + scalars = make([]*C.byte, npoints) + for i := range scalars { + scalars[i] = &val[i].b[0] + } + default: + panic(fmt.Sprintf("unsupported type %T", val)) + } + + numThreads := maxProcs + numCores := runtime.GOMAXPROCS(0) + if numCores < maxProcs { + numThreads = numCores + } + + if numThreads < 2 || npoints < 32 { + sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(C.size_t(npoints))) / 8 + scratch := make([]uint64, sz) + + pointsBySlice := [2]*P2Affine{nil, nil} + var p_points **P2Affine + switch val := pointsIf.(type) { + case []*P2Affine: + p_points = &val[0] + case []P2Affine: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + case P2Affines: + pointsBySlice[0] = &val[0] + p_points = &pointsBySlice[0] + } + + scalarsBySlice := [2]*C.byte{nil, nil} + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[0]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[0] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = (*C.byte)(&val[0].b[0]) + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[0] + } + case []*Scalar: + p_scalars = &scalars[0] + } + + var ret P2 + _cgoCheckPointer := func(...interface{}) {} + C.blst_p2s_mult_pippenger(&ret, p_points, C.size_t(npoints), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0])) + + for i := range scalars { + scalars[i] = nil + } + + return &ret + } + + // this is sizeof(scratch[0]) + sz := int(C.blst_p2s_mult_pippenger_scratch_sizeof(0)) / 8 + + nx, ny, window := breakdown(nbits, pippenger_window_size(npoints), + numThreads) + + // |grid[]| holds "coordinates" and place for result + grid := make([]struct { + x, dx, y, dy int + point P2 + }, nx*ny) + + dx := npoints / nx + y := window * (ny - 1) + total := 0 + for ; total < nx; total++ { + grid[total].x = total * dx + grid[total].dx = dx + grid[total].y = y + grid[total].dy = nbits - y + } + grid[total-1].dx = npoints - grid[total-1].x + + for y > 0 { + y -= window + for i := 0; i < nx; i++ { + grid[total].x = grid[i].x + grid[total].dx = grid[i].dx + grid[total].y = y + grid[total].dy = window + total++ + } + } + + if numThreads > total { + numThreads = total + } + + msgsCh := make(chan int, ny) + rowSync := make([]int32, ny) // count up to |nx| + curItem := int32(0) + for tid := 0; tid < numThreads; tid++ { + go func() { + scratch := make([]uint64, sz<= total { + break + } + + x := grid[workItem].x + y := grid[workItem].y + + var p_points **P2Affine + switch val := pointsIf.(type) { + case []*P2Affine: + p_points = &val[x] + case []P2Affine: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + case P2Affines: + pointsBySlice[0] = &val[x] + p_points = &pointsBySlice[0] + } + + var p_scalars **C.byte + switch val := scalarsIf.(type) { + case []byte: + scalarsBySlice[0] = (*C.byte)(&val[x*nbytes]) + p_scalars = &scalarsBySlice[0] + case [][]byte: + p_scalars = &scalars[x] + case []Scalar: + if nbits > 248 { + scalarsBySlice[0] = (*C.byte)(&val[x].b[0]) + p_scalars = &scalarsBySlice[0] + } else { + p_scalars = &scalars[x] + } + case []*Scalar: + p_scalars = &scalars[x] + } + + C.blst_p2s_tile_pippenger(&grid[workItem].point, + p_points, C.size_t(grid[workItem].dx), + p_scalars, C.size_t(nbits), + (*C.limb_t)(&scratch[0]), + C.size_t(y), C.size_t(window)) + + if atomic.AddInt32(&rowSync[y/window], 1) == int32(nx) { + msgsCh <- y // "row" is done + } else { + runtime.Gosched() // be nice to the application + } + } + + pointsBySlice[0] = nil + scalarsBySlice[0] = nil + }() + } + + var ret P2 + rows := make([]bool, ny) + row := 0 // actually index in |grid[]| + for i := 0; i < ny; i++ { // we expect |ny| messages, one per "row" + y := <-msgsCh + rows[y/window] = true // mark the "row" + for grid[row].y == y { // if it's current "row", process it + for row < total && grid[row].y == y { + C.blst_p2_add_or_double(&ret, &ret, &grid[row].point) + row++ + } + if y == 0 { + break // one can as well 'return &ret' here + } + for j := 0; j < window; j++ { + C.blst_p2_double(&ret, &ret) + } + y -= window + if !rows[y/window] { // see if next "row" was marked already + break + } + } + } + + for i := range scalars { + scalars[i] = nil + } + + return &ret +} + +func (points P2Affines) Mult(scalarsIf interface{}, nbits int) *P2 { + return P2AffinesMult(points, scalarsIf, nbits) +} + +func (points P2s) Mult(scalarsIf interface{}, nbits int) *P2 { + return points.ToAffine().Mult(scalarsIf, nbits) +} + +func parseOpts(optional ...interface{}) ([]byte, [][]byte, bool, bool) { + var aug [][]byte // For aggregate verify + var augSingle []byte // For signing + useHash := true // hash (true), encode (false) + + for _, arg := range optional { + switch v := arg.(type) { + case []byte: + augSingle = v + case [][]byte: + aug = v + case bool: + useHash = v + default: + return nil, nil, useHash, false + } + } + return augSingle, aug, useHash, true +} + +func bytesAllZero(s []byte) bool { + for _, v := range s { + if v != 0 { + return false + } + } + return true +} + +// These methods are inefficient because of cgo call overhead. For this +// reason they should be used primarily for prototyping with a goal to +// formulate interfaces that would process multiple scalars per cgo call. +func (a *Scalar) MulAssign(b *Scalar) (*Scalar, bool) { + return a, bool(C.blst_sk_mul_n_check(a, a, b)) +} + +func (a *Scalar) Mul(b *Scalar) (*Scalar, bool) { + var ret Scalar + return &ret, bool(C.blst_sk_mul_n_check(&ret, a, b)) +} + +func (a *Scalar) AddAssign(b *Scalar) (*Scalar, bool) { + return a, bool(C.blst_sk_add_n_check(a, a, b)) +} + +func (a *Scalar) Add(b *Scalar) (*Scalar, bool) { + var ret Scalar + return &ret, bool(C.blst_sk_add_n_check(&ret, a, b)) +} + +func (a *Scalar) SubAssign(b *Scalar) (*Scalar, bool) { + return a, bool(C.blst_sk_sub_n_check(a, a, b)) +} + +func (a *Scalar) Sub(b *Scalar) (*Scalar, bool) { + var ret Scalar + return &ret, bool(C.blst_sk_sub_n_check(&ret, a, b)) +} + +func (a *Scalar) Inverse() *Scalar { + var ret Scalar + C.blst_sk_inverse(&ret, a) + return &ret +} + +// +// Serialization/Deserialization. +// + +// Scalar serdes +func (s *Scalar) Serialize() []byte { + var out [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&out[0]), s) + return out[:] +} + +func (s *Scalar) Deserialize(in []byte) *Scalar { + if len(in) != BLST_SCALAR_BYTES || + !C.go_scalar_from_bendian(s, (*C.byte)(&in[0])) { + return nil + } + return s +} + +func (s *Scalar) Valid() bool { + return bool(C.blst_sk_check(s)) +} + +func (s *Scalar) HashTo(msg []byte, dst []byte) bool { + ret := HashToScalar(msg, dst) + if ret != nil { + *s = *ret + return true + } + return false +} + +func HashToScalar(msg []byte, dst []byte) *Scalar { + var ret Scalar + + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + if C.go_hash_to_scalar(&ret, msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst))) { + return &ret + } + + return nil +} + +// +// LEndian +// + +func (fr *Scalar) ToLEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_lendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToLEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_lendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +func (fr *Scalar) FromLEndian(arr []byte) *Scalar { + nbytes := len(arr) + if nbytes < BLST_SCALAR_BYTES || + !C.blst_scalar_from_le_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) { + return nil + } + return fr +} + +func (fp *Fp) FromLEndian(arr []byte) *Fp { + if len(arr) != BLST_FP_BYTES { + return nil + } + C.blst_fp_from_lendian(fp, (*C.byte)(&arr[0])) + return fp +} + +// +// BEndian +// + +func (fr *Scalar) ToBEndian() []byte { + var arr [BLST_SCALAR_BYTES]byte + C.blst_bendian_from_scalar((*C.byte)(&arr[0]), fr) + return arr[:] +} + +func (fp *Fp) ToBEndian() []byte { + var arr [BLST_FP_BYTES]byte + C.blst_bendian_from_fp((*C.byte)(&arr[0]), fp) + return arr[:] +} + +func (fr *Scalar) FromBEndian(arr []byte) *Scalar { + nbytes := len(arr) + if nbytes < BLST_SCALAR_BYTES || + !C.blst_scalar_from_be_bytes(fr, (*C.byte)(&arr[0]), C.size_t(nbytes)) { + return nil + } + return fr +} + +func (fp *Fp) FromBEndian(arr []byte) *Fp { + if len(arr) != BLST_FP_BYTES { + return nil + } + C.blst_fp_from_bendian(fp, (*C.byte)(&arr[0])) + return fp +} + +// +// Printing +// + +func PrintBytes(val []byte, name string) { + fmt.Printf("%s = %02x\n", name, val) +} + +func (s *Scalar) Print(name string) { + arr := s.ToBEndian() + PrintBytes(arr[:], name) +} + +func (p *P1Affine) Print(name string) { + fmt.Printf("%s:\n", name) + arr := p.x.ToBEndian() + PrintBytes(arr, " x") + arr = p.y.ToBEndian() + PrintBytes(arr, " y") +} + +func (p *P1) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + aff.Print(name) +} + +func (f *Fp2) Print(name string) { + fmt.Printf("%s:\n", name) + arr := f.fp[0].ToBEndian() + PrintBytes(arr, " 0") + arr = f.fp[1].ToBEndian() + PrintBytes(arr, " 1") +} + +func (p *P2Affine) Print(name string) { + fmt.Printf("%s:\n", name) + p.x.Print(" x") + p.y.Print(" y") +} + +func (p *P2) Print(name string) { + fmt.Printf("%s:\n", name) + aff := p.ToAffine() + aff.Print(name) +} + +// +// Equality +// + +func (s1 *Scalar) Equals(s2 *Scalar) bool { + return *s1 == *s2 +} + +func (e1 *Fp) Equals(e2 *Fp) bool { + return *e1 == *e2 +} + +func (e1 *Fp2) Equals(e2 *Fp2) bool { + return *e1 == *e2 +} + +func (e1 *P1Affine) Equals(e2 *P1Affine) bool { + return bool(C.blst_p1_affine_is_equal(e1, e2)) +} + +func (e1 *P1) Equals(e2 *P1) bool { + return bool(C.blst_p1_is_equal(e1, e2)) +} + +func (e1 *P2Affine) Equals(e2 *P2Affine) bool { + return bool(C.blst_p2_affine_is_equal(e1, e2)) +} + +func (e1 *P2) Equals(e2 *P2) bool { + return bool(C.blst_p2_is_equal(e1, e2)) +} + +// private thunk for testing + +func expandMessageXmd(msg []byte, dst []byte, len_in_bytes int) []byte { + ret := make([]byte, len_in_bytes) + + var msgC *C.byte + if len(msg) > 0 { + msgC = (*C.byte)(&msg[0]) + } + + var dstC *C.byte + if len(dst) > 0 { + dstC = (*C.byte)(&dst[0]) + } + + C.blst_expand_message_xmd((*C.byte)(&ret[0]), C.size_t(len(ret)), + msgC, C.size_t(len(msg)), + dstC, C.size_t(len(dst))) + return ret +} + +func breakdown(nbits, window, ncpus int) (int, int, int) { + var nx, ny, wnd int + + if nbits > window*ncpus { + nx = 1 + wnd = bits.Len(uint(ncpus) / 4) + if (window + wnd) > 18 { + wnd = window - wnd + } else { + wnd = (nbits/window + ncpus - 1) / ncpus + if (nbits/(window+1)+ncpus-1)/ncpus < wnd { + wnd = window + 1 + } else { + wnd = window + } + } + } else { + nx = 2 + wnd = window - 2 + for (nbits/wnd+1)*nx < ncpus { + nx += 1 + wnd = window - bits.Len(3*uint(nx)/2) + } + nx -= 1 + wnd = window - bits.Len(3*uint(nx)/2) + } + ny = nbits/wnd + 1 + wnd = nbits/ny + 1 + + return nx, ny, wnd +} + +func pippenger_window_size(npoints int) int { + wbits := bits.Len(uint(npoints)) + + if wbits > 13 { + return wbits - 4 + } + if wbits > 5 { + return wbits - 3 + } + return 2 +} diff --git a/crypto/internal/blst/blst.h b/crypto/internal/blst/blst.h new file mode 100644 index 00000000000..1349896a3f8 --- /dev/null +++ b/crypto/internal/blst/blst.h @@ -0,0 +1,482 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_H__ +#define __BLST_H__ + +#ifdef __SIZE_TYPE__ +typedef __SIZE_TYPE__ size_t; +#else +#include +#endif + +#if defined(__UINT8_TYPE__) && defined(__UINT32_TYPE__) \ + && defined(__UINT64_TYPE__) +typedef __UINT8_TYPE__ uint8_t; +typedef __UINT32_TYPE__ uint32_t; +typedef __UINT64_TYPE__ uint64_t; +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#elif defined(__BLST_CGO__) +typedef _Bool bool; /* it's assumed that cgo calls modern enough compiler */ +#elif defined(__STDC_VERSION__) && __STDC_VERSION__>=199901 +# define bool _Bool +#else +# define bool int +#endif + +#ifdef SWIG +# define DEFNULL =NULL +#elif defined __cplusplus +# define DEFNULL =0 +#else +# define DEFNULL +#endif + +typedef enum { + BLST_SUCCESS = 0, + BLST_BAD_ENCODING, + BLST_POINT_NOT_ON_CURVE, + BLST_POINT_NOT_IN_GROUP, + BLST_AGGR_TYPE_MISMATCH, + BLST_VERIFY_FAIL, + BLST_PK_IS_INFINITY, + BLST_BAD_SCALAR, +} BLST_ERROR; + +typedef uint8_t byte; +typedef uint64_t limb_t; + +typedef struct { byte b[256/8]; } blst_scalar; +typedef struct { limb_t l[256/8/sizeof(limb_t)]; } blst_fr; +typedef struct { limb_t l[384/8/sizeof(limb_t)]; } blst_fp; +/* 0 is "real" part, 1 is "imaginary" */ +typedef struct { blst_fp fp[2]; } blst_fp2; +typedef struct { blst_fp2 fp2[3]; } blst_fp6; +typedef struct { blst_fp6 fp6[2]; } blst_fp12; + +void blst_scalar_from_uint32(blst_scalar *out, const uint32_t a[8]); +void blst_uint32_from_scalar(uint32_t out[8], const blst_scalar *a); +void blst_scalar_from_uint64(blst_scalar *out, const uint64_t a[4]); +void blst_uint64_from_scalar(uint64_t out[4], const blst_scalar *a); +void blst_scalar_from_bendian(blst_scalar *out, const byte a[32]); +void blst_bendian_from_scalar(byte out[32], const blst_scalar *a); +void blst_scalar_from_lendian(blst_scalar *out, const byte a[32]); +void blst_lendian_from_scalar(byte out[32], const blst_scalar *a); +bool blst_scalar_fr_check(const blst_scalar *a); +bool blst_sk_check(const blst_scalar *a); +bool blst_sk_add_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_sub_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +bool blst_sk_mul_n_check(blst_scalar *out, const blst_scalar *a, + const blst_scalar *b); +void blst_sk_inverse(blst_scalar *out, const blst_scalar *a); +bool blst_scalar_from_le_bytes(blst_scalar *out, const byte *in, size_t len); +bool blst_scalar_from_be_bytes(blst_scalar *out, const byte *in, size_t len); + +#ifndef SWIG +/* + * BLS12-381-specific Fr operations. + */ +void blst_fr_add(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sub(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_mul_by_3(blst_fr *ret, const blst_fr *a); +void blst_fr_lshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_rshift(blst_fr *ret, const blst_fr *a, size_t count); +void blst_fr_mul(blst_fr *ret, const blst_fr *a, const blst_fr *b); +void blst_fr_sqr(blst_fr *ret, const blst_fr *a); +void blst_fr_cneg(blst_fr *ret, const blst_fr *a, bool flag); +void blst_fr_eucl_inverse(blst_fr *ret, const blst_fr *a); +void blst_fr_inverse(blst_fr *ret, const blst_fr *a); + +void blst_fr_from_uint64(blst_fr *ret, const uint64_t a[4]); +void blst_uint64_from_fr(uint64_t ret[4], const blst_fr *a); +void blst_fr_from_scalar(blst_fr *ret, const blst_scalar *a); +void blst_scalar_from_fr(blst_scalar *ret, const blst_fr *a); + +/* + * BLS12-381-specific Fp operations. + */ +void blst_fp_add(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sub(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_mul_by_3(blst_fp *ret, const blst_fp *a); +void blst_fp_mul_by_8(blst_fp *ret, const blst_fp *a); +void blst_fp_lshift(blst_fp *ret, const blst_fp *a, size_t count); +void blst_fp_mul(blst_fp *ret, const blst_fp *a, const blst_fp *b); +void blst_fp_sqr(blst_fp *ret, const blst_fp *a); +void blst_fp_cneg(blst_fp *ret, const blst_fp *a, bool flag); +void blst_fp_eucl_inverse(blst_fp *ret, const blst_fp *a); +void blst_fp_inverse(blst_fp *ret, const blst_fp *a); +bool blst_fp_sqrt(blst_fp *ret, const blst_fp *a); + +void blst_fp_from_uint32(blst_fp *ret, const uint32_t a[12]); +void blst_uint32_from_fp(uint32_t ret[12], const blst_fp *a); +void blst_fp_from_uint64(blst_fp *ret, const uint64_t a[6]); +void blst_uint64_from_fp(uint64_t ret[6], const blst_fp *a); +void blst_fp_from_bendian(blst_fp *ret, const byte a[48]); +void blst_bendian_from_fp(byte ret[48], const blst_fp *a); +void blst_fp_from_lendian(blst_fp *ret, const byte a[48]); +void blst_lendian_from_fp(byte ret[48], const blst_fp *a); + +/* + * BLS12-381-specific Fp2 operations. + */ +void blst_fp2_add(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sub(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_mul_by_3(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_mul_by_8(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_lshift(blst_fp2 *ret, const blst_fp2 *a, size_t count); +void blst_fp2_mul(blst_fp2 *ret, const blst_fp2 *a, const blst_fp2 *b); +void blst_fp2_sqr(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_cneg(blst_fp2 *ret, const blst_fp2 *a, bool flag); +void blst_fp2_eucl_inverse(blst_fp2 *ret, const blst_fp2 *a); +void blst_fp2_inverse(blst_fp2 *ret, const blst_fp2 *a); +bool blst_fp2_sqrt(blst_fp2 *ret, const blst_fp2 *a); + +/* + * BLS12-381-specific Fp12 operations. + */ +void blst_fp12_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_cyclotomic_sqr(blst_fp12 *ret, const blst_fp12 *a); +void blst_fp12_mul(blst_fp12 *ret, const blst_fp12 *a, const blst_fp12 *b); +void blst_fp12_mul_by_xy00z0(blst_fp12 *ret, const blst_fp12 *a, + const blst_fp6 *xy00z0); +void blst_fp12_conjugate(blst_fp12 *a); +void blst_fp12_inverse(blst_fp12 *ret, const blst_fp12 *a); +/* caveat lector! |n| has to be non-zero and not more than 3! */ +void blst_fp12_frobenius_map(blst_fp12 *ret, const blst_fp12 *a, size_t n); +bool blst_fp12_is_equal(const blst_fp12 *a, const blst_fp12 *b); +bool blst_fp12_is_one(const blst_fp12 *a); +bool blst_fp12_in_group(const blst_fp12 *a); +const blst_fp12 *blst_fp12_one(void); +#endif // SWIG + +/* + * BLS12-381-specific point operations. + */ +typedef struct { blst_fp x, y, z; } blst_p1; +typedef struct { blst_fp x, y; } blst_p1_affine; + +void blst_p1_add(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_or_double(blst_p1 *out, const blst_p1 *a, const blst_p1 *b); +void blst_p1_add_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_add_or_double_affine(blst_p1 *out, const blst_p1 *a, + const blst_p1_affine *b); +void blst_p1_double(blst_p1 *out, const blst_p1 *a); +void blst_p1_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p1_cneg(blst_p1 *p, bool cbit); +void blst_p1_to_affine(blst_p1_affine *out, const blst_p1 *in); +void blst_p1_from_affine(blst_p1 *out, const blst_p1_affine *in); +bool blst_p1_on_curve(const blst_p1 *p); +bool blst_p1_in_g1(const blst_p1 *p); +bool blst_p1_is_equal(const blst_p1 *a, const blst_p1 *b); +bool blst_p1_is_inf(const blst_p1 *a); +const blst_p1 *blst_p1_generator(void); + +bool blst_p1_affine_on_curve(const blst_p1_affine *p); +bool blst_p1_affine_in_g1(const blst_p1_affine *p); +bool blst_p1_affine_is_equal(const blst_p1_affine *a, const blst_p1_affine *b); +bool blst_p1_affine_is_inf(const blst_p1_affine *a); +const blst_p1_affine *blst_p1_affine_generator(void); + +typedef struct { blst_fp2 x, y, z; } blst_p2; +typedef struct { blst_fp2 x, y; } blst_p2_affine; + +void blst_p2_add(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_or_double(blst_p2 *out, const blst_p2 *a, const blst_p2 *b); +void blst_p2_add_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_add_or_double_affine(blst_p2 *out, const blst_p2 *a, + const blst_p2_affine *b); +void blst_p2_double(blst_p2 *out, const blst_p2 *a); +void blst_p2_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); +void blst_p2_cneg(blst_p2 *p, bool cbit); +void blst_p2_to_affine(blst_p2_affine *out, const blst_p2 *in); +void blst_p2_from_affine(blst_p2 *out, const blst_p2_affine *in); +bool blst_p2_on_curve(const blst_p2 *p); +bool blst_p2_in_g2(const blst_p2 *p); +bool blst_p2_is_equal(const blst_p2 *a, const blst_p2 *b); +bool blst_p2_is_inf(const blst_p2 *a); +const blst_p2 *blst_p2_generator(void); + +bool blst_p2_affine_on_curve(const blst_p2_affine *p); +bool blst_p2_affine_in_g2(const blst_p2_affine *p); +bool blst_p2_affine_is_equal(const blst_p2_affine *a, const blst_p2_affine *b); +bool blst_p2_affine_is_inf(const blst_p2_affine *a); +const blst_p2_affine *blst_p2_affine_generator(void); + +/* + * Multi-scalar multiplications and other multi-point operations. + */ + +void blst_p1s_to_affine(blst_p1_affine dst[], const blst_p1 *const points[], + size_t npoints); +void blst_p1s_add(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints); + +size_t blst_p1s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p1s_mult_wbits_precompute(blst_p1_affine table[], size_t wbits, + const blst_p1_affine *const points[], + size_t npoints); +size_t blst_p1s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p1s_mult_wbits(blst_p1 *ret, const blst_p1_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p1s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p1s_mult_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p1s_tile_pippenger(blst_p1 *ret, const blst_p1_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +void blst_p2s_to_affine(blst_p2_affine dst[], const blst_p2 *const points[], + size_t npoints); +void blst_p2s_add(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints); + +size_t blst_p2s_mult_wbits_precompute_sizeof(size_t wbits, size_t npoints); +void blst_p2s_mult_wbits_precompute(blst_p2_affine table[], size_t wbits, + const blst_p2_affine *const points[], + size_t npoints); +size_t blst_p2s_mult_wbits_scratch_sizeof(size_t npoints); +void blst_p2s_mult_wbits(blst_p2 *ret, const blst_p2_affine table[], + size_t wbits, size_t npoints, + const byte *const scalars[], size_t nbits, + limb_t *scratch); + +size_t blst_p2s_mult_pippenger_scratch_sizeof(size_t npoints); +void blst_p2s_mult_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch); +void blst_p2s_tile_pippenger(blst_p2 *ret, const blst_p2_affine *const points[], + size_t npoints, const byte *const scalars[], + size_t nbits, limb_t *scratch, + size_t bit0, size_t window); + +/* + * Hash-to-curve operations. + */ +#ifndef SWIG +void blst_map_to_g1(blst_p1 *out, const blst_fp *u, const blst_fp *v DEFNULL); +void blst_map_to_g2(blst_p2 *out, const blst_fp2 *u, const blst_fp2 *v DEFNULL); +#endif + +void blst_encode_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g1(blst_p1 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +void blst_encode_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); +void blst_hash_to_g2(blst_p2 *out, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, size_t DST_len DEFNULL, + const byte *aug DEFNULL, size_t aug_len DEFNULL); + +/* + * Zcash-compatible serialization/deserialization. + */ +void blst_p1_serialize(byte out[96], const blst_p1 *in); +void blst_p1_compress(byte out[48], const blst_p1 *in); +void blst_p1_affine_serialize(byte out[96], const blst_p1_affine *in); +void blst_p1_affine_compress(byte out[48], const blst_p1_affine *in); +BLST_ERROR blst_p1_uncompress(blst_p1_affine *out, const byte in[48]); +BLST_ERROR blst_p1_deserialize(blst_p1_affine *out, const byte in[96]); + +void blst_p2_serialize(byte out[192], const blst_p2 *in); +void blst_p2_compress(byte out[96], const blst_p2 *in); +void blst_p2_affine_serialize(byte out[192], const blst_p2_affine *in); +void blst_p2_affine_compress(byte out[96], const blst_p2_affine *in); +BLST_ERROR blst_p2_uncompress(blst_p2_affine *out, const byte in[96]); +BLST_ERROR blst_p2_deserialize(blst_p2_affine *out, const byte in[192]); + +/* + * Specification defines two variants, 'minimal-signature-size' and + * 'minimal-pubkey-size'. To unify appearance we choose to distinguish + * them by suffix referring to the public key type, more specifically + * _pk_in_g1 corresponds to 'minimal-pubkey-size' and _pk_in_g2 - to + * 'minimal-signature-size'. It might appear a bit counterintuitive + * in sign call, but no matter how you twist it, something is bound to + * turn a little odd. + */ +/* + * Secret-key operations. + */ +void blst_keygen(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_sk_to_pk_in_g1(blst_p1 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g1(blst_p2 *out_sig, const blst_p2 *hash, + const blst_scalar *SK); +void blst_sk_to_pk_in_g2(blst_p2 *out_pk, const blst_scalar *SK); +void blst_sign_pk_in_g2(blst_p1 *out_sig, const blst_p1 *hash, + const blst_scalar *SK); + +/* + * Pairing interface. + */ +#ifndef SWIG +void blst_miller_loop(blst_fp12 *ret, const blst_p2_affine *Q, + const blst_p1_affine *P); +void blst_miller_loop_n(blst_fp12 *ret, const blst_p2_affine *const Qs[], + const blst_p1_affine *const Ps[], + size_t n); +void blst_final_exp(blst_fp12 *ret, const blst_fp12 *f); +void blst_precompute_lines(blst_fp6 Qlines[68], const blst_p2_affine *Q); +void blst_miller_loop_lines(blst_fp12 *ret, const blst_fp6 Qlines[68], + const blst_p1_affine *P); +bool blst_fp12_finalverify(const blst_fp12 *gt1, const blst_fp12 *gt2); +#endif + +#ifdef __BLST_CGO__ +typedef limb_t blst_pairing; +#elif defined(__BLST_RUST_BINDGEN__) +typedef struct {} blst_pairing; +#else +typedef struct blst_opaque blst_pairing; +#endif + +size_t blst_pairing_sizeof(void); +void blst_pairing_init(blst_pairing *new_ctx, bool hash_or_encode, + const byte *DST DEFNULL, size_t DST_len DEFNULL); +const byte *blst_pairing_get_dst(const blst_pairing *ctx); +void blst_pairing_commit(blst_pairing *ctx); +BLST_ERROR blst_pairing_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + const blst_p1_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g2(blst_pairing *ctx, + const blst_p2_affine *PK, + bool pk_grpchk, + const blst_p1_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *signature, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *signature, + bool sig_grpchk, + const byte *msg, size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_mul_n_aggregate_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + const blst_p2_affine *sig, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_chk_n_mul_n_aggr_pk_in_g1(blst_pairing *ctx, + const blst_p1_affine *PK, + bool pk_grpchk, + const blst_p2_affine *sig, + bool sig_grpchk, + const byte *scalar, + size_t nbits, + const byte *msg, + size_t msg_len, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_pairing_merge(blst_pairing *ctx, const blst_pairing *ctx1); +bool blst_pairing_finalverify(const blst_pairing *ctx, + const blst_fp12 *gtsig DEFNULL); + + +/* + * Customarily applications aggregate signatures separately. + * In which case application would have to pass NULLs for |signature| + * to blst_pairing_aggregate calls and pass aggregated signature + * collected with these calls to blst_pairing_finalverify. Inputs are + * Zcash-compatible "straight-from-wire" byte vectors, compressed or + * not. + */ +BLST_ERROR blst_aggregate_in_g1(blst_p1 *out, const blst_p1 *in, + const byte *zwire); +BLST_ERROR blst_aggregate_in_g2(blst_p2 *out, const blst_p2 *in, + const byte *zwire); + +void blst_aggregated_in_g1(blst_fp12 *out, const blst_p1_affine *signature); +void blst_aggregated_in_g2(blst_fp12 *out, const blst_p2_affine *signature); + +/* + * "One-shot" CoreVerify entry points. + */ +BLST_ERROR blst_core_verify_pk_in_g1(const blst_p1_affine *pk, + const blst_p2_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); +BLST_ERROR blst_core_verify_pk_in_g2(const blst_p2_affine *pk, + const blst_p1_affine *signature, + bool hash_or_encode, + const byte *msg, size_t msg_len, + const byte *DST DEFNULL, + size_t DST_len DEFNULL, + const byte *aug DEFNULL, + size_t aug_len DEFNULL); + +extern const blst_p1_affine BLS12_381_G1; +extern const blst_p1_affine BLS12_381_NEG_G1; +extern const blst_p2_affine BLS12_381_G2; +extern const blst_p2_affine BLS12_381_NEG_G2; + +#include "blst_aux.h" + +#ifdef __cplusplus +} +#endif +#endif diff --git a/crypto/internal/blst/blst_aux.h b/crypto/internal/blst/blst_aux.h new file mode 100644 index 00000000000..3de0850e330 --- /dev/null +++ b/crypto/internal/blst/blst_aux.h @@ -0,0 +1,117 @@ +/* + * Copyright Supranational LLC + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ +#ifndef __BLST_AUX_H__ +#define __BLST_AUX_H__ +/* + * This file lists interfaces that might be promoted to blst.h or removed, + * depending on their proven/unproven worthiness. + */ + +void blst_fr_ct_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); +void blst_fr_gs_bfly(blst_fr *x0, blst_fr *x1, const blst_fr *twiddle); +void blst_fr_to(blst_fr *ret, const blst_fr *a); +void blst_fr_from(blst_fr *ret, const blst_fr *a); +#ifdef BLST_FR_PENTAROOT +void blst_fr_pentaroot(blst_fr *ret, const blst_fr *a); +void blst_fr_pentapow(blst_fr *ret, const blst_fr *a); +#endif + +void blst_fp_to(blst_fp *ret, const blst_fp *a); +void blst_fp_from(blst_fp *ret, const blst_fp *a); + +bool blst_fp_is_square(const blst_fp *a); +bool blst_fp2_is_square(const blst_fp2 *a); + +void blst_p1_from_jacobian(blst_p1 *out, const blst_p1 *in); +void blst_p2_from_jacobian(blst_p2 *out, const blst_p2 *in); + +/* + * Below functions produce both point and deserialized outcome of + * SkToPk and Sign. However, deserialized outputs are pre-decorated + * with sign and infinity bits. This means that you have to bring the + * output into compliance prior returning to application. If you want + * compressed point value, then do [equivalent of] + * + * byte temp[96]; + * blst_sk_to_pk2_in_g1(temp, out_pk, SK); + * temp[0] |= 0x80; + * memcpy(out, temp, 48); + * + * Otherwise do + * + * blst_sk_to_pk2_in_g1(out, out_pk, SK); + * out[0] &= ~0x20; + * + * Either |out| or |out_| can be NULL. + */ +void blst_sk_to_pk2_in_g1(byte out[96], blst_p1_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g1(byte out[192], blst_p2_affine *out_sig, + const blst_p2 *hash, const blst_scalar *SK); +void blst_sk_to_pk2_in_g2(byte out[192], blst_p2_affine *out_pk, + const blst_scalar *SK); +void blst_sign_pk2_in_g2(byte out[96], blst_p1_affine *out_sig, + const blst_p1 *hash, const blst_scalar *SK); + +#ifdef __BLST_RUST_BINDGEN__ +typedef struct {} blst_uniq; +#else +typedef struct blst_opaque blst_uniq; +#endif + +size_t blst_uniq_sizeof(size_t n_nodes); +void blst_uniq_init(blst_uniq *tree); +bool blst_uniq_test(blst_uniq *tree, const byte *msg, size_t len); + +#ifdef expand_message_xmd +void expand_message_xmd(unsigned char *bytes, size_t len_in_bytes, + const unsigned char *aug, size_t aug_len, + const unsigned char *msg, size_t msg_len, + const unsigned char *DST, size_t DST_len); +#else +void blst_expand_message_xmd(byte *out, size_t out_len, + const byte *msg, size_t msg_len, + const byte *DST, size_t DST_len); +#endif + +void blst_p1_unchecked_mult(blst_p1 *out, const blst_p1 *p, const byte *scalar, + size_t nbits); +void blst_p2_unchecked_mult(blst_p2 *out, const blst_p2 *p, const byte *scalar, + size_t nbits); + +void blst_pairing_raw_aggregate(blst_pairing *ctx, const blst_p2_affine *q, + const blst_p1_affine *p); +blst_fp12 *blst_pairing_as_fp12(blst_pairing *ctx); +void blst_bendian_from_fp12(byte out[48*12], const blst_fp12 *a); + +void blst_keygen_v3(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_keygen_v4_5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *salt, size_t salt_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_keygen_v5(blst_scalar *out_SK, const byte *IKM, size_t IKM_len, + const byte *salt, size_t salt_len, + const byte *info DEFNULL, size_t info_len DEFNULL); +void blst_derive_master_eip2333(blst_scalar *out_SK, + const byte *IKM, size_t IKM_len); +void blst_derive_child_eip2333(blst_scalar *out_SK, const blst_scalar *SK, + uint32_t child_index); + +void blst_scalar_from_hexascii(blst_scalar *out, const byte *hex); +void blst_fr_from_hexascii(blst_fr *ret, const byte *hex); +void blst_fp_from_hexascii(blst_fp *ret, const byte *hex); + +size_t blst_p1_sizeof(void); +size_t blst_p1_affine_sizeof(void); +size_t blst_p2_sizeof(void); +size_t blst_p2_affine_sizeof(void); +size_t blst_fp12_sizeof(void); + +/* + * Single-shot SHA-256 hash function. + */ +void blst_sha256(byte out[32], const byte *msg, size_t msg_len); +#endif diff --git a/crypto/relic_build.sh b/crypto/relic_build.sh deleted file mode 100755 index 6cff3a6b478..00000000000 --- a/crypto/relic_build.sh +++ /dev/null @@ -1,90 +0,0 @@ -#!/bin/bash - -set -euo pipefail - -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -pushd "$DIR" - -# Ensure the directory is writeable -chmod -R +w "$(pwd)" - -mkdir -p "$DIR/relic/build" -pushd "$DIR/relic/build" - - -# make cmake print its CC interpretation -CMAKE_FILE="${DIR}/relic/CMakeLists.txt" -# parameter expansion is not suitable here -# shellcheck disable=SC2089 -CMAKE_PRINT_CC="message ( STATUS \"CC=\$ENV{CC}\" )" -# Make the cmake run print its interpretation of CC -echo "$CMAKE_PRINT_CC" >> "${CMAKE_FILE}" - -# Probe cmake's MakeFile generation and extract the CC version -CMAKE_TEMP=$(mktemp) -cmake .. > "$CMAKE_TEMP" -CC_VAL="$(tail -n 5 "$CMAKE_TEMP" | grep -oE -m 1 'CC=.*$')" -CC_VAL="${CC_VAL:3}" - -# de-mangle the CMakeLists file, using a temporary file for BSD compatibility -sed '$d' ../CMakeLists.txt > "$CMAKE_TEMP" -mv "$CMAKE_TEMP" ../CMakeLists.txt - -# default to which -CC_VAL=${CC_VAL:-"$(which cc)"} -CC_VERSION_STR="$($CC_VAL --version)" - -# we use uname to record which arch we are running on -ARCH=$(uname -m 2>/dev/null || true) - -if [[ "$ARCH" =~ "x86_64" ]]; then - # Compile as westmere arch to avoid cross-compilation issues on machines not supporting AVX extensions. - # Relic performance as used in flow crypto library is not impacted by whether it is compiled with "native" or "westmere", as proven by benchmark results. - MARCH="-march=westmere" -elif [[ "$ARCH" =~ ^(arm64|armv7|armv7s)$ && "${CC_VERSION_STR[0]}" =~ (clang) ]]; then - # the "-march=native" option is not supported with clang on ARM - MARCH="" -else - MARCH="-march=native" -fi - -# Set RELIC config for Flow -COMP=(-DCFLAGS="-O3 -funroll-loops -fomit-frame-pointer ${MARCH} -mtune=native") -GENERAL=(-DTIMER=CYCLE -DCHECK=OFF -DVERBS=OFF) -LIBS=(-DSHLIB=OFF -DSTLIB=ON) -RAND=(-DRAND=HASHD -DSEED=) - -# -BN_REP=(-DALLOC=AUTO -DALIGN=1 -DWSIZE=64 -DBN_PRECI=1024 -DBN_MAGNI=DOUBLE) -ARITH=(-DARITH=EASY) -PRIME=(-DFP_PRIME=381) - -# -BN_METH=(-DBN_KARAT=0 -DBN_METHD="COMBA;COMBA;MONTY;SLIDE;BINAR;BASIC") -FP_METH=(-DFP_KARAT=0 -DFP_METHD="INTEG;INTEG;INTEG;MONTY;MONTY;JMPDS;SLIDE") -PRIMES=(-DFP_PMERS=OFF -DFP_QNRES=ON) -FPX_METH=(-DFPX_METHD="INTEG;INTEG;LAZYR") -EP_METH=(-DEP_MIXED=ON -DEP_PLAIN=OFF -DEP_ENDOM=ON -DEP_SUPER=OFF\ - -DEP_CTMAP=ON -DEP_METHD="JACOB;LWNAF;COMBS;INTER") -PP_METH=(-DPP_METHD="LAZYR;OATEP") - -# run cmake -cmake "${COMP[@]}" "${GENERAL[@]}" \ - "${LIBS[@]}" "${RAND[@]}" \ - "${BN_REP[@]}" "${ARITH[@]}" \ - "${PRIME[@]}" "${PRIMES[@]}" \ - "${EP_METH[@]}" \ - "${BN_METH[@]}" \ - "${FP_METH[@]}" \ - "${FPX_METH[@]}" \ - "${PP_METH[@]}" .. - - -# Compile the static library -make clean -make relic_s -j8 -rm -f CMakeCache.txt - -popd -popd diff --git a/crypto/sign.go b/crypto/sign.go index 68196acba2d..d400898d97d 100644 --- a/crypto/sign.go +++ b/crypto/sign.go @@ -49,44 +49,36 @@ type signer interface { decodePublicKeyCompressed([]byte) (PublicKey, error) } -// newNonRelicSigner returns a signer that does not depend on Relic library. -func newNonRelicSigner(algo SigningAlgorithm) (signer, error) { +// newSigner returns a signer instance +func newSigner(algo SigningAlgorithm) (signer, error) { switch algo { case ECDSAP256: return p256Instance, nil case ECDSASecp256k1: return secp256k1Instance, nil + case BLSBLS12381: + return blsInstance, nil default: return nil, invalidInputsErrorf("the signature scheme %s is not supported", algo) } } -// Initialize the context of all algos not requiring Relic -func initNonRelic() { - // P-256 +// Initialize the context of all algos +func init() { + // ECDSA p256Instance = &(ecdsaAlgo{ curve: elliptic.P256(), algo: ECDSAP256, }) - - // secp256k1 secp256k1Instance = &(ecdsaAlgo{ curve: btcec.S256(), algo: ECDSASecp256k1, }) -} -// Signature format Check for non-relic algos (ECDSA) -func signatureFormatCheckNonRelic(algo SigningAlgorithm, s Signature) (bool, error) { - switch algo { - case ECDSAP256: - return p256Instance.signatureFormatCheck(s), nil - case ECDSASecp256k1: - return secp256k1Instance.signatureFormatCheck(s), nil - default: - return false, invalidInputsErrorf( - "the signature scheme %s is not supported", - algo) + // BLS + initBLS12381() + blsInstance = &blsBLS12381Algo{ + algo: BLSBLS12381, } } @@ -98,8 +90,16 @@ func signatureFormatCheckNonRelic(algo SigningAlgorithm, s Signature) (bool, err // If SignatureFormatCheck returns false then the input is not a valid // signature and will fail a verification against any message and public key. func SignatureFormatCheck(algo SigningAlgorithm, s Signature) (bool, error) { - // For now, signatureFormatCheckNonRelic is only defined for non-Relic algos. - return signatureFormatCheckNonRelic(algo, s) + switch algo { + case ECDSAP256: + return p256Instance.signatureFormatCheck(s), nil + case ECDSASecp256k1: + return secp256k1Instance.signatureFormatCheck(s), nil + default: + return false, invalidInputsErrorf( + "the signature scheme %s is not supported", + algo) + } } // GeneratePrivateKey generates a private key of the algorithm using the entropy of the given seed. diff --git a/crypto/sign_norelic.go b/crypto/sign_norelic.go deleted file mode 100644 index 7e6dd4c0d10..00000000000 --- a/crypto/sign_norelic.go +++ /dev/null @@ -1,13 +0,0 @@ -//go:build !relic -// +build !relic - -package crypto - -// newSigner chooses and initializes a signature scheme -func newSigner(algo SigningAlgorithm) (signer, error) { - return newNonRelicSigner(algo) -} - -func init() { - initNonRelic() -} diff --git a/crypto/sign_relic.go b/crypto/sign_relic.go deleted file mode 100644 index 980fca20c51..00000000000 --- a/crypto/sign_relic.go +++ /dev/null @@ -1,42 +0,0 @@ -//go:build relic -// +build relic - -package crypto - -import ( - "fmt" -) - -// newSigner chooses and initializes a signature scheme -func newSigner(algo SigningAlgorithm) (signer, error) { - // try Relic algos - if signer := relicSigner(algo); signer != nil { - return signer, nil - } - // return a non-Relic algo - return newNonRelicSigner(algo) -} - -// relicSigner returns a signer that depends on Relic library. -func relicSigner(algo SigningAlgorithm) signer { - if algo == BLSBLS12381 { - return blsInstance - } - return nil -} - -// Initialize Relic with the BLS context on BLS 12-381 -func init() { - initRelic() - initNonRelic() -} - -// Initialize the context of all algos requiring Relic -func initRelic() { - blsInstance = &blsBLS12381Algo{ - algo: BLSBLS12381, - } - if err := blsInstance.init(); err != nil { - panic(fmt.Sprintf("initialization of BLS failed: %s", err.Error())) - } -} diff --git a/crypto/sign_test_utils.go b/crypto/sign_test_utils.go index a98f7d0713b..9ecc684a4be 100644 --- a/crypto/sign_test_utils.go +++ b/crypto/sign_test_utils.go @@ -55,60 +55,55 @@ func TestHasherErrors(t *testing.T) { // tests sign and verify are consistent for multiple generated keys and messages func testGenSignVerify(t *testing.T, salg SigningAlgorithm, halg hash.Hasher) { - t.Logf("Testing Generation/Signature/Verification for %s", salg) - // make sure the length is larger than minimum lengths of all the signaure algos - seedMinLength := 48 - seed := make([]byte, seedMinLength) - input := make([]byte, 100) - rand := getPRG(t) - - loops := 50 - for j := 0; j < loops; j++ { - n, err := rand.Read(seed) - require.Equal(t, n, seedMinLength) - require.NoError(t, err) - sk, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - _, err = rand.Read(input) - require.NoError(t, err) - s, err := sk.Sign(input, halg) - require.NoError(t, err) - pk := sk.PublicKey() + t.Run(fmt.Sprintf("Generation/Signature/Verification for %s", salg), func(t *testing.T) { + seed := make([]byte, KeyGenSeedMinLen) + input := make([]byte, 100) + rand := getPRG(t) - // test a valid signature - result, err := pk.Verify(s, input, halg) - require.NoError(t, err) - assert.True(t, result, fmt.Sprintf( - "Verification should succeed:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) + loops := 50 + for j := 0; j < loops; j++ { + n, err := rand.Read(seed) + require.Equal(t, n, KeyGenSeedMinLen) + require.NoError(t, err) + sk, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + _, err = rand.Read(input) + require.NoError(t, err) + s, err := sk.Sign(input, halg) + require.NoError(t, err) + pk := sk.PublicKey() - // test with a different message - input[0] ^= 1 - result, err = pk.Verify(s, input, halg) - require.NoError(t, err) - assert.False(t, result, fmt.Sprintf( - "Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) - input[0] ^= 1 + // test a valid signature + result, err := pk.Verify(s, input, halg) + require.NoError(t, err) + assert.True(t, result) - // test with a valid but different key - seed[0] ^= 1 - wrongSk, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - result, err = wrongSk.PublicKey().Verify(s, input, halg) - require.NoError(t, err) - assert.False(t, result, fmt.Sprintf( - "Verification should fail:\n signature:%s\n message:%x\n private key:%s", s, input, sk)) + // test with a different message + input[0] ^= 1 + result, err = pk.Verify(s, input, halg) + require.NoError(t, err) + assert.False(t, result) + input[0] ^= 1 - // test a wrong signature length - invalidLen := rand.Intn(2 * len(s)) // try random invalid lengths - if invalidLen == len(s) { // map to an invalid length - invalidLen = 0 + // test with a valid but different key + seed[0] ^= 1 + wrongSk, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + result, err = wrongSk.PublicKey().Verify(s, input, halg) + require.NoError(t, err) + assert.False(t, result) + + // test a wrong signature length + invalidLen := rand.Intn(2 * len(s)) // try random invalid lengths + if invalidLen == len(s) { // map to an invalid length + invalidLen = 0 + } + invalidSig := make([]byte, invalidLen) + result, err = pk.Verify(invalidSig, input, halg) + require.NoError(t, err) + assert.False(t, result) } - invalidSig := make([]byte, invalidLen) - result, err = pk.Verify(invalidSig, input, halg) - require.NoError(t, err) - assert.False(t, result, fmt.Sprintf( - "Verification should fail:\n signature:%s\n with invalid length %d", invalidSig, invalidLen)) - } + }) } // tests the key generation constraints with regards to the input seed, mainly @@ -138,7 +133,6 @@ func testKeyGenSeed(t *testing.T, salg SigningAlgorithm, minLen int, maxLen int) }) t.Run("deterministic generation", func(t *testing.T) { - // same seed results in the same key seed := make([]byte, minLen) read, err := crand.Read(seed) @@ -162,161 +156,164 @@ var BLS12381Order = []byte{0x73, 0xED, 0xA7, 0x53, 0x29, 0x9D, 0x7D, 0x48, 0x33, 0x5B, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01} func testEncodeDecode(t *testing.T, salg SigningAlgorithm) { - t.Logf("Testing encode/decode for %s", salg) - rand := getPRG(t) - // make sure the length is larger than minimum lengths of all the signaure algos - seedMinLength := 48 + t.Run(fmt.Sprintf("generic encode/decode for %s", salg), func(t *testing.T) { + rand := getPRG(t) + + t.Run("happy path tests", func(t *testing.T) { + loops := 50 + for j := 0; j < loops; j++ { + // generate a private key + seed := make([]byte, KeyGenSeedMinLen) + read, err := rand.Read(seed) + require.Equal(t, read, KeyGenSeedMinLen) + require.NoError(t, err) + sk, err := GeneratePrivateKey(salg, seed) + assert.Nil(t, err) + seed[0] ^= 1 // alter the seed to get a new private key + distinctSk, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + + // check private key encoding + skBytes := sk.Encode() + skCheck, err := DecodePrivateKey(salg, skBytes) + require.Nil(t, err) + assert.True(t, sk.Equals(skCheck)) + skCheckBytes := skCheck.Encode() + assert.Equal(t, skBytes, skCheckBytes) + distinctSkBytes := distinctSk.Encode() + assert.NotEqual(t, skBytes, distinctSkBytes) + + // check public key encoding + pk := sk.PublicKey() + pkBytes := pk.Encode() + pkCheck, err := DecodePublicKey(salg, pkBytes) + require.Nil(t, err) + assert.True(t, pk.Equals(pkCheck)) + pkCheckBytes := pkCheck.Encode() + assert.Equal(t, pkBytes, pkCheckBytes) + distinctPkBytes := distinctSk.PublicKey().Encode() + assert.NotEqual(t, pkBytes, distinctPkBytes) + + // same for the compressed encoding + // skip is BLS is used and compression isn't supported + if !(salg == BLSBLS12381 && !isG2Compressed()) { + pkComprBytes := pk.EncodeCompressed() + pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes) + require.Nil(t, err) + assert.True(t, pk.Equals(pkComprCheck)) + pkCheckComprBytes := pkComprCheck.EncodeCompressed() + assert.Equal(t, pkComprBytes, pkCheckComprBytes) + distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed() + assert.NotEqual(t, pkComprBytes, distinctPkComprBytes) + } + } + }) + + // test invalid private keys (equal to the curve group order) + + t.Run("private keys equal to the group order", func(t *testing.T) { + groupOrder := make(map[SigningAlgorithm][]byte) + groupOrder[ECDSAP256] = []byte{255, 255, 255, 255, 0, 0, 0, 0, 255, 255, 255, + 255, 255, 255, 255, 255, 188, 230, 250, 173, 167, + 23, 158, 132, 243, 185, 202, 194, 252, 99, 37, 81} + + groupOrder[ECDSASecp256k1] = []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 254, 186, 174, 220, 230, + 175, 72, 160, 59, 191, 210, 94, 140, 208, 54, 65, 65} + + groupOrder[BLSBLS12381] = BLS12381Order + + sk, err := DecodePrivateKey(salg, groupOrder[salg]) + require.Error(t, err) + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, sk) + }) + + // test invalid private and public keys (invalid length) + t.Run("invalid key length", func(t *testing.T) { + // private key + skLens := make(map[SigningAlgorithm]int) + skLens[ECDSAP256] = PrKeyLenECDSAP256 + skLens[ECDSASecp256k1] = PrKeyLenECDSASecp256k1 + skLens[BLSBLS12381] = 32 + + bytes := make([]byte, skLens[salg]+1) + sk, err := DecodePrivateKey(salg, bytes) + require.Error(t, err) + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, sk) - t.Run("happy path tests", func(t *testing.T) { - loops := 50 - for j := 0; j < loops; j++ { - // generate a private key - seed := make([]byte, seedMinLength) - read, err := rand.Read(seed) - require.Equal(t, read, seedMinLength) - require.NoError(t, err) - sk, err := GeneratePrivateKey(salg, seed) - assert.Nil(t, err, "the key generation failed") - seed[0] ^= 1 // alter the seed to get a new private key - distinctSk, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) + // public key + pkLens := make(map[SigningAlgorithm]int) + pkLens[ECDSAP256] = PubKeyLenECDSAP256 + pkLens[ECDSASecp256k1] = PubKeyLenECDSASecp256k1 + pkLens[BLSBLS12381] = 96 - // check private key encoding - skBytes := sk.Encode() - skCheck, err := DecodePrivateKey(salg, skBytes) - require.Nil(t, err, "the key decoding failed") - assert.True(t, sk.Equals(skCheck), "key equality check failed") - skCheckBytes := skCheck.Encode() - assert.Equal(t, skBytes, skCheckBytes, "keys should be equal") - distinctSkBytes := distinctSk.Encode() - assert.NotEqual(t, skBytes, distinctSkBytes, "keys should be different") - - // check public key encoding - pk := sk.PublicKey() - pkBytes := pk.Encode() - pkCheck, err := DecodePublicKey(salg, pkBytes) - require.Nil(t, err, "the key decoding failed") - assert.True(t, pk.Equals(pkCheck), "key equality check failed") - pkCheckBytes := pkCheck.Encode() - assert.Equal(t, pkBytes, pkCheckBytes, "keys should be equal") - distinctPkBytes := distinctSk.PublicKey().Encode() - assert.NotEqual(t, pkBytes, distinctPkBytes, "keys should be different") - - // same for the compressed encoding - pkComprBytes := pk.EncodeCompressed() - pkComprCheck, err := DecodePublicKeyCompressed(salg, pkComprBytes) - require.Nil(t, err, "the key decoding failed") - assert.True(t, pk.Equals(pkComprCheck), "key equality check failed") - pkCheckComprBytes := pkComprCheck.EncodeCompressed() - assert.Equal(t, pkComprBytes, pkCheckComprBytes, "keys should be equal") - distinctPkComprBytes := distinctSk.PublicKey().EncodeCompressed() - assert.NotEqual(t, pkComprBytes, distinctPkComprBytes, "keys should be different") - } + bytes = make([]byte, pkLens[salg]+1) + pk, err := DecodePublicKey(salg, bytes) + require.Error(t, err) + assert.True(t, IsInvalidInputsError(err)) + assert.Nil(t, pk) + }) }) +} - // test invalid private keys (equal to the curve group order) - t.Run("private keys equal to the group order", func(t *testing.T) { - groupOrder := make(map[SigningAlgorithm][]byte) - groupOrder[ECDSAP256] = []byte{255, 255, 255, 255, 0, 0, 0, 0, 255, 255, 255, - 255, 255, 255, 255, 255, 188, 230, 250, 173, 167, - 23, 158, 132, 243, 185, 202, 194, 252, 99, 37, 81} - - groupOrder[ECDSASecp256k1] = []byte{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 254, 186, 174, 220, 230, - 175, 72, 160, 59, 191, 210, 94, 140, 208, 54, 65, 65} +func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorithm) { + t.Run(fmt.Sprintf("equals for %s", salg), func(t *testing.T) { + rand := getPRG(t) + // generate a key pair + seed := make([]byte, KeyGenSeedMinLen) + n, err := rand.Read(seed) + require.Equal(t, n, KeyGenSeedMinLen) + require.NoError(t, err) - groupOrder[BLSBLS12381] = BLS12381Order + // first pair + sk1, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + pk1 := sk1.PublicKey() - sk, err := DecodePrivateKey(salg, groupOrder[salg]) - require.Error(t, err, "the key decoding should fail - private key value is too large") - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, sk) - }) + // second pair without changing the seed + sk2, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + pk2 := sk2.PublicKey() - // test invalid private and public keys (invalid length) - t.Run("invalid key length", func(t *testing.T) { - // private key - skLens := make(map[SigningAlgorithm]int) - skLens[ECDSAP256] = PrKeyLenECDSAP256 - skLens[ECDSASecp256k1] = PrKeyLenECDSASecp256k1 - skLens[BLSBLS12381] = 32 - - bytes := make([]byte, skLens[salg]+1) - sk, err := DecodePrivateKey(salg, bytes) - require.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, sk) + // unrelated algo pair + sk3, err := GeneratePrivateKey(otherSigAlgo, seed) + require.NoError(t, err) + pk3 := sk3.PublicKey() - // public key - pkLens := make(map[SigningAlgorithm]int) - pkLens[ECDSAP256] = PubKeyLenECDSAP256 - pkLens[ECDSASecp256k1] = PubKeyLenECDSASecp256k1 - pkLens[BLSBLS12381] = 96 + // fourth pair with same algo but a different seed + seed[0] ^= 1 + sk4, err := GeneratePrivateKey(salg, seed) + require.NoError(t, err) + pk4 := sk4.PublicKey() - bytes = make([]byte, pkLens[salg]+1) - pk, err := DecodePublicKey(salg, bytes) - require.Error(t, err) - assert.True(t, IsInvalidInputsError(err)) - assert.Nil(t, pk) + // tests + assert.True(t, sk1.Equals(sk2)) + assert.True(t, pk1.Equals(pk2)) + assert.False(t, sk1.Equals(sk3)) + assert.False(t, pk1.Equals(pk3)) + assert.False(t, sk1.Equals(sk4)) + assert.False(t, pk1.Equals(pk4)) }) } -func testEquals(t *testing.T, salg SigningAlgorithm, otherSigAlgo SigningAlgorithm) { - t.Logf("Testing Equals for %s", salg) - rand := getPRG(t) - // make sure the length is larger than minimum lengths of all the signaure algos - seedMinLength := 48 - - // generate a key pair - seed := make([]byte, seedMinLength) - n, err := rand.Read(seed) - require.Equal(t, n, seedMinLength) - require.NoError(t, err) - - // first pair - sk1, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - pk1 := sk1.PublicKey() - - // second pair without changing the seed - sk2, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - pk2 := sk2.PublicKey() - - // unrelated algo pair - sk3, err := GeneratePrivateKey(otherSigAlgo, seed) - require.NoError(t, err) - pk3 := sk3.PublicKey() - - // fourth pair with same algo but a different seed - seed[0] ^= 1 - sk4, err := GeneratePrivateKey(salg, seed) - require.NoError(t, err) - pk4 := sk4.PublicKey() - - // tests - assert.True(t, sk1.Equals(sk2), "key equality should return true") - assert.True(t, pk1.Equals(pk2), "key equality should return true") - assert.False(t, sk1.Equals(sk3), "key equality should return false") - assert.False(t, pk1.Equals(pk3), "key equality should return false") - assert.False(t, sk1.Equals(sk4), "key equality should return false") - assert.False(t, pk1.Equals(pk4), "key equality should return false") -} - func testKeysAlgorithm(t *testing.T, sk PrivateKey, salg SigningAlgorithm) { - t.Logf("Testing key.Algorithm for %s", salg) - alg := sk.Algorithm() - assert.Equal(t, alg, salg) - alg = sk.PublicKey().Algorithm() - assert.Equal(t, alg, salg) + t.Run(fmt.Sprintf("key.Algorithm for %s", salg), func(t *testing.T) { + alg := sk.Algorithm() + assert.Equal(t, alg, salg) + alg = sk.PublicKey().Algorithm() + assert.Equal(t, alg, salg) + }) } func testKeySize(t *testing.T, sk PrivateKey, skLen int, pkLen int) { - t.Logf("Testing key.Size for %s", sk.Algorithm()) - size := sk.Size() - assert.Equal(t, size, skLen) - size = sk.PublicKey().Size() - assert.Equal(t, size, pkLen) + t.Run(fmt.Sprintf("key.Size for %s", sk.Algorithm()), func(t *testing.T) { + size := sk.Size() + assert.Equal(t, size, skLen) + size = sk.PublicKey().Size() + assert.Equal(t, size, pkLen) + }) } func benchVerify(b *testing.B, algo SigningAlgorithm, halg hash.Hasher) { diff --git a/crypto/spock.go b/crypto/spock.go index 2487f39ce1b..da269c23ac1 100644 --- a/crypto/spock.go +++ b/crypto/spock.go @@ -1,13 +1,8 @@ -//go:build relic -// +build relic - package crypto // SPoCK design based on the BLS signature scheme. // BLS is using BLS12-381 curve and the same settings in bls.go. -// #cgo CFLAGS: -g -Wall -std=c99 -// #cgo LDFLAGS: -L${SRCDIR}/relic/build/lib -l relic_s // #include "bls_include.h" import "C" import ( @@ -78,7 +73,7 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur return false, notBLSKeyError } - if len(proof1) != signatureLengthBLSBLS12381 || len(proof2) != signatureLengthBLSBLS12381 { + if len(proof1) != g1BytesLen || len(proof2) != g1BytesLen { return false, nil } @@ -90,9 +85,9 @@ func SPOCKVerify(pk1 PublicKey, proof1 Signature, pk2 PublicKey, proof2 Signatur } // verify the spock proof using the secret data - verif := C.bls_spock_verify((*C.ep2_st)(&blsPk1.point), + verif := C.bls_spock_verify((*C.E2)(&blsPk1.point), (*C.uchar)(&proof1[0]), - (*C.ep2_st)(&blsPk2.point), + (*C.E2)(&blsPk2.point), (*C.uchar)(&proof2[0])) switch verif { diff --git a/crypto/spock_test.go b/crypto/spock_test.go index 596968234e4..59498a42f6f 100644 --- a/crypto/spock_test.go +++ b/crypto/spock_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package crypto import ( @@ -72,7 +69,7 @@ func TestSPOCKProveVerifyAgainstData(t *testing.T) { t.Run("identity proof", func(t *testing.T) { // verifying with a pair of (proof, publicKey) equal to (identity_signature, identity_key) should // return false - identityProof := identityBLSSignature + identityProof := g1Serialization result, err := SPOCKVerifyAgainstData(IdentityBLSPublicKey(), identityProof, data, kmac) assert.NoError(t, err) assert.False(t, result) @@ -169,7 +166,7 @@ func TestSPOCKProveVerify(t *testing.T) { t.Run("identity proof", func(t *testing.T) { // verifying with either pair of (proof, publicKey) equal to (identity_signature, identity_key) should // return falsen with any other (proof, key) pair. - identityProof := identityBLSSignature + identityProof := g1Serialization result, err := SPOCKVerify(IdentityBLSPublicKey(), identityProof, sk2.PublicKey(), pr2) assert.NoError(t, err) assert.False(t, result) diff --git a/crypto_adx_flag.mk b/crypto_adx_flag.mk new file mode 100644 index 00000000000..667c8d493d3 --- /dev/null +++ b/crypto_adx_flag.mk @@ -0,0 +1,17 @@ +# `ADX_SUPPORT` is 1 if ADX instructions are supported and 0 otherwise. +ifeq ($(shell uname -s),Linux) +# detect ADX support on the CURRENT linux machine. + ADX_SUPPORT := $(shell if ([ -f "/proc/cpuinfo" ] && grep -q -e '^flags.*\badx\b' /proc/cpuinfo); then echo 1; else echo 0; fi) +else +# on non-linux machines, set the flag to 1 by default + ADX_SUPPORT := 1 +endif + +# the crypto package uses BLST source files underneath which may use ADX instructions. +ifeq ($(ADX_SUPPORT), 1) +# if ADX instructions are supported, default is to use a fast ADX BLST implementation + CRYPTO_FLAG := "" +else +# if ADX instructions aren't supported, this CGO flags uses a slower non-ADX BLST implementation + CRYPTO_FLAG := "-O -D__BLST_PORTABLE__" +endif \ No newline at end of file diff --git a/crypto_setup.sh b/crypto_setup.sh deleted file mode 100644 index e9789c74a23..00000000000 --- a/crypto_setup.sh +++ /dev/null @@ -1,32 +0,0 @@ - -#!/bin/bash - -# crypto package -PKG_NAME="github.com/onflow/flow-go/crypto" - -# go.mod -MOD_FILE="./go.mod" - -# the version of onflow/flow-go/crypto used in the project is read from the go.mod file -if [ -f "${MOD_FILE}" ] -then - # extract the imported version - VERSION="$(go list -f '{{.Version}}' -m ${PKG_NAME})" - # go get the package - go get "${PKG_NAME}@${VERSION}" || { echo "go get the package failed"; exit 1; } - # using the right version, get the package directory path - PKG_DIR="$(go env GOPATH)/pkg/mod/${PKG_NAME}@${VERSION}" -else - { echo "couldn't find go.mod file - make sure the script is in the project root directory"; exit 1; } -fi - -# grant permissions if not existant -if [[ ! -r ${PKG_DIR} || ! -w ${PKG_DIR} || ! -x ${PKG_DIR} ]]; then - chmod -R 755 "${PKG_DIR}" -fi - -# get into the package directory and set up the external dependencies -( - cd "${PKG_DIR}" || { echo "cd into the GOPATH package folder failed"; exit 1; } - go generate -) diff --git a/engine/access/access_test.go b/engine/access/access_test.go index 66c7904bfa0..39dbc155634 100644 --- a/engine/access/access_test.go +++ b/engine/access/access_test.go @@ -552,15 +552,12 @@ func (suite *Suite) TestGetExecutionResultByBlockID() { for i, serviceEvent := range executionResult.ServiceEvents { assert.Equal(suite.T(), serviceEvent.Type.String(), er.ServiceEvents[i].Type) event := serviceEvent.Event - marshalledEvent, err := json.Marshal(event) require.NoError(suite.T(), err) - assert.Equal(suite.T(), marshalledEvent, er.ServiceEvents[i].Payload) } parsedExecResult, err := convert.MessageToExecutionResult(resp.ExecutionResult) require.NoError(suite.T(), err) - assert.Equal(suite.T(), parsedExecResult, executionResult) assert.Equal(suite.T(), parsedExecResult.ID(), executionResult.ID()) } diff --git a/engine/collection/test/cluster_switchover_test.go b/engine/collection/test/cluster_switchover_test.go index a8f04173099..15a23823ab3 100644 --- a/engine/collection/test/cluster_switchover_test.go +++ b/engine/collection/test/cluster_switchover_test.go @@ -212,7 +212,7 @@ func (tc *ClusterSwitchoverTestCase) StartNodes() { nodes = append(nodes, node) } - unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), time.Second, "could not start nodes") + unittest.RequireCloseBefore(tc.T(), util.AllReady(nodes...), 3*time.Second, "could not start nodes") // start continuous delivery for all nodes for _, node := range tc.nodes { diff --git a/engine/consensus/dkg/reactor_engine.go b/engine/consensus/dkg/reactor_engine.go index 1704483ef48..1d23344e4c6 100644 --- a/engine/consensus/dkg/reactor_engine.go +++ b/engine/consensus/dkg/reactor_engine.go @@ -348,7 +348,7 @@ func (e *ReactorEngine) getDKGInfo(firstBlockID flow.Identifier) (*dkgInfo, erro if err != nil { return nil, fmt.Errorf("could not retrieve epoch dkg final views: %w", err) } - seed := make([]byte, crypto.SeedMinLenDKG) + seed := make([]byte, crypto.KeyGenSeedMinLen) _, err = rand.Read(seed) if err != nil { return nil, fmt.Errorf("could not generate random seed: %w", err) diff --git a/engine/execution/computation/computer/result_collector.go b/engine/execution/computation/computer/result_collector.go index 4b367fda739..703fae44488 100644 --- a/engine/execution/computation/computer/result_collector.go +++ b/engine/execution/computation/computer/result_collector.go @@ -194,7 +194,7 @@ func (collector *resultCollector) commitCollection( spock, err := collector.signer.SignFunc( collectionExecutionSnapshot.SpockSecret, collector.spockHasher, - SPOCKProve) + crypto.SPOCKProve) if err != nil { return fmt.Errorf("signing spock hash failed: %w", err) } diff --git a/engine/execution/computation/computer/spock_norelic.go b/engine/execution/computation/computer/spock_norelic.go deleted file mode 100644 index 81678d94f33..00000000000 --- a/engine/execution/computation/computer/spock_norelic.go +++ /dev/null @@ -1,26 +0,0 @@ -//go:build !relic -// +build !relic - -package computer - -import ( - "github.com/onflow/flow-go/crypto" - "github.com/onflow/flow-go/crypto/hash" -) - -// This is a temporary wrapper that simulates a call to SPoCK prove, -// required for the emulator build. The function is never called by the -// emulator although it is required for a successful build. -// -// TODO(tarak): remove once the crypto module properly implements a non-relic -// version of SPOCKProve. -func SPOCKProve( - sk crypto.PrivateKey, - data []byte, - kmac hash.Hasher, -) ( - crypto.Signature, - error, -) { - panic("SPoCK prove not supported when flow-go is built without relic") -} diff --git a/engine/execution/computation/computer/spock_relic.go b/engine/execution/computation/computer/spock_relic.go deleted file mode 100644 index 89a8182ba8f..00000000000 --- a/engine/execution/computation/computer/spock_relic.go +++ /dev/null @@ -1,24 +0,0 @@ -//go:build relic -// +build relic - -package computer - -import ( - "github.com/onflow/flow-go/crypto" - "github.com/onflow/flow-go/crypto/hash" -) - -// This is a temporary wrapper that around the crypto library. -// -// TODO(tarak): remove once the crypto module properly implements a non-relic -// version of SPOCKProve. -func SPOCKProve( - sk crypto.PrivateKey, - data []byte, - kmac hash.Hasher, -) ( - crypto.Signature, - error, -) { - return crypto.SPOCKProve(sk, data, kmac) -} diff --git a/fvm/crypto/crypto_test.go b/fvm/crypto/crypto_test.go index fe6c400c1b4..ffbdec3a730 100644 --- a/fvm/crypto/crypto_test.go +++ b/fvm/crypto/crypto_test.go @@ -425,16 +425,13 @@ func TestVerifySignatureFromTransaction(t *testing.T) { func TestValidatePublicKey(t *testing.T) { - // make sure the seed length is larger than miniumum seed lengths of all signature schemes - seedLength := 64 - validPublicKey := func(t *testing.T, s runtime.SignatureAlgorithm) []byte { - seed := make([]byte, seedLength) + seed := make([]byte, gocrypto.KeyGenSeedMinLen) _, err := rand.Read(seed) require.NoError(t, err) - pk, err := gocrypto.GeneratePrivateKey(crypto.RuntimeToCryptoSigningAlgorithm(s), seed) + sk, err := gocrypto.GeneratePrivateKey(crypto.RuntimeToCryptoSigningAlgorithm(s), seed) require.NoError(t, err) - return pk.PublicKey().Encode() + return sk.PublicKey().Encode() } t.Run("Unknown algorithm should return false", func(t *testing.T) { @@ -463,12 +460,14 @@ func TestValidatePublicKey(t *testing.T) { runtime.SignatureAlgorithmBLS_BLS12_381, } for i, s := range signatureAlgos { + t.Run(fmt.Sprintf("case %v: %v", i, s), func(t *testing.T) { key := validPublicKey(t, s) + // This may cause flakiness depending on the public key + // deserialization scheme used!! key[0] ^= 1 // alter one bit of the valid key - err := crypto.ValidatePublicKey(s, key) - require.Error(t, err) + require.Errorf(t, err, "key is %#x", key) }) } }) diff --git a/go.mod b/go.mod index 9965cc6cd49..63f54a46fae 100644 --- a/go.mod +++ b/go.mod @@ -309,3 +309,5 @@ require ( lukechampine.com/blake3 v1.2.1 // indirect nhooyr.io/websocket v1.8.7 // indirect ) + +replace github.com/onflow/flow-go/crypto => ./crypto diff --git a/go.sum b/go.sum index 8e42fee0444..b5cbe076f36 100644 --- a/go.sum +++ b/go.sum @@ -200,6 +200,7 @@ github.com/btcsuite/btcd v0.20.1-beta/go.mod h1:wVuoA8VJLEcwgqHBwHmzLRazpKxTv13P github.com/btcsuite/btcd v0.21.0-beta/go.mod h1:ZSWyehm27aAuS9bvkATT+Xte3hjHZ+MRgMY/8NJ7K94= github.com/btcsuite/btcd/btcec/v2 v2.2.1 h1:xP60mv8fvp+0khmrN0zTdPC3cNm24rfeE6lh2R/Yv3E= github.com/btcsuite/btcd/btcec/v2 v2.2.1/go.mod h1:9/CSmJxmuvqzX9Wh2fXMWToLOHhPd11lSPuIupwTkI8= +github.com/btcsuite/btcd/chaincfg/chainhash v1.0.1/go.mod h1:7SFka0XMvUgj3hfZtydOrQY2mwhPclbT2snogU7SQQc= github.com/btcsuite/btcd/chaincfg/chainhash v1.0.2 h1:KdUfX2zKommPRa+PD0sWZUyXe9w277ABlgELO7H04IM= github.com/btcsuite/btcd/chaincfg/chainhash v1.0.2/go.mod h1:7SFka0XMvUgj3hfZtydOrQY2mwhPclbT2snogU7SQQc= github.com/btcsuite/btclog v0.0.0-20170628155309-84c8d2346e9f/go.mod h1:TdznJufoqS23FtqVCzL0ZqgP5MqXbb4fg/WgDys70nA= @@ -297,7 +298,9 @@ github.com/davidlazar/go-crypto v0.0.0-20200604182044-b73af7476f6c/go.mod h1:6Uh github.com/deckarep/golang-set v0.0.0-20180603214616-504e848d77ea/go.mod h1:93vsz/8Wt4joVM7c2AVqh+YRMiUSc14yDtF28KmMOgQ= github.com/deckarep/golang-set/v2 v2.1.0 h1:g47V4Or+DUdzbs8FxCCmgb6VYd+ptPAngjM6dtGktsI= github.com/deckarep/golang-set/v2 v2.1.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4= +github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc= github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5ilcvdfma9wOH6Y= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1/go.mod h1:hyedUtir6IdtD/7lIxGeCxkaw7y45JueMRL4DIyJDKs= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 h1:8UrgZ3GkP4i/CLijOJx79Yu+etlyjdBU4sfcs2WYQMs= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0= github.com/decred/dcrd/lru v1.0.0/go.mod h1:mxKOwFd7lFjN2GZYsiz/ecgqR6kkYAl+0pz0tEMk218= @@ -1338,9 +1341,6 @@ github.com/onflow/flow-ft/lib/go/contracts v0.7.1-0.20230711213910-baad011d2b13/ github.com/onflow/flow-go-sdk v0.24.0/go.mod h1:IoptMLPyFXWvyd9yYA6/4EmSeeozl6nJoIv4FaEMg74= github.com/onflow/flow-go-sdk v0.41.16 h1:HsmHwEVmj+iK+GszHbFseHh7Ii5W3PWOIRNAH/En08Q= github.com/onflow/flow-go-sdk v0.41.16/go.mod h1:bVrVNoJKiwB6vW5Qbm5tFAfJBQ5we4uSQWnn9gNAFhQ= -github.com/onflow/flow-go/crypto v0.21.3/go.mod h1:vI6V4CY3R6c4JKBxdcRiR/AnjBfL8OSD97bJc60cLuQ= -github.com/onflow/flow-go/crypto v0.24.9 h1:0EQp+kSZYJepMIiSypfJVe7tzsPcb6UXOdOtsTCDhBs= -github.com/onflow/flow-go/crypto v0.24.9/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0 h1:rhUDeD27jhLwOqQKI/23008CYfnqXErrJvc4EFRP2a0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0/go.mod h1:YsvzYng4htDgRB9sa9jxdwoTuuhjK8WYWXTyLkIigZY= github.com/onflow/flow/protobuf/go/flow v0.2.2/go.mod h1:gQxYqCfkI8lpnKsmIjwtN2mV/N2PIwc1I+RUK4HPIc8= @@ -1611,8 +1611,6 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/subosito/gotenv v1.4.2 h1:X1TuBLAMDFbaTAChgCBLu3DU3UPyELpnF2jjJ2cz/S8= github.com/subosito/gotenv v1.4.2/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0= -github.com/supranational/blst v0.3.4/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw= -github.com/supranational/blst v0.3.11-0.20230406105308-e9dfc5ee724b h1:u49mjRnygnB34h8OKbnNJFVUtWSKIKb1KukdV8bILUM= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/syndtr/goleveldb v1.0.1-0.20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 h1:epCh84lMvA70Z7CTTCmYQn2CKbY8j86K7/FAIr141uY= @@ -1781,7 +1779,6 @@ golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191227163750-53104e6ec876/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200115085410-6d4e4cb37c7d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20200117160349-530e935923ad/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200221231518-2aa609cf4a9d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200423211502-4bdfaf469ed5/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -1797,6 +1794,7 @@ golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.12.0 h1:tFM/ta59kqch6LlvYnPa0yx5a83cL2nHflFhYKvv9Yk= golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1996,7 +1994,6 @@ golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200107162124-548cf772de50/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/insecure/Makefile b/insecure/Makefile index 70eeff5a46f..f38a03381b3 100644 --- a/insecure/Makefile +++ b/insecure/Makefile @@ -8,15 +8,19 @@ else RACE_FLAG := endif +include ../crypto_adx_flag.mk + +CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) + # runs all unit tests of the insecure module .PHONY: test test: - go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) --tags relic ./... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./... .PHONY: lint lint: tidy # revive -config revive.toml -exclude storage/ledger/trie ./... - golangci-lint run -v --build-tags relic ./... + golangci-lint run -v # this ensures there is no unused dependency being added by accident .PHONY: tidy @@ -26,4 +30,4 @@ tidy: cd crypto; go mod tidy -v cd cmd/testclient; go mod tidy -v cd insecure; go mod tidy -v - git diff --exit-code \ No newline at end of file + git diff --exit-code diff --git a/insecure/go.mod b/insecure/go.mod index a6547997c8e..60eb94d0758 100644 --- a/insecure/go.mod +++ b/insecure/go.mod @@ -298,3 +298,5 @@ require ( ) replace github.com/onflow/flow-go => ../ + +replace github.com/onflow/flow-go/crypto => ../crypto diff --git a/insecure/go.sum b/insecure/go.sum index 81aa4cfd693..39a542b340a 100644 --- a/insecure/go.sum +++ b/insecure/go.sum @@ -189,6 +189,7 @@ github.com/btcsuite/btcd v0.20.1-beta/go.mod h1:wVuoA8VJLEcwgqHBwHmzLRazpKxTv13P github.com/btcsuite/btcd v0.21.0-beta/go.mod h1:ZSWyehm27aAuS9bvkATT+Xte3hjHZ+MRgMY/8NJ7K94= github.com/btcsuite/btcd/btcec/v2 v2.2.1 h1:xP60mv8fvp+0khmrN0zTdPC3cNm24rfeE6lh2R/Yv3E= github.com/btcsuite/btcd/btcec/v2 v2.2.1/go.mod h1:9/CSmJxmuvqzX9Wh2fXMWToLOHhPd11lSPuIupwTkI8= +github.com/btcsuite/btcd/chaincfg/chainhash v1.0.1/go.mod h1:7SFka0XMvUgj3hfZtydOrQY2mwhPclbT2snogU7SQQc= github.com/btcsuite/btcd/chaincfg/chainhash v1.0.2 h1:KdUfX2zKommPRa+PD0sWZUyXe9w277ABlgELO7H04IM= github.com/btcsuite/btcd/chaincfg/chainhash v1.0.2/go.mod h1:7SFka0XMvUgj3hfZtydOrQY2mwhPclbT2snogU7SQQc= github.com/btcsuite/btclog v0.0.0-20170628155309-84c8d2346e9f/go.mod h1:TdznJufoqS23FtqVCzL0ZqgP5MqXbb4fg/WgDys70nA= @@ -285,7 +286,9 @@ github.com/davidlazar/go-crypto v0.0.0-20200604182044-b73af7476f6c/go.mod h1:6Uh github.com/deckarep/golang-set v0.0.0-20180603214616-504e848d77ea/go.mod h1:93vsz/8Wt4joVM7c2AVqh+YRMiUSc14yDtF28KmMOgQ= github.com/deckarep/golang-set/v2 v2.1.0 h1:g47V4Or+DUdzbs8FxCCmgb6VYd+ptPAngjM6dtGktsI= github.com/deckarep/golang-set/v2 v2.1.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4= +github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc= github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5ilcvdfma9wOH6Y= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1/go.mod h1:hyedUtir6IdtD/7lIxGeCxkaw7y45JueMRL4DIyJDKs= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 h1:8UrgZ3GkP4i/CLijOJx79Yu+etlyjdBU4sfcs2WYQMs= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0= github.com/decred/dcrd/lru v1.0.0/go.mod h1:mxKOwFd7lFjN2GZYsiz/ecgqR6kkYAl+0pz0tEMk218= @@ -1313,9 +1316,6 @@ github.com/onflow/flow-ft/lib/go/contracts v0.7.1-0.20230711213910-baad011d2b13/ github.com/onflow/flow-go-sdk v0.24.0/go.mod h1:IoptMLPyFXWvyd9yYA6/4EmSeeozl6nJoIv4FaEMg74= github.com/onflow/flow-go-sdk v0.41.16 h1:HsmHwEVmj+iK+GszHbFseHh7Ii5W3PWOIRNAH/En08Q= github.com/onflow/flow-go-sdk v0.41.16/go.mod h1:bVrVNoJKiwB6vW5Qbm5tFAfJBQ5we4uSQWnn9gNAFhQ= -github.com/onflow/flow-go/crypto v0.21.3/go.mod h1:vI6V4CY3R6c4JKBxdcRiR/AnjBfL8OSD97bJc60cLuQ= -github.com/onflow/flow-go/crypto v0.24.9 h1:0EQp+kSZYJepMIiSypfJVe7tzsPcb6UXOdOtsTCDhBs= -github.com/onflow/flow-go/crypto v0.24.9/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0 h1:rhUDeD27jhLwOqQKI/23008CYfnqXErrJvc4EFRP2a0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0/go.mod h1:YsvzYng4htDgRB9sa9jxdwoTuuhjK8WYWXTyLkIigZY= github.com/onflow/flow/protobuf/go/flow v0.2.2/go.mod h1:gQxYqCfkI8lpnKsmIjwtN2mV/N2PIwc1I+RUK4HPIc8= @@ -1584,8 +1584,6 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/subosito/gotenv v1.4.2 h1:X1TuBLAMDFbaTAChgCBLu3DU3UPyELpnF2jjJ2cz/S8= github.com/subosito/gotenv v1.4.2/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0= -github.com/supranational/blst v0.3.4/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw= -github.com/supranational/blst v0.3.11-0.20230406105308-e9dfc5ee724b h1:u49mjRnygnB34h8OKbnNJFVUtWSKIKb1KukdV8bILUM= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/syndtr/goleveldb v1.0.1-0.20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 h1:epCh84lMvA70Z7CTTCmYQn2CKbY8j86K7/FAIr141uY= @@ -1756,7 +1754,6 @@ golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191227163750-53104e6ec876/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200115085410-6d4e4cb37c7d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20200117160349-530e935923ad/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200221231518-2aa609cf4a9d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200423211502-4bdfaf469ed5/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -1772,6 +1769,7 @@ golang.org/x/crypto v0.0.0-20210817164053-32db794688a5/go.mod h1:GvvjBRRGRdwPK5y golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20211108221036-ceb1ce70b4fa/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.0.0-20220622213112-05595931fe9d/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/crypto v0.12.0 h1:tFM/ta59kqch6LlvYnPa0yx5a83cL2nHflFhYKvv9Yk= golang.org/x/crypto v0.12.0/go.mod h1:NF0Gs7EO5K4qLn+Ylc+fih8BSTeIjAP05siRnAh98yw= golang.org/x/exp v0.0.0-20180321215751-8460e604b9de/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= @@ -1963,7 +1961,6 @@ golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200107162124-548cf772de50/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/integration/Makefile b/integration/Makefile index b538289ae82..361013a41d1 100644 --- a/integration/Makefile +++ b/integration/Makefile @@ -8,6 +8,10 @@ else RACE_FLAG := endif +include ../crypto_adx_flag.mk + +CGO_FLAG := CGO_CFLAGS=$(CRYPTO_FLAG) + # Run the integration test suite .PHONY: integration-test integration-test: access-tests ghost-tests mvp-tests execution-tests verification-tests upgrades-tests collection-tests epochs-cohort1-tests epochs-cohort2-tests network-tests consensus-tests @@ -15,82 +19,78 @@ integration-test: access-tests ghost-tests mvp-tests execution-tests verificatio .PHONY: ci-integration-test ci-integration-test: access-tests ghost-tests mvp-tests epochs-cohort1-tests epochs-cohort2-tests consensus-tests execution-tests verification-tests upgrades-tests network-tests collection-tests -############################################################################################ -# CAUTION: DO NOT MODIFY THE TARGETS BELOW! DOING SO WILL BREAK THE FLAKY TEST MONITOR -# In particular, do not skip tests by commenting them out here. - # Run unit tests for test utilities in this module .PHONY: test test: - go test $(if $(VERBOSE),-v,) -tags relic -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests` + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) -coverprofile=$(COVER_PROFILE) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) `go list ./... | grep -v -e integration/tests` .PHONY: access-cohort1-tests access-cohort1-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/access/cohort1/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/access/cohort1/... .PHONY: access-cohort2-tests access-cohort2-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/access/cohort2/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/access/cohort2/... .PHONY: access-cohort3-tests access-cohort3-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/access/cohort3/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/access/cohort3/... .PHONY: collection-tests collection-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/collection/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/collection/... .PHONY: consensus-tests consensus-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/consensus/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/consensus/... .PHONY: epochs-cohort1-tests epochs-cohort1-tests: # Use a higher timeout of 20m for the suite of tests which span full epochs - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic -timeout 20m ./tests/epochs/cohort1/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 20m ./tests/epochs/cohort1/... .PHONY: epochs-cohort2-tests epochs-cohort2-tests: # Use a higher timeout of 20m for the suite of tests which span full epochs - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic -timeout 20m ./tests/epochs/cohort2/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -timeout 20m ./tests/epochs/cohort2/... .PHONY: ghost-tests ghost-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/ghost/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/ghost/... .PHONY: mvp-tests mvp-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/mvp/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/mvp/... .PHONY: execution-tests execution-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/execution/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/execution/... .PHONY: verification-tests verification-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/verification/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/verification/... # upgrades-tests tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel .PHONY: upgrades-tests upgrades-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/upgrades/... -p 1 + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/upgrades/... -p 1 .PHONY: network-tests network-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/network/... + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/network/... # BFT tests need to be run sequentially (-p 1) due to interference between different Docker networks when tests are run in parallel .PHONY: bft-framework-tests bft-framework-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/framework/... -p 1 + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/framework/... -p 1 .PHONY: bft-protocol-tests bft-protocol-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/protocol/... -p 1 + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/protocol/... -p 1 .PHONY: bft-gossipsub-tests bft-gossipsub-tests: - go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) -tags relic ./tests/bft/gossipsub/... -p 1 + $(CGO_FLAG) go test $(if $(VERBOSE),-v,) $(RACE_FLAG) $(if $(JSON_OUTPUT),-json,) $(if $(NUM_RUNS),-count $(NUM_RUNS),) ./tests/bft/gossipsub/... -p 1 .PHONY: bft-tests bft-tests: bft-framework-tests bft-protocol-tests bft-gossipsub-tests -############################################################################################ + diff --git a/integration/benchmark/cmd/manual/Dockerfile b/integration/benchmark/cmd/manual/Dockerfile index 58f2b71d42b..788c2e6edb0 100644 --- a/integration/benchmark/cmd/manual/Dockerfile +++ b/integration/benchmark/cmd/manual/Dockerfile @@ -4,20 +4,11 @@ FROM golang:1.20-buster AS build-setup RUN apt-get update -RUN apt-get -y install cmake zip - -## (1) Build Relic first to maximize caching -FROM build-setup AS build-relic +RUN apt-get -y install zip RUN mkdir /build WORKDIR /build -# Copy over the crypto package -COPY crypto ./crypto - -# Build Relic (this places build artifacts in /build/relic/build) -RUN cd ./crypto/ && go generate - ## (2) Build the app binary FROM build-setup AS build-env @@ -35,12 +26,12 @@ ARG TARGET COPY . . -# Copy over Relic build artifacts -COPY --from=build-relic /build/crypto/relic/build ./crypto/relic/build - FROM build-env as build-production WORKDIR /app +# CGO_FLAG can be overwritten +ARG CGO_FLAG + # Keep Go's build cache between builds. # https://github.com/golang/go/issues/27719#issuecomment-514747274 # Also, allow ssh access @@ -48,7 +39,7 @@ RUN --mount=type=cache,sharing=locked,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ --mount=type=ssh \ cd integration && \ - CGO_ENABLED=1 go build --tags relic -ldflags "-extldflags -static" -o ./app ./${TARGET} + CGO_ENABLED=1 CGO_FLAGS="${CGO_FLAG}" go build -ldflags "-extldflags -static" -o ./app ./${TARGET} RUN mv /app/integration/app /app/app diff --git a/integration/benchmark/server/bench.sh b/integration/benchmark/server/bench.sh index 6ada16119a1..8c87214a3b1 100755 --- a/integration/benchmark/server/bench.sh +++ b/integration/benchmark/server/bench.sh @@ -22,8 +22,6 @@ while read -r branch_hash; do git log --oneline | head -1 git describe - make -C ../.. crypto_setup_gopath - # instead of running "make stop" which uses docker-compose for a lot of older versions, # we explicitly run the command here with "docker compose" DOCKER_BUILDKIT=1 COMPOSE_DOCKER_CLI_BUILD=1 docker compose -f docker-compose.nodes.yml down -v --remove-orphans @@ -36,7 +34,7 @@ while read -r branch_hash; do # sleep is workaround for slow initialization of some node types, so that benchmark does not quit immediately with "connection refused" sleep 30; - go run -tags relic ../benchmark/cmd/ci -log-level debug -git-repo-path ../../ -tps-initial 800 -tps-min 1 -tps-max 1200 -duration 30m + go run ../benchmark/cmd/ci -log-level debug -git-repo-path ../../ -tps-initial 800 -tps-min 1 -tps-max 1200 -duration 30m # instead of running "make stop" which uses docker-compose for a lot of older versions, # we explicitly run the command here with "docker compose" diff --git a/integration/benchnet2/Makefile b/integration/benchnet2/Makefile index facb25dc152..b7911fdc0f9 100644 --- a/integration/benchnet2/Makefile +++ b/integration/benchnet2/Makefile @@ -29,13 +29,12 @@ endif # assumes there is a checked out version of flow-go in a "flow-go" sub-folder at this level so that the bootstrap executable # for the checked out version will be run in the sub folder but the bootstrap folder will be created here (outside of the checked out flow-go in the sub folder) gen-bootstrap: clone-flow - cd flow-go && make crypto_setup_gopath - cd flow-go/cmd/bootstrap && go run -tags relic . genconfig --address-format "%s%d-${NETWORK_ID}.${NAMESPACE}:3569" --access $(ACCESS) --collection $(COLLECTION) --consensus $(CONSENSUS) --execution $(EXECUTION) --verification $(VERIFICATION) --weight 100 -o ./ --config ../../../bootstrap/conf/node-config.json - cd flow-go/cmd/bootstrap && go run -tags relic . keygen --machine-account --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/keys + cd flow-go/cmd/bootstrap && go run . genconfig --address-format "%s%d-${NETWORK_ID}.${NAMESPACE}:3569" --access $(ACCESS) --collection $(COLLECTION) --consensus $(CONSENSUS) --execution $(EXECUTION) --verification $(VERIFICATION) --weight 100 -o ./ --config ../../../bootstrap/conf/node-config.json + cd flow-go/cmd/bootstrap && go run . keygen --machine-account --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/keys echo {} > ./bootstrap/conf/partner-stakes.json mkdir ./bootstrap/partner-nodes - cd flow-go/cmd/bootstrap && go run -tags relic . rootblock --root-chain bench --root-height 0 --root-parent 0000000000000000000000000000000000000000000000000000000000000000 --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --internal-priv-dir ../../../bootstrap/keys/private-root-information - cd flow-go/cmd/bootstrap && go run -tags relic . finalize --root-commit 0000000000000000000000000000000000000000000000000000000000000000 --service-account-public-key-json "{\"PublicKey\":\"R7MTEDdLclRLrj2MI1hcp4ucgRTpR15PCHAWLM5nks6Y3H7+PGkfZTP2di2jbITooWO4DD1yqaBSAVK8iQ6i0A==\",\"SignAlgo\":2,\"HashAlgo\":1,\"SeqNumber\":0,\"Weight\":1000}" --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --collection-clusters 1 --epoch-counter 0 --epoch-length 30000 --epoch-staking-phase-length 20000 --epoch-dkg-phase-length 2000 --genesis-token-supply="1000000000.0" --protocol-version=0 --internal-priv-dir ../../../bootstrap/keys/private-root-information --dkg-data ../../../bootstrap/private-root-information/root-dkg-data.priv.json --root-block ../../../bootstrap/public-root-information/root-block.json --root-block-votes-dir ../../../bootstrap/public-root-information/root-block-votes/ --epoch-commit-safety-threshold=1000 + cd flow-go/cmd/bootstrap && go run . rootblock --root-chain bench --root-height 0 --root-parent 0000000000000000000000000000000000000000000000000000000000000000 --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --internal-priv-dir ../../../bootstrap/keys/private-root-information + cd flow-go/cmd/bootstrap && go run . finalize --root-commit 0000000000000000000000000000000000000000000000000000000000000000 --service-account-public-key-json "{\"PublicKey\":\"R7MTEDdLclRLrj2MI1hcp4ucgRTpR15PCHAWLM5nks6Y3H7+PGkfZTP2di2jbITooWO4DD1yqaBSAVK8iQ6i0A==\",\"SignAlgo\":2,\"HashAlgo\":1,\"SeqNumber\":0,\"Weight\":1000}" --config ../../../bootstrap/conf/node-config.json -o ../../../bootstrap/ --partner-dir ../../../bootstrap/partner-nodes --partner-weights ../../../bootstrap/conf/partner-stakes.json --collection-clusters 1 --epoch-counter 0 --epoch-length 30000 --epoch-staking-phase-length 20000 --epoch-dkg-phase-length 2000 --genesis-token-supply="1000000000.0" --protocol-version=0 --internal-priv-dir ../../../bootstrap/keys/private-root-information --dkg-data ../../../bootstrap/private-root-information/root-dkg-data.priv.json --root-block ../../../bootstrap/public-root-information/root-block.json --root-block-votes-dir ../../../bootstrap/public-root-information/root-block-votes/ --epoch-commit-safety-threshold=1000 gen-helm-l1: go run automate/cmd/level1/bootstrap.go --data bootstrap/public-root-information/root-protocol-state-snapshot.json --dockerTag $(NETWORK_ID) --dockerRegistry $(DOCKER_REGISTRY) diff --git a/integration/go.mod b/integration/go.mod index b898ee82001..ca0511bbeff 100644 --- a/integration/go.mod +++ b/integration/go.mod @@ -355,3 +355,5 @@ require ( replace github.com/onflow/flow-go => ../ replace github.com/onflow/flow-go/insecure => ../insecure + +replace github.com/onflow/flow-go/crypto => ../crypto diff --git a/integration/go.sum b/integration/go.sum index 9f7646c1abd..b4799f1dcf0 100644 --- a/integration/go.sum +++ b/integration/go.sum @@ -216,6 +216,7 @@ github.com/btcsuite/btcd v0.20.1-beta/go.mod h1:wVuoA8VJLEcwgqHBwHmzLRazpKxTv13P github.com/btcsuite/btcd v0.21.0-beta/go.mod h1:ZSWyehm27aAuS9bvkATT+Xte3hjHZ+MRgMY/8NJ7K94= github.com/btcsuite/btcd/btcec/v2 v2.2.1 h1:xP60mv8fvp+0khmrN0zTdPC3cNm24rfeE6lh2R/Yv3E= github.com/btcsuite/btcd/btcec/v2 v2.2.1/go.mod h1:9/CSmJxmuvqzX9Wh2fXMWToLOHhPd11lSPuIupwTkI8= +github.com/btcsuite/btcd/chaincfg/chainhash v1.0.1/go.mod h1:7SFka0XMvUgj3hfZtydOrQY2mwhPclbT2snogU7SQQc= github.com/btcsuite/btcd/chaincfg/chainhash v1.0.2 h1:KdUfX2zKommPRa+PD0sWZUyXe9w277ABlgELO7H04IM= github.com/btcsuite/btcd/chaincfg/chainhash v1.0.2/go.mod h1:7SFka0XMvUgj3hfZtydOrQY2mwhPclbT2snogU7SQQc= github.com/btcsuite/btclog v0.0.0-20170628155309-84c8d2346e9f/go.mod h1:TdznJufoqS23FtqVCzL0ZqgP5MqXbb4fg/WgDys70nA= @@ -322,7 +323,9 @@ github.com/davidlazar/go-crypto v0.0.0-20200604182044-b73af7476f6c/go.mod h1:6Uh github.com/deckarep/golang-set v0.0.0-20180603214616-504e848d77ea/go.mod h1:93vsz/8Wt4joVM7c2AVqh+YRMiUSc14yDtF28KmMOgQ= github.com/deckarep/golang-set/v2 v2.1.0 h1:g47V4Or+DUdzbs8FxCCmgb6VYd+ptPAngjM6dtGktsI= github.com/deckarep/golang-set/v2 v2.1.0/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4= +github.com/decred/dcrd/crypto/blake256 v1.0.0/go.mod h1:sQl2p6Y26YV+ZOcSTP6thNdn47hh8kt6rqSlvmrXFAc= github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5ilcvdfma9wOH6Y= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.0.1/go.mod h1:hyedUtir6IdtD/7lIxGeCxkaw7y45JueMRL4DIyJDKs= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0 h1:8UrgZ3GkP4i/CLijOJx79Yu+etlyjdBU4sfcs2WYQMs= github.com/decred/dcrd/dcrec/secp256k1/v4 v4.2.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0= github.com/decred/dcrd/lru v1.0.0/go.mod h1:mxKOwFd7lFjN2GZYsiz/ecgqR6kkYAl+0pz0tEMk218= @@ -1407,9 +1410,6 @@ github.com/onflow/flow-ft/lib/go/contracts v0.7.1-0.20230711213910-baad011d2b13/ github.com/onflow/flow-go-sdk v0.24.0/go.mod h1:IoptMLPyFXWvyd9yYA6/4EmSeeozl6nJoIv4FaEMg74= github.com/onflow/flow-go-sdk v0.41.16 h1:HsmHwEVmj+iK+GszHbFseHh7Ii5W3PWOIRNAH/En08Q= github.com/onflow/flow-go-sdk v0.41.16/go.mod h1:bVrVNoJKiwB6vW5Qbm5tFAfJBQ5we4uSQWnn9gNAFhQ= -github.com/onflow/flow-go/crypto v0.21.3/go.mod h1:vI6V4CY3R6c4JKBxdcRiR/AnjBfL8OSD97bJc60cLuQ= -github.com/onflow/flow-go/crypto v0.24.9 h1:0EQp+kSZYJepMIiSypfJVe7tzsPcb6UXOdOtsTCDhBs= -github.com/onflow/flow-go/crypto v0.24.9/go.mod h1:fqCzkIBBMRRkciVrvW21rECKq1oD7Q6u+bCI78lfNX0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0 h1:rhUDeD27jhLwOqQKI/23008CYfnqXErrJvc4EFRP2a0= github.com/onflow/flow-nft/lib/go/contracts v1.1.0/go.mod h1:YsvzYng4htDgRB9sa9jxdwoTuuhjK8WYWXTyLkIigZY= github.com/onflow/flow/protobuf/go/flow v0.2.2/go.mod h1:gQxYqCfkI8lpnKsmIjwtN2mV/N2PIwc1I+RUK4HPIc8= @@ -1703,8 +1703,6 @@ github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcU github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/subosito/gotenv v1.4.2 h1:X1TuBLAMDFbaTAChgCBLu3DU3UPyELpnF2jjJ2cz/S8= github.com/subosito/gotenv v1.4.2/go.mod h1:ayKnFf/c6rvx/2iiLrJUk1e6plDbT3edrFNGqEflhK0= -github.com/supranational/blst v0.3.4/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw= -github.com/supranational/blst v0.3.11-0.20230406105308-e9dfc5ee724b h1:u49mjRnygnB34h8OKbnNJFVUtWSKIKb1KukdV8bILUM= github.com/syndtr/goleveldb v1.0.0/go.mod h1:ZVVdQEZoIme9iO1Ch2Jdy24qqXrMMOU6lpPAyBWyWuQ= github.com/syndtr/goleveldb v1.0.1-0.20190923125748-758128399b1d/go.mod h1:9OrXJhf154huy1nPWmuSrkgjPUtUNhA+Zmy+6AESzuA= github.com/syndtr/goleveldb v1.0.1-0.20210819022825-2ae1ddf74ef7 h1:epCh84lMvA70Z7CTTCmYQn2CKbY8j86K7/FAIr141uY= @@ -1884,7 +1882,6 @@ golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= golang.org/x/crypto v0.0.0-20191227163750-53104e6ec876/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200115085410-6d4e4cb37c7d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20200117160349-530e935923ad/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200221231518-2aa609cf4a9d/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200423211502-4bdfaf469ed5/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= @@ -2106,7 +2103,6 @@ golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191220142924-d4481acd189f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191228213918-04cbcbbfeed8/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200106162015-b016eb3dc98e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20200107162124-548cf772de50/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200113162924-86b910548bc1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/integration/localnet/Makefile b/integration/localnet/Makefile index a2a035f4a93..d889d22e150 100644 --- a/integration/localnet/Makefile +++ b/integration/localnet/Makefile @@ -47,7 +47,7 @@ ifeq ($(strip $(VALID_EXECUTION)), 1) else ifeq ($(strip $(VALID_CONSENSUS)), 1) $(error Number of Consensus nodes should be no less than 2) else - go run -tags relic \ + go run \ -ldflags="-X 'github.com/onflow/flow-go/cmd/build.commit=${COMMIT}' \ -X 'github.com/onflow/flow-go/cmd/build.semver=${VERSION}'" \ builder/*.go \ @@ -123,15 +123,15 @@ stop: .PHONY: load load: - go run --tags relic ../benchmark/cmd/manual -log-level info -tps 1,10,100 -tps-durations 30s,30s + go run ../benchmark/cmd/manual -log-level info -tps 1,10,100 -tps-durations 30s,30s .PHONY: tps-ci-smoke tps-ci-smoke: - go run --tags relic ../benchmark/cmd/ci -log-level info -tps-initial 1 -tps-min 1 -tps-max 10 -duration 20s -tps-adjust-interval 1s -stat-interval 1s -bigquery-upload=false + go run ../benchmark/cmd/ci -log-level info -tps-initial 1 -tps-min 1 -tps-max 10 -duration 20s -tps-adjust-interval 1s -stat-interval 1s -bigquery-upload=false .PHONY: tps-ci tps-ci: bootstrap-ci build-flow start-flow - go run --tags relic ../benchmark/cmd/ci -log-level info -tps-initial $(TPS_INIT) -tps-min $(TPS_MIN) -tps-max $(TPS_MAX) -duration $(DURATION) + go run ../benchmark/cmd/ci -log-level info -tps-initial $(TPS_INIT) -tps-min $(TPS_MIN) -tps-max $(TPS_MAX) -duration $(DURATION) .PHONY: clean-data clean-data: diff --git a/integration/testnet/util.go b/integration/testnet/util.go index ad45be97c82..52ab6af17a0 100644 --- a/integration/testnet/util.go +++ b/integration/testnet/util.go @@ -71,7 +71,7 @@ func toNodeInfos(confs []ContainerConfig) []bootstrap.NodeInfo { } func getSeed() ([]byte, error) { - seedLen := int(math.Max(crypto.SeedMinLenDKG, crypto.KeyGenSeedMinLen)) + seedLen := int(math.Max(crypto.KeyGenSeedMinLen, crypto.KeyGenSeedMinLen)) seed := make([]byte, seedLen) n, err := rand.Read(seed) if err != nil || n != seedLen { diff --git a/integration/tests/access/cohort1/access_api_test.go b/integration/tests/access/cohort1/access_api_test.go index cb5a175130d..24409f84ad2 100644 --- a/integration/tests/access/cohort1/access_api_test.go +++ b/integration/tests/access/cohort1/access_api_test.go @@ -87,7 +87,12 @@ func (s *AccessAPISuite) SetupTest() { ) consensusConfigs := []func(config *testnet.NodeConfig){ - testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=100ms"), + // `cruise-ctl-fallback-proposal-duration` is set to 250ms instead to of 100ms + // to purposely slow down the block rate. This is needed since the crypto module + // update providing faster BLS operations. + // TODO: fix the access integration test logic to function without slowing down + // the block rate + testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=250ms"), testnet.WithAdditionalFlagf("--required-verification-seal-approvals=%d", 1), testnet.WithAdditionalFlagf("--required-construction-seal-approvals=%d", 1), testnet.WithLogLevel(zerolog.FatalLevel), diff --git a/integration/tests/bft/base_suite.go b/integration/tests/bft/base_suite.go index b50085a9e50..2e6e74de881 100644 --- a/integration/tests/bft/base_suite.go +++ b/integration/tests/bft/base_suite.go @@ -77,7 +77,12 @@ func (b *BaseSuite) SetupSuite() { testnet.WithLogLevel(zerolog.FatalLevel), testnet.WithAdditionalFlag("--required-verification-seal-approvals=1"), testnet.WithAdditionalFlag("--required-construction-seal-approvals=1"), - testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=1ms"), + // `cruise-ctl-fallback-proposal-duration` is set to 250ms instead to of 1ms + // to purposely slow down the block rate. This is needed since the crypto module + // update providing faster BLS operations. + // TODO: fix the access integration test logic to function without slowing down + // the block rate + testnet.WithAdditionalFlag("--cruise-ctl-fallback-proposal-duration=250ms"), ) b.NodeConfigs = append(b.NodeConfigs, nodeConfig) } diff --git a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go index fb825e447a6..d101af6371d 100644 --- a/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go +++ b/integration/tests/epochs/cohort2/epoch_join_and_leave_sn_test.go @@ -2,6 +2,7 @@ package cohort2 import ( "testing" + "time" "github.com/stretchr/testify/suite" @@ -17,6 +18,15 @@ type EpochJoinAndLeaveSNSuite struct { epochs.DynamicEpochTransitionSuite } +func (s *EpochJoinAndLeaveSNSuite) SetupTest() { + // slow down the block rate. This is needed since the crypto module + // update provides faster BLS operations. + // TODO: fix the access integration test logic to function without slowing down + // the block rate + s.ConsensusProposalDuration = time.Millisecond * 250 + s.DynamicEpochTransitionSuite.SetupTest() +} + // TestEpochJoinAndLeaveSN should update consensus nodes and assert healthy network conditions // after the epoch transition completes. See health check function for details. func (s *EpochJoinAndLeaveSNSuite) TestEpochJoinAndLeaveSN() { diff --git a/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go b/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go index d4b46693bb9..ed8f7ef1ae1 100644 --- a/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go +++ b/integration/tests/epochs/cohort2/epoch_join_and_leave_vn_test.go @@ -32,7 +32,7 @@ func (s *EpochJoinAndLeaveVNSuite) SetupTest() { s.DKGPhaseLen = 100 s.EpochLen = 450 s.EpochCommitSafetyThreshold = 20 - s.DynamicEpochTransitionSuite.Suite.SetupTest() + s.Suite.SetupTest() } // TestEpochJoinAndLeaveVN should update verification nodes and assert healthy network conditions diff --git a/integration/tests/upgrades/suite.go b/integration/tests/upgrades/suite.go index dbc40e810aa..93094b8c13b 100644 --- a/integration/tests/upgrades/suite.go +++ b/integration/tests/upgrades/suite.go @@ -83,10 +83,12 @@ func (s *Suite) SetupTest() { testnet.WithLogLevel(zerolog.WarnLevel), testnet.WithID(s.exe1ID), testnet.WithAdditionalFlag("--extensive-logging=true"), + testnet.WithAdditionalFlag("--max-graceful-stop-duration=1s"), ), testnet.NewNodeConfig( flow.RoleExecution, testnet.WithLogLevel(zerolog.WarnLevel), + testnet.WithAdditionalFlag("--max-graceful-stop-duration=1s"), ), testnet.NewNodeConfig(flow.RoleConsensus, consensusConfigs...), testnet.NewNodeConfig(flow.RoleConsensus, consensusConfigs...), diff --git a/model/bootstrap/node_info.go b/model/bootstrap/node_info.go index cdc6f855c4a..62a33f6f442 100644 --- a/model/bootstrap/node_info.go +++ b/model/bootstrap/node_info.go @@ -174,6 +174,18 @@ type decodableNodeInfoPub struct { Stake uint64 } +func (info *NodeInfoPub) Equals(other *NodeInfoPub) bool { + if other == nil { + return false + } + return info.Address == other.Address && + info.NodeID == other.NodeID && + info.Role == other.Role && + info.Weight == other.Weight && + info.NetworkPubKey.PublicKey.Equals(other.NetworkPubKey.PublicKey) && + info.StakingPubKey.PublicKey.Equals(other.StakingPubKey.PublicKey) +} + func (info *NodeInfoPub) UnmarshalJSON(b []byte) error { var decodable decodableNodeInfoPub err := json.Unmarshal(b, &decodable) diff --git a/model/bootstrap/node_info_test.go b/model/bootstrap/node_info_test.go index 536c0c808f9..39294de5f69 100644 --- a/model/bootstrap/node_info_test.go +++ b/model/bootstrap/node_info_test.go @@ -50,7 +50,7 @@ func TestNodeInfoPubEncodingJSON(t *testing.T) { var dec bootstrap.NodeInfoPub err = json.Unmarshal(enc, &dec) require.NoError(t, err) - assert.Equal(t, conf, dec) + assert.True(t, dec.Equals(&conf)) }) t.Run("compat: should accept old files using Stake field", func(t *testing.T) { conf := unittest.NodeInfoFixture().Public() @@ -61,6 +61,6 @@ func TestNodeInfoPubEncodingJSON(t *testing.T) { var dec bootstrap.NodeInfoPub err = json.Unmarshal(enc, &dec) require.NoError(t, err) - assert.Equal(t, conf, dec) + assert.True(t, dec.Equals(&conf)) }) } diff --git a/model/encodable/keys_test.go b/model/encodable/keys_test.go index ccdf63cd044..338c1708366 100644 --- a/model/encodable/keys_test.go +++ b/model/encodable/keys_test.go @@ -247,7 +247,7 @@ func TestEncodableRandomBeaconPrivKeyMsgPack(t *testing.T) { err = key.UnmarshalMsgpack(b) require.NoError(t, err) - require.Equal(t, oldPubKey, key.PublicKey) + require.True(t, oldPubKey.Equals(key.PublicKey)) } func generateRandomSeed(t *testing.T) []byte { diff --git a/model/flow/identity.go b/model/flow/identity.go index c44c394cb06..975baa556e9 100644 --- a/model/flow/identity.go +++ b/model/flow/identity.go @@ -61,6 +61,19 @@ type Identity struct { NetworkPubKey crypto.PublicKey } +func (id *Identity) Equals(other *Identity) bool { + if other == nil { + return false + } + return id.NodeID == other.NodeID && + id.Address == other.Address && + id.Role == other.Role && + id.Weight == other.Weight && + id.Ejected == other.Ejected && + id.StakingPubKey.Equals(other.StakingPubKey) && + id.NetworkPubKey.Equals(other.NetworkPubKey) +} + // ParseIdentity parses a string representation of an identity. func ParseIdentity(identity string) (*Identity, error) { diff --git a/model/flow/identity_test.go b/model/flow/identity_test.go index 891a854aca6..849db712d7d 100644 --- a/model/flow/identity_test.go +++ b/model/flow/identity_test.go @@ -58,7 +58,7 @@ func TestIdentityEncodingJSON(t *testing.T) { var dec flow.Identity err = json.Unmarshal(enc, &dec) require.NoError(t, err) - require.Equal(t, identity, &dec) + require.True(t, identity.Equals(&dec)) }) t.Run("empty address should be omitted", func(t *testing.T) { @@ -71,7 +71,7 @@ func TestIdentityEncodingJSON(t *testing.T) { var dec flow.Identity err = json.Unmarshal(enc, &dec) require.NoError(t, err) - require.Equal(t, identity, &dec) + require.True(t, identity.Equals(&dec)) }) t.Run("compat: should accept old files using Stake field", func(t *testing.T) { @@ -83,7 +83,7 @@ func TestIdentityEncodingJSON(t *testing.T) { var dec flow.Identity err = json.Unmarshal(enc, &dec) require.NoError(t, err) - require.Equal(t, identity, &dec) + require.True(t, identity.Equals(&dec)) }) } @@ -94,7 +94,7 @@ func TestIdentityEncodingMsgpack(t *testing.T) { var dec flow.Identity err = msgpack.Unmarshal(enc, &dec) require.NoError(t, err) - require.Equal(t, identity, &dec) + require.True(t, identity.Equals(&dec)) } func TestIdentityList_Exists(t *testing.T) { diff --git a/module/dkg/controller_test.go b/module/dkg/controller_test.go index e8f8d253537..3d9d1676a6a 100644 --- a/module/dkg/controller_test.go +++ b/module/dkg/controller_test.go @@ -248,7 +248,7 @@ func initNodes(t *testing.T, n int, phase1Duration, phase2Duration, phase3Durati logger: logger, } - seed := unittest.SeedFixture(20) + seed := unittest.SeedFixture(crypto.KeyGenSeedMinLen) dkg, err := crypto.NewJointFeldman(n, signature.RandomBeaconThreshold(n), i, broker) require.NoError(t, err) diff --git a/module/dkg_broker.go b/module/dkg_broker.go index 49ebb0ad051..7e64353816e 100644 --- a/module/dkg_broker.go +++ b/module/dkg_broker.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package module import ( diff --git a/module/metrics/example/README.md b/module/metrics/example/README.md index f693cac0780..ec319414ad8 100644 --- a/module/metrics/example/README.md +++ b/module/metrics/example/README.md @@ -18,7 +18,7 @@ You can choose one of the following: Note: Running example with `-happypath` flag examines the metrics collection on a real happy path of verification node. ``` - go run --tags=relic module/metrics/example/verification/main.go + go run module/metrics/example/verification/main.go ``` - Consensus Node: ``` diff --git a/module/signature/aggregation.go b/module/signature/aggregation.go index 99129c656dc..76101ee3805 100644 --- a/module/signature/aggregation.go +++ b/module/signature/aggregation.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package signature import ( diff --git a/module/signature/aggregation_no_relic.go b/module/signature/aggregation_no_relic.go deleted file mode 100644 index 6b51c6f35a3..00000000000 --- a/module/signature/aggregation_no_relic.go +++ /dev/null @@ -1,34 +0,0 @@ -//go:build !relic -// +build !relic - -package signature - -import ( - "github.com/onflow/flow-go/crypto" -) - -const panic_relic = "function only supported with the relic build tag" - -// These functions are the non-relic versions of some public functions from the package. -// The functions are here to allow the build of flow-emulator, since the emulator is built -// without the "relic" build tag, and does not run the functions below. -type SignatureAggregatorSameMessage struct{} - -func NewSignatureAggregatorSameMessage( - message []byte, - dsTag string, - publicKeys []crypto.PublicKey, -) (*SignatureAggregatorSameMessage, error) { - panic(panic_relic) -} - -func (s *SignatureAggregatorSameMessage) Verify(signer int, sig crypto.Signature) (bool, error) { - panic(panic_relic) -} -func (s *SignatureAggregatorSameMessage) TrustedAdd(signer int, sig crypto.Signature) error { - panic(panic_relic) -} - -func (s *SignatureAggregatorSameMessage) Aggregate() ([]int, crypto.Signature, error) { - panic(panic_relic) -} diff --git a/module/signature/aggregation_test.go b/module/signature/aggregation_test.go index aebc696b091..87a31561753 100644 --- a/module/signature/aggregation_test.go +++ b/module/signature/aggregation_test.go @@ -1,6 +1,3 @@ -//go:build relic -// +build relic - package signature import ( diff --git a/state/protocol/badger/snapshot_test.go b/state/protocol/badger/snapshot_test.go index 03e98d6f067..54f1522b2b8 100644 --- a/state/protocol/badger/snapshot_test.go +++ b/state/protocol/badger/snapshot_test.go @@ -829,7 +829,7 @@ func TestLatestSealedResult(t *testing.T) { expectedResult, expectedSeal, err := rootSnapshot.SealedResult() require.NoError(t, err) - assert.Equal(t, expectedResult, gotResult) + assert.Equal(t, expectedResult.ID(), gotResult.ID()) assert.Equal(t, expectedSeal, gotSeal) }) }) diff --git a/state/protocol/badger/validity_test.go b/state/protocol/badger/validity_test.go index 30ee94c40d6..53a044770c2 100644 --- a/state/protocol/badger/validity_test.go +++ b/state/protocol/badger/validity_test.go @@ -51,7 +51,7 @@ func TestEpochSetupValidity(t *testing.T) { t.Run("short seed", func(t *testing.T) { _, result, _ := unittest.BootstrapFixture(participants) setup := result.ServiceEvents[0].Event.(*flow.EpochSetup) - setup.RandomSource = unittest.SeedFixture(crypto.SeedMinLenDKG - 1) + setup.RandomSource = unittest.SeedFixture(crypto.KeyGenSeedMinLen - 1) err := verifyEpochSetup(setup, true) require.Error(t, err) diff --git a/tools/test_monitor/level1/process_summary1_results_test.go b/tools/test_monitor/level1/process_summary1_results_test.go index c64f8442995..6e7b12f0551 100644 --- a/tools/test_monitor/level1/process_summary1_results_test.go +++ b/tools/test_monitor/level1/process_summary1_results_test.go @@ -33,19 +33,19 @@ func TestGenerateLevel1Summary_Struct(t *testing.T) { RawJSONTestRunFile: "test-result-crypto-hash-1-count-skip-pass.json", }, - // raw results generated with: go test -json -count 1 --tags relic ./utils/unittest/... + // raw results generated with: go test -json -count 1 ./utils/unittest/... "2 count all pass": { ExpectedLevel1Summary: testdata.GetTestData_Level1_2CountPass(), RawJSONTestRunFile: "test-result-crypto-hash-2-count-pass.json", }, - // raw results generated with: go test -json -count 1 --tags relic ./utils/unittest/... + // raw results generated with: go test -json -count 1 ./utils/unittest/... "10 count all pass": { ExpectedLevel1Summary: testdata.GetTestData_Level1_10CountPass(), RawJSONTestRunFile: "test-result-crypto-hash-10-count-pass.json", }, - // raw results generated with: go test -json -count 1 --tags relic ./utils/unittest/... + // raw results generated with: go test -json -count 1 ./utils/unittest/... "10 count some failures": { ExpectedLevel1Summary: testdata.GetTestData_Level1_10CountSomeFailures(), RawJSONTestRunFile: "test-result-crypto-hash-10-count-fail.json", @@ -54,14 +54,14 @@ func TestGenerateLevel1Summary_Struct(t *testing.T) { // no result tests - tests below don't generate pass/fail result due to `go test` bug // with using `fmt.printf("log message")` without newline `\n` - // raw results generated with: go test -v -tags relic -count=1 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack + // raw results generated with: go test -v -count=1 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack // this is a single unit test that produces a no result "1 count single no result test": { ExpectedLevel1Summary: testdata.GetTestData_Level1_1CountSingleExceptionTest(), RawJSONTestRunFile: "test-result-exception-single-1-count-pass.json", }, - //raw results generated with: go test -v -tags relic -count=5 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack + //raw results generated with: go test -v -count=5 -json ./model/encodable/. -test.run TestEncodableRandomBeaconPrivKeyMsgPack //multiple no result tests in a row "5 no result tests in a row": { ExpectedLevel1Summary: testdata.GetTestData_Level1_5CountSingleExceptionTest(), @@ -74,7 +74,7 @@ func TestGenerateLevel1Summary_Struct(t *testing.T) { RawJSONTestRunFile: "test-result-exception-single-5-count-4-nil-1-normal-pass.json", }, - // raw results generated with: go test -v -tags relic -count=3 -json ./model/encodable/. + // raw results generated with: go test -v -count=3 -json ./model/encodable/. // group of unit tests with a single no result test "3 count no result test with normal tests": { ExpectedLevel1Summary: testdata.GetTestData_Leve1_3CountExceptionWithNormalTests(), diff --git a/utils/binstat/binstat_external_test.go b/utils/binstat/binstat_external_test.go index 9ffa7b23065..10f8b911ff9 100644 --- a/utils/binstat/binstat_external_test.go +++ b/utils/binstat/binstat_external_test.go @@ -28,7 +28,7 @@ import ( * 5. Strip "time" field from JSON log line output for shorter read, and * 6. Show the amount of code coverage from the tests. * - * pushd utils/binstat ; go fmt ./*.go ; golangci-lint run && go test -v -vv -coverprofile=coverage.txt -covermode=atomic --tags relic ./... | perl -lane 's~\\n~\n~g; s~"time".*?,~~g; print;' ; go tool cover -func=coverage.txt ; popd + * pushd utils/binstat ; go fmt ./*.go ; golangci-lint run && go test -v -vv -coverprofile=coverage.txt -covermode=atomic ./... | perl -lane 's~\\n~\n~g; s~"time".*?,~~g; print;' ; go tool cover -func=coverage.txt ; popd */ /*