From 50965113d4dfa300dc1ec3e112c366aa34f67777 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 11 May 2022 00:39:44 +0200 Subject: [PATCH 01/10] Add new files for BLAKE3 This commit adds the cryptographic hash function BLAKE3 to the icp module. The code for this hash was put into public domain - so I could use the CDDL for all files. Homepage of BLAKE3: https://github.com/BLAKE3-team/BLAKE3 Wikipedia: https://en.wikipedia.org/wiki/BLAKE_(hash_function)#BLAKE3 Short description of Wikipedia: BLAKE3 is a cryptographic hash function based on Bao and BLAKE2, created by Jack O'Connor, Jean-Philippe Aumasson, Samuel Neves, and Zooko Wilcox-O'Hearn. It was announced on January 9, 2020, at Real World Crypto. BLAKE3 is a single algorithm with many desirable features (parallelism, XOF, KDF, PRF and MAC), in contrast to BLAKE and BLAKE2, which are algorithm families with multiple variants. BLAKE3 has a binary tree structure, so it supports a practically unlimited degree of parallelism (both SIMD and multithreading) given enough input. The official Rust and C implementations are dual-licensed as public domain (CC0) and the Apache License. The new code is mainly splitted into these parts: - 1x cross platform generic c variant: blake3_generic.c - 4x assembly for X86-64 (SSE2, SSE4.1, AVX2, AVX512) - 2x assembly for ARMv8 (NEON converted from SSE2) - 2x assembly for PPC64-LE (POWER8 converted from SSE2) - one file for switching between the implementations Signed-off-by: Tino Reichardt Co-authored-by: Rich Ercolani Closes #10058 --- config/always-arch.m4 | 2 + lib/libicp/Makefile.am | 25 +- lib/libzpool/Makefile.am | 2 + module/Kbuild.in | 32 +- module/Makefile.bsd | 34 +- module/icp/algs/blake3/blake3.c | 732 ++++ module/icp/algs/blake3/blake3_generic.c | 202 ++ module/icp/algs/blake3/blake3_impl.c | 256 ++ module/icp/algs/blake3/blake3_impl.h | 213 ++ module/icp/algs/blake3/blake3_x86-64.c | 244 ++ .../icp/asm-aarch64/blake3/b3_aarch64_sse2.S | 2450 +++++++++++++ .../icp/asm-aarch64/blake3/b3_aarch64_sse41.S | 2463 +++++++++++++ module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S | 2823 +++++++++++++++ .../icp/asm-ppc64/blake3/b3_ppc64le_sse41.S | 3064 +++++++++++++++++ module/icp/asm-x86_64/blake3/blake3_avx2.S | 1845 ++++++++++ module/icp/asm-x86_64/blake3/blake3_avx512.S | 2618 ++++++++++++++ module/icp/asm-x86_64/blake3/blake3_sse2.S | 2323 +++++++++++++ module/icp/asm-x86_64/blake3/blake3_sse41.S | 2058 +++++++++++ 18 files changed, 21377 insertions(+), 9 deletions(-) create mode 100644 module/icp/algs/blake3/blake3.c create mode 100644 module/icp/algs/blake3/blake3_generic.c create mode 100644 module/icp/algs/blake3/blake3_impl.c create mode 100644 module/icp/algs/blake3/blake3_impl.h create mode 100644 module/icp/algs/blake3/blake3_x86-64.c create mode 100644 module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S create mode 100644 module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S create mode 100644 module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S create mode 100644 module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S create mode 100644 module/icp/asm-x86_64/blake3/blake3_avx2.S create mode 100644 module/icp/asm-x86_64/blake3/blake3_avx512.S create mode 100644 module/icp/asm-x86_64/blake3/blake3_sse2.S create mode 100644 module/icp/asm-x86_64/blake3/blake3_sse41.S diff --git a/config/always-arch.m4 b/config/always-arch.m4 index 02c8e4775b95..f7090a4826ba 100644 --- a/config/always-arch.m4 +++ b/config/always-arch.m4 @@ -30,6 +30,8 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [ ;; esac + AM_CONDITIONAL([TARGET_CPU_AARCH64], test $TARGET_CPU = aarch64) AM_CONDITIONAL([TARGET_CPU_X86_64], test $TARGET_CPU = x86_64) AM_CONDITIONAL([TARGET_CPU_POWERPC], test $TARGET_CPU = powerpc) + AM_CONDITIONAL([TARGET_CPU_SPARC64], test $TARGET_CPU = sparc64) ]) diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am index 304f49e39005..b7f1d0e1b1e4 100644 --- a/lib/libicp/Makefile.am +++ b/lib/libicp/Makefile.am @@ -13,6 +13,10 @@ nodist_libicp_la_SOURCES = \ module/icp/algs/aes/aes_impl_x86-64.c \ module/icp/algs/aes/aes_impl.c \ module/icp/algs/aes/aes_modes.c \ + module/icp/algs/blake3/blake3.c \ + module/icp/algs/blake3/blake3_generic.c \ + module/icp/algs/blake3/blake3_impl.c \ + module/icp/algs/blake3/blake3_x86-64.c \ module/icp/algs/edonr/edonr.c \ module/icp/algs/modes/modes.c \ module/icp/algs/modes/cbc.c \ @@ -36,15 +40,30 @@ nodist_libicp_la_SOURCES = \ module/icp/core/kcf_mech_tabs.c \ module/icp/core/kcf_prov_tabs.c -if TARGET_CPU_X86_64 +if TARGET_CPU_AARCH64 +nodist_libicp_la_SOURCES += \ + module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S \ + module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S +endif + +if TARGET_CPU_POWERPC nodist_libicp_la_SOURCES += \ - module/icp/asm-x86_64/aes/aeskey.c + module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S \ + module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S +endif + +if TARGET_CPU_X86_64 nodist_libicp_la_SOURCES += \ + module/icp/asm-x86_64/aes/aeskey.c \ module/icp/asm-x86_64/aes/aes_amd64.S \ module/icp/asm-x86_64/aes/aes_aesni.S \ module/icp/asm-x86_64/modes/gcm_pclmulqdq.S \ module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S \ module/icp/asm-x86_64/modes/ghash-x86_64.S \ module/icp/asm-x86_64/sha2/sha256_impl.S \ - module/icp/asm-x86_64/sha2/sha512_impl.S + module/icp/asm-x86_64/sha2/sha512_impl.S \ + module/icp/asm-x86_64/blake3/blake3_avx2.S \ + module/icp/asm-x86_64/blake3/blake3_avx512.S \ + module/icp/asm-x86_64/blake3/blake3_sse2.S \ + module/icp/asm-x86_64/blake3/blake3_sse41.S endif diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 60eb30749847..eaa920e56106 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -67,6 +67,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/abd.c \ module/zfs/aggsum.c \ module/zfs/arc.c \ + module/zfs/blake3_zfs.c \ module/zfs/blkptr.c \ module/zfs/bplist.c \ module/zfs/bpobj.c \ @@ -171,6 +172,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/zcp_synctask.c \ module/zfs/zfeature.c \ module/zfs/zfs_byteswap.c \ + module/zfs/zfs_chksum.c \ module/zfs/zfs_fm.c \ module/zfs/zfs_fuid.c \ module/zfs/zfs_ratelimit.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index 11099999fb87..ed8dc23a90d3 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -75,6 +75,10 @@ ICP_OBJS := \ algs/aes/aes_impl.o \ algs/aes/aes_impl_generic.o \ algs/aes/aes_modes.o \ + algs/blake3/blake3.o \ + algs/blake3/blake3_generic.o \ + algs/blake3/blake3_impl.o \ + algs/blake3/blake3_x86-64.o \ algs/edonr/edonr.o \ algs/modes/cbc.o \ algs/modes/ccm.o \ @@ -105,23 +109,44 @@ ICP_OBJS_X86_64 := \ asm-x86_64/aes/aes_aesni.o \ asm-x86_64/aes/aes_amd64.o \ asm-x86_64/aes/aeskey.o \ + asm-x86_64/blake3/blake3_avx2.o \ + asm-x86_64/blake3/blake3_avx512.o \ + asm-x86_64/blake3/blake3_sse2.o \ + asm-x86_64/blake3/blake3_sse41.o \ asm-x86_64/modes/aesni-gcm-x86_64.o \ asm-x86_64/modes/gcm_pclmulqdq.o \ asm-x86_64/modes/ghash-x86_64.o \ asm-x86_64/sha2/sha256_impl.o \ asm-x86_64/sha2/sha512_impl.o + ICP_OBJS_X86 := \ algs/aes/aes_impl_aesni.o \ algs/aes/aes_impl_x86-64.o \ algs/modes/gcm_pclmulqdq.o + +ICP_OBJS_ARM64 := \ + asm-aarch64/blake3/b3_aarch64_sse2.o \ + asm-aarch64/blake3/b3_aarch64_sse41.o + + +ICP_OBJS_PPC_PPC64 := \ + asm-ppc64/blake3/b3_ppc64le_sse2.o \ + asm-ppc64/blake3/b3_ppc64le_sse41.o + zfs-objs += $(addprefix icp/,$(ICP_OBJS)) zfs-$(CONFIG_X86) += $(addprefix icp/,$(ICP_OBJS_X86)) zfs-$(CONFIG_X86_64) += $(addprefix icp/,$(ICP_OBJS_X86_64)) +zfs-$(CONFIG_ARM64) += $(addprefix icp/,$(ICP_OBJS_ARM64)) +zfs-$(CONFIG_PPC) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) +zfs-$(CONFIG_PPC64) += $(addprefix icp/,$(ICP_OBJS_PPC_PPC64)) + +$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ + $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : asflags-y += -I$(icp_include) -$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : asflags-y += -I$(icp_include) -$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : ccflags-y += -I$(icp_include) +$(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64) \ + $(ICP_OBJS_ARM64) $(ICP_OBJS_PPC_PPC64)) : ccflags-y += -I$(icp_include) # Suppress objtool "can't find jump dest instruction at" warnings. They # are caused by the constants which are defined in the text section of the @@ -129,6 +154,7 @@ $(addprefix $(obj)/icp/,$(ICP_OBJS) $(ICP_OBJS_X86) $(ICP_OBJS_X86_64)) : ccflag # utility tries to interpret them as opcodes and obviously fails doing so. OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y + # Suppress objtool "unsupported stack pointer realignment" warnings. We are # not using a DRAP register while aligning the stack to a 64 byte boundary. # See #6950 for the reasoning. @@ -261,6 +287,7 @@ ZFS_OBJS := \ abd.o \ aggsum.o \ arc.o \ + blake3_zfs.o \ blkptr.o \ bplist.o \ bpobj.o \ @@ -358,6 +385,7 @@ ZFS_OBJS := \ zcp_synctask.o \ zfeature.o \ zfs_byteswap.o \ + zfs_chksum.o \ zfs_fm.o \ zfs_fuid.o \ zfs_ioctl.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 61f02152d334..589ca60b29be 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -10,6 +10,10 @@ INCDIR=${.CURDIR:H}/include KMOD= openzfs .PATH: ${SRCDIR}/avl \ + ${SRCDIR}/icp/algs/blake3 \ + ${SRCDIR}/icp/asm-aarch64/blake3 \ + ${SRCDIR}/icp/asm-ppc64/blake3 \ + ${SRCDIR}/icp/asm-x86_64/blake3 \ ${SRCDIR}/lua \ ${SRCDIR}/nvpair \ ${SRCDIR}/icp/algs/edonr \ @@ -31,6 +35,7 @@ CFLAGS+= -I${INCDIR}/os/freebsd CFLAGS+= -I${INCDIR}/os/freebsd/spl CFLAGS+= -I${INCDIR}/os/freebsd/zfs CFLAGS+= -I${SRCDIR}/zstd/include +CFLAGS+= -I${SRCDIR}/icp/include CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \ @@ -38,7 +43,8 @@ CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \ -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DCOMPAT_FREEBSD11 .if ${MACHINE_ARCH} == "amd64" -CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSSE3 +CFLAGS+= -D__x86_64 -DHAVE_SSE2 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 \ + -DHAVE_AVX -DHAVE_AVX2 -DHAVE_AVX512F -DHAVE_AVX512VL .endif .if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true" @@ -73,12 +79,32 @@ CFLAGS+= -DBITS_PER_LONG=64 SRCS= vnode_if.h device_if.h bus_if.h -# avl +#avl SRCS+= avl.c # icp SRCS+= edonr.c +#icp/algs/blake3 +SRCS+= blake3.c \ + blake3_generic.c \ + blake3_impl.c \ + blake3_x86-64.c + +#icp/asm-aarch64/blake3 +SRCS+= b3_aarch64_sse2.S \ + b3_aarch64_sse41.S + +#icp/asm-ppc64/blake3 +SRCS+= b3_ppc64le_sse2.S \ + b3_ppc64le_sse41.S + +#icp/asm-x86_64/blake3 +SRCS+= blake3_avx2.S \ + blake3_avx512.S \ + blake3_sse2.S \ + blake3_sse41.S + #lua SRCS+= lapi.c \ lauxlib.c \ @@ -189,6 +215,7 @@ SRCS+= zfeature_common.c \ SRCS+= abd.c \ aggsum.c \ arc.c \ + blake3_zfs.c \ blkptr.c \ bplist.c \ bpobj.c \ @@ -291,6 +318,7 @@ SRCS+= abd.c \ zcp_synctask.c \ zfeature.c \ zfs_byteswap.c \ + zfs_chksum.c \ zfs_file_os.c \ zfs_fm.c \ zfs_fuid.c \ @@ -337,8 +365,6 @@ SRCS+= zfs_zstd.c \ zstd_decompress.c \ zstd_decompress_block.c - - beforeinstall: .if ${MK_DEBUG_FILES} != "no" mtree -eu \ diff --git a/module/icp/algs/blake3/blake3.c b/module/icp/algs/blake3/blake3.c new file mode 100644 index 000000000000..8c9c06eb9d9f --- /dev/null +++ b/module/icp/algs/blake3/blake3.c @@ -0,0 +1,732 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#include +#include + +#include "blake3_impl.h" + +/* + * We need 1056 byte stack for blake3_compress_subtree_wide() + * - we define this pragma to make gcc happy + */ +#if defined(__GNUC__) +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +/* internal used */ +typedef struct { + uint32_t input_cv[8]; + uint64_t counter; + uint8_t block[BLAKE3_BLOCK_LEN]; + uint8_t block_len; + uint8_t flags; +} output_t; + +/* internal flags */ +enum blake3_flags { + CHUNK_START = 1 << 0, + CHUNK_END = 1 << 1, + PARENT = 1 << 2, + ROOT = 1 << 3, + KEYED_HASH = 1 << 4, + DERIVE_KEY_CONTEXT = 1 << 5, + DERIVE_KEY_MATERIAL = 1 << 6, +}; + +/* internal start */ +static void chunk_state_init(blake3_chunk_state_t *ctx, + const uint32_t key[8], uint8_t flags) +{ + memcpy(ctx->cv, key, BLAKE3_KEY_LEN); + ctx->chunk_counter = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + ctx->buf_len = 0; + ctx->blocks_compressed = 0; + ctx->flags = flags; +} + +static void chunk_state_reset(blake3_chunk_state_t *ctx, + const uint32_t key[8], uint64_t chunk_counter) +{ + memcpy(ctx->cv, key, BLAKE3_KEY_LEN); + ctx->chunk_counter = chunk_counter; + ctx->blocks_compressed = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + ctx->buf_len = 0; +} + +static size_t chunk_state_len(const blake3_chunk_state_t *ctx) +{ + return (BLAKE3_BLOCK_LEN * (size_t)ctx->blocks_compressed) + + ((size_t)ctx->buf_len); +} + +static size_t chunk_state_fill_buf(blake3_chunk_state_t *ctx, + const uint8_t *input, size_t input_len) +{ + size_t take = BLAKE3_BLOCK_LEN - ((size_t)ctx->buf_len); + if (take > input_len) { + take = input_len; + } + uint8_t *dest = ctx->buf + ((size_t)ctx->buf_len); + memcpy(dest, input, take); + ctx->buf_len += (uint8_t)take; + return (take); +} + +static uint8_t chunk_state_maybe_start_flag(const blake3_chunk_state_t *ctx) +{ + if (ctx->blocks_compressed == 0) { + return (CHUNK_START); + } else { + return (0); + } +} + +static output_t make_output(const uint32_t input_cv[8], + const uint8_t *block, uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + output_t ret; + memcpy(ret.input_cv, input_cv, 32); + memcpy(ret.block, block, BLAKE3_BLOCK_LEN); + ret.block_len = block_len; + ret.counter = counter; + ret.flags = flags; + return (ret); +} + +/* + * Chaining values within a given chunk (specifically the compress_in_place + * interface) are represented as words. This avoids unnecessary bytes<->words + * conversion overhead in the portable implementation. However, the hash_many + * interface handles both user input and parent node blocks, so it accepts + * bytes. For that reason, chaining values in the CV stack are represented as + * bytes. + */ +static void output_chaining_value(const blake3_impl_ops_t *ops, + const output_t *ctx, uint8_t cv[32]) +{ + uint32_t cv_words[8]; + memcpy(cv_words, ctx->input_cv, 32); + ops->compress_in_place(cv_words, ctx->block, ctx->block_len, + ctx->counter, ctx->flags); + store_cv_words(cv, cv_words); +} + +static void output_root_bytes(const blake3_impl_ops_t *ops, const output_t *ctx, + uint64_t seek, uint8_t *out, size_t out_len) +{ + uint64_t output_block_counter = seek / 64; + size_t offset_within_block = seek % 64; + uint8_t wide_buf[64]; + while (out_len > 0) { + ops->compress_xof(ctx->input_cv, ctx->block, ctx->block_len, + output_block_counter, ctx->flags | ROOT, wide_buf); + size_t available_bytes = 64 - offset_within_block; + size_t memcpy_len; + if (out_len > available_bytes) { + memcpy_len = available_bytes; + } else { + memcpy_len = out_len; + } + memcpy(out, wide_buf + offset_within_block, memcpy_len); + out += memcpy_len; + out_len -= memcpy_len; + output_block_counter += 1; + offset_within_block = 0; + } +} + +static void chunk_state_update(const blake3_impl_ops_t *ops, + blake3_chunk_state_t *ctx, const uint8_t *input, size_t input_len) +{ + if (ctx->buf_len > 0) { + size_t take = chunk_state_fill_buf(ctx, input, input_len); + input += take; + input_len -= take; + if (input_len > 0) { + ops->compress_in_place(ctx->cv, ctx->buf, + BLAKE3_BLOCK_LEN, ctx->chunk_counter, + ctx->flags|chunk_state_maybe_start_flag(ctx)); + ctx->blocks_compressed += 1; + ctx->buf_len = 0; + memset(ctx->buf, 0, BLAKE3_BLOCK_LEN); + } + } + + while (input_len > BLAKE3_BLOCK_LEN) { + ops->compress_in_place(ctx->cv, input, BLAKE3_BLOCK_LEN, + ctx->chunk_counter, + ctx->flags|chunk_state_maybe_start_flag(ctx)); + ctx->blocks_compressed += 1; + input += BLAKE3_BLOCK_LEN; + input_len -= BLAKE3_BLOCK_LEN; + } + + size_t take = chunk_state_fill_buf(ctx, input, input_len); + input += take; + input_len -= take; +} + +static output_t chunk_state_output(const blake3_chunk_state_t *ctx) +{ + uint8_t block_flags = + ctx->flags | chunk_state_maybe_start_flag(ctx) | CHUNK_END; + return (make_output(ctx->cv, ctx->buf, ctx->buf_len, ctx->chunk_counter, + block_flags)); +} + +static output_t parent_output(const uint8_t block[BLAKE3_BLOCK_LEN], + const uint32_t key[8], uint8_t flags) +{ + return (make_output(key, block, BLAKE3_BLOCK_LEN, 0, flags | PARENT)); +} + +/* + * Given some input larger than one chunk, return the number of bytes that + * should go in the left subtree. This is the largest power-of-2 number of + * chunks that leaves at least 1 byte for the right subtree. + */ +static size_t left_len(size_t content_len) +{ + /* + * Subtract 1 to reserve at least one byte for the right side. + * content_len + * should always be greater than BLAKE3_CHUNK_LEN. + */ + size_t full_chunks = (content_len - 1) / BLAKE3_CHUNK_LEN; + return (round_down_to_power_of_2(full_chunks) * BLAKE3_CHUNK_LEN); +} + +/* + * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time + * on a single thread. Write out the chunk chaining values and return the + * number of chunks hashed. These chunks are never the root and never empty; + * those cases use a different codepath. + */ +static size_t compress_chunks_parallel(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + const uint8_t *chunks_array[MAX_SIMD_DEGREE]; + size_t input_position = 0; + size_t chunks_array_len = 0; + while (input_len - input_position >= BLAKE3_CHUNK_LEN) { + chunks_array[chunks_array_len] = &input[input_position]; + input_position += BLAKE3_CHUNK_LEN; + chunks_array_len += 1; + } + + ops->hash_many(chunks_array, chunks_array_len, BLAKE3_CHUNK_LEN / + BLAKE3_BLOCK_LEN, key, chunk_counter, B_TRUE, flags, CHUNK_START, + CHUNK_END, out); + + /* + * Hash the remaining partial chunk, if there is one. Note that the + * empty chunk (meaning the empty message) is a different codepath. + */ + if (input_len > input_position) { + uint64_t counter = chunk_counter + (uint64_t)chunks_array_len; + blake3_chunk_state_t chunk_state; + chunk_state_init(&chunk_state, key, flags); + chunk_state.chunk_counter = counter; + chunk_state_update(ops, &chunk_state, &input[input_position], + input_len - input_position); + output_t output = chunk_state_output(&chunk_state); + output_chaining_value(ops, &output, &out[chunks_array_len * + BLAKE3_OUT_LEN]); + return (chunks_array_len + 1); + } else { + return (chunks_array_len); + } +} + +/* + * Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time + * on a single thread. Write out the parent chaining values and return the + * number of parents hashed. (If there's an odd input chaining value left over, + * return it as an additional output.) These parents are never the root and + * never empty; those cases use a different codepath. + */ +static size_t compress_parents_parallel(const blake3_impl_ops_t *ops, + const uint8_t *child_chaining_values, size_t num_chaining_values, + const uint32_t key[8], uint8_t flags, uint8_t *out) +{ + const uint8_t *parents_array[MAX_SIMD_DEGREE_OR_2]; + size_t parents_array_len = 0; + + while (num_chaining_values - (2 * parents_array_len) >= 2) { + parents_array[parents_array_len] = &child_chaining_values[2 * + parents_array_len * BLAKE3_OUT_LEN]; + parents_array_len += 1; + } + + ops->hash_many(parents_array, parents_array_len, 1, key, 0, B_FALSE, + flags | PARENT, 0, 0, out); + + /* If there's an odd child left over, it becomes an output. */ + if (num_chaining_values > 2 * parents_array_len) { + memcpy(&out[parents_array_len * BLAKE3_OUT_LEN], + &child_chaining_values[2 * parents_array_len * + BLAKE3_OUT_LEN], BLAKE3_OUT_LEN); + return (parents_array_len + 1); + } else { + return (parents_array_len); + } +} + +/* + * The wide helper function returns (writes out) an array of chaining values + * and returns the length of that array. The number of chaining values returned + * is the dyanmically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, + * if the input is shorter than that many chunks. The reason for maintaining a + * wide array of chaining values going back up the tree, is to allow the + * implementation to hash as many parents in parallel as possible. + * + * As a special case when the SIMD degree is 1, this function will still return + * at least 2 outputs. This guarantees that this function doesn't perform the + * root compression. (If it did, it would use the wrong flags, and also we + * wouldn't be able to implement exendable ouput.) Note that this function is + * not used when the whole input is only 1 chunk long; that's a different + * codepath. + * + * Why not just have the caller split the input on the first update(), instead + * of implementing this special rule? Because we don't want to limit SIMD or + * multi-threading parallelism for that update(). + */ +static size_t blake3_compress_subtree_wide(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t *out) +{ + /* + * Note that the single chunk case does *not* bump the SIMD degree up + * to 2 when it is 1. If this implementation adds multi-threading in + * the future, this gives us the option of multi-threading even the + * 2-chunk case, which can help performance on smaller platforms. + */ + if (input_len <= (size_t)(ops->degree * BLAKE3_CHUNK_LEN)) { + return (compress_chunks_parallel(ops, input, input_len, key, + chunk_counter, flags, out)); + } + + + /* + * With more than simd_degree chunks, we need to recurse. Start by + * dividing the input into left and right subtrees. (Note that this is + * only optimal as long as the SIMD degree is a power of 2. If we ever + * get a SIMD degree of 3 or something, we'll need a more complicated + * strategy.) + */ + size_t left_input_len = left_len(input_len); + size_t right_input_len = input_len - left_input_len; + const uint8_t *right_input = &input[left_input_len]; + uint64_t right_chunk_counter = chunk_counter + + (uint64_t)(left_input_len / BLAKE3_CHUNK_LEN); + + /* + * Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 + * to account for the special case of returning 2 outputs when the + * SIMD degree is 1. + */ + uint8_t cv_array[2 * MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t degree = ops->degree; + if (left_input_len > BLAKE3_CHUNK_LEN && degree == 1) { + + /* + * The special case: We always use a degree of at least two, + * to make sure there are two outputs. Except, as noted above, + * at the chunk level, where we allow degree=1. (Note that the + * 1-chunk-input case is a different codepath.) + */ + degree = 2; + } + uint8_t *right_cvs = &cv_array[degree * BLAKE3_OUT_LEN]; + + /* + * Recurse! If this implementation adds multi-threading support in the + * future, this is where it will go. + */ + size_t left_n = blake3_compress_subtree_wide(ops, input, left_input_len, + key, chunk_counter, flags, cv_array); + size_t right_n = blake3_compress_subtree_wide(ops, right_input, + right_input_len, key, right_chunk_counter, flags, right_cvs); + + /* + * The special case again. If simd_degree=1, then we'll have left_n=1 + * and right_n=1. Rather than compressing them into a single output, + * return them directly, to make sure we always have at least two + * outputs. + */ + if (left_n == 1) { + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); + return (2); + } + + /* Otherwise, do one layer of parent node compression. */ + size_t num_chaining_values = left_n + right_n; + return compress_parents_parallel(ops, cv_array, + num_chaining_values, key, flags, out); +} + +/* + * Hash a subtree with compress_subtree_wide(), and then condense the resulting + * list of chaining values down to a single parent node. Don't compress that + * last parent node, however. Instead, return its message bytes (the + * concatenated chaining values of its children). This is necessary when the + * first call to update() supplies a complete subtree, because the topmost + * parent node of that subtree could end up being the root. It's also necessary + * for extended output in the general case. + * + * As with compress_subtree_wide(), this function is not used on inputs of 1 + * chunk or less. That's a different codepath. + */ +static void compress_subtree_to_parent_node(const blake3_impl_ops_t *ops, + const uint8_t *input, size_t input_len, const uint32_t key[8], + uint64_t chunk_counter, uint8_t flags, uint8_t out[2 * BLAKE3_OUT_LEN]) +{ + uint8_t cv_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN]; + size_t num_cvs = blake3_compress_subtree_wide(ops, input, input_len, + key, chunk_counter, flags, cv_array); + + /* + * If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + * compress_subtree_wide() returns more than 2 chaining values. Condense + * them into 2 by forming parent nodes repeatedly. + */ + uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2]; + while (num_cvs > 2) { + num_cvs = compress_parents_parallel(ops, cv_array, num_cvs, key, + flags, out_array); + memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN); + } + memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN); +} + +static void hasher_init_base(BLAKE3_CTX *ctx, const uint32_t key[8], + uint8_t flags) +{ + memcpy(ctx->key, key, BLAKE3_KEY_LEN); + chunk_state_init(&ctx->chunk, key, flags); + ctx->cv_stack_len = 0; + ctx->ops = blake3_impl_get_ops(); +} + +/* + * As described in hasher_push_cv() below, we do "lazy merging", delaying + * merges until right before the next CV is about to be added. This is + * different from the reference implementation. Another difference is that we + * aren't always merging 1 chunk at a time. Instead, each CV might represent + * any power-of-two number of chunks, as long as the smaller-above-larger + * stack order is maintained. Instead of the "count the trailing 0-bits" + * algorithm described in the spec, we use a "count the total number of + * 1-bits" variant that doesn't require us to retain the subtree size of the + * CV on top of the stack. The principle is the same: each CV that should + * remain in the stack is represented by a 1-bit in the total number of chunks + * (or bytes) so far. + */ +static void hasher_merge_cv_stack(BLAKE3_CTX *ctx, uint64_t total_len) +{ + size_t post_merge_stack_len = (size_t)popcnt(total_len); + while (ctx->cv_stack_len > post_merge_stack_len) { + uint8_t *parent_node = + &ctx->cv_stack[(ctx->cv_stack_len - 2) * BLAKE3_OUT_LEN]; + output_t output = + parent_output(parent_node, ctx->key, ctx->chunk.flags); + output_chaining_value(ctx->ops, &output, parent_node); + ctx->cv_stack_len -= 1; + } +} + +/* + * In reference_impl.rs, we merge the new CV with existing CVs from the stack + * before pushing it. We can do that because we know more input is coming, so + * we know none of the merges are root. + * + * This setting is different. We want to feed as much input as possible to + * compress_subtree_wide(), without setting aside anything for the chunk_state. + * If the user gives us 64 KiB, we want to parallelize over all 64 KiB at once + * as a single subtree, if at all possible. + * + * This leads to two problems: + * 1) This 64 KiB input might be the only call that ever gets made to update. + * In this case, the root node of the 64 KiB subtree would be the root node + * of the whole tree, and it would need to be ROOT finalized. We can't + * compress it until we know. + * 2) This 64 KiB input might complete a larger tree, whose root node is + * similarly going to be the the root of the whole tree. For example, maybe + * we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the + * node at the root of the 256 KiB subtree until we know how to finalize it. + * + * The second problem is solved with "lazy merging". That is, when we're about + * to add a CV to the stack, we don't merge it with anything first, as the + * reference impl does. Instead we do merges using the *previous* CV that was + * added, which is sitting on top of the stack, and we put the new CV + * (unmerged) on top of the stack afterwards. This guarantees that we never + * merge the root node until finalize(). + * + * Solving the first problem requires an additional tool, + * compress_subtree_to_parent_node(). That function always returns the top + * *two* chaining values of the subtree it's compressing. We then do lazy + * merging with each of them separately, so that the second CV will always + * remain unmerged. (That also helps us support extendable output when we're + * hashing an input all-at-once.) + */ +static void hasher_push_cv(BLAKE3_CTX *ctx, uint8_t new_cv[BLAKE3_OUT_LEN], + uint64_t chunk_counter) +{ + hasher_merge_cv_stack(ctx, chunk_counter); + memcpy(&ctx->cv_stack[ctx->cv_stack_len * BLAKE3_OUT_LEN], new_cv, + BLAKE3_OUT_LEN); + ctx->cv_stack_len += 1; +} + +void +Blake3_Init(BLAKE3_CTX *ctx) +{ + hasher_init_base(ctx, BLAKE3_IV, 0); +} + +void +Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN]) +{ + uint32_t key_words[8]; + load_key_words(key, key_words); + hasher_init_base(ctx, key_words, KEYED_HASH); +} + +static void +Blake3_Update2(BLAKE3_CTX *ctx, const void *input, size_t input_len) +{ + /* + * Explicitly checking for zero avoids causing UB by passing a null + * pointer to memcpy. This comes up in practice with things like: + * std::vector v; + * blake3_hasher_update(&hasher, v.data(), v.size()); + */ + if (input_len == 0) { + return; + } + + const uint8_t *input_bytes = (const uint8_t *)input; + + /* + * If we have some partial chunk bytes in the internal chunk_state, we + * need to finish that chunk first. + */ + if (chunk_state_len(&ctx->chunk) > 0) { + size_t take = BLAKE3_CHUNK_LEN - chunk_state_len(&ctx->chunk); + if (take > input_len) { + take = input_len; + } + chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, take); + input_bytes += take; + input_len -= take; + /* + * If we've filled the current chunk and there's more coming, + * finalize this chunk and proceed. In this case we know it's + * not the root. + */ + if (input_len > 0) { + output_t output = chunk_state_output(&ctx->chunk); + uint8_t chunk_cv[32]; + output_chaining_value(ctx->ops, &output, chunk_cv); + hasher_push_cv(ctx, chunk_cv, ctx->chunk.chunk_counter); + chunk_state_reset(&ctx->chunk, ctx->key, + ctx->chunk.chunk_counter + 1); + } else { + return; + } + } + + /* + * Now the chunk_state is clear, and we have more input. If there's + * more than a single chunk (so, definitely not the root chunk), hash + * the largest whole subtree we can, with the full benefits of SIMD + * (and maybe in the future, multi-threading) parallelism. Two + * restrictions: + * - The subtree has to be a power-of-2 number of chunks. Only + * subtrees along the right edge can be incomplete, and we don't know + * where the right edge is going to be until we get to finalize(). + * - The subtree must evenly divide the total number of chunks up + * until this point (if total is not 0). If the current incomplete + * subtree is only waiting for 1 more chunk, we can't hash a subtree + * of 4 chunks. We have to complete the current subtree first. + * Because we might need to break up the input to form powers of 2, or + * to evenly divide what we already have, this part runs in a loop. + */ + while (input_len > BLAKE3_CHUNK_LEN) { + size_t subtree_len = round_down_to_power_of_2(input_len); + uint64_t count_so_far = + ctx->chunk.chunk_counter * BLAKE3_CHUNK_LEN; + /* + * Shrink the subtree_len until it evenly divides the count so + * far. We know that subtree_len itself is a power of 2, so we + * can use a bitmasking trick instead of an actual remainder + * operation. (Note that if the caller consistently passes + * power-of-2 inputs of the same size, as is hopefully + * typical, this loop condition will always fail, and + * subtree_len will always be the full length of the input.) + * + * An aside: We don't have to shrink subtree_len quite this + * much. For example, if count_so_far is 1, we could pass 2 + * chunks to compress_subtree_to_parent_node. Since we'll get + * 2 CVs back, we'll still get the right answer in the end, + * and we might get to use 2-way SIMD parallelism. The problem + * with this optimization, is that it gets us stuck always + * hashing 2 chunks. The total number of chunks will remain + * odd, and we'll never graduate to higher degrees of + * parallelism. See + * https://github.com/BLAKE3-team/BLAKE3/issues/69. + */ + while ((((uint64_t)(subtree_len - 1)) & count_so_far) != 0) { + subtree_len /= 2; + } + /* + * The shrunken subtree_len might now be 1 chunk long. If so, + * hash that one chunk by itself. Otherwise, compress the + * subtree into a pair of CVs. + */ + uint64_t subtree_chunks = subtree_len / BLAKE3_CHUNK_LEN; + if (subtree_len <= BLAKE3_CHUNK_LEN) { + blake3_chunk_state_t chunk_state; + chunk_state_init(&chunk_state, ctx->key, + ctx->chunk.flags); + chunk_state.chunk_counter = ctx->chunk.chunk_counter; + chunk_state_update(ctx->ops, &chunk_state, input_bytes, + subtree_len); + output_t output = chunk_state_output(&chunk_state); + uint8_t cv[BLAKE3_OUT_LEN]; + output_chaining_value(ctx->ops, &output, cv); + hasher_push_cv(ctx, cv, chunk_state.chunk_counter); + } else { + /* + * This is the high-performance happy path, though + * getting here depends on the caller giving us a long + * enough input. + */ + uint8_t cv_pair[2 * BLAKE3_OUT_LEN]; + compress_subtree_to_parent_node(ctx->ops, input_bytes, + subtree_len, ctx->key, ctx-> chunk.chunk_counter, + ctx->chunk.flags, cv_pair); + hasher_push_cv(ctx, cv_pair, ctx->chunk.chunk_counter); + hasher_push_cv(ctx, &cv_pair[BLAKE3_OUT_LEN], + ctx->chunk.chunk_counter + (subtree_chunks / 2)); + } + ctx->chunk.chunk_counter += subtree_chunks; + input_bytes += subtree_len; + input_len -= subtree_len; + } + + /* + * If there's any remaining input less than a full chunk, add it to + * the chunk state. In that case, also do a final merge loop to make + * sure the subtree stack doesn't contain any unmerged pairs. The + * remaining input means we know these merges are non-root. This merge + * loop isn't strictly necessary here, because hasher_push_chunk_cv + * already does its own merge loop, but it simplifies + * blake3_hasher_finalize below. + */ + if (input_len > 0) { + chunk_state_update(ctx->ops, &ctx->chunk, input_bytes, + input_len); + hasher_merge_cv_stack(ctx, ctx->chunk.chunk_counter); + } +} + +void +Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t todo) +{ + size_t done = 0; + const uint8_t *data = input; + const size_t block_max = 1024 * 64; + + /* max feed buffer to leave the stack size small */ + while (todo != 0) { + size_t block = (todo >= block_max) ? block_max : todo; + Blake3_Update2(ctx, data + done, block); + done += block; + todo -= block; + } +} + +void +Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out) +{ + Blake3_FinalSeek(ctx, 0, out, BLAKE3_OUT_LEN); +} + +void +Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out, + size_t out_len) +{ + /* + * Explicitly checking for zero avoids causing UB by passing a null + * pointer to memcpy. This comes up in practice with things like: + * std::vector v; + * blake3_hasher_finalize(&hasher, v.data(), v.size()); + */ + if (out_len == 0) { + return; + } + /* If the subtree stack is empty, then the current chunk is the root. */ + if (ctx->cv_stack_len == 0) { + output_t output = chunk_state_output(&ctx->chunk); + output_root_bytes(ctx->ops, &output, seek, out, out_len); + return; + } + /* + * If there are any bytes in the chunk state, finalize that chunk and + * do a roll-up merge between that chunk hash and every subtree in the + * stack. In this case, the extra merge loop at the end of + * blake3_hasher_update guarantees that none of the subtrees in the + * stack need to be merged with each other first. Otherwise, if there + * are no bytes in the chunk state, then the top of the stack is a + * chunk hash, and we start the merge from that. + */ + output_t output; + size_t cvs_remaining; + if (chunk_state_len(&ctx->chunk) > 0) { + cvs_remaining = ctx->cv_stack_len; + output = chunk_state_output(&ctx->chunk); + } else { + /* There are always at least 2 CVs in the stack in this case. */ + cvs_remaining = ctx->cv_stack_len - 2; + output = parent_output(&ctx->cv_stack[cvs_remaining * 32], + ctx->key, ctx->chunk.flags); + } + while (cvs_remaining > 0) { + cvs_remaining -= 1; + uint8_t parent_block[BLAKE3_BLOCK_LEN]; + memcpy(parent_block, &ctx->cv_stack[cvs_remaining * 32], 32); + output_chaining_value(ctx->ops, &output, &parent_block[32]); + output = parent_output(parent_block, ctx->key, + ctx->chunk.flags); + } + output_root_bytes(ctx->ops, &output, seek, out, out_len); +} diff --git a/module/icp/algs/blake3/blake3_generic.c b/module/icp/algs/blake3/blake3_generic.c new file mode 100644 index 000000000000..6ff9a845ccdc --- /dev/null +++ b/module/icp/algs/blake3/blake3_generic.c @@ -0,0 +1,202 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#include +#include "blake3_impl.h" + +#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +static inline void g(uint32_t *state, size_t a, size_t b, size_t c, size_t d, + uint32_t x, uint32_t y) +{ + state[a] = state[a] + state[b] + x; + state[d] = rotr32(state[d] ^ state[a], 16); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 12); + state[a] = state[a] + state[b] + y; + state[d] = rotr32(state[d] ^ state[a], 8); + state[c] = state[c] + state[d]; + state[b] = rotr32(state[b] ^ state[c], 7); +} + +static inline void round_fn(uint32_t state[16], const uint32_t *msg, + size_t round) +{ + /* Select the message schedule based on the round. */ + const uint8_t *schedule = BLAKE3_MSG_SCHEDULE[round]; + + /* Mix the columns. */ + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + /* Mix the rows. */ + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +static inline void compress_pre(uint32_t state[16], const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + uint32_t block_words[16]; + block_words[0] = load32(block + 4 * 0); + block_words[1] = load32(block + 4 * 1); + block_words[2] = load32(block + 4 * 2); + block_words[3] = load32(block + 4 * 3); + block_words[4] = load32(block + 4 * 4); + block_words[5] = load32(block + 4 * 5); + block_words[6] = load32(block + 4 * 6); + block_words[7] = load32(block + 4 * 7); + block_words[8] = load32(block + 4 * 8); + block_words[9] = load32(block + 4 * 9); + block_words[10] = load32(block + 4 * 10); + block_words[11] = load32(block + 4 * 11); + block_words[12] = load32(block + 4 * 12); + block_words[13] = load32(block + 4 * 13); + block_words[14] = load32(block + 4 * 14); + block_words[15] = load32(block + 4 * 15); + + state[0] = cv[0]; + state[1] = cv[1]; + state[2] = cv[2]; + state[3] = cv[3]; + state[4] = cv[4]; + state[5] = cv[5]; + state[6] = cv[6]; + state[7] = cv[7]; + state[8] = BLAKE3_IV[0]; + state[9] = BLAKE3_IV[1]; + state[10] = BLAKE3_IV[2]; + state[11] = BLAKE3_IV[3]; + state[12] = counter_low(counter); + state[13] = counter_high(counter); + state[14] = (uint32_t)block_len; + state[15] = (uint32_t)flags; + + round_fn(state, &block_words[0], 0); + round_fn(state, &block_words[0], 1); + round_fn(state, &block_words[0], 2); + round_fn(state, &block_words[0], 3); + round_fn(state, &block_words[0], 4); + round_fn(state, &block_words[0], 5); + round_fn(state, &block_words[0], 6); +} + +static inline void blake3_compress_in_place_generic(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +static inline void hash_one_generic(const uint8_t *input, size_t blocks, + const uint32_t key[8], uint64_t counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) +{ + uint32_t cv[8]; + memcpy(cv, key, BLAKE3_KEY_LEN); + uint8_t block_flags = flags | flags_start; + while (blocks > 0) { + if (blocks == 1) { + block_flags |= flags_end; + } + blake3_compress_in_place_generic(cv, input, BLAKE3_BLOCK_LEN, + counter, block_flags); + input = &input[BLAKE3_BLOCK_LEN]; + blocks -= 1; + block_flags = flags; + } + store_cv_words(out, cv); +} + +static inline void blake3_compress_xof_generic(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) +{ + uint32_t state[16]; + compress_pre(state, cv, block, block_len, counter, flags); + + store32(&out[0 * 4], state[0] ^ state[8]); + store32(&out[1 * 4], state[1] ^ state[9]); + store32(&out[2 * 4], state[2] ^ state[10]); + store32(&out[3 * 4], state[3] ^ state[11]); + store32(&out[4 * 4], state[4] ^ state[12]); + store32(&out[5 * 4], state[5] ^ state[13]); + store32(&out[6 * 4], state[6] ^ state[14]); + store32(&out[7 * 4], state[7] ^ state[15]); + store32(&out[8 * 4], state[8] ^ cv[0]); + store32(&out[9 * 4], state[9] ^ cv[1]); + store32(&out[10 * 4], state[10] ^ cv[2]); + store32(&out[11 * 4], state[11] ^ cv[3]); + store32(&out[12 * 4], state[12] ^ cv[4]); + store32(&out[13 * 4], state[13] ^ cv[5]); + store32(&out[14 * 4], state[14] ^ cv[6]); + store32(&out[15 * 4], state[15] ^ cv[7]); +} + +static inline void blake3_hash_many_generic(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], uint64_t counter, + boolean_t increment_counter, uint8_t flags, uint8_t flags_start, + uint8_t flags_end, uint8_t *out) +{ + while (num_inputs > 0) { + hash_one_generic(inputs[0], blocks, key, counter, flags, + flags_start, flags_end, out); + if (increment_counter) { + counter += 1; + } + inputs += 1; + num_inputs -= 1; + out = &out[BLAKE3_OUT_LEN]; + } +} + +static inline boolean_t blake3_is_generic_supported(void) +{ + return (B_TRUE); +} + +const blake3_impl_ops_t blake3_generic_impl = { + .compress_in_place = blake3_compress_in_place_generic, + .compress_xof = blake3_compress_xof_generic, + .hash_many = blake3_hash_many_generic, + .is_supported = blake3_is_generic_supported, + .degree = 4, + .name = "generic" +}; diff --git a/module/icp/algs/blake3/blake3_impl.c b/module/icp/algs/blake3/blake3_impl.c new file mode 100644 index 000000000000..c3268ec13dad --- /dev/null +++ b/module/icp/algs/blake3/blake3_impl.c @@ -0,0 +1,256 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#include +#include + +#include "blake3_impl.h" + +static const blake3_impl_ops_t *const blake3_impls[] = { + &blake3_generic_impl, +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse2_impl, +#endif +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + &blake3_sse41_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) + &blake3_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + &blake3_avx512_impl, +#endif +}; + +/* this pointer holds current ops for implementation */ +static const blake3_impl_ops_t *blake3_selected_impl = &blake3_generic_impl; + +/* special implementation selections */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX-1) +#define IMPL_USER (UINT32_MAX-2) +#define IMPL_PARAM (UINT32_MAX-3) + +#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) +static uint32_t icp_blake3_impl = IMPL_FASTEST; + +#define BLAKE3_IMPL_NAME_MAX 16 + +/* id of fastest implementation */ +static uint32_t blake3_fastest_id = 0; + +/* currently used id */ +static uint32_t blake3_current_id = 0; + +/* id of module parameter (-1 == unused) */ +static int blake3_param_id = -1; + +/* return number of supported implementations */ +int +blake3_get_impl_count(void) +{ + static int impls = 0; + int i; + + if (impls) + return (impls); + + for (i = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + impls++; + } + + return (impls); +} + +/* return id of selected implementation */ +int +blake3_get_impl_id(void) +{ + return (blake3_current_id); +} + +/* return name of selected implementation */ +const char * +blake3_get_impl_name(void) +{ + return (blake3_selected_impl->name); +} + +/* setup id as fastest implementation */ +void +blake3_set_impl_fastest(uint32_t id) +{ + blake3_fastest_id = id; +} + +/* set implementation by id */ +void +blake3_set_impl_id(uint32_t id) +{ + int i, cid; + + /* select fastest */ + if (id == IMPL_FASTEST) + id = blake3_fastest_id; + + /* select next or first */ + if (id == IMPL_CYCLE) + id = (++blake3_current_id) % blake3_get_impl_count(); + + /* 0..N for the real impl */ + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + if (cid == id) { + blake3_current_id = cid; + blake3_selected_impl = blake3_impls[i]; + return; + } + cid++; + } +} + +/* set implementation by name */ +int +blake3_set_impl_name(const char *name) +{ + int i, cid; + + if (strcmp(name, "fastest") == 0) { + atomic_swap_32(&icp_blake3_impl, IMPL_FASTEST); + blake3_set_impl_id(IMPL_FASTEST); + return (0); + } else if (strcmp(name, "cycle") == 0) { + atomic_swap_32(&icp_blake3_impl, IMPL_CYCLE); + blake3_set_impl_id(IMPL_CYCLE); + return (0); + } + + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + if (strcmp(name, blake3_impls[i]->name) == 0) { + if (icp_blake3_impl == IMPL_PARAM) { + blake3_param_id = cid; + return (0); + } + blake3_selected_impl = blake3_impls[i]; + blake3_current_id = cid; + return (0); + } + cid++; + } + + return (-EINVAL); +} + +/* setup implementation */ +void +blake3_setup_impl(void) +{ + switch (IMPL_READ(icp_blake3_impl)) { + case IMPL_PARAM: + blake3_set_impl_id(blake3_param_id); + atomic_swap_32(&icp_blake3_impl, IMPL_USER); + break; + case IMPL_FASTEST: + blake3_set_impl_id(IMPL_FASTEST); + break; + case IMPL_CYCLE: + blake3_set_impl_id(IMPL_CYCLE); + break; + default: + blake3_set_impl_id(blake3_current_id); + break; + } +} + +/* return selected implementation */ +const blake3_impl_ops_t * +blake3_impl_get_ops(void) +{ + /* each call to ops will cycle */ + if (icp_blake3_impl == IMPL_CYCLE) + blake3_set_impl_id(IMPL_CYCLE); + + return (blake3_selected_impl); +} + +#if defined(_KERNEL) && defined(__linux__) +static int +icp_blake3_impl_set(const char *name, zfs_kernel_param_t *kp) +{ + char req_name[BLAKE3_IMPL_NAME_MAX]; + size_t i; + + /* sanitize input */ + i = strnlen(name, BLAKE3_IMPL_NAME_MAX); + if (i == 0 || i >= BLAKE3_IMPL_NAME_MAX) + return (-EINVAL); + + strlcpy(req_name, name, BLAKE3_IMPL_NAME_MAX); + while (i > 0 && isspace(req_name[i-1])) + i--; + req_name[i] = '\0'; + + atomic_swap_32(&icp_blake3_impl, IMPL_PARAM); + return (blake3_set_impl_name(req_name)); +} + +static int +icp_blake3_impl_get(char *buffer, zfs_kernel_param_t *kp) +{ + int i, cid, cnt = 0; + char *fmt; + + /* cycling */ + fmt = (icp_blake3_impl == IMPL_CYCLE) ? "[cycle] " : "cycle "; + cnt += sprintf(buffer + cnt, fmt); + + /* fastest one */ + fmt = (icp_blake3_impl == IMPL_FASTEST) ? "[fastest] " : "fastest "; + cnt += sprintf(buffer + cnt, fmt); + + /* user selected */ + for (i = 0, cid = 0; i < ARRAY_SIZE(blake3_impls); i++) { + if (!blake3_impls[i]->is_supported()) continue; + fmt = (icp_blake3_impl == IMPL_USER && + cid == blake3_current_id) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, blake3_impls[i]->name); + cid++; + } + + buffer[cnt] = 0; + + return (cnt); +} + +module_param_call(icp_blake3_impl, icp_blake3_impl_set, icp_blake3_impl_get, + NULL, 0644); +MODULE_PARM_DESC(icp_blake3_impl, "Select BLAKE3 implementation."); +#endif diff --git a/module/icp/algs/blake3/blake3_impl.h b/module/icp/algs/blake3/blake3_impl.h new file mode 100644 index 000000000000..7b40cc4d3f02 --- /dev/null +++ b/module/icp/algs/blake3/blake3_impl.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#ifndef BLAKE3_IMPL_H +#define BLAKE3_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* + * Methods used to define BLAKE3 assembler implementations + */ +typedef void (*blake3_compress_in_place_f)(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], + uint8_t block_len, uint64_t counter, + uint8_t flags); + +typedef void (*blake3_compress_xof_f)(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +typedef void (*blake3_hash_many_f)(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +typedef boolean_t (*blake3_is_supported_f)(void); + +typedef struct blake3_impl_ops { + blake3_compress_in_place_f compress_in_place; + blake3_compress_xof_f compress_xof; + blake3_hash_many_f hash_many; + blake3_is_supported_f is_supported; + int degree; + const char *name; +} blake3_impl_ops_t; + +/* Return selected BLAKE3 implementation ops */ +extern const blake3_impl_ops_t *blake3_impl_get_ops(void); + +extern const blake3_impl_ops_t blake3_generic_impl; + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) +extern const blake3_impl_ops_t blake3_sse2_impl; +#endif + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE4_1)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) +extern const blake3_impl_ops_t blake3_sse41_impl; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern const blake3_impl_ops_t blake3_avx2_impl; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern const blake3_impl_ops_t blake3_avx512_impl; +#endif + +#if defined(__x86_64) +#define MAX_SIMD_DEGREE 16 +#else +#define MAX_SIMD_DEGREE 4 +#endif + +#define MAX_SIMD_DEGREE_OR_2 (MAX_SIMD_DEGREE > 2 ? MAX_SIMD_DEGREE : 2) + +static const uint32_t BLAKE3_IV[8] = { + 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL, + 0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL}; + +static const uint8_t BLAKE3_MSG_SCHEDULE[7][16] = { + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, + {2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8}, + {3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1}, + {10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6}, + {12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4}, + {9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7}, + {11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13}, +}; + +/* Find index of the highest set bit */ +static inline unsigned int highest_one(uint64_t x) { +#if defined(__GNUC__) || defined(__clang__) + return (63 ^ __builtin_clzll(x)); +#elif defined(_MSC_VER) && defined(IS_X86_64) + unsigned long index; + _BitScanReverse64(&index, x); + return (index); +#elif defined(_MSC_VER) && defined(IS_X86_32) + if (x >> 32) { + unsigned long index; + _BitScanReverse(&index, x >> 32); + return (32 + index); + } else { + unsigned long index; + _BitScanReverse(&index, x); + return (index); + } +#else + unsigned int c = 0; + if (x & 0xffffffff00000000ULL) { x >>= 32; c += 32; } + if (x & 0x00000000ffff0000ULL) { x >>= 16; c += 16; } + if (x & 0x000000000000ff00ULL) { x >>= 8; c += 8; } + if (x & 0x00000000000000f0ULL) { x >>= 4; c += 4; } + if (x & 0x000000000000000cULL) { x >>= 2; c += 2; } + if (x & 0x0000000000000002ULL) { c += 1; } + return (c); +#endif +} + +/* Count the number of 1 bits. */ +static inline unsigned int popcnt(uint64_t x) { + unsigned int count = 0; + + while (x != 0) { + count += 1; + x &= x - 1; + } + + return (count); +} + +/* + * Largest power of two less than or equal to x. + * As a special case, returns 1 when x is 0. + */ +static inline uint64_t round_down_to_power_of_2(uint64_t x) { + return (1ULL << highest_one(x | 1)); +} + +static inline uint32_t counter_low(uint64_t counter) { + return ((uint32_t)counter); +} + +static inline uint32_t counter_high(uint64_t counter) { + return ((uint32_t)(counter >> 32)); +} + +static inline uint32_t load32(const void *src) { + const uint8_t *p = (const uint8_t *)src; + return ((uint32_t)(p[0]) << 0) | ((uint32_t)(p[1]) << 8) | + ((uint32_t)(p[2]) << 16) | ((uint32_t)(p[3]) << 24); +} + +static inline void load_key_words(const uint8_t key[BLAKE3_KEY_LEN], + uint32_t key_words[8]) { + key_words[0] = load32(&key[0 * 4]); + key_words[1] = load32(&key[1 * 4]); + key_words[2] = load32(&key[2 * 4]); + key_words[3] = load32(&key[3 * 4]); + key_words[4] = load32(&key[4 * 4]); + key_words[5] = load32(&key[5 * 4]); + key_words[6] = load32(&key[6 * 4]); + key_words[7] = load32(&key[7 * 4]); +} + +static inline void store32(void *dst, uint32_t w) { + uint8_t *p = (uint8_t *)dst; + p[0] = (uint8_t)(w >> 0); + p[1] = (uint8_t)(w >> 8); + p[2] = (uint8_t)(w >> 16); + p[3] = (uint8_t)(w >> 24); +} + +static inline void store_cv_words(uint8_t bytes_out[32], uint32_t cv_words[8]) { + store32(&bytes_out[0 * 4], cv_words[0]); + store32(&bytes_out[1 * 4], cv_words[1]); + store32(&bytes_out[2 * 4], cv_words[2]); + store32(&bytes_out[3 * 4], cv_words[3]); + store32(&bytes_out[4 * 4], cv_words[4]); + store32(&bytes_out[5 * 4], cv_words[5]); + store32(&bytes_out[6 * 4], cv_words[6]); + store32(&bytes_out[7 * 4], cv_words[7]); +} + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_IMPL_H */ diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c new file mode 100644 index 000000000000..8502f3094b5e --- /dev/null +++ b/module/icp/algs/blake3/blake3_x86-64.c @@ -0,0 +1,244 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021-2022 Tino Reichardt + */ + +#include "blake3_impl.h" + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +extern void zfs_blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse2(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse2(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_sse2(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse2(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_sse2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_sse2_supported(void) +{ +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse2_available()); +#else + return (kfpu_allowed()); +#endif +} + +const blake3_impl_ops_t blake3_sse2_impl = { + .compress_in_place = blake3_compress_in_place_sse2, + .compress_xof = blake3_compress_xof_sse2, + .hash_many = blake3_hash_many_sse2, + .is_supported = blake3_is_sse2_supported, + .degree = 4, + .name = "sse2" +}; +#endif + +#if defined(__aarch64__) || \ + (defined(__x86_64) && defined(HAVE_SSE2)) || \ + (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + +extern void zfs_blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_sse41(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_sse41(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_sse41(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_sse41(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_sse41(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_sse41_supported(void) +{ +#if defined(__x86_64) + return (kfpu_allowed() && zfs_sse4_1_available()); +#else + return (kfpu_allowed()); +#endif +} + +const blake3_impl_ops_t blake3_sse41_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_sse41, + .is_supported = blake3_is_sse41_supported, + .degree = 4, + .name = "sse41" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_SSE4_1) && defined(HAVE_AVX2) +extern void zfs_blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_hash_many_avx2(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx2(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_avx2_supported(void) +{ + return (kfpu_allowed() && zfs_sse4_1_available() && + zfs_avx2_available()); +} + +const blake3_impl_ops_t blake3_avx2_impl = { + .compress_in_place = blake3_compress_in_place_sse41, + .compress_xof = blake3_compress_xof_sse41, + .hash_many = blake3_hash_many_avx2, + .is_supported = blake3_is_avx2_supported, + .degree = 8, + .name = "avx2" +}; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) +extern void zfs_blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags); + +extern void zfs_blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]); + +extern void zfs_blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out); + +static void blake3_compress_in_place_avx512(uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags) { + kfpu_begin(); + zfs_blake3_compress_in_place_avx512(cv, block, block_len, counter, + flags); + kfpu_end(); +} + +static void blake3_compress_xof_avx512(const uint32_t cv[8], + const uint8_t block[BLAKE3_BLOCK_LEN], uint8_t block_len, + uint64_t counter, uint8_t flags, uint8_t out[64]) { + kfpu_begin(); + zfs_blake3_compress_xof_avx512(cv, block, block_len, counter, flags, + out); + kfpu_end(); +} + +static void blake3_hash_many_avx512(const uint8_t * const *inputs, + size_t num_inputs, size_t blocks, const uint32_t key[8], + uint64_t counter, boolean_t increment_counter, uint8_t flags, + uint8_t flags_start, uint8_t flags_end, uint8_t *out) { + kfpu_begin(); + zfs_blake3_hash_many_avx512(inputs, num_inputs, blocks, key, counter, + increment_counter, flags, flags_start, flags_end, out); + kfpu_end(); +} + +static boolean_t blake3_is_avx512_supported(void) +{ + return (kfpu_allowed() && zfs_avx512f_available() && + zfs_avx512vl_available()); +} + +const blake3_impl_ops_t blake3_avx512_impl = { + .compress_in_place = blake3_compress_in_place_avx512, + .compress_xof = blake3_compress_xof_avx512, + .hash_many = blake3_hash_many_avx512, + .is_supported = blake3_is_avx512_supported, + .degree = 16, + .name = "avx512" +}; +#endif diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S new file mode 100644 index 000000000000..59a4d9afd437 --- /dev/null +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse2.S @@ -0,0 +1,2450 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale + * Copyright (c) 2022 Tino Reichardt + * + * This is converted assembly: SSE2 -> ARMv8-A + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if defined(__aarch64__) + .text + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI0_1: + .xword 0 + .xword -4294967296 +.LCPI0_2: + .xword -1 + .xword 4294967295 + .text + .globl zfs_blake3_compress_in_place_sse2 + .p2align 2 + .type zfs_blake3_compress_in_place_sse2,@function +zfs_blake3_compress_in_place_sse2: + .cfi_startproc + ldp q3, q2, [x0] + ldp q5, q6, [x1] + add x10, x1, #32 + lsr x11, x3, #32 + fmov s4, w3 + ld2 { v17.4s, v18.4s }, [x10] + adrp x10, .LCPI0_2 + and w8, w2, #0xff + mov v4.s[1], w11 + ldr q1, [x10, :lo12:.LCPI0_2] + and w9, w4, #0xff + adrp x12, .LCPI0_0 + mov v4.s[2], w8 + uzp1 v19.4s, v5.4s, v6.4s + add v3.4s, v2.4s, v3.4s + ldr q7, [x12, :lo12:.LCPI0_0] + mov v4.s[3], w9 + add v3.4s, v3.4s, v19.4s + uzp2 v5.4s, v5.4s, v6.4s + ext v21.16b, v18.16b, v18.16b, #12 + uzp1 v6.4s, v19.4s, v19.4s + ext v22.16b, v19.16b, v19.16b, #12 + eor v4.16b, v3.16b, v4.16b + ext v20.16b, v17.16b, v17.16b, #12 + ext v6.16b, v6.16b, v19.16b, #8 + ext v19.16b, v19.16b, v22.16b, #12 + zip1 v22.2d, v21.2d, v5.2d + rev32 v24.8h, v4.8h + mov v4.16b, v1.16b + zip2 v23.4s, v5.4s, v21.4s + uzp2 v6.4s, v6.4s, v5.4s + bsl v4.16b, v22.16b, v20.16b + add v3.4s, v3.4s, v5.4s + zip1 v5.4s, v23.4s, v20.4s + zip1 v22.4s, v20.4s, v23.4s + add v23.4s, v24.4s, v7.4s + ext v7.16b, v6.16b, v6.16b, #4 + ext v25.16b, v4.16b, v4.16b, #12 + ext v5.16b, v22.16b, v5.16b, #8 + eor v2.16b, v23.16b, v2.16b + uzp1 v4.4s, v4.4s, v25.4s + uzp1 v22.4s, v7.4s, v7.4s + ext v25.16b, v7.16b, v7.16b, #12 + ext v22.16b, v22.16b, v7.16b, #8 + ext v7.16b, v7.16b, v25.16b, #12 + ushr v25.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v2.16b, v2.16b, v25.16b + add v3.4s, v3.4s, v2.4s + eor v24.16b, v3.16b, v24.16b + add v3.4s, v3.4s, v17.4s + ushr v17.4s, v24.4s, #8 + shl v18.4s, v24.4s, #24 + orr v17.16b, v18.16b, v17.16b + add v18.4s, v17.4s, v23.4s + eor v2.16b, v18.16b, v2.16b + ushr v23.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v3.16b, v3.16b, v3.16b, #12 + orr v2.16b, v2.16b, v23.16b + ext v17.16b, v17.16b, v17.16b, #8 + add v3.4s, v2.4s, v3.4s + adrp x11, .LCPI0_1 + eor v17.16b, v3.16b, v17.16b + ldr q16, [x11, :lo12:.LCPI0_1] + ext v18.16b, v18.16b, v18.16b, #4 + rev32 v24.8h, v17.8h + movi v0.2d, #0xffffffff00000000 + add v23.4s, v3.4s, v21.4s + mov v21.s[1], v20.s[2] + add v20.4s, v18.4s, v24.4s + bit v19.16b, v21.16b, v0.16b + eor v3.16b, v20.16b, v2.16b + uzp2 v2.4s, v22.4s, v19.4s + zip1 v17.2d, v5.2d, v19.2d + zip2 v18.4s, v19.4s, v5.4s + ushr v21.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + ext v22.16b, v2.16b, v2.16b, #4 + bsl v16.16b, v4.16b, v17.16b + zip1 v17.4s, v18.4s, v4.4s + zip1 v18.4s, v4.4s, v18.4s + orr v21.16b, v3.16b, v21.16b + ext v25.16b, v16.16b, v16.16b, #12 + ext v3.16b, v18.16b, v17.16b, #8 + uzp1 v18.4s, v22.4s, v22.4s + ext v26.16b, v22.16b, v22.16b, #12 + add v23.4s, v23.4s, v21.4s + uzp1 v17.4s, v16.4s, v25.4s + ext v16.16b, v18.16b, v22.16b, #8 + ext v18.16b, v22.16b, v26.16b, #12 + eor v22.16b, v23.16b, v24.16b + add v6.4s, v23.4s, v6.4s + ushr v23.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v23.16b + add v20.4s, v22.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ushr v23.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v21.16b, v21.16b, v23.16b + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v21.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v20.16b, v20.16b, v20.16b, #12 + add v6.4s, v6.4s, v19.4s + rev32 v19.8h, v22.8h + add v20.4s, v20.4s, v19.4s + eor v21.16b, v20.16b, v21.16b + ushr v22.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + orr v21.16b, v21.16b, v22.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ushr v22.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v22.16b + add v20.4s, v19.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #12 + ushr v22.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + add v6.4s, v6.4s, v4.4s + orr v21.16b, v21.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ext v20.16b, v20.16b, v20.16b, #4 + rev32 v19.8h, v19.8h + add v20.4s, v20.4s, v19.4s + add v6.4s, v6.4s, v5.4s + mov v5.s[1], v4.s[2] + eor v4.16b, v20.16b, v21.16b + ushr v21.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v21.16b, v4.16b, v21.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + add v2.4s, v6.4s, v2.4s + ushr v6.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v6.16b, v19.16b, v6.16b + add v19.4s, v6.4s, v20.4s + eor v20.16b, v19.16b, v21.16b + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v20.4s, v2.4s + eor v6.16b, v2.16b, v6.16b + ext v19.16b, v19.16b, v19.16b, #12 + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v6.4s + mov v22.16b, v0.16b + eor v20.16b, v19.16b, v20.16b + bsl v22.16b, v5.16b, v7.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + add v2.4s, v2.4s, v22.4s + orr v20.16b, v20.16b, v21.16b + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + ushr v21.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v6.16b, v6.16b, v21.16b + add v19.4s, v6.4s, v19.4s + eor v20.16b, v19.16b, v20.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v2.4s, v2.4s, v17.4s + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + uzp2 v5.4s, v16.4s, v22.4s + zip1 v7.2d, v3.2d, v22.2d + zip2 v16.4s, v22.4s, v3.4s + ext v19.16b, v19.16b, v19.16b, #4 + rev32 v22.8h, v6.8h + ext v23.16b, v5.16b, v5.16b, #4 + bif v7.16b, v17.16b, v1.16b + zip1 v24.4s, v16.4s, v17.4s + zip1 v16.4s, v17.4s, v16.4s + add v21.4s, v2.4s, v3.4s + mov v3.s[1], v17.s[2] + add v17.4s, v19.4s, v22.4s + mov v19.16b, v0.16b + ext v25.16b, v7.16b, v7.16b, #12 + ext v4.16b, v16.16b, v24.16b, #8 + uzp1 v16.4s, v23.4s, v23.4s + bsl v19.16b, v3.16b, v18.16b + eor v2.16b, v17.16b, v20.16b + uzp1 v7.4s, v7.4s, v25.4s + ext v25.16b, v16.16b, v23.16b, #8 + zip1 v3.2d, v4.2d, v19.2d + ushr v20.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ext v24.16b, v23.16b, v23.16b, #12 + uzp2 v6.4s, v25.4s, v19.4s + zip2 v18.4s, v19.4s, v4.4s + bif v3.16b, v7.16b, v1.16b + orr v20.16b, v2.16b, v20.16b + ext v16.16b, v23.16b, v24.16b, #12 + ext v23.16b, v6.16b, v6.16b, #4 + zip1 v24.4s, v18.4s, v7.4s + zip1 v18.4s, v7.4s, v18.4s + ext v25.16b, v3.16b, v3.16b, #12 + add v21.4s, v21.4s, v20.4s + ext v2.16b, v18.16b, v24.16b, #8 + uzp1 v18.4s, v23.4s, v23.4s + ext v24.16b, v23.16b, v23.16b, #12 + uzp1 v3.4s, v3.4s, v25.4s + eor v22.16b, v21.16b, v22.16b + ext v25.16b, v18.16b, v23.16b, #8 + dup v18.4s, v2.s[3] + ext v23.16b, v23.16b, v24.16b, #12 + add v5.4s, v21.4s, v5.4s + trn1 v21.4s, v3.4s, v3.4s + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + ext v18.16b, v21.16b, v18.16b, #8 + orr v21.16b, v22.16b, v24.16b + add v17.4s, v21.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ushr v22.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v20.16b, v20.16b, v22.16b + ext v21.16b, v21.16b, v21.16b, #8 + add v5.4s, v20.4s, v5.4s + eor v21.16b, v5.16b, v21.16b + ext v17.16b, v17.16b, v17.16b, #12 + add v5.4s, v5.4s, v19.4s + rev32 v19.8h, v21.8h + add v17.4s, v17.4s, v19.4s + eor v20.16b, v17.16b, v20.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v21.16b + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ushr v21.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v21.16b + add v17.4s, v19.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ext v5.16b, v5.16b, v5.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v5.4s, v5.4s, v7.4s + orr v20.16b, v20.16b, v21.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ext v17.16b, v17.16b, v17.16b, #4 + rev32 v22.8h, v19.8h + add v21.4s, v5.4s, v4.4s + mov v4.s[1], v7.s[2] + add v19.4s, v17.4s, v22.4s + bit v16.16b, v4.16b, v0.16b + eor v5.16b, v19.16b, v20.16b + uzp2 v4.4s, v25.4s, v16.4s + zip1 v7.2d, v2.2d, v16.2d + zip2 v17.4s, v16.4s, v2.4s + ushr v20.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ext v24.16b, v4.16b, v4.16b, #4 + bif v7.16b, v3.16b, v1.16b + zip1 v25.4s, v17.4s, v3.4s + zip1 v17.4s, v3.4s, v17.4s + orr v20.16b, v5.16b, v20.16b + ext v26.16b, v7.16b, v7.16b, #12 + ext v5.16b, v17.16b, v25.16b, #8 + uzp1 v17.4s, v24.4s, v24.4s + ext v25.16b, v24.16b, v24.16b, #12 + bit v23.16b, v18.16b, v0.16b + add v21.4s, v21.4s, v20.4s + uzp1 v7.4s, v7.4s, v26.4s + ext v26.16b, v17.16b, v24.16b, #8 + ext v17.16b, v24.16b, v25.16b, #12 + eor v22.16b, v21.16b, v22.16b + add v6.4s, v21.4s, v6.4s + zip1 v21.2d, v5.2d, v23.2d + zip2 v24.4s, v23.4s, v5.4s + bif v21.16b, v7.16b, v1.16b + zip1 v1.4s, v24.4s, v7.4s + zip1 v24.4s, v7.4s, v24.4s + ext v1.16b, v24.16b, v1.16b, #8 + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v24.16b + add v19.4s, v22.4s, v19.4s + ext v24.16b, v21.16b, v21.16b, #12 + eor v20.16b, v19.16b, v20.16b + uzp1 v21.4s, v21.4s, v24.4s + ushr v24.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + orr v20.16b, v20.16b, v24.16b + ext v6.16b, v6.16b, v6.16b, #4 + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v20.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #12 + add v6.4s, v6.4s, v16.4s + rev32 v16.8h, v22.8h + add v19.4s, v19.4s, v16.4s + eor v20.16b, v19.16b, v20.16b + ushr v22.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v22.16b + add v6.4s, v6.4s, v20.4s + eor v16.16b, v6.16b, v16.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v6.4s, v3.4s + ushr v6.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + orr v6.16b, v16.16b, v6.16b + add v16.4s, v6.4s, v19.4s + eor v19.16b, v16.16b, v20.16b + ushr v20.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v20.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v3.4s, v3.4s, v19.4s + eor v6.16b, v3.16b, v6.16b + ext v16.16b, v16.16b, v16.16b, #4 + add v2.4s, v3.4s, v2.4s + rev32 v3.8h, v6.8h + add v6.4s, v16.4s, v3.4s + eor v16.16b, v6.16b, v19.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + orr v16.16b, v16.16b, v19.16b + add v2.4s, v2.4s, v16.4s + eor v3.16b, v2.16b, v3.16b + add v2.4s, v2.4s, v4.4s + ushr v4.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v4.16b + add v4.4s, v3.4s, v6.4s + eor v6.16b, v4.16b, v16.16b + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v6.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v6.16b, v4.16b, v6.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + add v2.4s, v2.4s, v23.4s + orr v6.16b, v6.16b, v16.16b + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ushr v16.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v16.16b + add v4.4s, v3.4s, v4.4s + eor v6.16b, v4.16b, v6.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + add v2.4s, v2.4s, v7.4s + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #4 + rev32 v3.8h, v3.8h + add v2.4s, v2.4s, v5.4s + mov v5.s[1], v7.s[2] + add v4.4s, v4.4s, v3.4s + bsl v0.16b, v5.16b, v17.16b + eor v5.16b, v4.16b, v6.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v5.4s + eor v3.16b, v2.16b, v3.16b + ushr v6.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v6.16b + add v4.4s, v3.4s, v4.4s + uzp2 v18.4s, v26.4s, v18.4s + eor v5.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v18.4s + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v5.16b, v5.16b, v6.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v5.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v0.4s, v2.4s, v0.4s + rev32 v2.8h, v3.8h + add v3.4s, v4.4s, v2.4s + eor v4.16b, v3.16b, v5.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v4.16b, v4.16b, v5.16b + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ushr v5.4s, v2.4s, #8 + shl v2.4s, v2.4s, #24 + orr v2.16b, v2.16b, v5.16b + add v3.4s, v2.4s, v3.4s + eor v4.16b, v3.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #12 + ushr v5.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v0.4s, v0.4s, v21.4s + orr v4.16b, v4.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ext v3.16b, v3.16b, v3.16b, #4 + add v0.4s, v0.4s, v1.4s + rev32 v1.8h, v2.8h + add v2.4s, v3.4s, v1.4s + eor v3.16b, v2.16b, v4.16b + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v4.16b + add v0.4s, v0.4s, v3.4s + eor v1.16b, v0.16b, v1.16b + ushr v4.4s, v1.4s, #8 + shl v1.4s, v1.4s, #24 + orr v1.16b, v1.16b, v4.16b + add v2.4s, v1.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v0.16b, v0.16b, v0.16b, #4 + ext v2.16b, v2.16b, v2.16b, #12 + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v1.16b, v1.16b, v1.16b, #8 + eor v0.16b, v2.16b, v0.16b + orr v2.16b, v3.16b, v4.16b + eor v1.16b, v2.16b, v1.16b + stp q0, q1, [x0] + ret +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-zfs_blake3_compress_in_place_sse2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI1_1: + .xword 0 + .xword -4294967296 +.LCPI1_2: + .xword -1 + .xword 4294967295 + .text + .globl zfs_blake3_compress_xof_sse2 + .p2align 2 + .type zfs_blake3_compress_xof_sse2,@function +zfs_blake3_compress_xof_sse2: + .cfi_startproc + ldp q3, q2, [x0] + ldp q5, q6, [x1] + add x10, x1, #32 + lsr x11, x3, #32 + fmov s4, w3 + ld2 { v17.4s, v18.4s }, [x10] + adrp x10, .LCPI1_2 + and w8, w2, #0xff + mov v4.s[1], w11 + ldr q1, [x10, :lo12:.LCPI1_2] + and w9, w4, #0xff + adrp x12, .LCPI1_0 + mov v4.s[2], w8 + uzp1 v19.4s, v5.4s, v6.4s + add v3.4s, v2.4s, v3.4s + ldr q7, [x12, :lo12:.LCPI1_0] + mov v4.s[3], w9 + add v3.4s, v3.4s, v19.4s + uzp2 v5.4s, v5.4s, v6.4s + ext v21.16b, v18.16b, v18.16b, #12 + uzp1 v6.4s, v19.4s, v19.4s + ext v22.16b, v19.16b, v19.16b, #12 + eor v4.16b, v3.16b, v4.16b + ext v20.16b, v17.16b, v17.16b, #12 + ext v6.16b, v6.16b, v19.16b, #8 + ext v19.16b, v19.16b, v22.16b, #12 + zip1 v22.2d, v21.2d, v5.2d + rev32 v24.8h, v4.8h + mov v4.16b, v1.16b + zip2 v23.4s, v5.4s, v21.4s + uzp2 v6.4s, v6.4s, v5.4s + bsl v4.16b, v22.16b, v20.16b + add v3.4s, v3.4s, v5.4s + zip1 v5.4s, v23.4s, v20.4s + zip1 v22.4s, v20.4s, v23.4s + add v23.4s, v24.4s, v7.4s + ext v7.16b, v6.16b, v6.16b, #4 + ext v25.16b, v4.16b, v4.16b, #12 + ext v5.16b, v22.16b, v5.16b, #8 + eor v2.16b, v23.16b, v2.16b + uzp1 v4.4s, v4.4s, v25.4s + uzp1 v22.4s, v7.4s, v7.4s + ext v25.16b, v7.16b, v7.16b, #12 + ext v22.16b, v22.16b, v7.16b, #8 + ext v7.16b, v7.16b, v25.16b, #12 + ushr v25.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v2.16b, v2.16b, v25.16b + add v3.4s, v3.4s, v2.4s + eor v24.16b, v3.16b, v24.16b + add v3.4s, v3.4s, v17.4s + ushr v17.4s, v24.4s, #8 + shl v18.4s, v24.4s, #24 + orr v17.16b, v18.16b, v17.16b + add v18.4s, v17.4s, v23.4s + eor v2.16b, v18.16b, v2.16b + ushr v23.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + ext v3.16b, v3.16b, v3.16b, #12 + orr v2.16b, v2.16b, v23.16b + ext v17.16b, v17.16b, v17.16b, #8 + add v3.4s, v2.4s, v3.4s + adrp x11, .LCPI1_1 + eor v17.16b, v3.16b, v17.16b + ldr q16, [x11, :lo12:.LCPI1_1] + ext v18.16b, v18.16b, v18.16b, #4 + rev32 v24.8h, v17.8h + movi v0.2d, #0xffffffff00000000 + add v23.4s, v3.4s, v21.4s + mov v21.s[1], v20.s[2] + add v20.4s, v18.4s, v24.4s + bit v19.16b, v21.16b, v0.16b + eor v3.16b, v20.16b, v2.16b + uzp2 v2.4s, v22.4s, v19.4s + zip1 v17.2d, v5.2d, v19.2d + zip2 v18.4s, v19.4s, v5.4s + ushr v21.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + ext v22.16b, v2.16b, v2.16b, #4 + bsl v16.16b, v4.16b, v17.16b + zip1 v17.4s, v18.4s, v4.4s + zip1 v18.4s, v4.4s, v18.4s + orr v21.16b, v3.16b, v21.16b + ext v25.16b, v16.16b, v16.16b, #12 + ext v3.16b, v18.16b, v17.16b, #8 + uzp1 v18.4s, v22.4s, v22.4s + ext v26.16b, v22.16b, v22.16b, #12 + add v23.4s, v23.4s, v21.4s + uzp1 v17.4s, v16.4s, v25.4s + ext v16.16b, v18.16b, v22.16b, #8 + ext v18.16b, v22.16b, v26.16b, #12 + eor v22.16b, v23.16b, v24.16b + add v6.4s, v23.4s, v6.4s + ushr v23.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v23.16b + add v20.4s, v22.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ushr v23.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v21.16b, v21.16b, v23.16b + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v21.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v20.16b, v20.16b, v20.16b, #12 + add v6.4s, v6.4s, v19.4s + rev32 v19.8h, v22.8h + add v20.4s, v20.4s, v19.4s + eor v21.16b, v20.16b, v21.16b + ushr v22.4s, v21.4s, #12 + shl v21.4s, v21.4s, #20 + orr v21.16b, v21.16b, v22.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ushr v22.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v22.16b + add v20.4s, v19.4s, v20.4s + eor v21.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #12 + ushr v22.4s, v21.4s, #7 + shl v21.4s, v21.4s, #25 + add v6.4s, v6.4s, v4.4s + orr v21.16b, v21.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + ext v20.16b, v20.16b, v20.16b, #4 + rev32 v19.8h, v19.8h + add v20.4s, v20.4s, v19.4s + add v6.4s, v6.4s, v5.4s + mov v5.s[1], v4.s[2] + eor v4.16b, v20.16b, v21.16b + ushr v21.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v21.16b, v4.16b, v21.16b + add v6.4s, v6.4s, v21.4s + eor v19.16b, v6.16b, v19.16b + add v2.4s, v6.4s, v2.4s + ushr v6.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v6.16b, v19.16b, v6.16b + add v19.4s, v6.4s, v20.4s + eor v20.16b, v19.16b, v21.16b + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v20.4s, v2.4s + eor v6.16b, v2.16b, v6.16b + ext v19.16b, v19.16b, v19.16b, #12 + rev32 v6.8h, v6.8h + add v19.4s, v19.4s, v6.4s + mov v22.16b, v0.16b + eor v20.16b, v19.16b, v20.16b + bsl v22.16b, v5.16b, v7.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + add v2.4s, v2.4s, v22.4s + orr v20.16b, v20.16b, v21.16b + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + ushr v21.4s, v6.4s, #8 + shl v6.4s, v6.4s, #24 + orr v6.16b, v6.16b, v21.16b + add v19.4s, v6.4s, v19.4s + eor v20.16b, v19.16b, v20.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v2.4s, v2.4s, v17.4s + orr v20.16b, v20.16b, v21.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v2.4s, v2.4s, v20.4s + eor v6.16b, v2.16b, v6.16b + uzp2 v5.4s, v16.4s, v22.4s + zip1 v7.2d, v3.2d, v22.2d + zip2 v16.4s, v22.4s, v3.4s + ext v19.16b, v19.16b, v19.16b, #4 + rev32 v22.8h, v6.8h + ext v23.16b, v5.16b, v5.16b, #4 + bif v7.16b, v17.16b, v1.16b + zip1 v24.4s, v16.4s, v17.4s + zip1 v16.4s, v17.4s, v16.4s + add v21.4s, v2.4s, v3.4s + mov v3.s[1], v17.s[2] + add v17.4s, v19.4s, v22.4s + mov v19.16b, v0.16b + ext v25.16b, v7.16b, v7.16b, #12 + ext v4.16b, v16.16b, v24.16b, #8 + uzp1 v16.4s, v23.4s, v23.4s + bsl v19.16b, v3.16b, v18.16b + eor v2.16b, v17.16b, v20.16b + uzp1 v7.4s, v7.4s, v25.4s + ext v25.16b, v16.16b, v23.16b, #8 + zip1 v3.2d, v4.2d, v19.2d + ushr v20.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ext v24.16b, v23.16b, v23.16b, #12 + uzp2 v6.4s, v25.4s, v19.4s + zip2 v18.4s, v19.4s, v4.4s + bif v3.16b, v7.16b, v1.16b + orr v20.16b, v2.16b, v20.16b + ext v16.16b, v23.16b, v24.16b, #12 + ext v23.16b, v6.16b, v6.16b, #4 + zip1 v24.4s, v18.4s, v7.4s + zip1 v18.4s, v7.4s, v18.4s + ext v25.16b, v3.16b, v3.16b, #12 + add v21.4s, v21.4s, v20.4s + ext v2.16b, v18.16b, v24.16b, #8 + uzp1 v18.4s, v23.4s, v23.4s + ext v24.16b, v23.16b, v23.16b, #12 + uzp1 v3.4s, v3.4s, v25.4s + eor v22.16b, v21.16b, v22.16b + ext v25.16b, v18.16b, v23.16b, #8 + dup v18.4s, v2.s[3] + ext v23.16b, v23.16b, v24.16b, #12 + add v5.4s, v21.4s, v5.4s + trn1 v21.4s, v3.4s, v3.4s + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + ext v18.16b, v21.16b, v18.16b, #8 + orr v21.16b, v22.16b, v24.16b + add v17.4s, v21.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ushr v22.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v20.16b, v20.16b, v22.16b + ext v21.16b, v21.16b, v21.16b, #8 + add v5.4s, v20.4s, v5.4s + eor v21.16b, v5.16b, v21.16b + ext v17.16b, v17.16b, v17.16b, #12 + add v5.4s, v5.4s, v19.4s + rev32 v19.8h, v21.8h + add v17.4s, v17.4s, v19.4s + eor v20.16b, v17.16b, v20.16b + ushr v21.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v21.16b + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ushr v21.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v21.16b + add v17.4s, v19.4s, v17.4s + eor v20.16b, v17.16b, v20.16b + ext v5.16b, v5.16b, v5.16b, #12 + ushr v21.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + add v5.4s, v5.4s, v7.4s + orr v20.16b, v20.16b, v21.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v5.4s, v5.4s, v20.4s + eor v19.16b, v5.16b, v19.16b + ext v17.16b, v17.16b, v17.16b, #4 + rev32 v22.8h, v19.8h + add v21.4s, v5.4s, v4.4s + mov v4.s[1], v7.s[2] + add v19.4s, v17.4s, v22.4s + bit v16.16b, v4.16b, v0.16b + eor v5.16b, v19.16b, v20.16b + uzp2 v4.4s, v25.4s, v16.4s + zip1 v7.2d, v2.2d, v16.2d + zip2 v17.4s, v16.4s, v2.4s + ushr v20.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + ext v24.16b, v4.16b, v4.16b, #4 + bif v7.16b, v3.16b, v1.16b + zip1 v25.4s, v17.4s, v3.4s + zip1 v17.4s, v3.4s, v17.4s + orr v20.16b, v5.16b, v20.16b + ext v26.16b, v7.16b, v7.16b, #12 + ext v5.16b, v17.16b, v25.16b, #8 + uzp1 v17.4s, v24.4s, v24.4s + ext v25.16b, v24.16b, v24.16b, #12 + bit v23.16b, v18.16b, v0.16b + add v21.4s, v21.4s, v20.4s + uzp1 v7.4s, v7.4s, v26.4s + ext v26.16b, v17.16b, v24.16b, #8 + ext v17.16b, v24.16b, v25.16b, #12 + eor v22.16b, v21.16b, v22.16b + add v6.4s, v21.4s, v6.4s + zip1 v21.2d, v5.2d, v23.2d + zip2 v24.4s, v23.4s, v5.4s + bif v21.16b, v7.16b, v1.16b + zip1 v1.4s, v24.4s, v7.4s + zip1 v24.4s, v7.4s, v24.4s + ext v1.16b, v24.16b, v1.16b, #8 + ushr v24.4s, v22.4s, #8 + shl v22.4s, v22.4s, #24 + orr v22.16b, v22.16b, v24.16b + add v19.4s, v22.4s, v19.4s + ext v24.16b, v21.16b, v21.16b, #12 + eor v20.16b, v19.16b, v20.16b + uzp1 v21.4s, v21.4s, v24.4s + ushr v24.4s, v20.4s, #7 + shl v20.4s, v20.4s, #25 + orr v20.16b, v20.16b, v24.16b + ext v6.16b, v6.16b, v6.16b, #4 + ext v22.16b, v22.16b, v22.16b, #8 + add v6.4s, v20.4s, v6.4s + eor v22.16b, v6.16b, v22.16b + ext v19.16b, v19.16b, v19.16b, #12 + add v6.4s, v6.4s, v16.4s + rev32 v16.8h, v22.8h + add v19.4s, v19.4s, v16.4s + eor v20.16b, v19.16b, v20.16b + ushr v22.4s, v20.4s, #12 + shl v20.4s, v20.4s, #20 + orr v20.16b, v20.16b, v22.16b + add v6.4s, v6.4s, v20.4s + eor v16.16b, v6.16b, v16.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v6.4s, v3.4s + ushr v6.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + orr v6.16b, v16.16b, v6.16b + add v16.4s, v6.4s, v19.4s + eor v19.16b, v16.16b, v20.16b + ushr v20.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v20.16b + ext v6.16b, v6.16b, v6.16b, #8 + add v3.4s, v3.4s, v19.4s + eor v6.16b, v3.16b, v6.16b + ext v16.16b, v16.16b, v16.16b, #4 + add v2.4s, v3.4s, v2.4s + rev32 v3.8h, v6.8h + add v6.4s, v16.4s, v3.4s + eor v16.16b, v6.16b, v19.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + orr v16.16b, v16.16b, v19.16b + add v2.4s, v2.4s, v16.4s + eor v3.16b, v2.16b, v3.16b + add v2.4s, v2.4s, v4.4s + ushr v4.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v4.16b + add v4.4s, v3.4s, v6.4s + eor v6.16b, v4.16b, v16.16b + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v6.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + rev32 v3.8h, v3.8h + add v4.4s, v4.4s, v3.4s + eor v6.16b, v4.16b, v6.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + add v2.4s, v2.4s, v23.4s + orr v6.16b, v6.16b, v16.16b + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ushr v16.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v16.16b + add v4.4s, v3.4s, v4.4s + eor v6.16b, v4.16b, v6.16b + ext v2.16b, v2.16b, v2.16b, #12 + ushr v16.4s, v6.4s, #7 + shl v6.4s, v6.4s, #25 + add v2.4s, v2.4s, v7.4s + orr v6.16b, v6.16b, v16.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v2.4s, v6.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #4 + rev32 v3.8h, v3.8h + add v2.4s, v2.4s, v5.4s + mov v5.s[1], v7.s[2] + add v4.4s, v4.4s, v3.4s + bsl v0.16b, v5.16b, v17.16b + eor v5.16b, v4.16b, v6.16b + ushr v6.4s, v5.4s, #12 + shl v5.4s, v5.4s, #20 + orr v5.16b, v5.16b, v6.16b + add v2.4s, v2.4s, v5.4s + eor v3.16b, v2.16b, v3.16b + ushr v6.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v3.16b, v3.16b, v6.16b + add v4.4s, v3.4s, v4.4s + uzp2 v18.4s, v26.4s, v18.4s + eor v5.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v18.4s + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + orr v5.16b, v5.16b, v6.16b + ext v3.16b, v3.16b, v3.16b, #8 + add v2.4s, v5.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v0.4s, v2.4s, v0.4s + rev32 v2.8h, v3.8h + add v3.4s, v4.4s, v2.4s + eor v4.16b, v3.16b, v5.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + orr v4.16b, v4.16b, v5.16b + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ushr v5.4s, v2.4s, #8 + shl v2.4s, v2.4s, #24 + orr v2.16b, v2.16b, v5.16b + add v3.4s, v2.4s, v3.4s + eor v4.16b, v3.16b, v4.16b + ext v0.16b, v0.16b, v0.16b, #12 + ushr v5.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v0.4s, v0.4s, v21.4s + orr v4.16b, v4.16b, v5.16b + ext v2.16b, v2.16b, v2.16b, #8 + add v0.4s, v0.4s, v4.4s + eor v2.16b, v0.16b, v2.16b + ext v3.16b, v3.16b, v3.16b, #4 + add v0.4s, v0.4s, v1.4s + rev32 v1.8h, v2.8h + add v2.4s, v3.4s, v1.4s + eor v3.16b, v2.16b, v4.16b + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v4.16b + add v0.4s, v0.4s, v3.4s + eor v1.16b, v0.16b, v1.16b + ushr v4.4s, v1.4s, #8 + shl v1.4s, v1.4s, #24 + orr v1.16b, v1.16b, v4.16b + add v2.4s, v1.4s, v2.4s + eor v3.16b, v2.16b, v3.16b + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v0.16b, v0.16b, v0.16b, #4 + ext v1.16b, v1.16b, v1.16b, #8 + ext v2.16b, v2.16b, v2.16b, #12 + orr v3.16b, v3.16b, v4.16b + eor v0.16b, v2.16b, v0.16b + eor v3.16b, v3.16b, v1.16b + stp q0, q3, [x5] + ldr q0, [x0] + eor v0.16b, v0.16b, v2.16b + str q0, [x5, #32] + ldr q0, [x0, #16] + eor v0.16b, v0.16b, v1.16b + str q0, [x5, #48] + ret +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-zfs_blake3_compress_xof_sse2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI2_0: + .word 0 + .word 1 + .word 2 + .word 3 + .text + .globl zfs_blake3_hash_many_sse2 + .p2align 2 + .type zfs_blake3_hash_many_sse2,@function +zfs_blake3_hash_many_sse2: + .cfi_startproc + stp d15, d14, [sp, #-160]! + stp d13, d12, [sp, #16] + stp d11, d10, [sp, #32] + stp d9, d8, [sp, #48] + stp x29, x30, [sp, #64] + stp x28, x27, [sp, #80] + stp x26, x25, [sp, #96] + stp x24, x23, [sp, #112] + stp x22, x21, [sp, #128] + stp x20, x19, [sp, #144] + mov x29, sp + sub sp, sp, #384 + .cfi_def_cfa w29, 160 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + ldr x26, [x29, #168] + ldrb w27, [x29, #160] + mov w19, w6 + mov x20, x4 + mov x22, x2 + mov x28, x1 + cmp x1, #4 + mov x24, x0 + str x3, [sp, #40] + b.lo .LBB2_8 + adrp x9, .LCPI2_0 + ldr q0, [x9, :lo12:.LCPI2_0] + sbfx w11, w5, #0, #1 + dup v1.4s, w11 + mov w9, #58983 + mov w10, #44677 + and v0.16b, v1.16b, v0.16b + mov w11, #62322 + mov w12, #62778 + orr w8, w7, w19 + movk w9, #27145, lsl #16 + movk w10, #47975, lsl #16 + movk w11, #15470, lsl #16 + str q0, [sp, #16] + orr v0.4s, #128, lsl #24 + movk w12, #42319, lsl #16 + str q0, [sp] +.LBB2_2: + ldr x0, [sp, #40] + mov x13, x0 + ld1r { v20.4s }, [x13], #4 + add x14, x0, #8 + add x15, x0, #12 + add x16, x0, #16 + add x17, x0, #20 + add x18, x0, #24 + add x0, x0, #28 + ld1r { v17.4s }, [x14] + ld1r { v6.4s }, [x15] + ld1r { v8.4s }, [x16] + ld1r { v9.4s }, [x17] + ld1r { v31.4s }, [x18] + ld1r { v26.4s }, [x13] + ld1r { v15.4s }, [x0] + cbz x22, .LBB2_7 + ldr q1, [sp, #16] + dup v0.4s, w20 + ldp x13, x14, [x24] + ldp x15, x16, [x24, #16] + add v1.4s, v0.4s, v1.4s + movi v0.4s, #128, lsl #24 + str q1, [sp, #64] + eor v0.16b, v1.16b, v0.16b + ldr q1, [sp] + lsr x18, x20, #32 + mov x17, xzr + cmgt v0.4s, v1.4s, v0.4s + dup v1.4s, w18 + sub v0.4s, v1.4s, v0.4s + mov w18, w8 + str q0, [sp, #48] +.LBB2_4: + mov w2, #16 + bfi x2, x17, #6, #58 + ldr q1, [x13, x2] + ldr q3, [x14, x2] + ldr q2, [x15, x2] + ldr q4, [x16, x2] + mov w2, #32 + bfi x2, x17, #6, #58 + ldr q5, [x13, x2] + ldr q18, [x14, x2] + ldr q19, [x15, x2] + ldr q23, [x16, x2] + mov w2, #48 + lsl x3, x17, #6 + bfi x2, x17, #6, #58 + add x17, x17, #1 + ldr q0, [x13, x3] + ldr q21, [x14, x3] + ldr q7, [x15, x3] + ldr q16, [x16, x3] + cmp x17, x22 + ldr q13, [x13, x2] + ldr q14, [x14, x2] + ldr q29, [x15, x2] + ldr q10, [x16, x2] + csel w2, w27, wzr, eq + orr w18, w2, w18 + mov x0, xzr + and w18, w18, #0xff + add x3, x3, #256 +.LBB2_5: + ldr x2, [x24, x0] + add x0, x0, #8 + cmp x0, #32 + add x2, x2, x3 + prfm pldl1keep, [x2] + b.ne .LBB2_5 + dup v22.4s, w18 + str q22, [sp, #192] + zip1 v27.4s, v0.4s, v21.4s + zip2 v21.4s, v0.4s, v21.4s + zip1 v0.4s, v7.4s, v16.4s + zip2 v22.4s, v7.4s, v16.4s + zip1 v7.4s, v1.4s, v3.4s + zip1 v25.4s, v2.4s, v4.4s + zip2 v16.4s, v2.4s, v4.4s + zip1 v11.4s, v19.4s, v23.4s + zip2 v12.4s, v19.4s, v23.4s + zip1 v19.4s, v13.4s, v14.4s + zip2 v23.4s, v13.4s, v14.4s + zip1 v13.4s, v29.4s, v10.4s + zip2 v14.4s, v29.4s, v10.4s + add v10.4s, v20.4s, v8.4s + add v2.4s, v26.4s, v9.4s + ext v20.16b, v22.16b, v21.16b, #8 + ext v26.16b, v25.16b, v7.16b, #8 + zip2 v24.4s, v1.4s, v3.4s + add v1.4s, v6.4s, v15.4s + ext v6.16b, v0.16b, v27.16b, #8 + ext v20.16b, v21.16b, v20.16b, #8 + mov v21.d[1], v22.d[0] + ext v22.16b, v7.16b, v26.16b, #8 + mov v7.d[1], v25.d[0] + add v3.4s, v17.4s, v31.4s + str q1, [sp, #144] + ext v1.16b, v27.16b, v6.16b, #8 + mov v6.16b, v7.16b + zip1 v28.4s, v5.4s, v18.4s + stur q1, [x29, #-80] + mov v1.16b, v27.16b + mov v27.16b, v24.16b + add v3.4s, v3.4s, v6.4s + ldr q6, [sp, #64] + ext v29.16b, v16.16b, v24.16b, #8 + mov v1.d[1], v0.d[0] + ext v0.16b, v11.16b, v28.16b, #8 + mov v27.d[1], v16.d[0] + ext v16.16b, v14.16b, v23.16b, #8 + stur q7, [x29, #-144] + ext v7.16b, v24.16b, v29.16b, #8 + ext v29.16b, v28.16b, v0.16b, #8 + ext v0.16b, v23.16b, v16.16b, #8 + mov v23.d[1], v14.d[0] + stp q0, q23, [sp, #80] + add v0.4s, v10.4s, v1.4s + eor v16.16b, v0.16b, v6.16b + ldr q6, [sp, #48] + add v2.4s, v2.4s, v21.4s + mov v28.d[1], v11.d[0] + zip2 v18.4s, v5.4s, v18.4s + eor v10.16b, v2.16b, v6.16b + movi v6.4s, #64 + eor v11.16b, v3.16b, v6.16b + ldr q6, [sp, #144] + dup v17.4s, w9 + ext v30.16b, v12.16b, v18.16b, #8 + rev32 v16.8h, v16.8h + dup v5.4s, w10 + ext v25.16b, v18.16b, v30.16b, #8 + mov v30.16b, v23.16b + mov v23.16b, v1.16b + str q1, [sp, #160] + rev32 v10.8h, v10.8h + add v1.4s, v16.4s, v17.4s + add v17.4s, v6.4s, v27.4s + ldr q6, [sp, #192] + dup v4.4s, w11 + rev32 v11.8h, v11.8h + add v5.4s, v10.4s, v5.4s + eor v8.16b, v1.16b, v8.16b + stur q21, [x29, #-128] + mov v18.d[1], v12.d[0] + add v4.4s, v11.4s, v4.4s + eor v9.16b, v5.16b, v9.16b + ushr v12.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + ldur q21, [x29, #-80] + ext v26.16b, v13.16b, v19.16b, #8 + eor v31.16b, v4.16b, v31.16b + orr v8.16b, v8.16b, v12.16b + ushr v12.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + ext v26.16b, v19.16b, v26.16b, #8 + mov v19.d[1], v13.d[0] + orr v9.16b, v9.16b, v12.16b + ushr v12.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v17.16b, v6.16b + orr v31.16b, v31.16b, v12.16b + dup v12.4s, w12 + rev32 v13.8h, v13.8h + add v12.4s, v13.4s, v12.4s + add v0.4s, v0.4s, v21.4s + eor v14.16b, v12.16b, v15.16b + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v20.4s + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v22.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v7.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v13.16b, v17.16b, v13.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v13.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v28.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v18.4s + orr v14.16b, v14.16b, v15.16b + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v19.4s + rev32 v13.8h, v13.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v30.4s + add v4.4s, v4.4s, v13.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + mov v24.16b, v7.16b + stur q7, [x29, #-112] + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + mov v7.16b, v26.16b + add v3.4s, v3.4s, v26.4s + ldr q26, [sp, #80] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v5.4s, v11.4s + add v0.4s, v0.4s, v29.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v25.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v26.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v13.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + str q22, [sp, #128] + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + ldur q22, [x29, #-128] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + eor v8.16b, v5.16b, v8.16b + mov v6.16b, v18.16b + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + ldur q18, [x29, #-144] + orr v8.16b, v8.16b, v15.16b + add v0.4s, v0.4s, v22.4s + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v20.4s + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v24.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v18.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v13.16b, v17.16b, v13.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v13.8h, v13.8h + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v13.4s + add v0.4s, v0.4s, v27.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v6.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v23.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v7.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v13.16b, v17.16b, v13.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v13.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v21.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v19.4s + orr v14.16b, v14.16b, v15.16b + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v29.4s + str q28, [sp, #112] + rev32 v13.8h, v13.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v26.4s + add v4.4s, v4.4s, v13.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + ldp q28, q23, [sp, #112] + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ldr q21, [sp, #96] + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v5.4s, v11.4s + add v0.4s, v0.4s, v25.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v23.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v21.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v28.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v13.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + eor v8.16b, v5.16b, v8.16b + mov v30.16b, v29.16b + mov v29.16b, v25.16b + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + ldur q25, [x29, #-112] + orr v8.16b, v8.16b, v15.16b + add v0.4s, v0.4s, v20.4s + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v6.4s + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v7.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v25.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v13.16b, v17.16b, v13.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v13.8h, v13.8h + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v13.4s + add v0.4s, v0.4s, v18.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v8.4s + add v2.4s, v2.4s, v19.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v16.16b, v0.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v22.4s + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v21.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v14.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v13.16b, v17.16b, v13.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v13.4s, #8 + shl v13.4s, v13.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v13.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v14.16b, v12.16b, v14.16b + add v0.4s, v0.4s, v27.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #7 + shl v14.4s, v14.4s, #25 + add v0.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v30.4s + orr v14.16b, v14.16b, v15.16b + eor v13.16b, v0.16b, v13.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v29.4s + rev32 v13.8h, v13.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v28.4s + add v4.4s, v4.4s, v13.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + ldr q24, [sp, #160] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v14.16b, v1.16b, v14.16b + add v5.4s, v5.4s, v11.4s + stur q7, [x29, #-64] + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v14.4s, #12 + shl v14.4s, v14.4s, #20 + eor v8.16b, v5.16b, v8.16b + mov v7.16b, v26.16b + add v3.4s, v3.4s, v26.4s + ldur q26, [x29, #-80] + orr v14.16b, v14.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + add v0.4s, v0.4s, v23.4s + orr v8.16b, v8.16b, v15.16b + add v15.4s, v0.4s, v9.4s + add v2.4s, v2.4s, v24.4s + eor v0.16b, v15.16b, v13.16b + add v2.4s, v2.4s, v31.4s + ushr v13.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v14.4s + add v17.4s, v17.4s, v26.4s + orr v0.16b, v0.16b, v13.16b + ushr v13.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v13.16b + ushr v13.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v0.4s, v4.4s + orr v10.16b, v10.16b, v13.16b + ushr v13.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v13.16b + ushr v13.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + orr v9.16b, v9.16b, v13.16b + ushr v13.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + add v1.4s, v10.4s, v1.4s + orr v31.16b, v31.16b, v13.16b + eor v13.16b, v1.16b, v14.16b + add v5.4s, v11.4s, v5.4s + ushr v14.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v13.16b, v13.16b, v14.16b + ushr v14.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + stur q6, [x29, #-96] + orr v8.16b, v8.16b, v14.16b + add v14.4s, v15.4s, v6.4s + ldur q6, [x29, #-64] + mov v18.16b, v19.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v18.4s + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v21.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v6.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v0.16b, v17.16b, v0.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v0.8h, v0.8h + str q27, [sp, #176] + mov v27.16b, v30.16b + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v0.4s + add v14.4s, v14.4s, v25.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v27.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v20.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v7.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + mov v30.16b, v23.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v0.16b, v17.16b, v0.16b + add v1.4s, v16.4s, v1.4s + ldur q23, [x29, #-144] + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v0.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v23.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v29.4s + orr v13.16b, v13.16b, v15.16b + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v30.4s + rev32 v0.8h, v0.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v26.4s + add v4.4s, v4.4s, v0.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + ldur q22, [x29, #-128] + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + ldr q26, [sp, #176] + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v5.4s, v11.4s + add v14.4s, v14.4s, v24.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v22.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v28.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v26.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v0.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + orr v8.16b, v8.16b, v15.16b + add v14.4s, v14.4s, v18.4s + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v27.4s + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v7.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v21.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v0.16b, v17.16b, v0.16b + add v14.4s, v14.4s, v6.4s + ldur q6, [x29, #-96] + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v0.8h, v0.8h + stur q20, [x29, #-160] + mov v20.16b, v29.16b + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v0.4s + mov v19.16b, v29.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v20.4s + mov v19.16b, v28.16b + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v6.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v19.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v0.16b, v17.16b, v0.16b + add v1.4s, v16.4s, v1.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v8.16b, v1.16b, v8.16b + add v5.4s, v10.4s, v5.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v0.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v25.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v30.4s + orr v13.16b, v13.16b, v15.16b + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v24.4s + rev32 v0.8h, v0.8h + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v26.4s + mov v29.16b, v27.16b + add v4.4s, v4.4s, v0.4s + rev32 v16.8h, v16.8h + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + ldur q27, [x29, #-160] + eor v9.16b, v4.16b, v9.16b + add v12.4s, v12.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v17.16b, v11.16b + ldur q6, [x29, #-80] + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v1.4s, v10.4s + rev32 v11.8h, v11.8h + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v5.4s, v11.4s + add v14.4s, v14.4s, v22.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v8.16b, v5.16b, v8.16b + add v14.4s, v14.4s, v9.4s + add v2.4s, v2.4s, v27.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v0.16b, v14.16b, v0.16b + add v2.4s, v2.4s, v31.4s + add v3.4s, v3.4s, v6.4s + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + eor v16.16b, v2.16b, v16.16b + add v3.4s, v3.4s, v13.4s + add v17.4s, v17.4s, v23.4s + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v3.16b, v10.16b + add v17.4s, v17.4s, v8.4s + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + eor v11.16b, v17.16b, v11.16b + add v4.4s, v0.4s, v4.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v11.4s, #8 + shl v11.4s, v11.4s, #24 + eor v9.16b, v4.16b, v9.16b + add v12.4s, v16.4s, v12.4s + orr v11.16b, v11.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v31.16b, v12.16b, v31.16b + add v1.4s, v10.4s, v1.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + eor v13.16b, v1.16b, v13.16b + add v5.4s, v11.4s, v5.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #7 + shl v13.4s, v13.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + orr v8.16b, v8.16b, v15.16b + add v14.4s, v14.4s, v29.4s + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v20.4s + mov v28.16b, v7.16b + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + add v3.4s, v3.4s, v19.4s + rev32 v16.8h, v16.8h + eor v10.16b, v2.16b, v10.16b + add v3.4s, v3.4s, v31.4s + add v17.4s, v17.4s, v28.4s + add v1.4s, v1.4s, v16.4s + rev32 v10.8h, v10.8h + eor v11.16b, v3.16b, v11.16b + add v17.4s, v17.4s, v13.4s + eor v8.16b, v1.16b, v8.16b + add v5.4s, v5.4s, v10.4s + rev32 v11.8h, v11.8h + eor v0.16b, v17.16b, v0.16b + ushr v15.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + eor v9.16b, v5.16b, v9.16b + add v4.4s, v4.4s, v11.4s + rev32 v0.8h, v0.8h + orr v8.16b, v8.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v31.16b, v4.16b, v31.16b + add v12.4s, v12.4s, v0.4s + add v14.4s, v14.4s, v21.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + eor v13.16b, v12.16b, v13.16b + add v14.4s, v14.4s, v8.4s + add v2.4s, v2.4s, v30.4s + orr v31.16b, v31.16b, v15.16b + ushr v15.4s, v13.4s, #12 + shl v13.4s, v13.4s, #20 + eor v16.16b, v14.16b, v16.16b + add v2.4s, v2.4s, v9.4s + orr v13.16b, v13.16b, v15.16b + ushr v15.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v10.16b, v2.16b, v10.16b + orr v16.16b, v16.16b, v15.16b + ushr v15.4s, v10.4s, #8 + shl v10.4s, v10.4s, #24 + add v3.4s, v3.4s, v18.4s + orr v10.16b, v10.16b, v15.16b + add v15.4s, v3.4s, v31.4s + eor v3.16b, v15.16b, v11.16b + ushr v11.4s, v3.4s, #8 + shl v3.4s, v3.4s, #24 + orr v11.16b, v3.16b, v11.16b + add v3.4s, v17.4s, v6.4s + add v17.4s, v3.4s, v13.4s + eor v0.16b, v17.16b, v0.16b + ushr v3.4s, v0.4s, #8 + shl v0.4s, v0.4s, #24 + add v1.4s, v16.4s, v1.4s + orr v0.16b, v0.16b, v3.16b + eor v3.16b, v1.16b, v8.16b + ushr v8.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + add v5.4s, v10.4s, v5.4s + orr v8.16b, v3.16b, v8.16b + eor v3.16b, v5.16b, v9.16b + add v4.4s, v11.4s, v4.4s + ushr v9.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + eor v31.16b, v4.16b, v31.16b + mov v7.16b, v23.16b + mov v23.16b, v28.16b + mov v28.16b, v6.16b + orr v3.16b, v3.16b, v9.16b + ushr v9.4s, v31.4s, #7 + shl v31.4s, v31.4s, #25 + ldur q6, [x29, #-64] + orr v31.16b, v31.16b, v9.16b + add v9.4s, v0.4s, v12.4s + eor v12.16b, v9.16b, v13.16b + ushr v13.4s, v12.4s, #7 + shl v12.4s, v12.4s, #25 + orr v12.16b, v12.16b, v13.16b + add v13.4s, v14.4s, v6.4s + add v13.4s, v13.4s, v3.4s + eor v0.16b, v13.16b, v0.16b + add v2.4s, v2.4s, v24.4s + rev32 v14.8h, v0.8h + add v0.4s, v2.4s, v31.4s + add v6.4s, v4.4s, v14.4s + eor v2.16b, v0.16b, v16.16b + eor v3.16b, v6.16b, v3.16b + rev32 v16.8h, v2.8h + ushr v4.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v2.4s, v9.4s, v16.4s + orr v4.16b, v3.16b, v4.16b + eor v3.16b, v2.16b, v31.16b + ushr v31.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + orr v3.16b, v3.16b, v31.16b + add v31.4s, v15.4s, v22.4s + add v31.4s, v31.4s, v12.4s + add v17.4s, v17.4s, v7.4s + eor v9.16b, v31.16b, v10.16b + add v17.4s, v17.4s, v8.4s + rev32 v9.8h, v9.8h + eor v11.16b, v17.16b, v11.16b + add v1.4s, v1.4s, v9.4s + rev32 v11.8h, v11.8h + eor v10.16b, v1.16b, v12.16b + add v5.4s, v5.4s, v11.4s + ushr v12.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v8.16b, v5.16b, v8.16b + orr v10.16b, v10.16b, v12.16b + ushr v12.4s, v8.4s, #12 + shl v8.4s, v8.4s, #20 + orr v8.16b, v8.16b, v12.16b + add v12.4s, v13.4s, v27.4s + add v12.4s, v12.4s, v4.4s + eor v13.16b, v12.16b, v14.16b + ldur q14, [x29, #-96] + mov v25.16b, v29.16b + add v29.4s, v12.4s, v20.4s + add v20.4s, v31.4s, v26.4s + add v0.4s, v0.4s, v14.4s + add v0.4s, v0.4s, v3.4s + eor v16.16b, v0.16b, v16.16b + add v0.4s, v0.4s, v30.4s + ldur q30, [x29, #-112] + add v20.4s, v20.4s, v10.4s + eor v31.16b, v20.16b, v9.16b + add v20.4s, v20.4s, v28.4s + add v17.4s, v17.4s, v30.4s + add v17.4s, v17.4s, v8.4s + eor v9.16b, v17.16b, v11.16b + ushr v28.4s, v13.4s, #8 + shl v11.4s, v13.4s, #24 + orr v28.16b, v11.16b, v28.16b + ushr v11.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + orr v16.16b, v16.16b, v11.16b + ushr v11.4s, v31.4s, #8 + shl v31.4s, v31.4s, #24 + add v6.4s, v28.4s, v6.4s + orr v31.16b, v31.16b, v11.16b + ushr v11.4s, v9.4s, #8 + shl v9.4s, v9.4s, #24 + add v2.4s, v16.4s, v2.4s + eor v4.16b, v6.16b, v4.16b + orr v9.16b, v9.16b, v11.16b + add v1.4s, v31.4s, v1.4s + eor v3.16b, v2.16b, v3.16b + ushr v11.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + add v5.4s, v9.4s, v5.4s + eor v10.16b, v1.16b, v10.16b + orr v4.16b, v4.16b, v11.16b + ushr v11.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + eor v8.16b, v5.16b, v8.16b + orr v3.16b, v3.16b, v11.16b + ushr v11.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + orr v10.16b, v10.16b, v11.16b + ushr v11.4s, v8.4s, #7 + shl v8.4s, v8.4s, #25 + orr v8.16b, v8.16b, v11.16b + add v29.4s, v29.4s, v8.4s + eor v16.16b, v29.16b, v16.16b + add v0.4s, v0.4s, v4.4s + mov v12.16b, v26.16b + add v17.4s, v17.4s, v19.4s + add v26.4s, v29.4s, v23.4s + eor v29.16b, v0.16b, v31.16b + add v20.4s, v20.4s, v3.4s + rev32 v16.8h, v16.8h + stur q18, [x29, #-176] + mov v18.16b, v27.16b + add v0.4s, v0.4s, v24.4s + eor v27.16b, v20.16b, v9.16b + add v17.4s, v17.4s, v10.4s + rev32 v24.8h, v29.8h + add v1.4s, v1.4s, v16.4s + add v20.4s, v20.4s, v25.4s + eor v25.16b, v17.16b, v28.16b + rev32 v27.8h, v27.8h + add v5.4s, v5.4s, v24.4s + eor v28.16b, v1.16b, v8.16b + rev32 v25.8h, v25.8h + add v6.4s, v6.4s, v27.4s + eor v4.16b, v5.16b, v4.16b + ushr v31.4s, v28.4s, #12 + shl v28.4s, v28.4s, #20 + add v2.4s, v2.4s, v25.4s + eor v3.16b, v6.16b, v3.16b + orr v28.16b, v28.16b, v31.16b + ushr v31.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + eor v29.16b, v2.16b, v10.16b + orr v4.16b, v4.16b, v31.16b + ushr v31.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v26.4s, v26.4s, v28.4s + orr v3.16b, v3.16b, v31.16b + ushr v31.4s, v29.4s, #12 + shl v29.4s, v29.4s, #20 + eor v16.16b, v26.16b, v16.16b + add v0.4s, v0.4s, v4.4s + add v17.4s, v17.4s, v12.4s + orr v29.16b, v29.16b, v31.16b + eor v24.16b, v0.16b, v24.16b + add v0.4s, v0.4s, v22.4s + add v20.4s, v20.4s, v3.4s + ushr v22.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + add v23.4s, v26.4s, v21.4s + eor v21.16b, v20.16b, v27.16b + add v17.4s, v17.4s, v29.4s + orr v16.16b, v16.16b, v22.16b + ushr v22.4s, v24.4s, #8 + shl v24.4s, v24.4s, #24 + eor v25.16b, v17.16b, v25.16b + orr v22.16b, v24.16b, v22.16b + ushr v24.4s, v21.4s, #8 + shl v21.4s, v21.4s, #24 + orr v21.16b, v21.16b, v24.16b + ushr v24.4s, v25.4s, #8 + shl v25.4s, v25.4s, #24 + add v1.4s, v16.4s, v1.4s + orr v24.16b, v25.16b, v24.16b + add v5.4s, v22.4s, v5.4s + eor v25.16b, v1.16b, v28.16b + add v6.4s, v21.4s, v6.4s + eor v4.16b, v5.16b, v4.16b + ushr v27.4s, v25.4s, #7 + shl v25.4s, v25.4s, #25 + add v2.4s, v24.4s, v2.4s + eor v3.16b, v6.16b, v3.16b + orr v25.16b, v25.16b, v27.16b + ushr v27.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + ldur q19, [x29, #-176] + eor v26.16b, v2.16b, v29.16b + orr v4.16b, v4.16b, v27.16b + ushr v27.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + orr v3.16b, v3.16b, v27.16b + ushr v27.4s, v26.4s, #7 + shl v26.4s, v26.4s, #25 + add v20.4s, v20.4s, v18.4s + add v17.4s, v17.4s, v30.4s + orr v26.16b, v26.16b, v27.16b + add v0.4s, v0.4s, v3.4s + eor v16.16b, v0.16b, v16.16b + add v0.4s, v0.4s, v19.4s + add v19.4s, v20.4s, v26.4s + add v17.4s, v17.4s, v25.4s + eor v20.16b, v19.16b, v22.16b + add v7.4s, v19.4s, v7.4s + eor v19.16b, v17.16b, v21.16b + ldur q21, [x29, #-64] + add v23.4s, v23.4s, v4.4s + eor v24.16b, v23.16b, v24.16b + rev32 v16.8h, v16.8h + add v17.4s, v17.4s, v21.4s + rev32 v21.8h, v24.8h + add v6.4s, v6.4s, v21.4s + rev32 v20.8h, v20.8h + add v2.4s, v2.4s, v16.4s + eor v4.16b, v6.16b, v4.16b + rev32 v19.8h, v19.8h + add v1.4s, v1.4s, v20.4s + eor v3.16b, v2.16b, v3.16b + ushr v24.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v5.4s, v5.4s, v19.4s + eor v22.16b, v1.16b, v26.16b + orr v4.16b, v4.16b, v24.16b + ushr v24.4s, v3.4s, #12 + shl v3.4s, v3.4s, #20 + add v18.4s, v23.4s, v14.4s + eor v23.16b, v5.16b, v25.16b + orr v3.16b, v3.16b, v24.16b + ushr v24.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + orr v22.16b, v22.16b, v24.16b + ushr v24.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v23.16b, v23.16b, v24.16b + add v18.4s, v18.4s, v4.4s + add v0.4s, v0.4s, v3.4s + add v24.4s, v17.4s, v23.4s + eor v17.16b, v18.16b, v21.16b + add v7.4s, v7.4s, v22.4s + eor v16.16b, v0.16b, v16.16b + ushr v21.4s, v17.4s, #8 + shl v17.4s, v17.4s, #24 + eor v20.16b, v7.16b, v20.16b + orr v21.16b, v17.16b, v21.16b + ushr v17.4s, v16.4s, #8 + shl v16.4s, v16.4s, #24 + eor v19.16b, v24.16b, v19.16b + orr v16.16b, v16.16b, v17.16b + ushr v17.4s, v20.4s, #8 + shl v20.4s, v20.4s, #24 + orr v25.16b, v20.16b, v17.16b + ushr v17.4s, v19.4s, #8 + shl v19.4s, v19.4s, #24 + orr v19.16b, v19.16b, v17.16b + add v1.4s, v25.4s, v1.4s + eor v22.16b, v1.16b, v22.16b + eor v20.16b, v1.16b, v18.16b + add v1.4s, v19.4s, v5.4s + eor v26.16b, v1.16b, v0.16b + add v0.4s, v21.4s, v6.4s + eor v5.16b, v1.16b, v23.16b + eor v1.16b, v0.16b, v4.16b + eor v17.16b, v0.16b, v7.16b + add v0.4s, v16.4s, v2.4s + eor v2.16b, v0.16b, v3.16b + eor v6.16b, v0.16b, v24.16b + ushr v0.4s, v1.4s, #7 + shl v1.4s, v1.4s, #25 + orr v0.16b, v1.16b, v0.16b + ushr v1.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v1.16b, v2.16b, v1.16b + ushr v2.4s, v22.4s, #7 + shl v3.4s, v22.4s, #25 + orr v2.16b, v3.16b, v2.16b + ushr v3.4s, v5.4s, #7 + shl v4.4s, v5.4s, #25 + orr v3.16b, v4.16b, v3.16b + eor v8.16b, v16.16b, v3.16b + eor v9.16b, v25.16b, v0.16b + eor v31.16b, v1.16b, v19.16b + cmp x17, x22 + eor v15.16b, v2.16b, v21.16b + mov w18, w19 + b.ne .LBB2_4 +.LBB2_7: + zip1 v0.4s, v20.4s, v26.4s + zip2 v1.4s, v20.4s, v26.4s + zip1 v2.4s, v17.4s, v6.4s + zip2 v3.4s, v17.4s, v6.4s + zip1 v4.4s, v8.4s, v9.4s + zip2 v5.4s, v8.4s, v9.4s + zip1 v6.4s, v31.4s, v15.4s + zip2 v7.4s, v31.4s, v15.4s + add x13, x20, #4 + tst w5, #0x1 + sub x28, x28, #4 + zip1 v16.2d, v0.2d, v2.2d + zip2 v0.2d, v0.2d, v2.2d + zip1 v2.2d, v1.2d, v3.2d + zip2 v1.2d, v1.2d, v3.2d + zip1 v3.2d, v4.2d, v6.2d + zip2 v4.2d, v4.2d, v6.2d + zip1 v6.2d, v5.2d, v7.2d + zip2 v5.2d, v5.2d, v7.2d + add x24, x24, #32 + csel x20, x13, x20, ne + cmp x28, #3 + stp q16, q3, [x26] + stp q0, q4, [x26, #32] + stp q2, q6, [x26, #64] + stp q1, q5, [x26, #96] + add x26, x26, #128 + b.hi .LBB2_2 +.LBB2_8: + cbz x28, .LBB2_16 + orr w8, w7, w19 + and x21, x5, #0x1 + stur w8, [x29, #-64] +.LBB2_10: + ldr x8, [sp, #40] + ldr x25, [x24] + ldur w4, [x29, #-64] + ldp q1, q0, [x8] + mov x8, x22 + stp q1, q0, [x29, #-48] +.LBB2_11: + subs x23, x8, #1 + b.eq .LBB2_13 + cbnz x8, .LBB2_14 + b .LBB2_15 +.LBB2_13: + orr w4, w4, w27 +.LBB2_14: + sub x0, x29, #48 + mov w2, #64 + mov x1, x25 + mov x3, x20 + bl zfs_blake3_compress_in_place_sse2 + add x25, x25, #64 + mov x8, x23 + mov w4, w19 + b .LBB2_11 +.LBB2_15: + ldp q0, q1, [x29, #-48] + add x20, x20, x21 + add x24, x24, #8 + subs x28, x28, #1 + stp q0, q1, [x26], #32 + b.ne .LBB2_10 +.LBB2_16: + add sp, sp, #384 + ldp x20, x19, [sp, #144] + ldp x22, x21, [sp, #128] + ldp x24, x23, [sp, #112] + ldp x26, x25, [sp, #96] + ldp x28, x27, [sp, #80] + ldp x29, x30, [sp, #64] + ldp d9, d8, [sp, #48] + ldp d11, d10, [sp, #32] + ldp d13, d12, [sp, #16] + ldp d15, d14, [sp], #160 + ret +.Lfunc_end2: + .size zfs_blake3_hash_many_sse2, .Lfunc_end2-zfs_blake3_hash_many_sse2 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S new file mode 100644 index 000000000000..eb6946400b8a --- /dev/null +++ b/module/icp/asm-aarch64/blake3/b3_aarch64_sse41.S @@ -0,0 +1,2463 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves + * Copyright (c) 2022 Tino Reichardt + * + * This is converted assembly: SSE4.1 -> ARMv8-A + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if defined(__aarch64__) + .text + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI0_1: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI0_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI0_3: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LCPI0_4: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 + .text + .globl zfs_blake3_compress_in_place_sse41 + .p2align 2 + .type zfs_blake3_compress_in_place_sse41,@function +zfs_blake3_compress_in_place_sse41: + .cfi_startproc + ldp q7, q6, [x0] + ldp q17, q18, [x1] + add x12, x1, #32 + ld2 { v4.4s, v5.4s }, [x12] + lsr x10, x3, #32 + fmov s16, w3 + adrp x13, .LCPI0_0 + adrp x11, .LCPI0_1 + and w8, w2, #0xff + mov v16.s[1], w10 + ldr q0, [x13, :lo12:.LCPI0_0] + ldr q20, [x11, :lo12:.LCPI0_1] + adrp x11, .LCPI0_4 + and w9, w4, #0xff + ldr q2, [x11, :lo12:.LCPI0_4] + mov v16.s[2], w8 + uzp1 v21.4s, v17.4s, v18.4s + add v7.4s, v6.4s, v7.4s + adrp x12, .LCPI0_3 + mov v16.s[3], w9 + uzp2 v18.4s, v17.4s, v18.4s + add v7.4s, v7.4s, v21.4s + ext v17.16b, v5.16b, v5.16b, #12 + ldr q3, [x12, :lo12:.LCPI0_3] + ext v24.16b, v4.16b, v4.16b, #12 + eor v16.16b, v7.16b, v16.16b + mov v27.16b, v17.16b + uzp1 v19.4s, v21.4s, v21.4s + ext v25.16b, v21.16b, v21.16b, #12 + zip2 v28.4s, v18.4s, v17.4s + tbl v29.16b, { v16.16b }, v0.16b + mov v27.s[1], v24.s[2] + zip1 v23.2d, v17.2d, v18.2d + ext v19.16b, v19.16b, v21.16b, #8 + add v22.4s, v29.4s, v20.4s + ext v26.16b, v21.16b, v25.16b, #12 + tbl v20.16b, { v23.16b, v24.16b }, v2.16b + zip1 v21.4s, v28.4s, v24.4s + zip1 v23.4s, v24.4s, v28.4s + uzp2 v19.4s, v19.4s, v18.4s + eor v24.16b, v22.16b, v6.16b + ext v25.16b, v20.16b, v20.16b, #12 + ext v6.16b, v23.16b, v21.16b, #8 + add v7.4s, v7.4s, v18.4s + ext v18.16b, v19.16b, v19.16b, #4 + tbl v16.16b, { v26.16b, v27.16b }, v3.16b + uzp1 v21.4s, v20.4s, v25.4s + mov v26.16b, v6.16b + ext v23.16b, v18.16b, v18.16b, #12 + mov v26.s[1], v21.s[2] + adrp x10, .LCPI0_2 + ext v25.16b, v18.16b, v23.16b, #12 + uzp1 v23.4s, v18.4s, v18.4s + ldr q1, [x10, :lo12:.LCPI0_2] + ext v18.16b, v23.16b, v18.16b, #8 + ushr v23.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + orr v23.16b, v24.16b, v23.16b + add v7.4s, v7.4s, v23.4s + eor v27.16b, v29.16b, v7.16b + add v4.4s, v7.4s, v4.4s + tbl v7.16b, { v25.16b, v26.16b }, v3.16b + tbl v26.16b, { v27.16b }, v1.16b + add v22.4s, v22.4s, v26.4s + uzp2 v18.4s, v18.4s, v16.4s + eor v23.16b, v23.16b, v22.16b + ext v5.16b, v18.16b, v18.16b, #4 + ushr v27.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + uzp1 v25.4s, v5.4s, v5.4s + orr v23.16b, v23.16b, v27.16b + ext v28.16b, v4.16b, v4.16b, #12 + ext v4.16b, v25.16b, v5.16b, #8 + ext v25.16b, v26.16b, v26.16b, #8 + add v26.4s, v28.4s, v23.4s + eor v25.16b, v26.16b, v25.16b + ext v22.16b, v22.16b, v22.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v22.4s, v22.4s, v25.4s + eor v23.16b, v23.16b, v22.16b + add v17.4s, v26.4s, v17.4s + ushr v26.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v23.16b, v23.16b, v26.16b + add v17.4s, v17.4s, v23.4s + eor v25.16b, v25.16b, v17.16b + add v17.4s, v17.4s, v19.4s + tbl v19.16b, { v25.16b }, v1.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + ext v17.16b, v17.16b, v17.16b, #4 + orr v23.16b, v23.16b, v25.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v17.4s, v17.4s, v23.4s + eor v19.16b, v17.16b, v19.16b + ext v22.16b, v22.16b, v22.16b, #12 + tbl v19.16b, { v19.16b }, v0.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + add v17.4s, v17.4s, v16.4s + orr v23.16b, v23.16b, v25.16b + add v17.4s, v17.4s, v23.4s + ext v25.16b, v17.16b, v17.16b, #12 + eor v17.16b, v19.16b, v17.16b + tbl v17.16b, { v17.16b }, v1.16b + add v19.4s, v22.4s, v17.4s + eor v22.16b, v23.16b, v19.16b + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v6.2d, v16.2d + ushr v23.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + zip2 v24.4s, v16.4s, v6.4s + tbl v26.16b, { v20.16b, v21.16b }, v2.16b + orr v22.16b, v22.16b, v23.16b + zip1 v16.4s, v24.4s, v21.4s + zip1 v20.4s, v21.4s, v24.4s + ext v21.16b, v26.16b, v26.16b, #12 + ext v17.16b, v17.16b, v17.16b, #8 + add v25.4s, v25.4s, v22.4s + ext v16.16b, v20.16b, v16.16b, #8 + uzp1 v21.4s, v26.4s, v21.4s + eor v26.16b, v25.16b, v17.16b + ext v19.16b, v19.16b, v19.16b, #4 + tbl v26.16b, { v26.16b }, v0.16b + mov v29.16b, v16.16b + add v19.4s, v19.4s, v26.4s + ext v27.16b, v5.16b, v5.16b, #12 + mov v29.s[1], v21.s[2] + eor v22.16b, v22.16b, v19.16b + ext v28.16b, v5.16b, v27.16b, #12 + ushr v27.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v25.4s, v6.4s + orr v22.16b, v22.16b, v27.16b + add v6.4s, v6.4s, v22.4s + eor v26.16b, v26.16b, v6.16b + add v6.4s, v6.4s, v18.4s + tbl v18.16b, { v26.16b }, v1.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v22.16b, v22.16b, v26.16b + ext v18.16b, v18.16b, v18.16b, #8 + add v6.4s, v6.4s, v22.4s + eor v18.16b, v6.16b, v18.16b + ext v19.16b, v19.16b, v19.16b, #12 + tbl v18.16b, { v18.16b }, v0.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v6.4s, v7.4s + orr v22.16b, v22.16b, v26.16b + add v6.4s, v6.4s, v22.4s + ext v26.16b, v6.16b, v6.16b, #12 + eor v6.16b, v18.16b, v6.16b + uzp2 v4.4s, v4.4s, v7.4s + zip2 v25.4s, v7.4s, v16.4s + add v26.4s, v26.4s, v21.4s + zip1 v20.2d, v16.2d, v7.2d + tbl v6.16b, { v6.16b }, v1.16b + ext v24.16b, v4.16b, v4.16b, #4 + tbl v27.16b, { v20.16b, v21.16b }, v2.16b + zip1 v7.4s, v25.4s, v21.4s + zip1 v20.4s, v21.4s, v25.4s + add v18.4s, v19.4s, v6.4s + uzp1 v5.4s, v24.4s, v24.4s + ext v21.16b, v27.16b, v27.16b, #12 + ext v7.16b, v20.16b, v7.16b, #8 + eor v19.16b, v22.16b, v18.16b + ext v5.16b, v5.16b, v24.16b, #8 + tbl v17.16b, { v28.16b, v29.16b }, v3.16b + uzp1 v21.4s, v27.4s, v21.4s + mov v28.16b, v7.16b + ushr v22.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v23.16b, v24.16b, v24.16b, #12 + uzp2 v5.4s, v5.4s, v17.4s + mov v28.s[1], v21.s[2] + orr v19.16b, v19.16b, v22.16b + ext v27.16b, v24.16b, v23.16b, #12 + ext v23.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #8 + ext v25.16b, v18.16b, v18.16b, #4 + add v18.4s, v26.4s, v19.4s + uzp1 v24.4s, v23.4s, v23.4s + eor v6.16b, v18.16b, v6.16b + ext v24.16b, v24.16b, v23.16b, #8 + add v16.4s, v18.4s, v16.4s + tbl v18.16b, { v27.16b, v28.16b }, v3.16b + tbl v27.16b, { v6.16b }, v0.16b + uzp2 v6.4s, v24.4s, v18.4s + add v24.4s, v25.4s, v27.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v25.16b + add v16.4s, v16.4s, v19.4s + eor v25.16b, v27.16b, v16.16b + add v4.4s, v16.4s, v4.4s + tbl v16.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v16.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v16.16b, v16.16b, v16.16b, #8 + add v4.4s, v4.4s, v19.4s + eor v16.16b, v4.16b, v16.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v25.16b, { v16.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v16.16b, v19.16b, v24.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v17.4s + orr v19.16b, v16.16b, v19.16b + add v27.4s, v4.4s, v19.4s + eor v25.16b, v25.16b, v27.16b + tbl v25.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v25.4s + zip2 v26.4s, v17.4s, v7.4s + ext v4.16b, v27.16b, v27.16b, #12 + eor v19.16b, v19.16b, v24.16b + add v28.4s, v4.4s, v21.4s + zip1 v20.2d, v7.2d, v17.2d + zip1 v4.4s, v26.4s, v21.4s + zip1 v17.4s, v21.4s, v26.4s + ushr v26.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v26.16b + ext v25.16b, v25.16b, v25.16b, #8 + add v27.4s, v28.4s, v19.4s + eor v25.16b, v27.16b, v25.16b + ext v24.16b, v24.16b, v24.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v19.16b, v19.16b, v24.16b + add v7.4s, v27.4s, v7.4s + ushr v27.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v27.16b + add v7.4s, v7.4s, v19.4s + eor v25.16b, v25.16b, v7.16b + add v5.4s, v7.4s, v5.4s + tbl v7.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v7.16b, v7.16b, v7.16b, #8 + add v5.4s, v5.4s, v19.4s + eor v7.16b, v5.16b, v7.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v7.16b, { v7.16b }, v0.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + tbl v16.16b, { v20.16b, v21.16b }, v2.16b + add v5.4s, v5.4s, v18.4s + orr v19.16b, v19.16b, v25.16b + ext v20.16b, v16.16b, v16.16b, #12 + ext v4.16b, v17.16b, v4.16b, #8 + add v5.4s, v5.4s, v19.4s + uzp1 v21.4s, v16.4s, v20.4s + mov v17.16b, v4.16b + ext v25.16b, v5.16b, v5.16b, #12 + mov v17.s[1], v21.s[2] + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v4.2d, v18.2d + ext v22.16b, v23.16b, v23.16b, #12 + zip2 v26.4s, v18.4s, v4.4s + tbl v18.16b, { v20.16b, v21.16b }, v2.16b + eor v5.16b, v7.16b, v5.16b + ext v16.16b, v23.16b, v22.16b, #12 + ext v22.16b, v6.16b, v6.16b, #4 + zip1 v27.4s, v26.4s, v21.4s + zip1 v20.4s, v21.4s, v26.4s + ext v21.16b, v18.16b, v18.16b, #12 + tbl v5.16b, { v5.16b }, v1.16b + ext v20.16b, v20.16b, v27.16b, #8 + uzp1 v27.4s, v18.4s, v21.4s + uzp1 v18.4s, v22.4s, v22.4s + add v21.4s, v24.4s, v5.4s + ext v18.16b, v18.16b, v22.16b, #8 + eor v19.16b, v19.16b, v21.16b + tbl v7.16b, { v16.16b, v17.16b }, v3.16b + uzp2 v18.4s, v18.4s, v17.4s + zip2 v16.4s, v16.4s, v20.4s + ushr v17.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v17.16b, v19.16b, v17.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v19.4s, v25.4s, v17.4s + eor v5.16b, v19.16b, v5.16b + ext v21.16b, v21.16b, v21.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v4.4s, v19.4s, v4.4s + add v19.4s, v21.4s, v5.4s + eor v17.16b, v17.16b, v19.16b + ushr v21.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + orr v17.16b, v17.16b, v21.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v4.4s, v4.4s, v6.4s + add v6.4s, v19.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v17.16b, v17.16b, v19.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v17.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v4.4s, v4.4s, v7.4s + orr v17.16b, v17.16b, v19.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + mov v29.16b, v20.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v6.4s, v6.4s, v5.4s + mov v29.s[1], v27.s[2] + add v4.4s, v4.4s, v27.4s + zip1 v26.2d, v20.2d, v7.2d + zip1 v7.4s, v16.4s, v27.4s + zip1 v16.4s, v27.4s, v16.4s + eor v17.16b, v17.16b, v6.16b + ext v7.16b, v16.16b, v7.16b, #8 + ushr v16.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + orr v16.16b, v17.16b, v16.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v16.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + ushr v17.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v20.4s + orr v16.16b, v16.16b, v17.16b + add v4.4s, v4.4s, v16.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + add v4.4s, v4.4s, v18.4s + ushr v17.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + ext v23.16b, v22.16b, v22.16b, #12 + ext v4.16b, v4.16b, v4.16b, #4 + orr v16.16b, v16.16b, v17.16b + ext v28.16b, v22.16b, v23.16b, #12 + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v16.4s, v4.4s + tbl v3.16b, { v28.16b, v29.16b }, v3.16b + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v4.4s, v3.4s + tbl v4.16b, { v5.16b }, v0.16b + add v5.4s, v6.4s, v4.4s + eor v6.16b, v16.16b, v5.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + orr v6.16b, v6.16b, v16.16b + tbl v2.16b, { v26.16b, v27.16b }, v2.16b + add v3.4s, v3.4s, v6.4s + ext v19.16b, v2.16b, v2.16b, #12 + eor v4.16b, v4.16b, v3.16b + uzp1 v2.4s, v2.4s, v19.4s + ext v3.16b, v3.16b, v3.16b, #12 + tbl v4.16b, { v4.16b }, v1.16b + add v2.4s, v3.4s, v2.4s + add v3.4s, v5.4s, v4.4s + eor v5.16b, v6.16b, v3.16b + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v6.16b + ext v4.16b, v4.16b, v4.16b, #8 + add v2.4s, v2.4s, v5.4s + eor v4.16b, v2.16b, v4.16b + ext v3.16b, v3.16b, v3.16b, #4 + tbl v0.16b, { v4.16b }, v0.16b + add v3.4s, v3.4s, v0.4s + eor v4.16b, v5.16b, v3.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v2.4s, v2.4s, v7.4s + orr v4.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v4.4s + eor v0.16b, v0.16b, v2.16b + tbl v0.16b, { v0.16b }, v1.16b + add v1.4s, v3.4s, v0.4s + eor v3.16b, v4.16b, v1.16b + ext v2.16b, v2.16b, v2.16b, #4 + ext v1.16b, v1.16b, v1.16b, #12 + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v0.16b, v0.16b, v0.16b, #8 + eor v1.16b, v2.16b, v1.16b + orr v2.16b, v3.16b, v4.16b + eor v0.16b, v2.16b, v0.16b + stp q1, q0, [x0] + ret +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-zfs_blake3_compress_in_place_sse41 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI1_1: + .word 1779033703 + .word 3144134277 + .word 1013904242 + .word 2773480762 +.LCPI1_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI1_3: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LCPI1_4: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 28 + .byte 29 + .byte 30 + .byte 31 + .text + .globl zfs_blake3_compress_xof_sse41 + .p2align 2 + .type zfs_blake3_compress_xof_sse41,@function +zfs_blake3_compress_xof_sse41: + .cfi_startproc + ldp q7, q6, [x0] + ldp q17, q18, [x1] + add x12, x1, #32 + ld2 { v4.4s, v5.4s }, [x12] + lsr x10, x3, #32 + fmov s16, w3 + adrp x13, .LCPI1_0 + adrp x11, .LCPI1_1 + and w8, w2, #0xff + mov v16.s[1], w10 + ldr q0, [x13, :lo12:.LCPI1_0] + ldr q20, [x11, :lo12:.LCPI1_1] + adrp x11, .LCPI1_4 + and w9, w4, #0xff + ldr q2, [x11, :lo12:.LCPI1_4] + mov v16.s[2], w8 + uzp1 v21.4s, v17.4s, v18.4s + add v7.4s, v6.4s, v7.4s + adrp x12, .LCPI1_3 + mov v16.s[3], w9 + uzp2 v18.4s, v17.4s, v18.4s + add v7.4s, v7.4s, v21.4s + ext v17.16b, v5.16b, v5.16b, #12 + ldr q3, [x12, :lo12:.LCPI1_3] + ext v24.16b, v4.16b, v4.16b, #12 + eor v16.16b, v7.16b, v16.16b + mov v27.16b, v17.16b + uzp1 v19.4s, v21.4s, v21.4s + ext v25.16b, v21.16b, v21.16b, #12 + zip2 v28.4s, v18.4s, v17.4s + tbl v29.16b, { v16.16b }, v0.16b + mov v27.s[1], v24.s[2] + zip1 v23.2d, v17.2d, v18.2d + ext v19.16b, v19.16b, v21.16b, #8 + add v22.4s, v29.4s, v20.4s + ext v26.16b, v21.16b, v25.16b, #12 + tbl v20.16b, { v23.16b, v24.16b }, v2.16b + zip1 v21.4s, v28.4s, v24.4s + zip1 v23.4s, v24.4s, v28.4s + uzp2 v19.4s, v19.4s, v18.4s + eor v24.16b, v22.16b, v6.16b + ext v25.16b, v20.16b, v20.16b, #12 + ext v6.16b, v23.16b, v21.16b, #8 + add v7.4s, v7.4s, v18.4s + ext v18.16b, v19.16b, v19.16b, #4 + tbl v16.16b, { v26.16b, v27.16b }, v3.16b + uzp1 v21.4s, v20.4s, v25.4s + mov v26.16b, v6.16b + ext v23.16b, v18.16b, v18.16b, #12 + mov v26.s[1], v21.s[2] + adrp x10, .LCPI1_2 + ext v25.16b, v18.16b, v23.16b, #12 + uzp1 v23.4s, v18.4s, v18.4s + ldr q1, [x10, :lo12:.LCPI1_2] + ext v18.16b, v23.16b, v18.16b, #8 + ushr v23.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + orr v23.16b, v24.16b, v23.16b + add v7.4s, v7.4s, v23.4s + eor v27.16b, v29.16b, v7.16b + add v4.4s, v7.4s, v4.4s + tbl v7.16b, { v25.16b, v26.16b }, v3.16b + tbl v26.16b, { v27.16b }, v1.16b + add v22.4s, v22.4s, v26.4s + uzp2 v18.4s, v18.4s, v16.4s + eor v23.16b, v23.16b, v22.16b + ext v5.16b, v18.16b, v18.16b, #4 + ushr v27.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + uzp1 v25.4s, v5.4s, v5.4s + orr v23.16b, v23.16b, v27.16b + ext v28.16b, v4.16b, v4.16b, #12 + ext v4.16b, v25.16b, v5.16b, #8 + ext v25.16b, v26.16b, v26.16b, #8 + add v26.4s, v28.4s, v23.4s + eor v25.16b, v26.16b, v25.16b + ext v22.16b, v22.16b, v22.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v22.4s, v22.4s, v25.4s + eor v23.16b, v23.16b, v22.16b + add v17.4s, v26.4s, v17.4s + ushr v26.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + orr v23.16b, v23.16b, v26.16b + add v17.4s, v17.4s, v23.4s + eor v25.16b, v25.16b, v17.16b + add v17.4s, v17.4s, v19.4s + tbl v19.16b, { v25.16b }, v1.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #7 + shl v23.4s, v23.4s, #25 + ext v17.16b, v17.16b, v17.16b, #4 + orr v23.16b, v23.16b, v25.16b + ext v19.16b, v19.16b, v19.16b, #8 + add v17.4s, v17.4s, v23.4s + eor v19.16b, v17.16b, v19.16b + ext v22.16b, v22.16b, v22.16b, #12 + tbl v19.16b, { v19.16b }, v0.16b + add v22.4s, v22.4s, v19.4s + eor v23.16b, v23.16b, v22.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + add v17.4s, v17.4s, v16.4s + orr v23.16b, v23.16b, v25.16b + add v17.4s, v17.4s, v23.4s + ext v25.16b, v17.16b, v17.16b, #12 + eor v17.16b, v19.16b, v17.16b + tbl v17.16b, { v17.16b }, v1.16b + add v19.4s, v22.4s, v17.4s + eor v22.16b, v23.16b, v19.16b + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v6.2d, v16.2d + ushr v23.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + zip2 v24.4s, v16.4s, v6.4s + tbl v26.16b, { v20.16b, v21.16b }, v2.16b + orr v22.16b, v22.16b, v23.16b + zip1 v16.4s, v24.4s, v21.4s + zip1 v20.4s, v21.4s, v24.4s + ext v21.16b, v26.16b, v26.16b, #12 + ext v17.16b, v17.16b, v17.16b, #8 + add v25.4s, v25.4s, v22.4s + ext v16.16b, v20.16b, v16.16b, #8 + uzp1 v21.4s, v26.4s, v21.4s + eor v26.16b, v25.16b, v17.16b + ext v19.16b, v19.16b, v19.16b, #4 + tbl v26.16b, { v26.16b }, v0.16b + mov v29.16b, v16.16b + add v19.4s, v19.4s, v26.4s + ext v27.16b, v5.16b, v5.16b, #12 + mov v29.s[1], v21.s[2] + eor v22.16b, v22.16b, v19.16b + ext v28.16b, v5.16b, v27.16b, #12 + ushr v27.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v25.4s, v6.4s + orr v22.16b, v22.16b, v27.16b + add v6.4s, v6.4s, v22.4s + eor v26.16b, v26.16b, v6.16b + add v6.4s, v6.4s, v18.4s + tbl v18.16b, { v26.16b }, v1.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #7 + shl v22.4s, v22.4s, #25 + ext v6.16b, v6.16b, v6.16b, #4 + orr v22.16b, v22.16b, v26.16b + ext v18.16b, v18.16b, v18.16b, #8 + add v6.4s, v6.4s, v22.4s + eor v18.16b, v6.16b, v18.16b + ext v19.16b, v19.16b, v19.16b, #12 + tbl v18.16b, { v18.16b }, v0.16b + add v19.4s, v19.4s, v18.4s + eor v22.16b, v22.16b, v19.16b + ushr v26.4s, v22.4s, #12 + shl v22.4s, v22.4s, #20 + add v6.4s, v6.4s, v7.4s + orr v22.16b, v22.16b, v26.16b + add v6.4s, v6.4s, v22.4s + ext v26.16b, v6.16b, v6.16b, #12 + eor v6.16b, v18.16b, v6.16b + uzp2 v4.4s, v4.4s, v7.4s + zip2 v25.4s, v7.4s, v16.4s + add v26.4s, v26.4s, v21.4s + zip1 v20.2d, v16.2d, v7.2d + tbl v6.16b, { v6.16b }, v1.16b + ext v24.16b, v4.16b, v4.16b, #4 + tbl v27.16b, { v20.16b, v21.16b }, v2.16b + zip1 v7.4s, v25.4s, v21.4s + zip1 v20.4s, v21.4s, v25.4s + add v18.4s, v19.4s, v6.4s + uzp1 v5.4s, v24.4s, v24.4s + ext v21.16b, v27.16b, v27.16b, #12 + ext v7.16b, v20.16b, v7.16b, #8 + eor v19.16b, v22.16b, v18.16b + ext v5.16b, v5.16b, v24.16b, #8 + tbl v17.16b, { v28.16b, v29.16b }, v3.16b + uzp1 v21.4s, v27.4s, v21.4s + mov v28.16b, v7.16b + ushr v22.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v23.16b, v24.16b, v24.16b, #12 + uzp2 v5.4s, v5.4s, v17.4s + mov v28.s[1], v21.s[2] + orr v19.16b, v19.16b, v22.16b + ext v27.16b, v24.16b, v23.16b, #12 + ext v23.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #8 + ext v25.16b, v18.16b, v18.16b, #4 + add v18.4s, v26.4s, v19.4s + uzp1 v24.4s, v23.4s, v23.4s + eor v6.16b, v18.16b, v6.16b + ext v24.16b, v24.16b, v23.16b, #8 + add v16.4s, v18.4s, v16.4s + tbl v18.16b, { v27.16b, v28.16b }, v3.16b + tbl v27.16b, { v6.16b }, v0.16b + uzp2 v6.4s, v24.4s, v18.4s + add v24.4s, v25.4s, v27.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v25.16b + add v16.4s, v16.4s, v19.4s + eor v25.16b, v27.16b, v16.16b + add v4.4s, v16.4s, v4.4s + tbl v16.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v16.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v16.16b, v16.16b, v16.16b, #8 + add v4.4s, v4.4s, v19.4s + eor v16.16b, v4.16b, v16.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v25.16b, { v16.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v16.16b, v19.16b, v24.16b + ushr v19.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v17.4s + orr v19.16b, v16.16b, v19.16b + add v27.4s, v4.4s, v19.4s + eor v25.16b, v25.16b, v27.16b + tbl v25.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v25.4s + zip2 v26.4s, v17.4s, v7.4s + ext v4.16b, v27.16b, v27.16b, #12 + eor v19.16b, v19.16b, v24.16b + add v28.4s, v4.4s, v21.4s + zip1 v20.2d, v7.2d, v17.2d + zip1 v4.4s, v26.4s, v21.4s + zip1 v17.4s, v21.4s, v26.4s + ushr v26.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v19.16b, v19.16b, v26.16b + ext v25.16b, v25.16b, v25.16b, #8 + add v27.4s, v28.4s, v19.4s + eor v25.16b, v27.16b, v25.16b + ext v24.16b, v24.16b, v24.16b, #4 + tbl v25.16b, { v25.16b }, v0.16b + add v24.4s, v24.4s, v25.4s + eor v19.16b, v19.16b, v24.16b + add v7.4s, v27.4s, v7.4s + ushr v27.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v19.16b, v19.16b, v27.16b + add v7.4s, v7.4s, v19.4s + eor v25.16b, v25.16b, v7.16b + add v5.4s, v7.4s, v5.4s + tbl v7.16b, { v25.16b }, v1.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + ext v5.16b, v5.16b, v5.16b, #4 + orr v19.16b, v19.16b, v25.16b + ext v7.16b, v7.16b, v7.16b, #8 + add v5.4s, v5.4s, v19.4s + eor v7.16b, v5.16b, v7.16b + ext v24.16b, v24.16b, v24.16b, #12 + tbl v7.16b, { v7.16b }, v0.16b + add v24.4s, v24.4s, v7.4s + eor v19.16b, v19.16b, v24.16b + ushr v25.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + tbl v16.16b, { v20.16b, v21.16b }, v2.16b + add v5.4s, v5.4s, v18.4s + orr v19.16b, v19.16b, v25.16b + ext v20.16b, v16.16b, v16.16b, #12 + ext v4.16b, v17.16b, v4.16b, #8 + add v5.4s, v5.4s, v19.4s + uzp1 v21.4s, v16.4s, v20.4s + mov v17.16b, v4.16b + ext v25.16b, v5.16b, v5.16b, #12 + mov v17.s[1], v21.s[2] + add v25.4s, v25.4s, v21.4s + zip1 v20.2d, v4.2d, v18.2d + ext v22.16b, v23.16b, v23.16b, #12 + zip2 v26.4s, v18.4s, v4.4s + tbl v18.16b, { v20.16b, v21.16b }, v2.16b + eor v5.16b, v7.16b, v5.16b + ext v16.16b, v23.16b, v22.16b, #12 + ext v22.16b, v6.16b, v6.16b, #4 + zip1 v27.4s, v26.4s, v21.4s + zip1 v20.4s, v21.4s, v26.4s + ext v21.16b, v18.16b, v18.16b, #12 + tbl v5.16b, { v5.16b }, v1.16b + ext v20.16b, v20.16b, v27.16b, #8 + uzp1 v27.4s, v18.4s, v21.4s + uzp1 v18.4s, v22.4s, v22.4s + add v21.4s, v24.4s, v5.4s + ext v18.16b, v18.16b, v22.16b, #8 + eor v19.16b, v19.16b, v21.16b + tbl v7.16b, { v16.16b, v17.16b }, v3.16b + uzp2 v18.4s, v18.4s, v17.4s + zip2 v16.4s, v16.4s, v20.4s + ushr v17.4s, v19.4s, #7 + shl v19.4s, v19.4s, #25 + orr v17.16b, v19.16b, v17.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v19.4s, v25.4s, v17.4s + eor v5.16b, v19.16b, v5.16b + ext v21.16b, v21.16b, v21.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v4.4s, v19.4s, v4.4s + add v19.4s, v21.4s, v5.4s + eor v17.16b, v17.16b, v19.16b + ushr v21.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + orr v17.16b, v17.16b, v21.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v4.4s, v4.4s, v6.4s + add v6.4s, v19.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + ext v4.16b, v4.16b, v4.16b, #4 + orr v17.16b, v17.16b, v19.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v17.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v17.16b, v17.16b, v6.16b + ushr v19.4s, v17.4s, #12 + shl v17.4s, v17.4s, #20 + add v4.4s, v4.4s, v7.4s + orr v17.16b, v17.16b, v19.16b + add v4.4s, v4.4s, v17.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + mov v29.16b, v20.16b + ext v4.16b, v4.16b, v4.16b, #12 + add v6.4s, v6.4s, v5.4s + mov v29.s[1], v27.s[2] + add v4.4s, v4.4s, v27.4s + zip1 v26.2d, v20.2d, v7.2d + zip1 v7.4s, v16.4s, v27.4s + zip1 v16.4s, v27.4s, v16.4s + eor v17.16b, v17.16b, v6.16b + ext v7.16b, v16.16b, v7.16b, #8 + ushr v16.4s, v17.4s, #7 + shl v17.4s, v17.4s, #25 + orr v16.16b, v17.16b, v16.16b + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v4.4s, v16.4s + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #4 + tbl v5.16b, { v5.16b }, v0.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + ushr v17.4s, v16.4s, #12 + shl v16.4s, v16.4s, #20 + add v4.4s, v4.4s, v20.4s + orr v16.16b, v16.16b, v17.16b + add v4.4s, v4.4s, v16.4s + eor v5.16b, v5.16b, v4.16b + tbl v5.16b, { v5.16b }, v1.16b + add v6.4s, v6.4s, v5.4s + eor v16.16b, v16.16b, v6.16b + add v4.4s, v4.4s, v18.4s + ushr v17.4s, v16.4s, #7 + shl v16.4s, v16.4s, #25 + ext v23.16b, v22.16b, v22.16b, #12 + ext v4.16b, v4.16b, v4.16b, #4 + orr v16.16b, v16.16b, v17.16b + ext v28.16b, v22.16b, v23.16b, #12 + ext v5.16b, v5.16b, v5.16b, #8 + add v4.4s, v16.4s, v4.4s + tbl v3.16b, { v28.16b, v29.16b }, v3.16b + eor v5.16b, v4.16b, v5.16b + ext v6.16b, v6.16b, v6.16b, #12 + add v3.4s, v4.4s, v3.4s + tbl v4.16b, { v5.16b }, v0.16b + add v5.4s, v6.4s, v4.4s + eor v6.16b, v16.16b, v5.16b + ushr v16.4s, v6.4s, #12 + shl v6.4s, v6.4s, #20 + orr v6.16b, v6.16b, v16.16b + tbl v2.16b, { v26.16b, v27.16b }, v2.16b + add v3.4s, v3.4s, v6.4s + ext v19.16b, v2.16b, v2.16b, #12 + eor v4.16b, v4.16b, v3.16b + uzp1 v2.4s, v2.4s, v19.4s + ext v3.16b, v3.16b, v3.16b, #12 + tbl v4.16b, { v4.16b }, v1.16b + add v2.4s, v3.4s, v2.4s + add v3.4s, v5.4s, v4.4s + eor v5.16b, v6.16b, v3.16b + ushr v6.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v5.16b, v5.16b, v6.16b + ext v4.16b, v4.16b, v4.16b, #8 + add v2.4s, v2.4s, v5.4s + eor v4.16b, v2.16b, v4.16b + ext v3.16b, v3.16b, v3.16b, #4 + tbl v0.16b, { v4.16b }, v0.16b + add v3.4s, v3.4s, v0.4s + eor v4.16b, v5.16b, v3.16b + ushr v5.4s, v4.4s, #12 + shl v4.4s, v4.4s, #20 + add v2.4s, v2.4s, v7.4s + orr v4.16b, v4.16b, v5.16b + add v2.4s, v2.4s, v4.4s + eor v0.16b, v0.16b, v2.16b + tbl v0.16b, { v0.16b }, v1.16b + add v1.4s, v3.4s, v0.4s + eor v3.16b, v4.16b, v1.16b + ushr v4.4s, v3.4s, #7 + shl v3.4s, v3.4s, #25 + ext v2.16b, v2.16b, v2.16b, #4 + ext v0.16b, v0.16b, v0.16b, #8 + ext v1.16b, v1.16b, v1.16b, #12 + orr v3.16b, v3.16b, v4.16b + eor v2.16b, v2.16b, v1.16b + eor v3.16b, v3.16b, v0.16b + stp q2, q3, [x5] + ldr q2, [x0] + eor v1.16b, v2.16b, v1.16b + str q1, [x5, #32] + ldr q1, [x0, #16] + eor v0.16b, v1.16b, v0.16b + str q0, [x5, #48] + ret +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-zfs_blake3_compress_xof_sse41 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI2_0: + .word 0 + .word 1 + .word 2 + .word 3 +.LCPI2_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI2_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 + .text + .globl zfs_blake3_hash_many_sse41 + .p2align 2 + .type zfs_blake3_hash_many_sse41,@function +zfs_blake3_hash_many_sse41: + .cfi_startproc + stp d15, d14, [sp, #-160]! + stp d13, d12, [sp, #16] + stp d11, d10, [sp, #32] + stp d9, d8, [sp, #48] + stp x29, x30, [sp, #64] + stp x28, x27, [sp, #80] + stp x26, x25, [sp, #96] + stp x24, x23, [sp, #112] + stp x22, x21, [sp, #128] + stp x20, x19, [sp, #144] + mov x29, sp + sub sp, sp, #448 + .cfi_def_cfa w29, 160 + .cfi_offset w19, -8 + .cfi_offset w20, -16 + .cfi_offset w21, -24 + .cfi_offset w22, -32 + .cfi_offset w23, -40 + .cfi_offset w24, -48 + .cfi_offset w25, -56 + .cfi_offset w26, -64 + .cfi_offset w27, -72 + .cfi_offset w28, -80 + .cfi_offset w30, -88 + .cfi_offset w29, -96 + .cfi_offset b8, -104 + .cfi_offset b9, -112 + .cfi_offset b10, -120 + .cfi_offset b11, -128 + .cfi_offset b12, -136 + .cfi_offset b13, -144 + .cfi_offset b14, -152 + .cfi_offset b15, -160 + ldr x26, [x29, #168] + ldrb w27, [x29, #160] + mov w19, w6 + mov x20, x4 + mov x22, x2 + mov x28, x1 + cmp x1, #4 + mov x24, x0 + str x3, [sp, #40] + b.lo .LBB2_8 + adrp x11, .LCPI2_0 + ldr q0, [x11, :lo12:.LCPI2_0] + sbfx w13, w5, #0, #1 + dup v1.4s, w13 + mov w10, #58983 + mov w11, #44677 + mov w12, #62322 + and v0.16b, v1.16b, v0.16b + mov w13, #62778 + orr w8, w7, w19 + adrp x9, .LCPI2_1 + movk w10, #27145, lsl #16 + movk w11, #47975, lsl #16 + movk w12, #15470, lsl #16 + movk w13, #42319, lsl #16 + str q0, [sp, #16] + orr v0.4s, #128, lsl #24 + adrp x14, .LCPI2_2 + str q0, [sp] +.LBB2_2: + ldr x2, [sp, #40] + mov x15, x2 + ld1r { v7.4s }, [x15], #4 + add x16, x2, #8 + add x17, x2, #12 + add x18, x2, #16 + add x0, x2, #20 + add x3, x2, #24 + add x2, x2, #28 + ld1r { v6.4s }, [x16] + ld1r { v17.4s }, [x17] + ld1r { v10.4s }, [x18] + ld1r { v11.4s }, [x0] + ld1r { v19.4s }, [x3] + ld1r { v18.4s }, [x15] + ld1r { v16.4s }, [x2] + cbz x22, .LBB2_7 + ldr q1, [sp, #16] + dup v0.4s, w20 + ldp x15, x16, [x24] + ldp x17, x18, [x24, #16] + add v1.4s, v0.4s, v1.4s + movi v0.4s, #128, lsl #24 + str q1, [sp, #64] + eor v0.16b, v1.16b, v0.16b + ldr q1, [sp] + lsr x2, x20, #32 + mov x0, xzr + mov w6, w8 + cmgt v0.4s, v1.4s, v0.4s + dup v1.4s, w2 + sub v0.4s, v1.4s, v0.4s + str q0, [sp, #48] +.LBB2_4: + mov w4, #16 + stp q16, q17, [sp, #192] + bfi x4, x0, #6, #58 + ldr q1, [x15, x4] + ldr q3, [x16, x4] + ldr q2, [x17, x4] + ldr q4, [x18, x4] + mov w4, #32 + bfi x4, x0, #6, #58 + ldr q5, [x15, x4] + ldr q20, [x16, x4] + ldr q21, [x17, x4] + ldr q22, [x18, x4] + mov w4, #48 + lsl x3, x0, #6 + bfi x4, x0, #6, #58 + add x0, x0, #1 + ldr q0, [x15, x3] + ldr q23, [x16, x3] + ldr q16, [x17, x3] + ldr q17, [x18, x3] + cmp x0, x22 + ldr q25, [x15, x4] + ldr q14, [x16, x4] + ldr q28, [x17, x4] + ldr q31, [x18, x4] + csel w4, w27, wzr, eq + orr w4, w4, w6 + mov x2, xzr + and w6, w4, #0xff + add x3, x3, #256 +.LBB2_5: + ldr x4, [x24, x2] + add x2, x2, #8 + cmp x2, #32 + add x4, x4, x3 + prfm pldl1keep, [x4] + b.ne .LBB2_5 + zip1 v29.4s, v0.4s, v23.4s + zip2 v23.4s, v0.4s, v23.4s + zip1 v0.4s, v16.4s, v17.4s + zip2 v24.4s, v16.4s, v17.4s + zip1 v9.4s, v1.4s, v3.4s + zip2 v26.4s, v1.4s, v3.4s + zip1 v27.4s, v2.4s, v4.4s + zip2 v17.4s, v2.4s, v4.4s + zip1 v12.4s, v21.4s, v22.4s + zip2 v13.4s, v21.4s, v22.4s + add v2.4s, v7.4s, v10.4s + add v1.4s, v18.4s, v11.4s + ext v7.16b, v0.16b, v29.16b, #8 + ext v22.16b, v24.16b, v23.16b, #8 + zip1 v30.4s, v5.4s, v20.4s + zip2 v20.4s, v5.4s, v20.4s + stp q1, q2, [sp, #112] + ext v2.16b, v29.16b, v7.16b, #8 + mov v29.d[1], v0.d[0] + ext v18.16b, v23.16b, v22.16b, #8 + mov v23.d[1], v24.d[0] + zip1 v21.4s, v25.4s, v14.4s + zip2 v4.4s, v25.4s, v14.4s + zip1 v14.4s, v28.4s, v31.4s + zip2 v15.4s, v28.4s, v31.4s + add v8.4s, v6.4s, v19.4s + ext v28.16b, v27.16b, v9.16b, #8 + ext v31.16b, v17.16b, v26.16b, #8 + stur q2, [x29, #-208] + mov v7.16b, v29.16b + ext v0.16b, v12.16b, v30.16b, #8 + stp q23, q29, [x29, #-80] + mov v2.16b, v19.16b + ext v19.16b, v13.16b, v20.16b, #8 + mov v29.16b, v9.16b + ext v25.16b, v9.16b, v28.16b, #8 + mov v29.d[1], v27.d[0] + ext v24.16b, v26.16b, v31.16b, #8 + mov v26.d[1], v17.d[0] + ext v17.16b, v15.16b, v4.16b, #8 + ext v27.16b, v30.16b, v0.16b, #8 + ext v0.16b, v20.16b, v19.16b, #8 + stp q0, q25, [sp, #80] + ext v0.16b, v4.16b, v17.16b, #8 + str q0, [sp, #224] + ldr q0, [sp, #128] + mov v6.16b, v23.16b + mov v22.16b, v4.16b + ldr q16, [x9, :lo12:.LCPI2_1] + add v17.4s, v0.4s, v7.4s + ldr q0, [sp, #112] + mov v30.d[1], v12.d[0] + add v7.4s, v8.4s, v29.4s + mov v20.d[1], v13.d[0] + add v4.4s, v0.4s, v6.4s + ldr q0, [sp, #64] + dup v3.4s, w12 + ext v28.16b, v14.16b, v21.16b, #8 + dup v1.4s, w10 + eor v19.16b, v17.16b, v0.16b + ldr q0, [sp, #48] + ext v23.16b, v21.16b, v28.16b, #8 + mov v21.d[1], v14.d[0] + tbl v14.16b, { v19.16b }, v16.16b + eor v12.16b, v4.16b, v0.16b + movi v0.4s, #64 + eor v13.16b, v7.16b, v0.16b + tbl v13.16b, { v13.16b }, v16.16b + add v6.4s, v13.4s, v3.4s + dup v5.4s, w11 + tbl v12.16b, { v12.16b }, v16.16b + add v1.4s, v14.4s, v1.4s + eor v9.16b, v6.16b, v2.16b + ldp q2, q0, [sp, #192] + add v5.4s, v12.4s, v5.4s + eor v19.16b, v1.16b, v10.16b + eor v10.16b, v5.16b, v11.16b + ushr v11.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v11.16b, v19.16b, v11.16b + ushr v19.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + mov v22.d[1], v15.d[0] + orr v10.16b, v10.16b, v19.16b + ushr v19.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + add v15.4s, v0.4s, v2.4s + orr v9.16b, v9.16b, v19.16b + dup v19.4s, w6 + add v15.4s, v15.4s, v26.4s + eor v19.16b, v15.16b, v19.16b + tbl v3.16b, { v19.16b }, v16.16b + dup v19.4s, w13 + add v8.4s, v3.4s, v19.4s + ldur q31, [x29, #-208] + eor v19.16b, v8.16b, v2.16b + ushr v0.4s, v19.4s, #12 + shl v19.4s, v19.4s, #20 + orr v2.16b, v19.16b, v0.16b + ldr q19, [x14, :lo12:.LCPI2_2] + add v17.4s, v17.4s, v31.4s + add v17.4s, v17.4s, v11.4s + eor v14.16b, v14.16b, v17.16b + tbl v14.16b, { v14.16b }, v19.16b + add v1.4s, v1.4s, v14.4s + eor v11.16b, v1.16b, v11.16b + add v4.4s, v4.4s, v18.4s + ushr v0.4s, v11.4s, #7 + shl v11.4s, v11.4s, #25 + add v4.4s, v4.4s, v10.4s + orr v0.16b, v11.16b, v0.16b + eor v11.16b, v12.16b, v4.16b + tbl v11.16b, { v11.16b }, v19.16b + add v5.4s, v5.4s, v11.4s + eor v10.16b, v5.16b, v10.16b + add v7.4s, v7.4s, v25.4s + ushr v12.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + add v7.4s, v7.4s, v9.4s + orr v10.16b, v10.16b, v12.16b + eor v12.16b, v13.16b, v7.16b + tbl v12.16b, { v12.16b }, v19.16b + add v6.4s, v6.4s, v12.4s + eor v9.16b, v6.16b, v9.16b + ushr v13.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + orr v9.16b, v9.16b, v13.16b + add v13.4s, v15.4s, v24.4s + add v13.4s, v13.4s, v2.4s + eor v3.16b, v3.16b, v13.16b + tbl v3.16b, { v3.16b }, v19.16b + add v8.4s, v8.4s, v3.4s + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v30.4s + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v20.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v21.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v22.4s + mov v28.16b, v26.16b + stur q26, [x29, #-112] + mov v26.16b, v18.16b + mov v18.16b, v24.16b + stur q24, [x29, #-160] + add v6.4s, v6.4s, v3.4s + mov v24.16b, v20.16b + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldr q20, [sp, #80] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + stp q30, q22, [x29, #-192] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + mov v30.16b, v27.16b + add v17.4s, v17.4s, v27.4s + ldr q27, [sp, #224] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v20.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v23.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v27.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + stur q21, [x29, #-144] + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + ldur q21, [x29, #-80] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v21.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v26.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v18.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v29.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ldur q22, [x29, #-64] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v28.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v24.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v22.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v23.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ldur q22, [x29, #-144] + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v31.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v22.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v30.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v27.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldr q27, [sp, #96] + mov v21.16b, v26.16b + stur q26, [x29, #-96] + mov v28.16b, v31.16b + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldp q31, q26, [x29, #-192] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v20.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v27.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v26.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v31.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + mov v18.16b, v24.16b + mov v24.16b, v20.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + ldur q20, [x29, #-160] + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v21.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v18.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v23.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v20.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ldur q25, [x29, #-80] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v29.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v25.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v26.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + ldur q25, [x29, #-112] + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v25.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v30.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v24.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v31.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldur q25, [x29, #-64] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldr q31, [sp, #224] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v27.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v25.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v31.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v28.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v18.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v26.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v23.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + mov v21.16b, v29.16b + stur q29, [x29, #-128] + mov v29.16b, v30.16b + mov v30.16b, v27.16b + mov v27.16b, v18.16b + str q18, [sp, #176] + eor v0.16b, v0.16b, v1.16b + mov v18.16b, v22.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ldur q22, [x29, #-96] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v20.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v29.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v22.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v31.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v21.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v24.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v30.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v28.4s + add v6.4s, v6.4s, v3.4s + mov v22.16b, v24.16b + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldur q24, [x29, #-80] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + mov v21.16b, v30.16b + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldur q30, [x29, #-192] + mov v20.16b, v29.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + ldur q29, [x29, #-112] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v25.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v24.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v30.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v29.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v18.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v20.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v31.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v26.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + add v17.4s, v17.4s, v23.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v27.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v30.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + ldur q27, [x29, #-160] + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v27.4s + mov v28.16b, v25.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v21.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v28.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v29.4s + mov v25.16b, v31.16b + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + ldur q31, [x29, #-96] + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v13.16b + ldur q28, [x29, #-208] + mov v18.16b, v20.16b + str q20, [sp, #144] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + ldur q20, [x29, #-128] + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v24.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v31.4s + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v28.4s + orr v0.16b, v0.16b, v15.16b + tbl v3.16b, { v3.16b }, v19.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v13.4s, v13.4s, v20.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v0.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v13.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v1.16b, v2.16b + add v5.4s, v5.4s, v12.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v2.16b, v2.16b, v15.16b + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v15.16b + add v17.4s, v17.4s, v18.4s + add v17.4s, v17.4s, v0.4s + add v4.4s, v4.4s, v22.4s + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v30.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v25.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v16.16b + eor v3.16b, v3.16b, v13.16b + add v17.4s, v17.4s, v26.4s + mov v26.16b, v21.16b + add v4.4s, v4.4s, v21.4s + ldur q21, [x29, #-144] + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v16.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v8.16b, v2.16b + add v17.4s, v17.4s, v0.4s + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + eor v14.16b, v14.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v7.4s, v7.4s, v21.4s + orr v2.16b, v2.16b, v15.16b + tbl v14.16b, { v14.16b }, v19.16b + eor v11.16b, v11.16b, v4.16b + add v7.4s, v7.4s, v9.4s + add v13.4s, v13.4s, v28.4s + add v1.4s, v1.4s, v14.4s + tbl v11.16b, { v11.16b }, v19.16b + eor v12.16b, v12.16b, v7.16b + add v13.4s, v13.4s, v2.4s + str q23, [sp, #160] + eor v0.16b, v0.16b, v1.16b + add v5.4s, v5.4s, v11.4s + tbl v12.16b, { v12.16b }, v19.16b + eor v3.16b, v3.16b, v13.16b + add v17.4s, v17.4s, v23.4s + ldur q23, [x29, #-64] + ushr v15.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v10.16b, v5.16b, v10.16b + add v6.4s, v6.4s, v12.4s + tbl v3.16b, { v3.16b }, v19.16b + orr v0.16b, v0.16b, v15.16b + ushr v15.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + eor v9.16b, v6.16b, v9.16b + add v8.4s, v8.4s, v3.4s + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v2.16b, v8.16b, v2.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v23.4s + orr v2.16b, v2.16b, v15.16b + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v24.4s + tbl v3.16b, { v3.16b }, v16.16b + eor v14.16b, v14.16b, v4.16b + add v7.4s, v7.4s, v2.4s + add v6.4s, v6.4s, v3.4s + tbl v14.16b, { v14.16b }, v16.16b + eor v11.16b, v11.16b, v7.16b + add v13.4s, v13.4s, v20.4s + eor v10.16b, v6.16b, v10.16b + add v8.4s, v8.4s, v14.4s + tbl v11.16b, { v11.16b }, v16.16b + add v13.4s, v13.4s, v0.4s + ldr q20, [sp, #176] + ushr v15.4s, v10.4s, #12 + shl v10.4s, v10.4s, #20 + eor v9.16b, v8.16b, v9.16b + add v1.4s, v1.4s, v11.4s + eor v12.16b, v12.16b, v13.16b + orr v10.16b, v10.16b, v15.16b + ushr v15.4s, v9.4s, #12 + shl v9.4s, v9.4s, #20 + eor v2.16b, v1.16b, v2.16b + tbl v12.16b, { v12.16b }, v16.16b + orr v9.16b, v9.16b, v15.16b + ushr v15.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + add v5.4s, v5.4s, v12.4s + add v17.4s, v17.4s, v31.4s + orr v2.16b, v2.16b, v15.16b + eor v0.16b, v5.16b, v0.16b + add v17.4s, v17.4s, v10.4s + add v4.4s, v4.4s, v20.4s + add v7.4s, v7.4s, v29.4s + ushr v15.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v3.16b, v3.16b, v17.16b + add v4.4s, v4.4s, v9.4s + add v7.4s, v7.4s, v2.4s + orr v0.16b, v0.16b, v15.16b + mov v15.16b, v31.16b + add v17.4s, v17.4s, v22.4s + eor v31.16b, v14.16b, v4.16b + eor v22.16b, v11.16b, v7.16b + add v11.4s, v13.4s, v27.4s + tbl v3.16b, { v3.16b }, v19.16b + add v11.4s, v11.4s, v0.4s + tbl v31.16b, { v31.16b }, v19.16b + add v6.4s, v6.4s, v3.4s + eor v12.16b, v12.16b, v11.16b + tbl v22.16b, { v22.16b }, v19.16b + add v8.4s, v8.4s, v31.4s + eor v10.16b, v6.16b, v10.16b + add v30.4s, v11.4s, v30.4s + tbl v11.16b, { v12.16b }, v19.16b + add v1.4s, v1.4s, v22.4s + eor v9.16b, v8.16b, v9.16b + ushr v12.4s, v10.4s, #7 + shl v10.4s, v10.4s, #25 + add v5.4s, v5.4s, v11.4s + eor v2.16b, v1.16b, v2.16b + orr v10.16b, v10.16b, v12.16b + ushr v12.4s, v9.4s, #7 + shl v9.4s, v9.4s, #25 + eor v0.16b, v5.16b, v0.16b + orr v9.16b, v9.16b, v12.16b + ushr v12.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v2.16b, v2.16b, v12.16b + ushr v12.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v12.16b + add v4.4s, v4.4s, v26.4s + add v17.4s, v17.4s, v0.4s + add v7.4s, v7.4s, v28.4s + mov v18.16b, v27.16b + eor v31.16b, v31.16b, v17.16b + add v4.4s, v4.4s, v10.4s + add v27.4s, v30.4s, v2.4s + eor v22.16b, v22.16b, v4.16b + add v7.4s, v7.4s, v9.4s + eor v3.16b, v3.16b, v27.16b + add v26.4s, v27.4s, v29.4s + tbl v27.16b, { v31.16b }, v16.16b + eor v28.16b, v11.16b, v7.16b + tbl v22.16b, { v22.16b }, v16.16b + add v1.4s, v1.4s, v27.4s + add v4.4s, v4.4s, v23.4s + ldr q23, [sp, #144] + tbl v28.16b, { v28.16b }, v16.16b + tbl v3.16b, { v3.16b }, v16.16b + add v5.4s, v5.4s, v22.4s + eor v0.16b, v0.16b, v1.16b + add v6.4s, v6.4s, v28.4s + add v29.4s, v8.4s, v3.4s + eor v30.16b, v5.16b, v10.16b + ushr v8.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + eor v31.16b, v6.16b, v9.16b + orr v0.16b, v0.16b, v8.16b + ushr v8.4s, v30.4s, #12 + shl v30.4s, v30.4s, #20 + eor v2.16b, v29.16b, v2.16b + orr v30.16b, v30.16b, v8.16b + ushr v8.4s, v31.4s, #12 + shl v31.4s, v31.4s, #20 + add v17.4s, v17.4s, v25.4s + add v7.4s, v7.4s, v23.4s + orr v31.16b, v31.16b, v8.16b + ushr v8.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + ldur q23, [x29, #-176] + orr v2.16b, v2.16b, v8.16b + add v17.4s, v17.4s, v0.4s + eor v27.16b, v27.16b, v17.16b + add v4.4s, v4.4s, v30.4s + add v25.4s, v26.4s, v2.4s + eor v22.16b, v22.16b, v4.16b + add v4.4s, v4.4s, v24.4s + add v7.4s, v7.4s, v31.4s + eor v3.16b, v3.16b, v25.16b + add v24.4s, v25.4s, v18.4s + tbl v25.16b, { v27.16b }, v19.16b + add v17.4s, v17.4s, v23.4s + eor v23.16b, v28.16b, v7.16b + tbl v22.16b, { v22.16b }, v19.16b + add v1.4s, v1.4s, v25.4s + tbl v23.16b, { v23.16b }, v19.16b + tbl v3.16b, { v3.16b }, v19.16b + add v5.4s, v5.4s, v22.4s + eor v0.16b, v0.16b, v1.16b + add v6.4s, v6.4s, v23.4s + add v26.4s, v29.4s, v3.4s + eor v27.16b, v5.16b, v30.16b + ushr v29.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + eor v28.16b, v6.16b, v31.16b + orr v0.16b, v0.16b, v29.16b + ushr v29.4s, v27.4s, #7 + shl v27.4s, v27.4s, #25 + eor v2.16b, v26.16b, v2.16b + orr v27.16b, v27.16b, v29.16b + ushr v29.4s, v28.4s, #7 + shl v28.4s, v28.4s, #25 + ldur q18, [x29, #-128] + orr v28.16b, v28.16b, v29.16b + ushr v29.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + add v7.4s, v7.4s, v15.4s + orr v2.16b, v2.16b, v29.16b + add v17.4s, v17.4s, v27.4s + add v4.4s, v4.4s, v28.4s + add v7.4s, v7.4s, v2.4s + eor v3.16b, v3.16b, v17.16b + add v17.4s, v17.4s, v20.4s + eor v20.16b, v25.16b, v4.16b + add v4.4s, v4.4s, v21.4s + eor v21.16b, v22.16b, v7.16b + add v7.4s, v7.4s, v18.4s + add v18.4s, v24.4s, v0.4s + eor v22.16b, v23.16b, v18.16b + ldr q23, [sp, #160] + tbl v3.16b, { v3.16b }, v16.16b + tbl v20.16b, { v20.16b }, v16.16b + add v6.4s, v6.4s, v3.4s + add v18.4s, v18.4s, v23.4s + tbl v21.16b, { v21.16b }, v16.16b + tbl v16.16b, { v22.16b }, v16.16b + add v22.4s, v26.4s, v20.4s + eor v23.16b, v6.16b, v27.16b + add v1.4s, v1.4s, v21.4s + eor v24.16b, v22.16b, v28.16b + ushr v25.4s, v23.4s, #12 + shl v23.4s, v23.4s, #20 + add v5.4s, v5.4s, v16.4s + eor v2.16b, v1.16b, v2.16b + orr v23.16b, v23.16b, v25.16b + ushr v25.4s, v24.4s, #12 + shl v24.4s, v24.4s, #20 + eor v0.16b, v5.16b, v0.16b + orr v24.16b, v24.16b, v25.16b + ushr v25.4s, v2.4s, #12 + shl v2.4s, v2.4s, #20 + orr v2.16b, v2.16b, v25.16b + ushr v25.4s, v0.4s, #12 + shl v0.4s, v0.4s, #20 + orr v0.16b, v0.16b, v25.16b + add v25.4s, v7.4s, v2.4s + add v26.4s, v18.4s, v0.4s + eor v18.16b, v21.16b, v25.16b + add v17.4s, v17.4s, v23.4s + add v4.4s, v4.4s, v24.4s + eor v16.16b, v16.16b, v26.16b + tbl v21.16b, { v18.16b }, v19.16b + eor v3.16b, v3.16b, v17.16b + eor v7.16b, v20.16b, v4.16b + tbl v16.16b, { v16.16b }, v19.16b + add v1.4s, v1.4s, v21.4s + tbl v3.16b, { v3.16b }, v19.16b + tbl v20.16b, { v7.16b }, v19.16b + eor v2.16b, v1.16b, v2.16b + eor v7.16b, v1.16b, v17.16b + add v1.4s, v5.4s, v16.4s + eor v0.16b, v1.16b, v0.16b + eor v18.16b, v1.16b, v4.16b + add v1.4s, v6.4s, v3.4s + eor v4.16b, v1.16b, v23.16b + eor v6.16b, v25.16b, v1.16b + add v1.4s, v22.4s, v20.4s + eor v5.16b, v1.16b, v24.16b + eor v17.16b, v26.16b, v1.16b + ushr v1.4s, v4.4s, #7 + shl v4.4s, v4.4s, #25 + orr v1.16b, v4.16b, v1.16b + ushr v4.4s, v5.4s, #7 + shl v5.4s, v5.4s, #25 + orr v4.16b, v5.16b, v4.16b + ushr v5.4s, v2.4s, #7 + shl v2.4s, v2.4s, #25 + orr v2.16b, v2.16b, v5.16b + ushr v5.4s, v0.4s, #7 + shl v0.4s, v0.4s, #25 + orr v0.16b, v0.16b, v5.16b + eor v10.16b, v0.16b, v20.16b + eor v11.16b, v1.16b, v21.16b + eor v19.16b, v4.16b, v16.16b + cmp x0, x22 + eor v16.16b, v2.16b, v3.16b + mov w6, w19 + b.ne .LBB2_4 +.LBB2_7: + zip1 v0.4s, v7.4s, v18.4s + zip2 v1.4s, v7.4s, v18.4s + zip1 v2.4s, v6.4s, v17.4s + zip2 v3.4s, v6.4s, v17.4s + zip1 v4.4s, v10.4s, v11.4s + zip2 v5.4s, v10.4s, v11.4s + zip1 v6.4s, v19.4s, v16.4s + zip2 v7.4s, v19.4s, v16.4s + add x15, x20, #4 + tst w5, #0x1 + sub x28, x28, #4 + zip1 v16.2d, v0.2d, v2.2d + zip2 v0.2d, v0.2d, v2.2d + zip1 v2.2d, v1.2d, v3.2d + zip2 v1.2d, v1.2d, v3.2d + zip1 v3.2d, v4.2d, v6.2d + zip2 v4.2d, v4.2d, v6.2d + zip1 v6.2d, v5.2d, v7.2d + zip2 v5.2d, v5.2d, v7.2d + add x24, x24, #32 + csel x20, x15, x20, ne + cmp x28, #3 + stp q16, q3, [x26] + stp q0, q4, [x26, #32] + stp q2, q6, [x26, #64] + stp q1, q5, [x26, #96] + add x26, x26, #128 + b.hi .LBB2_2 +.LBB2_8: + cbz x28, .LBB2_16 + orr w8, w7, w19 + and x21, x5, #0x1 + stur w8, [x29, #-64] +.LBB2_10: + ldr x8, [sp, #40] + ldr x25, [x24] + ldur w4, [x29, #-64] + ldp q1, q0, [x8] + mov x8, x22 + stp q1, q0, [x29, #-48] +.LBB2_11: + subs x23, x8, #1 + b.eq .LBB2_13 + cbnz x8, .LBB2_14 + b .LBB2_15 +.LBB2_13: + orr w4, w4, w27 +.LBB2_14: + sub x0, x29, #48 + mov w2, #64 + mov x1, x25 + mov x3, x20 + bl zfs_blake3_compress_in_place_sse41 + add x25, x25, #64 + mov x8, x23 + mov w4, w19 + b .LBB2_11 +.LBB2_15: + ldp q0, q1, [x29, #-48] + add x20, x20, x21 + add x24, x24, #8 + subs x28, x28, #1 + stp q0, q1, [x26], #32 + b.ne .LBB2_10 +.LBB2_16: + add sp, sp, #448 + ldp x20, x19, [sp, #144] + ldp x22, x21, [sp, #128] + ldp x24, x23, [sp, #112] + ldp x26, x25, [sp, #96] + ldp x28, x27, [sp, #80] + ldp x29, x30, [sp, #64] + ldp d9, d8, [sp, #48] + ldp d11, d10, [sp, #32] + ldp d13, d12, [sp, #16] + ldp d15, d14, [sp], #160 + ret +.Lfunc_end2: + .size zfs_blake3_hash_many_sse41, .Lfunc_end2-zfs_blake3_hash_many_sse41 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S new file mode 100644 index 000000000000..9deba202fde8 --- /dev/null +++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse2.S @@ -0,0 +1,2823 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves and Matthew Krupcale + * Copyright (c) 2022 Tino Reichardt + * + * This is converted assembly: SSE2 -> POWER8 PPC64 Little Endian + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + .text + .abiversion 2 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI0_1: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI0_2: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_3: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_4: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_5: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_6: + .short 1 + .short 2 + .short 4 + .short 8 + .short 16 + .short 32 + .short 64 + .short 128 +.LCPI0_7: + .short 0 + .short 0 + .short 4 + .short 8 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI0_8: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI0_9: + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI0_10: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 7 + .byte 6 + .byte 5 + .byte 4 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_12: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI0_13: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_14: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .text + .globl zfs_blake3_compress_in_place_sse2 + .p2align 2 + .type zfs_blake3_compress_in_place_sse2,@function +zfs_blake3_compress_in_place_sse2: +.Lfunc_begin0: + .cfi_startproc +.Lfunc_gep0: + addis 2, 12, .TOC.-.Lfunc_gep0@ha + addi 2, 2, .TOC.-.Lfunc_gep0@l +.Lfunc_lep0: + .localentry zfs_blake3_compress_in_place_sse2, .Lfunc_lep0-.Lfunc_gep0 + li 8, -64 + mtvsrd 35, 5 + li 5, 16 + lfdx 0, 0, 4 + vspltisw 12, 9 + stxvd2x 60, 1, 8 + li 8, -48 + mtvsrd 36, 7 + lfd 2, 16(4) + stxvd2x 61, 1, 8 + li 8, -32 + lfd 1, 8(4) + mtvsrwz 37, 6 + rldicl 6, 6, 32, 32 + addis 7, 2, .LCPI0_2@toc@ha + stxvd2x 62, 1, 8 + li 8, -16 + addi 7, 7, .LCPI0_2@toc@l + stxvd2x 63, 1, 8 + li 8, 0 + lvx 9, 0, 7 + li 7, 48 + mtvsrd 34, 8 + xxmrghd 32, 1, 0 + lxvd2x 0, 0, 3 + lxvd2x 1, 3, 5 + lfd 3, 24(4) + addis 8, 2, .LCPI0_5@toc@ha + vmrghb 3, 2, 3 + addi 8, 8, .LCPI0_5@toc@l + vmrghb 4, 2, 4 + vspltb 2, 2, 7 + xxmrghd 33, 3, 2 + vpkudum 7, 1, 0 + vmrglh 3, 2, 3 + vmrglh 2, 2, 4 + mtvsrwz 36, 6 + addis 6, 2, .LCPI0_0@toc@ha + addi 6, 6, .LCPI0_0@toc@l + vperm 10, 1, 0, 9 + vmrghw 4, 4, 5 + xxswapd 37, 1 + lxvd2x 1, 4, 7 + addis 7, 2, .LCPI0_8@toc@ha + addi 7, 7, .LCPI0_8@toc@l + vmrglw 2, 2, 3 + xxswapd 35, 0 + xxswapd 41, 1 + xxspltd 62, 42, 1 + vadduwm 3, 7, 3 + vadduwm 6, 3, 5 + xxmrgld 36, 34, 36 + lvx 2, 0, 6 + addis 6, 2, .LCPI0_1@toc@ha + addi 6, 6, .LCPI0_1@toc@l + xxlxor 35, 38, 36 + lvx 4, 0, 6 + li 6, 32 + lxvd2x 0, 4, 6 + addis 4, 2, .LCPI0_3@toc@ha + addis 6, 2, .LCPI0_7@toc@ha + vperm 8, 3, 3, 2 + vspltisw 3, 10 + addi 4, 4, .LCPI0_3@toc@l + addi 6, 6, .LCPI0_7@toc@l + vadduwm 3, 3, 3 + vadduwm 11, 8, 4 + xxlxor 36, 43, 37 + vadduwm 5, 6, 10 + vrlw 0, 4, 3 + vspltisw 4, 12 + vadduwm 4, 4, 4 + vadduwm 1, 0, 5 + xxlxor 37, 33, 40 + xxswapd 40, 0 + vrlw 6, 5, 4 + vspltisw 5, -16 + vpkudum 13, 9, 8 + vsubuwm 5, 12, 5 + lvx 12, 0, 4 + addis 4, 2, .LCPI0_4@toc@ha + addi 4, 4, .LCPI0_4@toc@l + vadduwm 11, 6, 11 + xxswapd 0, 38 + vadduwm 1, 1, 13 + xxsldwi 50, 45, 45, 1 + xxlxor 32, 43, 32 + xxsldwi 43, 43, 43, 3 + xxsldwi 33, 33, 33, 1 + vperm 12, 8, 9, 12 + vrlw 0, 0, 5 + vadduwm 1, 0, 1 + xxlxor 38, 33, 0 + vadduwm 1, 1, 12 + vperm 6, 6, 6, 2 + vadduwm 15, 6, 11 + lvx 11, 0, 4 + addis 4, 2, .LCPI0_6@toc@ha + addi 4, 4, .LCPI0_6@toc@l + xxlxor 32, 47, 32 + lvx 17, 0, 4 + addis 4, 2, .LCPI0_9@toc@ha + vperm 14, 10, 7, 11 + addi 4, 4, .LCPI0_9@toc@l + vrlw 0, 0, 3 + vadduwm 1, 0, 1 + xxlxor 38, 33, 38 + vrlw 6, 6, 4 + vadduwm 8, 6, 15 + xxswapd 0, 38 + lvx 6, 0, 8 + xxlxor 32, 40, 32 + xxsldwi 40, 40, 40, 1 + vperm 13, 12, 18, 6 + vrlw 9, 0, 5 + vadduwm 0, 1, 14 + lvx 1, 0, 7 + xxsldwi 46, 46, 46, 3 + xxsldwi 32, 32, 32, 3 + vperm 7, 7, 7, 1 + vadduwm 15, 9, 0 + xxlxor 32, 47, 0 + vperm 16, 0, 0, 2 + lvx 0, 0, 6 + addis 6, 2, .LCPI0_10@toc@ha + vcmpequh 0, 0, 17 + vadduwm 19, 16, 8 + xxlxor 40, 51, 41 + xxsel 45, 39, 45, 32 + vrlw 31, 8, 3 + lvx 8, 0, 4 + addis 4, 2, .LCPI0_11@toc@ha + addi 4, 4, .LCPI0_11@toc@l + vcmpequh 7, 8, 17 + vadduwm 8, 15, 13 + vadduwm 15, 31, 8 + lvx 8, 0, 4 + addi 4, 6, .LCPI0_10@toc@l + lvx 17, 0, 4 + addis 4, 2, .LCPI0_12@toc@ha + xxlxor 41, 47, 48 + xxsldwi 47, 47, 47, 1 + addi 4, 4, .LCPI0_12@toc@l + xxlnor 48, 39, 39 + vrlw 29, 9, 4 + vperm 9, 16, 16, 8 + xxland 48, 50, 39 + vperm 17, 30, 12, 17 + vperm 16, 16, 16, 8 + vmrghw 12, 12, 10 + lvx 10, 0, 4 + addis 4, 2, .LCPI0_13@toc@ha + vadduwm 19, 29, 19 + addi 4, 4, .LCPI0_13@toc@l + xxlxor 63, 51, 63 + xxsldwi 51, 51, 51, 3 + xxland 0, 49, 41 + vrlw 17, 31, 5 + xxlor 48, 0, 48 + xxswapd 0, 61 + vperm 18, 12, 18, 10 + vadduwm 15, 15, 16 + xxland 60, 48, 39 + vadduwm 15, 17, 15 + vperm 28, 28, 28, 8 + xxlxor 63, 47, 0 + vadduwm 15, 15, 18 + vperm 31, 31, 31, 2 + vperm 30, 18, 16, 6 + vadduwm 19, 31, 19 + xxlxor 44, 51, 49 + vrlw 12, 12, 3 + vadduwm 15, 12, 15 + xxlxor 49, 47, 63 + vperm 31, 13, 14, 11 + vrlw 17, 17, 4 + vperm 14, 14, 14, 1 + vadduwm 15, 15, 31 + vadduwm 19, 17, 19 + xxswapd 0, 49 + xxsldwi 47, 47, 47, 3 + xxsel 46, 46, 62, 32 + xxlxor 44, 51, 44 + xxsldwi 51, 51, 51, 1 + vrlw 12, 12, 5 + vadduwm 15, 12, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 19, 17, 19 + xxlxor 44, 51, 44 + vrlw 29, 12, 3 + vadduwm 12, 15, 14 + vadduwm 15, 29, 12 + lvx 12, 0, 4 + addis 4, 2, .LCPI0_14@toc@ha + addi 4, 4, .LCPI0_14@toc@l + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + vperm 30, 13, 18, 12 + vrlw 17, 17, 4 + vmrghw 13, 18, 13 + xxland 0, 62, 41 + vadduwm 19, 17, 19 + vperm 16, 13, 16, 10 + xxlxor 61, 51, 61 + xxsldwi 50, 51, 51, 3 + xxsldwi 51, 63, 63, 3 + vrlw 30, 29, 5 + xxlor 61, 60, 0 + xxswapd 0, 49 + vperm 31, 14, 19, 11 + vadduwm 15, 15, 29 + vperm 19, 19, 19, 1 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 16 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 62 + vperm 30, 16, 29, 6 + vrlw 13, 13, 3 + vadduwm 15, 13, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 31 + xxsldwi 63, 63, 63, 3 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 45, 50, 45 + xxsldwi 50, 50, 50, 1 + vrlw 13, 13, 5 + vadduwm 15, 13, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 45 + vrlw 28, 13, 3 + xxsel 45, 51, 62, 32 + xxland 51, 61, 39 + vperm 30, 14, 16, 12 + vadduwm 15, 15, 13 + vperm 19, 19, 19, 8 + vmrghw 14, 16, 14 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 62, 41 + vrlw 17, 17, 4 + xxlor 51, 51, 0 + vadduwm 15, 15, 19 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 60, 50, 60 + xxsldwi 48, 50, 50, 3 + vperm 18, 14, 29, 10 + vrlw 30, 28, 5 + vperm 29, 18, 19, 6 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 18 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 62 + vperm 30, 13, 31, 11 + vrlw 14, 14, 3 + vperm 31, 31, 31, 1 + vadduwm 15, 14, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 46, 48, 46 + xxsldwi 48, 48, 48, 1 + vrlw 14, 14, 5 + vadduwm 15, 14, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 46 + vrlw 28, 14, 3 + xxsel 46, 63, 61, 32 + xxland 63, 51, 39 + vperm 29, 13, 18, 12 + vadduwm 15, 15, 14 + vperm 31, 31, 31, 8 + vmrghw 13, 18, 13 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 61, 41 + vrlw 17, 17, 4 + xxlor 63, 63, 0 + vperm 13, 13, 19, 10 + xxsldwi 51, 62, 62, 3 + vadduwm 15, 15, 31 + vperm 30, 14, 19, 11 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 60, 48, 60 + xxsldwi 48, 48, 48, 3 + vrlw 29, 28, 5 + vadduwm 15, 29, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 50, 48, 61 + vrlw 18, 18, 3 + vadduwm 15, 18, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 11, 17, 16 + xxswapd 0, 49 + xxlxor 48, 43, 50 + xxsldwi 43, 43, 43, 1 + vperm 18, 19, 19, 1 + vrlw 16, 16, 5 + vperm 19, 13, 31, 6 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 29, 17, 11 + xxlxor 43, 61, 48 + vrlw 16, 11, 3 + xxsel 43, 50, 51, 32 + xxland 50, 63, 39 + vperm 19, 14, 13, 12 + vadduwm 15, 15, 11 + vperm 18, 18, 18, 8 + vmrghw 13, 13, 14 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 51, 41 + lvx 19, 0, 4 + vrlw 17, 17, 4 + xxlor 50, 50, 0 + vperm 13, 13, 31, 10 + xxsldwi 63, 62, 62, 3 + vadduwm 15, 15, 18 + vperm 19, 11, 31, 19 + vadduwm 29, 17, 29 + xxswapd 0, 49 + vperm 1, 31, 31, 1 + xxlxor 48, 61, 48 + xxsldwi 46, 61, 61, 3 + vperm 6, 13, 18, 6 + vrlw 16, 16, 5 + xxsel 32, 33, 38, 32 + xxland 38, 50, 39 + vadduwm 15, 16, 15 + vperm 7, 11, 13, 12 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vperm 6, 6, 6, 8 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 3 + vrlw 17, 17, 4 + vadduwm 15, 15, 19 + vadduwm 14, 17, 14 + xxswapd 0, 49 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 5 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vadduwm 0, 15, 0 + vperm 17, 17, 17, 2 + xxland 0, 39, 41 + xxlor 38, 38, 0 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 0, 16, 0 + xxlxor 33, 32, 49 + xxsldwi 32, 32, 32, 1 + vrlw 1, 1, 4 + vadduwm 0, 0, 6 + vadduwm 8, 1, 14 + xxswapd 0, 33 + xxlxor 44, 40, 48 + xxsldwi 38, 40, 40, 3 + vrlw 7, 12, 5 + vadduwm 0, 7, 0 + xxlxor 33, 32, 0 + vperm 2, 1, 1, 2 + vmrghw 1, 13, 11 + vadduwm 6, 2, 6 + vperm 1, 1, 18, 10 + xxlxor 39, 38, 39 + vrlw 3, 7, 3 + vadduwm 0, 0, 1 + vadduwm 0, 3, 0 + xxlxor 34, 32, 34 + xxsldwi 0, 32, 32, 3 + vrlw 2, 2, 4 + vadduwm 4, 2, 6 + xxswapd 2, 34 + xxlxor 35, 36, 35 + xxsldwi 1, 36, 36, 1 + vrlw 3, 3, 5 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 1, 35, 2 + stxvd2x 0, 0, 3 + xxswapd 1, 1 + stxvd2x 1, 3, 5 + li 3, -16 + lxvd2x 63, 1, 3 + li 3, -32 + lxvd2x 62, 1, 3 + li 3, -48 + lxvd2x 61, 1, 3 + li 3, -64 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse2, .Lfunc_end0-.Lfunc_begin0 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI1_1: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI1_2: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_3: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_4: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_5: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_6: + .short 1 + .short 2 + .short 4 + .short 8 + .short 16 + .short 32 + .short 64 + .short 128 +.LCPI1_7: + .short 0 + .short 0 + .short 4 + .short 8 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI1_8: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI1_9: + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 0 + .short 64 + .short 128 +.LCPI1_10: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 7 + .byte 6 + .byte 5 + .byte 4 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_12: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI1_13: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_14: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .text + .globl zfs_blake3_compress_xof_sse2 + .p2align 2 + .type zfs_blake3_compress_xof_sse2,@function +zfs_blake3_compress_xof_sse2: +.Lfunc_begin1: + .cfi_startproc +.Lfunc_gep1: + addis 2, 12, .TOC.-.Lfunc_gep1@ha + addi 2, 2, .TOC.-.Lfunc_gep1@l +.Lfunc_lep1: + .localentry zfs_blake3_compress_xof_sse2, .Lfunc_lep1-.Lfunc_gep1 + li 9, -80 + mtvsrd 35, 5 + li 5, 16 + lfdx 0, 0, 4 + addis 10, 2, .LCPI1_2@toc@ha + vspltisw 12, 9 + std 30, -16(1) + addis 12, 2, .LCPI1_8@toc@ha + addis 30, 2, .LCPI1_5@toc@ha + addis 11, 2, .LCPI1_7@toc@ha + stxvd2x 60, 1, 9 + li 9, -64 + mtvsrd 36, 7 + lfd 2, 16(4) + addi 10, 10, .LCPI1_2@toc@l + addi 12, 12, .LCPI1_8@toc@l + addi 11, 11, .LCPI1_7@toc@l + stxvd2x 61, 1, 9 + li 9, -48 + lfd 3, 24(4) + mtvsrwz 37, 6 + rldicl 6, 6, 32, 32 + lvx 9, 0, 10 + stxvd2x 62, 1, 9 + li 9, -32 + li 10, 32 + stxvd2x 63, 1, 9 + li 9, 0 + mtvsrd 34, 9 + xxmrghd 33, 3, 2 + lfd 1, 8(4) + vmrghb 3, 2, 3 + vmrghb 4, 2, 4 + vspltb 2, 2, 7 + xxmrghd 32, 1, 0 + lxvd2x 0, 0, 3 + lxvd2x 1, 3, 5 + vpkudum 7, 1, 0 + vmrglh 3, 2, 3 + vmrglh 2, 2, 4 + mtvsrwz 36, 6 + addis 6, 2, .LCPI1_0@toc@ha + addi 6, 6, .LCPI1_0@toc@l + vperm 10, 1, 0, 9 + vmrghw 4, 4, 5 + xxswapd 37, 1 + vmrglw 2, 2, 3 + xxswapd 35, 0 + lxvd2x 0, 4, 10 + xxspltd 62, 42, 1 + vadduwm 3, 7, 3 + vadduwm 6, 3, 5 + xxmrgld 36, 34, 36 + lvx 2, 0, 6 + addis 6, 2, .LCPI1_1@toc@ha + addi 6, 6, .LCPI1_1@toc@l + xxlxor 35, 38, 36 + lvx 4, 0, 6 + li 6, 48 + lxvd2x 1, 4, 6 + addis 4, 2, .LCPI1_3@toc@ha + vperm 8, 3, 3, 2 + vspltisw 3, 10 + addi 4, 4, .LCPI1_3@toc@l + xxswapd 41, 1 + vadduwm 3, 3, 3 + vadduwm 11, 8, 4 + xxlxor 36, 43, 37 + vadduwm 5, 6, 10 + vrlw 0, 4, 3 + vspltisw 4, 12 + vadduwm 4, 4, 4 + vadduwm 1, 0, 5 + xxlxor 37, 33, 40 + xxswapd 40, 0 + vrlw 6, 5, 4 + vspltisw 5, -16 + vpkudum 13, 9, 8 + vsubuwm 5, 12, 5 + lvx 12, 0, 4 + addis 4, 2, .LCPI1_4@toc@ha + addi 4, 4, .LCPI1_4@toc@l + vadduwm 11, 6, 11 + xxswapd 0, 38 + vadduwm 1, 1, 13 + xxsldwi 50, 45, 45, 1 + xxlxor 32, 43, 32 + xxsldwi 43, 43, 43, 3 + xxsldwi 33, 33, 33, 1 + vperm 12, 8, 9, 12 + vrlw 0, 0, 5 + vadduwm 1, 0, 1 + xxlxor 38, 33, 0 + vadduwm 1, 1, 12 + vperm 6, 6, 6, 2 + vadduwm 15, 6, 11 + lvx 11, 0, 4 + addis 4, 2, .LCPI1_6@toc@ha + addi 4, 4, .LCPI1_6@toc@l + xxlxor 32, 47, 32 + lvx 17, 0, 4 + addi 4, 30, .LCPI1_5@toc@l + vperm 14, 10, 7, 11 + vrlw 0, 0, 3 + vadduwm 1, 0, 1 + xxlxor 38, 33, 38 + vrlw 6, 6, 4 + vadduwm 8, 6, 15 + xxswapd 0, 38 + lvx 6, 0, 4 + addis 4, 2, .LCPI1_9@toc@ha + addi 4, 4, .LCPI1_9@toc@l + xxlxor 32, 40, 32 + xxsldwi 40, 40, 40, 1 + vperm 13, 12, 18, 6 + vrlw 9, 0, 5 + vadduwm 0, 1, 14 + lvx 1, 0, 12 + xxsldwi 46, 46, 46, 3 + xxsldwi 32, 32, 32, 3 + vperm 7, 7, 7, 1 + vadduwm 15, 9, 0 + xxlxor 32, 47, 0 + vperm 16, 0, 0, 2 + lvx 0, 0, 11 + addis 11, 2, .LCPI1_10@toc@ha + vcmpequh 0, 0, 17 + vadduwm 19, 16, 8 + xxlxor 40, 51, 41 + xxsel 45, 39, 45, 32 + vrlw 31, 8, 3 + lvx 8, 0, 4 + addis 4, 2, .LCPI1_11@toc@ha + addi 4, 4, .LCPI1_11@toc@l + vcmpequh 7, 8, 17 + vadduwm 8, 15, 13 + vadduwm 15, 31, 8 + lvx 8, 0, 4 + addi 4, 11, .LCPI1_10@toc@l + lvx 17, 0, 4 + addis 4, 2, .LCPI1_12@toc@ha + xxlxor 41, 47, 48 + xxsldwi 47, 47, 47, 1 + addi 4, 4, .LCPI1_12@toc@l + xxlnor 48, 39, 39 + vrlw 29, 9, 4 + vperm 9, 16, 16, 8 + xxland 48, 50, 39 + vperm 17, 30, 12, 17 + vperm 16, 16, 16, 8 + vmrghw 12, 12, 10 + lvx 10, 0, 4 + addis 4, 2, .LCPI1_13@toc@ha + vadduwm 19, 29, 19 + addi 4, 4, .LCPI1_13@toc@l + xxlxor 63, 51, 63 + xxsldwi 51, 51, 51, 3 + xxland 0, 49, 41 + vrlw 17, 31, 5 + xxlor 48, 0, 48 + xxswapd 0, 61 + vperm 18, 12, 18, 10 + vadduwm 15, 15, 16 + xxland 60, 48, 39 + vadduwm 15, 17, 15 + vperm 28, 28, 28, 8 + xxlxor 63, 47, 0 + vadduwm 15, 15, 18 + vperm 31, 31, 31, 2 + vperm 30, 18, 16, 6 + vadduwm 19, 31, 19 + xxlxor 44, 51, 49 + vrlw 12, 12, 3 + vadduwm 15, 12, 15 + xxlxor 49, 47, 63 + vperm 31, 13, 14, 11 + vrlw 17, 17, 4 + vperm 14, 14, 14, 1 + vadduwm 15, 15, 31 + vadduwm 19, 17, 19 + xxswapd 0, 49 + xxsldwi 47, 47, 47, 3 + xxsel 46, 46, 62, 32 + xxlxor 44, 51, 44 + xxsldwi 51, 51, 51, 1 + vrlw 12, 12, 5 + vadduwm 15, 12, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 19, 17, 19 + xxlxor 44, 51, 44 + vrlw 29, 12, 3 + vadduwm 12, 15, 14 + vadduwm 15, 29, 12 + lvx 12, 0, 4 + addis 4, 2, .LCPI1_14@toc@ha + addi 4, 4, .LCPI1_14@toc@l + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + vperm 30, 13, 18, 12 + vrlw 17, 17, 4 + vmrghw 13, 18, 13 + xxland 0, 62, 41 + vadduwm 19, 17, 19 + vperm 16, 13, 16, 10 + xxlxor 61, 51, 61 + xxsldwi 50, 51, 51, 3 + xxsldwi 51, 63, 63, 3 + vrlw 30, 29, 5 + xxlor 61, 60, 0 + xxswapd 0, 49 + vperm 31, 14, 19, 11 + vadduwm 15, 15, 29 + vperm 19, 19, 19, 1 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 16 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 62 + vperm 30, 16, 29, 6 + vrlw 13, 13, 3 + vadduwm 15, 13, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 31 + xxsldwi 63, 63, 63, 3 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 45, 50, 45 + xxsldwi 50, 50, 50, 1 + vrlw 13, 13, 5 + vadduwm 15, 13, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 18, 17, 18 + xxlxor 45, 50, 45 + vrlw 28, 13, 3 + xxsel 45, 51, 62, 32 + xxland 51, 61, 39 + vperm 30, 14, 16, 12 + vadduwm 15, 15, 13 + vperm 19, 19, 19, 8 + vmrghw 14, 16, 14 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 62, 41 + vrlw 17, 17, 4 + xxlor 51, 51, 0 + vadduwm 15, 15, 19 + vadduwm 18, 17, 18 + xxswapd 0, 49 + xxlxor 60, 50, 60 + xxsldwi 48, 50, 50, 3 + vperm 18, 14, 29, 10 + vrlw 30, 28, 5 + vperm 29, 18, 19, 6 + vadduwm 15, 30, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 18 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 62 + vperm 30, 13, 31, 11 + vrlw 14, 14, 3 + vperm 31, 31, 31, 1 + vadduwm 15, 14, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 46, 48, 46 + xxsldwi 48, 48, 48, 1 + vrlw 14, 14, 5 + vadduwm 15, 14, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 46, 48, 46 + vrlw 28, 14, 3 + xxsel 46, 63, 61, 32 + xxland 63, 51, 39 + vperm 29, 13, 18, 12 + vadduwm 15, 15, 14 + vperm 31, 31, 31, 8 + vmrghw 13, 18, 13 + vadduwm 15, 28, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 61, 41 + vrlw 17, 17, 4 + xxlor 63, 63, 0 + vperm 13, 13, 19, 10 + xxsldwi 51, 62, 62, 3 + vadduwm 15, 15, 31 + vperm 30, 14, 19, 11 + vadduwm 16, 17, 16 + xxswapd 0, 49 + xxlxor 60, 48, 60 + xxsldwi 48, 48, 48, 3 + vrlw 29, 28, 5 + vadduwm 15, 29, 15 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vadduwm 16, 17, 16 + xxlxor 50, 48, 61 + vrlw 18, 18, 3 + vadduwm 15, 18, 15 + xxlxor 49, 47, 49 + vadduwm 15, 15, 30 + vrlw 17, 17, 4 + xxsldwi 47, 47, 47, 3 + vadduwm 11, 17, 16 + xxswapd 0, 49 + xxlxor 48, 43, 50 + xxsldwi 43, 43, 43, 1 + vperm 18, 19, 19, 1 + vrlw 16, 16, 5 + vperm 19, 13, 31, 6 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vperm 17, 17, 17, 2 + vadduwm 29, 17, 11 + xxlxor 43, 61, 48 + vrlw 16, 11, 3 + xxsel 43, 50, 51, 32 + xxland 50, 63, 39 + vperm 19, 14, 13, 12 + vadduwm 15, 15, 11 + vperm 18, 18, 18, 8 + vmrghw 13, 13, 14 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 1 + xxland 0, 51, 41 + lvx 19, 0, 4 + vrlw 17, 17, 4 + xxlor 50, 50, 0 + vperm 13, 13, 31, 10 + xxsldwi 63, 62, 62, 3 + vadduwm 15, 15, 18 + vperm 19, 11, 31, 19 + vadduwm 29, 17, 29 + xxswapd 0, 49 + vperm 1, 31, 31, 1 + xxlxor 48, 61, 48 + xxsldwi 46, 61, 61, 3 + vperm 6, 13, 18, 6 + vrlw 16, 16, 5 + xxsel 32, 33, 38, 32 + xxland 38, 50, 39 + vadduwm 15, 16, 15 + vperm 7, 11, 13, 12 + xxlxor 49, 47, 0 + vadduwm 15, 15, 13 + vperm 17, 17, 17, 2 + vperm 6, 6, 6, 8 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 15, 16, 15 + xxlxor 49, 47, 49 + xxsldwi 47, 47, 47, 3 + vrlw 17, 17, 4 + vadduwm 15, 15, 19 + vadduwm 14, 17, 14 + xxswapd 0, 49 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 5 + vadduwm 15, 16, 15 + xxlxor 49, 47, 0 + vadduwm 0, 15, 0 + vperm 17, 17, 17, 2 + xxland 0, 39, 41 + xxlor 38, 38, 0 + vadduwm 14, 17, 14 + xxlxor 48, 46, 48 + vrlw 16, 16, 3 + vadduwm 0, 16, 0 + xxlxor 33, 32, 49 + xxsldwi 32, 32, 32, 1 + vrlw 1, 1, 4 + vadduwm 0, 0, 6 + vadduwm 8, 1, 14 + xxswapd 0, 33 + xxlxor 44, 40, 48 + xxsldwi 38, 40, 40, 3 + vrlw 7, 12, 5 + vadduwm 0, 7, 0 + xxlxor 33, 32, 0 + vperm 2, 1, 1, 2 + vmrghw 1, 13, 11 + vadduwm 6, 2, 6 + vperm 1, 1, 18, 10 + xxlxor 39, 38, 39 + vrlw 3, 7, 3 + vadduwm 0, 0, 1 + vadduwm 0, 3, 0 + xxlxor 34, 32, 34 + xxsldwi 0, 32, 32, 3 + vrlw 2, 2, 4 + vadduwm 4, 2, 6 + xxswapd 2, 34 + xxlxor 35, 36, 35 + xxsldwi 1, 36, 36, 1 + vrlw 3, 3, 5 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 3, 35, 2 + stxvd2x 0, 0, 8 + xxswapd 3, 3 + stxvd2x 3, 8, 5 + lfdx 0, 0, 3 + lfd 3, 8(3) + xxmrghd 34, 3, 0 + xxlxor 0, 1, 34 + xxswapd 0, 0 + stxvd2x 0, 8, 10 + lfd 0, 16(3) + lfd 1, 24(3) + li 3, -32 + xxmrghd 34, 1, 0 + xxlxor 0, 2, 34 + xxswapd 0, 0 + stxvd2x 0, 8, 6 + lxvd2x 63, 1, 3 + li 3, -48 + ld 30, -16(1) + lxvd2x 62, 1, 3 + li 3, -64 + lxvd2x 61, 1, 3 + li 3, -80 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse2, .Lfunc_end1-.Lfunc_begin1 + .cfi_endproc + + .globl zfs_blake3_hash_many_sse2 + .p2align 2 + .type zfs_blake3_hash_many_sse2,@function +zfs_blake3_hash_many_sse2: +.Lfunc_begin2: + .cfi_startproc +.Lfunc_gep2: + addis 2, 12, .TOC.-.Lfunc_gep2@ha + addi 2, 2, .TOC.-.Lfunc_gep2@l +.Lfunc_lep2: + .localentry zfs_blake3_hash_many_sse2, .Lfunc_lep2-.Lfunc_gep2 + mfocrf 12, 32 + mflr 0 + std 0, 16(1) + stw 12, 8(1) + stdu 1, -256(1) + .cfi_def_cfa_offset 256 + .cfi_offset lr, 16 + .cfi_offset r17, -120 + .cfi_offset r18, -112 + .cfi_offset r19, -104 + .cfi_offset r20, -96 + .cfi_offset r21, -88 + .cfi_offset r22, -80 + .cfi_offset r23, -72 + .cfi_offset r24, -64 + .cfi_offset r25, -56 + .cfi_offset r26, -48 + .cfi_offset r27, -40 + .cfi_offset r28, -32 + .cfi_offset r29, -24 + .cfi_offset r30, -16 + .cfi_offset cr2, 8 + std 26, 208(1) + mr 26, 4 + cmpldi 1, 4, 4 + andi. 4, 8, 1 + std 18, 144(1) + std 19, 152(1) + crmove 8, 1 + ld 19, 360(1) + lwz 18, 352(1) + std 24, 192(1) + std 25, 200(1) + std 27, 216(1) + std 28, 224(1) + mr 24, 10 + mr 28, 6 + mr 27, 5 + mr 25, 3 + std 29, 232(1) + std 30, 240(1) + mr 30, 9 + mr 29, 7 + std 17, 136(1) + std 20, 160(1) + std 21, 168(1) + std 22, 176(1) + std 23, 184(1) + blt 1, .LBB2_3 + li 3, 0 + li 4, 1 + clrldi 23, 30, 32 + isel 22, 4, 3, 8 + clrldi 21, 24, 32 + clrldi 20, 18, 32 +.LBB2_2: + mr 3, 25 + mr 4, 27 + mr 5, 28 + mr 6, 29 + mr 7, 22 + mr 8, 23 + mr 9, 21 + mr 10, 20 + std 19, 32(1) + bl blake3_hash4_sse2 + addi 26, 26, -4 + addi 3, 29, 4 + addi 25, 25, 32 + addi 19, 19, 128 + cmpldi 26, 3 + isel 29, 3, 29, 8 + bgt 0, .LBB2_2 +.LBB2_3: + cmpldi 26, 0 + beq 0, .LBB2_11 + li 3, 0 + li 4, 1 + or 21, 24, 30 + li 20, 16 + addi 24, 1, 96 + isel 22, 4, 3, 8 +.LBB2_5: + lxvd2x 0, 28, 20 + ld 23, 0(25) + mr 17, 27 + mr 3, 21 + stxvd2x 0, 24, 20 + lxvd2x 0, 0, 28 + stxvd2x 0, 0, 24 +.LBB2_6: + cmpldi 17, 1 + beq 0, .LBB2_8 + cmpldi 17, 0 + bne 0, .LBB2_9 + b .LBB2_10 +.LBB2_8: + or 3, 3, 18 +.LBB2_9: + clrldi 7, 3, 56 + mr 3, 24 + mr 4, 23 + li 5, 64 + mr 6, 29 + bl zfs_blake3_compress_in_place_sse2 + addi 23, 23, 64 + addi 17, 17, -1 + mr 3, 30 + b .LBB2_6 +.LBB2_10: + lxvd2x 0, 24, 20 + addi 26, 26, -1 + add 29, 29, 22 + addi 25, 25, 8 + cmpldi 26, 0 + stxvd2x 0, 19, 20 + lxvd2x 0, 0, 24 + stxvd2x 0, 0, 19 + addi 19, 19, 32 + bne 0, .LBB2_5 +.LBB2_11: + ld 30, 240(1) + ld 29, 232(1) + ld 28, 224(1) + ld 27, 216(1) + ld 26, 208(1) + ld 25, 200(1) + ld 24, 192(1) + ld 23, 184(1) + ld 22, 176(1) + ld 21, 168(1) + ld 20, 160(1) + ld 19, 152(1) + ld 18, 144(1) + ld 17, 136(1) + addi 1, 1, 256 + ld 0, 16(1) + lwz 12, 8(1) + mtocrf 32, 12 + mtlr 0 + blr + .long 0 + .quad 0 +.Lfunc_end2: + .size zfs_blake3_hash_many_sse2, .Lfunc_end2-.Lfunc_begin2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI3_0: + .quad 4294967296 + .quad 12884901890 +.LCPI3_1: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI3_2: + .long 1779033703 + .long 1779033703 + .long 1779033703 + .long 1779033703 +.LCPI3_3: + .long 3144134277 + .long 3144134277 + .long 3144134277 + .long 3144134277 +.LCPI3_4: + .long 1013904242 + .long 1013904242 + .long 1013904242 + .long 1013904242 +.LCPI3_5: + .long 2773480762 + .long 2773480762 + .long 2773480762 + .long 2773480762 + .text + .p2align 2 + .type blake3_hash4_sse2,@function +blake3_hash4_sse2: +.Lfunc_begin3: + .cfi_startproc +.Lfunc_gep3: + addis 2, 12, .TOC.-.Lfunc_gep3@ha + addi 2, 2, .TOC.-.Lfunc_gep3@l +.Lfunc_lep3: + .localentry blake3_hash4_sse2, .Lfunc_lep3-.Lfunc_gep3 + stdu 1, -400(1) + .cfi_def_cfa_offset 400 + .cfi_offset r22, -152 + .cfi_offset r23, -144 + .cfi_offset r24, -136 + .cfi_offset r25, -128 + .cfi_offset r26, -120 + .cfi_offset r27, -112 + .cfi_offset r28, -104 + .cfi_offset r29, -96 + .cfi_offset r30, -88 + .cfi_offset f23, -72 + .cfi_offset f24, -64 + .cfi_offset f25, -56 + .cfi_offset f26, -48 + .cfi_offset f27, -40 + .cfi_offset f28, -32 + .cfi_offset f29, -24 + .cfi_offset f30, -16 + .cfi_offset f31, -8 + .cfi_offset v20, -352 + .cfi_offset v21, -336 + .cfi_offset v22, -320 + .cfi_offset v23, -304 + .cfi_offset v24, -288 + .cfi_offset v25, -272 + .cfi_offset v26, -256 + .cfi_offset v27, -240 + .cfi_offset v28, -224 + .cfi_offset v29, -208 + .cfi_offset v30, -192 + .cfi_offset v31, -176 + li 11, 48 + li 0, 8 + std 30, 312(1) + li 30, 12 + li 12, 4 + lfiwzx 0, 0, 5 + stxvd2x 52, 1, 11 + li 11, 64 + lfiwzx 2, 5, 0 + li 0, 20 + lfiwzx 3, 5, 30 + stxvd2x 53, 1, 11 + li 11, 80 + li 30, 24 + lfiwzx 4, 5, 0 + li 0, 28 + stxvd2x 54, 1, 11 + li 11, 96 + lfiwzx 1, 5, 12 + lfiwzx 6, 5, 30 + xxspltw 45, 0, 1 + cmpldi 4, 0 + std 22, 248(1) + stxvd2x 55, 1, 11 + li 11, 112 + lfiwzx 7, 5, 0 + xxspltw 40, 2, 1 + std 23, 256(1) + xxspltw 38, 3, 1 + xxspltw 50, 4, 1 + std 24, 264(1) + std 25, 272(1) + std 26, 280(1) + xxspltw 54, 7, 1 + std 27, 288(1) + std 28, 296(1) + std 29, 304(1) + stxvd2x 56, 1, 11 + li 11, 128 + stfd 23, 328(1) + stxvd2x 57, 1, 11 + li 11, 144 + stfd 24, 336(1) + stxvd2x 58, 1, 11 + li 11, 160 + stfd 25, 344(1) + stxvd2x 59, 1, 11 + li 11, 176 + xxspltw 59, 1, 1 + stxvd2x 60, 1, 11 + li 11, 192 + stfd 26, 352(1) + stxvd2x 61, 1, 11 + li 11, 208 + stfd 27, 360(1) + stxvd2x 62, 1, 11 + li 11, 224 + xxspltw 62, 6, 1 + stxvd2x 63, 1, 11 + li 11, 16 + stfd 28, 368(1) + lfiwzx 5, 5, 11 + ld 5, 432(1) + stfd 29, 376(1) + stfd 30, 384(1) + stfd 31, 392(1) + xxspltw 61, 5, 1 + beq 0, .LBB3_5 + addis 30, 2, .LCPI3_0@toc@ha + neg 7, 7 + xxleqv 34, 34, 34 + addis 28, 2, .LCPI3_2@toc@ha + addis 27, 2, .LCPI3_3@toc@ha + addis 26, 2, .LCPI3_4@toc@ha + addis 25, 2, .LCPI3_5@toc@ha + ld 29, 24(3) + addi 0, 30, .LCPI3_0@toc@l + mtfprwz 1, 7 + addis 7, 2, .LCPI3_1@toc@ha + ld 30, 16(3) + lxvd2x 0, 0, 0 + mtfprwz 2, 6 + rldicl 6, 6, 32, 32 + addi 0, 7, .LCPI3_1@toc@l + ld 7, 8(3) + vslw 2, 2, 2 + lvx 5, 0, 0 + addi 0, 28, .LCPI3_2@toc@l + addi 28, 27, .LCPI3_3@toc@l + addi 27, 26, .LCPI3_4@toc@l + addi 26, 25, .LCPI3_5@toc@l + or 25, 9, 8 + li 9, 0 + xxspltw 36, 2, 1 + xxswapd 35, 0 + xxspltw 0, 1, 1 + xxland 35, 0, 35 + mtfprwz 0, 6 + ld 6, 0(3) + addi 3, 3, -8 + vadduwm 4, 3, 4 + xxlor 35, 35, 34 + xxlxor 34, 36, 34 + xxlor 9, 36, 36 + vspltisw 4, 4 + vcmpgtsw 2, 3, 2 + xxspltw 35, 0, 1 + xxlor 10, 36, 36 + vsubuwm 2, 3, 2 + xxlor 11, 34, 34 + lvx 2, 0, 0 + li 0, 32 + xxlor 12, 34, 34 + lvx 2, 0, 28 + li 28, 48 + xxlor 13, 34, 34 + lvx 2, 0, 27 + li 27, 0 + xxlor 31, 34, 34 + lvx 2, 0, 26 + xxlor 30, 34, 34 +.LBB3_2: + mr 26, 27 + addi 27, 27, 1 + xxlor 28, 40, 40 + cmpld 27, 4 + sldi 26, 26, 6 + xxlor 24, 45, 45 + iseleq 24, 10, 9 + add 23, 6, 26 + add 22, 30, 26 + lxvd2x 0, 6, 26 + lxvd2x 1, 7, 26 + or 25, 24, 25 + add 24, 7, 26 + lxvd2x 2, 30, 26 + lxvd2x 3, 29, 26 + xxlor 29, 38, 38 + lxvd2x 4, 23, 11 + lxvd2x 6, 24, 11 + clrlwi 25, 25, 24 + lxvd2x 7, 22, 11 + lxvd2x 8, 23, 0 + mtfprd 5, 25 + add 25, 29, 26 + xxswapd 34, 0 + lxvd2x 0, 25, 11 + xxswapd 36, 1 + xxswapd 33, 2 + lxvd2x 1, 24, 0 + lxvd2x 2, 22, 0 + xxswapd 39, 3 + xxswapd 32, 4 + lxvd2x 3, 25, 0 + lxvd2x 4, 23, 28 + xxswapd 49, 6 + xxswapd 51, 7 + lxvd2x 6, 24, 28 + xxswapd 58, 8 + lxvd2x 7, 22, 28 + lxvd2x 8, 25, 28 + xxswapd 60, 0 + mr 25, 3 + xxswapd 57, 1 + xxswapd 53, 2 + xxswapd 52, 3 + xxswapd 56, 4 + xxswapd 55, 6 + xxswapd 0, 5 + xxswapd 40, 7 + xxswapd 41, 8 + mtctr 12 +.LBB3_3: + ldu 24, 8(25) + add 24, 24, 26 + addi 24, 24, 256 + dcbt 0, 24 + bdnz .LBB3_3 + vmrgew 3, 4, 2 + vspltisw 31, 9 + mr 25, 8 + vmrglw 10, 4, 2 + vspltisw 14, 10 + vmrghw 6, 4, 2 + xxspltw 0, 0, 3 + vmrgew 4, 17, 0 + vmrglw 11, 17, 0 + vmrghw 16, 17, 0 + vmrgew 0, 25, 26 + vmrgew 13, 7, 1 + vmrglw 2, 7, 1 + vmrghw 7, 7, 1 + xxlor 25, 36, 36 + vmrgew 4, 28, 19 + xxlor 26, 32, 32 + vmrglw 0, 25, 26 + vmrglw 1, 28, 19 + xxmrgld 47, 34, 42 + xxlor 44, 28, 28 + vmrghw 25, 25, 26 + xxlor 23, 36, 36 + vmrghw 4, 28, 19 + vspltisw 19, -16 + xxlor 5, 32, 32 + vmrgew 0, 20, 21 + xxmrgld 34, 33, 43 + vmrglw 28, 20, 21 + vmrghw 21, 20, 21 + vmrglw 20, 23, 24 + vmrghw 26, 23, 24 + vmrglw 17, 9, 8 + xxlor 8, 32, 32 + vmrgew 0, 23, 24 + xxmrgld 56, 39, 38 + vmrgew 23, 9, 8 + xxlor 33, 24, 24 + xxlor 2, 34, 34 + vadduwm 11, 15, 1 + xxmrgld 33, 36, 48 + xxlor 6, 47, 47 + xxlor 27, 32, 32 + vmrghw 0, 9, 8 + vspltisw 9, 12 + vsubuwm 8, 31, 19 + xxmrgld 51, 23, 25 + vadduwm 31, 2, 12 + xxlor 34, 10, 10 + vadduwm 10, 14, 14 + vslw 15, 2, 2 + xxlor 34, 29, 29 + vadduwm 14, 24, 27 + xxlor 24, 48, 48 + vadduwm 16, 1, 2 + xxmrgld 34, 45, 35 + vadduwm 31, 31, 30 + xxmrghd 36, 36, 24 + vadduwm 11, 11, 29 + vadduwm 14, 14, 18 + vadduwm 13, 16, 22 + xxlxor 47, 63, 47 + xxlor 1, 9, 9 + xxlor 1, 11, 11 + xxlxor 48, 43, 9 + vadduwm 11, 11, 2 + xxlor 7, 34, 34 + xxmrghd 34, 39, 38 + xxlxor 39, 46, 11 + xxlor 1, 50, 50 + xxlxor 50, 45, 0 + vperm 15, 15, 15, 5 + vperm 16, 16, 16, 5 + vperm 7, 7, 7, 5 + vperm 18, 18, 18, 5 + xxlor 4, 33, 33 + xxlor 33, 31, 31 + vadduwm 14, 14, 2 + xxlor 3, 34, 34 + xxlor 34, 12, 12 + xxlor 35, 13, 13 + vadduwm 6, 15, 1 + xxlor 33, 30, 30 + vadduwm 2, 16, 2 + vadduwm 3, 7, 3 + vadduwm 12, 18, 1 + xxlxor 59, 34, 61 + xxlxor 61, 35, 1 + xxlxor 33, 38, 62 + xxlxor 62, 44, 54 + vrlw 22, 27, 10 + vrlw 29, 29, 10 + vrlw 1, 1, 10 + vrlw 30, 30, 10 + vadduwm 31, 31, 19 + vadduwm 13, 13, 4 + vadduwm 11, 22, 11 + vadduwm 14, 29, 14 + vadduwm 31, 1, 31 + vadduwm 13, 30, 13 + vadduwm 9, 9, 9 + xxlor 1, 36, 36 + xxlxor 48, 43, 48 + xxlxor 36, 46, 39 + xxmrgld 39, 60, 5 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 28, 4, 9 + xxmrgld 36, 53, 57 + vrlw 15, 15, 9 + xxmrghd 57, 53, 57 + vrlw 18, 18, 9 + vadduwm 14, 14, 4 + xxlor 0, 36, 36 + xxmrgld 36, 49, 52 + vadduwm 2, 16, 2 + xxmrgld 49, 8, 26 + vadduwm 3, 28, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 54, 34, 54 + xxlxor 61, 35, 61 + xxlxor 33, 38, 33 + xxlxor 62, 44, 62 + vrlw 29, 29, 8 + vrlw 20, 1, 8 + xxmrgld 33, 55, 27 + vrlw 30, 30, 8 + vrlw 22, 22, 8 + vadduwm 11, 11, 7 + xxlor 5, 39, 39 + xxmrgld 39, 32, 58 + vadduwm 31, 31, 4 + vadduwm 11, 29, 11 + vadduwm 13, 13, 7 + vadduwm 14, 20, 14 + vadduwm 31, 30, 31 + vadduwm 13, 22, 13 + xxlor 28, 36, 36 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 60 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vadduwm 11, 11, 17 + vmr 28, 17 + xxmrghd 49, 32, 58 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 21, 4, 2 + vadduwm 3, 15, 3 + xxlxor 34, 38, 61 + xxlxor 61, 44, 52 + xxlxor 62, 53, 62 + xxlxor 54, 35, 54 + vrlw 20, 2, 10 + vrlw 29, 29, 10 + vrlw 0, 30, 10 + vrlw 30, 22, 10 + vadduwm 14, 14, 25 + vadduwm 31, 31, 1 + vadduwm 13, 13, 17 + vadduwm 11, 20, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 11, 11, 24 + xxlor 8, 56, 56 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 21 + vadduwm 3, 15, 3 + xxlxor 55, 38, 52 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + xxlor 25, 51, 51 + vmr 26, 17 + xxlor 49, 3, 3 + xxlor 52, 1, 1 + xxlor 51, 2, 2 + vadduwm 14, 14, 17 + vadduwm 31, 31, 20 + vadduwm 13, 13, 19 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 29, 39, 39 + xxlor 59, 4, 4 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 53, 0, 0 + xxlor 39, 6, 6 + vadduwm 11, 11, 27 + vadduwm 14, 14, 21 + vadduwm 31, 31, 7 + vadduwm 13, 13, 1 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 34, 7, 7 + vadduwm 31, 31, 28 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 11, 11, 2 + xxlor 34, 28, 28 + vadduwm 13, 13, 26 + vadduwm 14, 14, 2 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 2, 58, 58 + xxlor 39, 25, 25 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + xxlor 54, 29, 29 + xxlor 58, 5, 5 + vadduwm 11, 11, 25 + vadduwm 14, 14, 7 + vadduwm 31, 31, 22 + vadduwm 13, 13, 26 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 11, 11, 17 + vadduwm 14, 14, 21 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vadduwm 31, 31, 1 + vadduwm 13, 13, 20 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 0, 33, 33 + xxlor 33, 8, 8 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vadduwm 11, 11, 19 + vadduwm 14, 14, 2 + vadduwm 31, 31, 1 + vadduwm 13, 13, 22 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + vadduwm 11, 11, 27 + vadduwm 14, 14, 28 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 31, 31, 25 + vadduwm 13, 13, 26 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 3, 7, 7 + vadduwm 11, 11, 7 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + xxlor 33, 6, 6 + xxlor 58, 2, 2 + xxlor 39, 3, 3 + vadduwm 14, 14, 1 + vadduwm 31, 31, 26 + vadduwm 13, 13, 7 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + xxlor 52, 0, 0 + vadduwm 11, 11, 21 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vadduwm 14, 14, 2 + vadduwm 31, 31, 22 + vadduwm 13, 13, 20 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 7, 49, 49 + vmr 17, 2 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 54, 1, 1 + xxlor 34, 7, 7 + vadduwm 11, 11, 22 + vadduwm 14, 14, 28 + vadduwm 31, 31, 2 + vadduwm 13, 13, 26 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 59, 25, 25 + vadduwm 11, 11, 19 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 14, 14, 25 + vadduwm 31, 31, 27 + vadduwm 13, 13, 7 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vmr 2, 19 + xxlor 0, 7, 7 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + xxlor 1, 51, 51 + xxlor 7, 39, 39 + xxlor 51, 8, 8 + xxlor 39, 5, 5 + xxlor 34, 4, 4 + vadduwm 11, 11, 1 + vadduwm 14, 14, 19 + vadduwm 31, 31, 7 + vadduwm 13, 13, 2 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + xxlor 2, 53, 53 + vmr 21, 28 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + xxlor 53, 29, 29 + vadduwm 11, 11, 17 + vadduwm 14, 14, 28 + vadduwm 31, 31, 26 + vadduwm 13, 13, 21 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + vadduwm 11, 11, 20 + xxlor 5, 52, 52 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 52, 2, 2 + vadduwm 14, 14, 25 + vadduwm 31, 31, 20 + vadduwm 13, 13, 7 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + vadduwm 11, 11, 22 + vadduwm 14, 14, 27 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 31, 31, 1 + vadduwm 13, 13, 2 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 3, 29, 29 + xxlor 4, 49, 49 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + vmr 17, 28 + xxlor 2, 54, 54 + xxlor 3, 34, 34 + xxlor 34, 8, 8 + xxlor 51, 0, 0 + xxlor 60, 7, 7 + xxlor 54, 1, 1 + vadduwm 11, 11, 2 + vadduwm 14, 14, 19 + vadduwm 31, 31, 28 + vadduwm 13, 13, 22 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 11, 11, 17 + vadduwm 14, 14, 25 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 56, 32 + vrlw 30, 30, 8 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vadduwm 31, 31, 7 + vadduwm 13, 13, 26 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 6, 39, 39 + xxlor 39, 4, 4 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 30, 30, 10 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vadduwm 11, 11, 21 + vadduwm 14, 14, 27 + vadduwm 31, 31, 7 + vadduwm 13, 13, 28 + vadduwm 11, 30, 11 + vadduwm 14, 23, 14 + vadduwm 31, 29, 31 + vadduwm 13, 0, 13 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 63, 47 + xxlxor 50, 45, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 0, 49, 49 + xxlor 49, 5, 5 + vadduwm 24, 16, 24 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 62, 56, 62 + xxlxor 55, 35, 55 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 23, 23, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + vrlw 30, 30, 8 + vadduwm 11, 11, 17 + vadduwm 14, 14, 1 + vadduwm 31, 31, 2 + vadduwm 13, 13, 22 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 34, 3, 3 + xxlor 49, 2, 2 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 24, 4, 24 + vadduwm 3, 15, 3 + xxlxor 55, 38, 55 + xxlxor 61, 44, 61 + xxlxor 32, 56, 32 + xxlxor 62, 35, 62 + vrlw 23, 23, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + vrlw 30, 30, 10 + vadduwm 11, 11, 19 + vadduwm 14, 14, 20 + vadduwm 31, 31, 2 + vadduwm 13, 13, 17 + vadduwm 11, 23, 11 + vadduwm 14, 29, 14 + vadduwm 31, 0, 31 + vadduwm 13, 30, 13 + xxlxor 50, 43, 50 + xxlxor 48, 46, 48 + xxlxor 36, 63, 36 + xxlxor 47, 45, 47 + vrlw 18, 18, 9 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vadduwm 14, 14, 27 + vadduwm 11, 11, 25 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 27, 4, 24 + vadduwm 3, 15, 3 + xxlxor 57, 38, 55 + xxlxor 61, 44, 61 + xxlxor 62, 35, 62 + xxlxor 32, 59, 32 + xxlor 39, 7, 7 + vrlw 30, 30, 8 + vrlw 25, 25, 8 + vrlw 29, 29, 8 + vrlw 0, 0, 8 + xxlor 1, 58, 58 + vmr 26, 19 + vadduwm 19, 31, 7 + xxlor 39, 6, 6 + vadduwm 11, 30, 11 + vadduwm 7, 13, 7 + vadduwm 13, 25, 14 + vadduwm 14, 29, 19 + vadduwm 7, 0, 7 + xxlxor 48, 43, 48 + xxlxor 36, 45, 36 + xxlxor 47, 46, 47 + xxlxor 50, 39, 50 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + vperm 18, 18, 18, 5 + xxlor 51, 1, 1 + vadduwm 13, 13, 1 + vadduwm 11, 11, 19 + vadduwm 19, 16, 27 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 63, 51, 62 + xxlxor 62, 35, 57 + xxlxor 61, 38, 61 + xxlxor 32, 44, 32 + vrlw 31, 31, 10 + vrlw 30, 30, 10 + vrlw 29, 29, 10 + vrlw 0, 0, 10 + xxlor 33, 0, 0 + vadduwm 7, 7, 2 + vadduwm 14, 14, 1 + vadduwm 11, 31, 11 + vadduwm 13, 30, 13 + vadduwm 14, 29, 14 + vadduwm 7, 0, 7 + xxlxor 48, 43, 48 + xxlxor 36, 45, 36 + xxlxor 47, 46, 47 + xxlxor 50, 39, 50 + vrlw 16, 16, 9 + vrlw 4, 4, 9 + vrlw 15, 15, 9 + vrlw 18, 18, 9 + xxlor 60, 8, 8 + vadduwm 1, 11, 21 + vadduwm 11, 13, 28 + vadduwm 13, 16, 19 + vadduwm 3, 4, 3 + vadduwm 6, 15, 6 + vadduwm 12, 18, 12 + xxlxor 51, 45, 63 + xxlxor 63, 35, 62 + xxlxor 62, 38, 61 + xxlxor 32, 44, 32 + vrlw 31, 31, 8 + vrlw 30, 30, 8 + vrlw 0, 0, 8 + vrlw 19, 19, 8 + vadduwm 14, 14, 26 + vadduwm 7, 7, 17 + vadduwm 1, 31, 1 + vadduwm 11, 30, 11 + vadduwm 14, 0, 14 + vadduwm 7, 19, 7 + xxlxor 50, 33, 50 + xxlxor 48, 43, 48 + xxlxor 36, 46, 36 + xxlxor 47, 39, 47 + vperm 18, 18, 18, 5 + vperm 16, 16, 16, 5 + vperm 4, 4, 4, 5 + vperm 15, 15, 15, 5 + xxlor 34, 4, 4 + vadduwm 14, 14, 22 + vadduwm 6, 18, 6 + vadduwm 12, 16, 12 + vadduwm 13, 4, 13 + vadduwm 3, 15, 3 + xxlxor 49, 38, 63 + xxlxor 63, 44, 62 + xxlxor 32, 45, 32 + xxlxor 51, 35, 51 + vrlw 17, 17, 10 + vrlw 31, 31, 10 + vrlw 0, 0, 10 + vrlw 10, 19, 10 + vadduwm 11, 11, 2 + xxlor 34, 5, 5 + vadduwm 1, 1, 20 + vadduwm 2, 7, 2 + vadduwm 7, 31, 11 + vadduwm 11, 0, 14 + vadduwm 2, 10, 2 + vadduwm 1, 17, 1 + xxlxor 36, 43, 36 + xxlxor 46, 34, 47 + vrlw 4, 4, 9 + vrlw 14, 14, 9 + xxlxor 47, 33, 50 + xxlxor 48, 39, 48 + vrlw 15, 15, 9 + vrlw 9, 16, 9 + vadduwm 13, 4, 13 + vadduwm 3, 14, 3 + xxlxor 32, 45, 32 + xxlxor 45, 45, 33 + xxlxor 33, 35, 42 + xxlxor 59, 35, 39 + vadduwm 3, 15, 6 + vadduwm 6, 9, 12 + xxlxor 39, 35, 49 + xxlxor 42, 38, 63 + vrlw 1, 1, 8 + vrlw 7, 7, 8 + vrlw 10, 10, 8 + vrlw 0, 0, 8 + xxlxor 40, 35, 43 + xxlxor 38, 38, 34 + xxlxor 61, 33, 41 + xxlxor 50, 39, 36 + xxlxor 62, 42, 46 + xxlxor 54, 32, 47 + bne 0, .LBB3_2 +.LBB3_5: + vmrglw 2, 27, 13 + li 3, 32 + li 4, 48 + vmrglw 4, 6, 8 + vmrglw 0, 18, 29 + vmrglw 1, 22, 30 + vmrghw 3, 27, 13 + vmrghw 5, 6, 8 + vmrghw 6, 18, 29 + vmrghw 7, 22, 30 + xxmrgld 40, 36, 34 + xxmrghd 34, 36, 34 + xxmrgld 41, 33, 32 + xxswapd 0, 40 + xxmrgld 36, 37, 35 + xxmrghd 35, 37, 35 + xxmrghd 37, 33, 32 + xxswapd 1, 41 + xxmrgld 32, 39, 38 + xxmrghd 33, 39, 38 + xxswapd 2, 34 + xxswapd 4, 36 + xxswapd 3, 37 + stxvd2x 0, 0, 5 + xxswapd 5, 32 + stxvd2x 1, 5, 11 + xxswapd 0, 35 + xxswapd 1, 33 + stxvd2x 2, 5, 3 + li 3, 64 + stxvd2x 3, 5, 4 + li 4, 80 + stxvd2x 4, 5, 3 + li 3, 96 + stxvd2x 5, 5, 4 + li 4, 112 + stxvd2x 0, 5, 3 + stxvd2x 1, 5, 4 + li 3, 224 + lxvd2x 63, 1, 3 + li 3, 208 + lfd 31, 392(1) + ld 30, 312(1) + ld 29, 304(1) + lxvd2x 62, 1, 3 + li 3, 192 + lfd 30, 384(1) + ld 28, 296(1) + ld 27, 288(1) + lxvd2x 61, 1, 3 + li 3, 176 + lfd 29, 376(1) + ld 26, 280(1) + ld 25, 272(1) + lxvd2x 60, 1, 3 + li 3, 160 + lfd 28, 368(1) + ld 24, 264(1) + ld 23, 256(1) + lxvd2x 59, 1, 3 + li 3, 144 + lfd 27, 360(1) + ld 22, 248(1) + lxvd2x 58, 1, 3 + li 3, 128 + lfd 26, 352(1) + lxvd2x 57, 1, 3 + li 3, 112 + lfd 25, 344(1) + lxvd2x 56, 1, 3 + li 3, 96 + lfd 24, 336(1) + lxvd2x 55, 1, 3 + li 3, 80 + lfd 23, 328(1) + lxvd2x 54, 1, 3 + li 3, 64 + lxvd2x 53, 1, 3 + li 3, 48 + lxvd2x 52, 1, 3 + addi 1, 1, 400 + blr + .long 0 + .quad 0 +.Lfunc_end3: + .size blake3_hash4_sse2, .Lfunc_end3-.Lfunc_begin3 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S new file mode 100644 index 000000000000..a8b2627f12b0 --- /dev/null +++ b/module/icp/asm-ppc64/blake3/b3_ppc64le_sse41.S @@ -0,0 +1,3064 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2022 Samuel Neves + * Copyright (c) 2022 Tino Reichardt + * + * This is converted assembly: SSE4.1 -> POWER8 PPC64 Little Endian + * Used tools: SIMDe https://github.com/simd-everywhere/simde + */ + +#if (defined(__PPC64__) && defined(__LITTLE_ENDIAN__)) + .text + .abiversion 2 + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI0_0: + .byte 31 + .byte 14 + .byte 13 + .byte 12 + .byte 30 + .byte 10 + .byte 9 + .byte 8 + .byte 29 + .byte 6 + .byte 5 + .byte 4 + .byte 28 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI0_2: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI0_3: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI0_4: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_5: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI0_6: + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 26 + .byte 25 + .byte 24 + .byte 27 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 18 + .byte 17 + .byte 16 + .byte 19 +.LCPI0_7: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_8: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_9: + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_10: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 31 + .byte 31 + .byte 31 +.LCPI0_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI0_12: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI0_13: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI0_14: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .text + .globl zfs_blake3_compress_in_place_sse41 + .p2align 2 + .type zfs_blake3_compress_in_place_sse41,@function +zfs_blake3_compress_in_place_sse41: +.Lfunc_begin0: + .cfi_startproc +.Lfunc_gep0: + addis 2, 12, .TOC.-.Lfunc_gep0@ha + addi 2, 2, .TOC.-.Lfunc_gep0@l +.Lfunc_lep0: + .localentry zfs_blake3_compress_in_place_sse41, .Lfunc_lep0-.Lfunc_gep0 + li 8, -64 + mtvsrd 34, 5 + li 5, 16 + lfdx 0, 0, 4 + vspltisw 13, -16 + stxvd2x 60, 1, 8 + li 8, -48 + mtvsrd 35, 7 + lfd 2, 16(4) + lfd 3, 24(4) + addis 7, 2, .LCPI0_0@toc@ha + stxvd2x 61, 1, 8 + li 8, -32 + mtvsrwz 36, 6 + rldicl 6, 6, 32, 32 + stxvd2x 62, 1, 8 + li 8, -16 + vmrghb 2, 3, 2 + stxvd2x 63, 1, 8 + mtvsrwz 35, 6 + addi 6, 7, .LCPI0_0@toc@l + addis 7, 2, .LCPI0_2@toc@ha + lfd 1, 8(4) + xxmrghd 32, 3, 2 + lvx 6, 0, 6 + xxlxor 33, 33, 33 + addis 6, 2, .LCPI0_1@toc@ha + addi 7, 7, .LCPI0_2@toc@l + vmrghw 3, 3, 4 + addi 6, 6, .LCPI0_1@toc@l + vspltisw 14, 9 + xxmrghd 37, 1, 0 + lxvd2x 0, 0, 3 + lxvd2x 1, 3, 5 + vperm 2, 1, 2, 6 + vpkudum 9, 0, 5 + xxswapd 36, 0 + xxswapd 38, 1 + xxmrgld 34, 34, 35 + lvx 3, 0, 7 + addis 7, 2, .LCPI0_4@toc@ha + addi 7, 7, .LCPI0_4@toc@l + vadduwm 4, 9, 4 + lvx 11, 0, 7 + addis 7, 2, .LCPI0_6@toc@ha + addi 7, 7, .LCPI0_6@toc@l + vadduwm 7, 4, 6 + lvx 4, 0, 6 + addis 6, 2, .LCPI0_3@toc@ha + addi 6, 6, .LCPI0_3@toc@l + vperm 11, 0, 5, 11 + lvx 0, 0, 7 + li 7, 48 + xxlxor 40, 39, 34 + lvx 10, 0, 6 + addis 6, 2, .LCPI0_5@toc@ha + lxvd2x 1, 4, 7 + vcmpgtsb 2, 1, 4 + addi 6, 6, .LCPI0_5@toc@l + vperm 4, 8, 8, 3 + vspltisw 8, 10 + xxlandc 44, 36, 34 + vadduwm 4, 8, 8 + vadduwm 8, 12, 10 + xxlxor 37, 40, 38 + vrlw 6, 5, 4 + vadduwm 5, 7, 11 + vadduwm 7, 6, 5 + lvx 5, 0, 6 + li 6, 32 + lxvd2x 0, 4, 6 + addis 4, 2, .LCPI0_7@toc@ha + addis 6, 2, .LCPI0_9@toc@ha + xxlxor 42, 39, 44 + xxswapd 44, 1 + addi 4, 4, .LCPI0_7@toc@l + addi 6, 6, .LCPI0_9@toc@l + vcmpgtsb 5, 1, 5 + vperm 1, 10, 10, 0 + xxswapd 42, 0 + vpkudum 16, 12, 10 + xxlandc 47, 33, 37 + vsubuwm 1, 14, 13 + lvx 14, 0, 4 + addis 4, 2, .LCPI0_8@toc@ha + vadduwm 8, 15, 8 + xxswapd 45, 47 + addi 4, 4, .LCPI0_8@toc@l + vadduwm 7, 7, 16 + xxsldwi 48, 48, 48, 1 + xxlxor 38, 40, 38 + xxsldwi 40, 40, 40, 3 + xxsldwi 39, 39, 39, 1 + vperm 14, 10, 12, 14 + vrlw 6, 6, 1 + vadduwm 7, 6, 7 + xxlxor 45, 39, 45 + vperm 13, 13, 13, 3 + xxlandc 45, 45, 34 + vadduwm 8, 13, 8 + xxlxor 38, 40, 38 + vrlw 10, 6, 4 + vadduwm 6, 7, 14 + vadduwm 7, 10, 6 + xxlxor 38, 39, 45 + vperm 12, 6, 6, 0 + lvx 6, 0, 4 + addis 4, 2, .LCPI0_10@toc@ha + addi 4, 4, .LCPI0_10@toc@l + vperm 13, 11, 9, 6 + xxlandc 44, 44, 37 + vadduwm 15, 12, 8 + vadduwm 7, 7, 13 + xxsldwi 45, 45, 45, 3 + xxlxor 40, 47, 42 + xxsldwi 47, 47, 47, 1 + xxsldwi 39, 39, 39, 3 + vrlw 10, 8, 1 + xxswapd 40, 44 + vadduwm 17, 10, 7 + lvx 7, 0, 4 + addis 4, 2, .LCPI0_11@toc@ha + addi 4, 4, .LCPI0_11@toc@l + xxlxor 44, 49, 40 + lvx 8, 0, 6 + vperm 18, 9, 9, 7 + lvx 9, 0, 4 + addis 4, 2, .LCPI0_12@toc@ha + vperm 12, 12, 12, 3 + addi 4, 4, .LCPI0_12@toc@l + vperm 19, 14, 16, 8 + xxlandc 63, 44, 34 + vperm 12, 19, 18, 9 + vadduwm 15, 31, 15 + xxlxor 42, 47, 42 + vrlw 18, 10, 4 + vadduwm 10, 17, 12 + vadduwm 17, 18, 10 + xxlxor 42, 49, 63 + xxmrgld 63, 43, 46 + xxsldwi 49, 49, 49, 1 + vmrghw 14, 14, 11 + vperm 19, 10, 10, 0 + lvx 10, 0, 4 + addis 4, 2, .LCPI0_13@toc@ha + addi 4, 4, .LCPI0_13@toc@l + lvx 11, 0, 4 + addis 4, 2, .LCPI0_14@toc@ha + vperm 31, 16, 31, 10 + addi 4, 4, .LCPI0_14@toc@l + vperm 14, 14, 16, 11 + xxlandc 51, 51, 37 + vadduwm 15, 19, 15 + xxswapd 51, 51 + vadduwm 17, 17, 31 + xxlxor 50, 47, 50 + xxsldwi 47, 47, 47, 3 + vperm 30, 14, 31, 8 + vrlw 18, 18, 1 + vadduwm 17, 18, 17 + xxlxor 51, 49, 51 + vadduwm 17, 17, 14 + vperm 19, 19, 19, 3 + xxlandc 51, 51, 34 + vadduwm 15, 19, 15 + xxlxor 48, 47, 50 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 51 + vperm 19, 12, 13, 6 + vperm 18, 18, 18, 0 + vperm 13, 13, 13, 7 + vadduwm 17, 17, 19 + xxlandc 50, 50, 37 + xxsldwi 49, 49, 49, 3 + vperm 13, 30, 13, 9 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxmrgld 62, 44, 46 + vmrghw 12, 14, 12 + xxlxor 48, 47, 48 + xxsldwi 47, 47, 47, 1 + vrlw 16, 16, 1 + vperm 30, 31, 30, 10 + vperm 12, 12, 31, 11 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 13 + vperm 18, 18, 18, 3 + vperm 31, 12, 30, 8 + xxlandc 50, 50, 34 + vadduwm 15, 18, 15 + xxlxor 48, 47, 48 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + xxsldwi 49, 49, 49, 1 + vperm 18, 18, 18, 0 + vadduwm 17, 17, 30 + xxlandc 50, 50, 37 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxlxor 48, 47, 48 + xxsldwi 46, 47, 47, 3 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 12 + vperm 18, 18, 18, 3 + xxlandc 47, 50, 34 + xxsldwi 50, 51, 51, 3 + vadduwm 14, 15, 14 + vperm 19, 13, 18, 6 + xxlxor 48, 46, 48 + vperm 18, 18, 18, 7 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vadduwm 17, 17, 19 + vperm 15, 15, 15, 0 + xxsldwi 49, 49, 49, 3 + xxlandc 47, 47, 37 + vadduwm 14, 15, 14 + xxswapd 47, 47 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 29, 15, 14 + vperm 14, 31, 18, 9 + xxmrgld 50, 45, 44 + xxlxor 48, 61, 48 + vmrghw 12, 12, 13 + vrlw 16, 16, 4 + vperm 18, 30, 18, 10 + vadduwm 17, 17, 14 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + xxsldwi 49, 49, 49, 1 + vperm 15, 15, 15, 0 + vadduwm 17, 17, 18 + xxlandc 47, 47, 37 + vadduwm 31, 15, 29 + xxswapd 47, 47 + xxlxor 48, 63, 48 + xxsldwi 45, 63, 63, 3 + vperm 31, 12, 30, 11 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 13, 15, 13 + xxlxor 44, 45, 48 + vadduwm 16, 17, 31 + xxsldwi 49, 51, 51, 3 + vrlw 12, 12, 4 + vperm 19, 14, 17, 6 + vadduwm 16, 12, 16 + xxlxor 47, 48, 47 + vperm 15, 15, 15, 0 + xxlandc 47, 47, 37 + vadduwm 13, 15, 13 + xxswapd 47, 47 + xxlxor 44, 45, 44 + xxsldwi 45, 45, 45, 1 + vrlw 30, 12, 1 + vadduwm 12, 16, 19 + xxsldwi 44, 44, 44, 3 + vadduwm 16, 30, 12 + xxlxor 44, 48, 47 + vperm 15, 17, 17, 7 + vperm 12, 12, 12, 3 + vperm 17, 31, 18, 8 + xxlandc 61, 44, 34 + vperm 12, 17, 15, 9 + vadduwm 13, 29, 13 + xxlxor 47, 45, 62 + xxmrgld 62, 46, 63 + vmrghw 14, 31, 14 + vrlw 15, 15, 4 + vadduwm 16, 16, 12 + vperm 30, 18, 30, 10 + vperm 14, 14, 18, 11 + xxsldwi 50, 51, 51, 3 + vadduwm 16, 15, 16 + xxlxor 49, 48, 61 + xxsldwi 48, 48, 48, 1 + vperm 19, 12, 18, 6 + vperm 17, 17, 17, 0 + vadduwm 16, 16, 30 + xxmrgld 60, 44, 46 + vmrghw 12, 14, 12 + vperm 28, 30, 28, 10 + xxlandc 49, 49, 37 + vadduwm 13, 17, 13 + xxswapd 49, 49 + vperm 12, 12, 30, 11 + xxlxor 47, 45, 47 + xxsldwi 45, 45, 45, 3 + vrlw 15, 15, 1 + vperm 8, 12, 28, 8 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vadduwm 16, 16, 14 + vperm 17, 17, 17, 3 + xxlandc 49, 49, 34 + vadduwm 13, 17, 13 + xxlxor 47, 45, 47 + vrlw 15, 15, 4 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vperm 17, 17, 17, 0 + xxlandc 49, 49, 37 + vadduwm 31, 17, 13 + xxlxor 45, 63, 47 + vrlw 15, 13, 1 + vadduwm 13, 16, 19 + xxswapd 48, 49 + xxsldwi 51, 51, 51, 3 + xxsldwi 45, 45, 45, 3 + vadduwm 17, 15, 13 + xxlxor 45, 49, 48 + lvx 16, 0, 4 + vperm 29, 13, 13, 3 + vperm 13, 18, 18, 7 + xxsldwi 50, 63, 63, 1 + vperm 16, 14, 30, 16 + vperm 7, 19, 19, 7 + xxlandc 63, 61, 34 + vadduwm 18, 31, 18 + vperm 29, 16, 13, 9 + xxlxor 47, 50, 47 + vperm 6, 16, 19, 6 + vrlw 15, 15, 4 + vperm 7, 8, 7, 9 + vadduwm 17, 17, 29 + xxmrgld 41, 61, 44 + vadduwm 17, 15, 17 + vperm 9, 28, 9, 10 + xxlxor 63, 49, 63 + xxsldwi 49, 49, 49, 1 + vperm 31, 31, 31, 0 + vadduwm 17, 17, 28 + xxlandc 63, 63, 37 + vadduwm 18, 31, 18 + xxswapd 63, 63 + xxlxor 47, 50, 47 + xxsldwi 46, 50, 50, 3 + vrlw 15, 15, 1 + vadduwm 17, 15, 17 + xxlxor 63, 49, 63 + vadduwm 17, 17, 12 + vperm 31, 31, 31, 3 + xxlandc 50, 63, 34 + vadduwm 14, 18, 14 + xxlxor 47, 46, 47 + vrlw 15, 15, 4 + vadduwm 17, 15, 17 + xxlxor 50, 49, 50 + vadduwm 6, 17, 6 + vperm 18, 18, 18, 0 + xxsldwi 38, 38, 38, 3 + xxlandc 50, 50, 37 + vadduwm 14, 18, 14 + xxswapd 48, 50 + xxlxor 47, 46, 47 + xxsldwi 46, 46, 46, 1 + vrlw 15, 15, 1 + vadduwm 6, 15, 6 + xxlxor 48, 38, 48 + vadduwm 6, 6, 7 + vperm 16, 16, 16, 3 + xxlandc 48, 48, 34 + vadduwm 14, 16, 14 + xxlxor 40, 46, 47 + vrlw 8, 8, 4 + vadduwm 6, 8, 6 + xxlxor 39, 38, 48 + xxsldwi 38, 38, 38, 1 + vperm 7, 7, 7, 0 + vadduwm 6, 6, 9 + xxlandc 39, 39, 37 + vadduwm 14, 7, 14 + xxswapd 39, 39 + xxlxor 40, 46, 40 + xxsldwi 41, 46, 46, 3 + vrlw 8, 8, 1 + vadduwm 6, 8, 6 + xxlxor 39, 38, 39 + vperm 3, 7, 7, 3 + vmrghw 7, 12, 13 + xxlandc 34, 35, 34 + vperm 7, 7, 28, 11 + vadduwm 3, 2, 9 + xxlxor 40, 35, 40 + vrlw 4, 8, 4 + vadduwm 6, 6, 7 + vadduwm 6, 4, 6 + xxlxor 34, 38, 34 + xxsldwi 0, 38, 38, 3 + vperm 2, 2, 2, 0 + xxlandc 34, 34, 37 + vadduwm 3, 2, 3 + xxswapd 34, 34 + xxlxor 36, 35, 36 + xxsldwi 1, 35, 35, 1 + vrlw 4, 4, 1 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 1, 36, 34 + stxvd2x 0, 0, 3 + xxswapd 1, 1 + stxvd2x 1, 3, 5 + li 3, -16 + lxvd2x 63, 1, 3 + li 3, -32 + lxvd2x 62, 1, 3 + li 3, -48 + lxvd2x 61, 1, 3 + li 3, -64 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end0: + .size zfs_blake3_compress_in_place_sse41, .Lfunc_end0-.Lfunc_begin0 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI1_0: + .byte 31 + .byte 14 + .byte 13 + .byte 12 + .byte 30 + .byte 10 + .byte 9 + .byte 8 + .byte 29 + .byte 6 + .byte 5 + .byte 4 + .byte 28 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI1_2: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI1_3: + .long 1779033703 + .long 3144134277 + .long 1013904242 + .long 2773480762 +.LCPI1_4: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_5: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI1_6: + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 26 + .byte 25 + .byte 24 + .byte 27 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 18 + .byte 17 + .byte 16 + .byte 19 +.LCPI1_7: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_8: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_9: + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_10: + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 31 + .byte 31 + .byte 31 +.LCPI1_11: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 +.LCPI1_12: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 27 + .byte 26 + .byte 25 + .byte 24 +.LCPI1_13: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 31 + .byte 30 + .byte 29 + .byte 28 +.LCPI1_14: + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .text + .globl zfs_blake3_compress_xof_sse41 + .p2align 2 + .type zfs_blake3_compress_xof_sse41,@function +zfs_blake3_compress_xof_sse41: +.Lfunc_begin1: + .cfi_startproc +.Lfunc_gep1: + addis 2, 12, .TOC.-.Lfunc_gep1@ha + addi 2, 2, .TOC.-.Lfunc_gep1@l +.Lfunc_lep1: + .localentry zfs_blake3_compress_xof_sse41, .Lfunc_lep1-.Lfunc_gep1 + li 9, -64 + mtvsrd 34, 5 + li 5, 16 + lfdx 0, 0, 4 + vspltisw 13, -16 + addis 11, 2, .LCPI1_9@toc@ha + stxvd2x 60, 1, 9 + li 9, -48 + mtvsrd 35, 7 + lfd 1, 8(4) + lfd 2, 16(4) + addis 7, 2, .LCPI1_0@toc@ha + stxvd2x 61, 1, 9 + li 9, -32 + mtvsrwz 36, 6 + rldicl 6, 6, 32, 32 + stxvd2x 62, 1, 9 + li 9, -16 + vmrghb 2, 3, 2 + stxvd2x 63, 1, 9 + mtvsrwz 35, 6 + addi 6, 7, .LCPI1_0@toc@l + addis 7, 2, .LCPI1_2@toc@ha + lfd 3, 24(4) + xxmrghd 37, 1, 0 + lvx 6, 0, 6 + xxlxor 33, 33, 33 + lxvd2x 0, 0, 3 + addis 6, 2, .LCPI1_1@toc@ha + addi 7, 7, .LCPI1_2@toc@l + vmrghw 3, 3, 4 + lxvd2x 1, 3, 5 + addi 6, 6, .LCPI1_1@toc@l + vspltisw 14, 9 + xxmrghd 32, 3, 2 + xxswapd 36, 0 + vperm 2, 1, 2, 6 + xxswapd 38, 1 + vpkudum 9, 0, 5 + xxmrgld 34, 34, 35 + lvx 3, 0, 7 + addis 7, 2, .LCPI1_4@toc@ha + addi 7, 7, .LCPI1_4@toc@l + vadduwm 4, 9, 4 + lvx 11, 0, 7 + addis 7, 2, .LCPI1_6@toc@ha + addi 7, 7, .LCPI1_6@toc@l + vadduwm 7, 4, 6 + lvx 4, 0, 6 + addis 6, 2, .LCPI1_3@toc@ha + addi 6, 6, .LCPI1_3@toc@l + vperm 11, 0, 5, 11 + lvx 0, 0, 7 + li 7, 32 + xxlxor 40, 39, 34 + lvx 10, 0, 6 + addis 6, 2, .LCPI1_5@toc@ha + lxvd2x 0, 4, 7 + vcmpgtsb 2, 1, 4 + addi 6, 6, .LCPI1_5@toc@l + vperm 4, 8, 8, 3 + vspltisw 8, 10 + xxlandc 44, 36, 34 + vadduwm 4, 8, 8 + vadduwm 8, 12, 10 + xxlxor 37, 40, 38 + vrlw 6, 5, 4 + vadduwm 5, 7, 11 + vadduwm 7, 6, 5 + lvx 5, 0, 6 + li 6, 48 + lxvd2x 1, 4, 6 + addis 4, 2, .LCPI1_7@toc@ha + xxlxor 42, 39, 44 + addi 4, 4, .LCPI1_7@toc@l + vcmpgtsb 5, 1, 5 + vperm 1, 10, 10, 0 + xxswapd 42, 0 + xxswapd 44, 1 + vpkudum 16, 12, 10 + xxlandc 47, 33, 37 + vsubuwm 1, 14, 13 + lvx 14, 0, 4 + addis 4, 2, .LCPI1_8@toc@ha + vadduwm 8, 15, 8 + xxswapd 45, 47 + addi 4, 4, .LCPI1_8@toc@l + xxlxor 38, 40, 38 + xxsldwi 40, 40, 40, 3 + vadduwm 7, 7, 16 + xxsldwi 48, 48, 48, 1 + vrlw 6, 6, 1 + xxsldwi 39, 39, 39, 1 + vperm 14, 10, 12, 14 + vadduwm 7, 6, 7 + xxlxor 45, 39, 45 + vperm 13, 13, 13, 3 + xxlandc 45, 45, 34 + vadduwm 8, 13, 8 + xxlxor 38, 40, 38 + vrlw 10, 6, 4 + vadduwm 6, 7, 14 + vadduwm 7, 10, 6 + xxlxor 38, 39, 45 + vperm 12, 6, 6, 0 + lvx 6, 0, 4 + addis 4, 2, .LCPI1_10@toc@ha + addi 4, 4, .LCPI1_10@toc@l + vperm 13, 11, 9, 6 + xxlandc 44, 44, 37 + vadduwm 15, 12, 8 + vadduwm 7, 7, 13 + xxsldwi 45, 45, 45, 3 + xxlxor 40, 47, 42 + xxsldwi 47, 47, 47, 1 + xxsldwi 39, 39, 39, 3 + vrlw 10, 8, 1 + xxswapd 40, 44 + vadduwm 17, 10, 7 + lvx 7, 0, 4 + addi 4, 11, .LCPI1_9@toc@l + xxlxor 44, 49, 40 + lvx 8, 0, 4 + addis 4, 2, .LCPI1_11@toc@ha + vperm 18, 9, 9, 7 + addi 4, 4, .LCPI1_11@toc@l + vperm 12, 12, 12, 3 + lvx 9, 0, 4 + addis 4, 2, .LCPI1_12@toc@ha + vperm 19, 14, 16, 8 + addi 4, 4, .LCPI1_12@toc@l + xxlandc 63, 44, 34 + vperm 12, 19, 18, 9 + vadduwm 15, 31, 15 + xxlxor 42, 47, 42 + vrlw 18, 10, 4 + vadduwm 10, 17, 12 + vadduwm 17, 18, 10 + xxlxor 42, 49, 63 + xxmrgld 63, 43, 46 + xxsldwi 49, 49, 49, 1 + vmrghw 14, 14, 11 + vperm 19, 10, 10, 0 + lvx 10, 0, 4 + addis 4, 2, .LCPI1_13@toc@ha + addi 4, 4, .LCPI1_13@toc@l + lvx 11, 0, 4 + addis 4, 2, .LCPI1_14@toc@ha + vperm 31, 16, 31, 10 + addi 4, 4, .LCPI1_14@toc@l + vperm 14, 14, 16, 11 + xxlandc 51, 51, 37 + vadduwm 15, 19, 15 + xxswapd 51, 51 + vadduwm 17, 17, 31 + xxlxor 50, 47, 50 + xxsldwi 47, 47, 47, 3 + vperm 30, 14, 31, 8 + vrlw 18, 18, 1 + vadduwm 17, 18, 17 + xxlxor 51, 49, 51 + vadduwm 17, 17, 14 + vperm 19, 19, 19, 3 + xxlandc 51, 51, 34 + vadduwm 15, 19, 15 + xxlxor 48, 47, 50 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 51 + vperm 19, 12, 13, 6 + vperm 18, 18, 18, 0 + vperm 13, 13, 13, 7 + vadduwm 17, 17, 19 + xxlandc 50, 50, 37 + xxsldwi 49, 49, 49, 3 + vperm 13, 30, 13, 9 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxmrgld 62, 44, 46 + vmrghw 12, 14, 12 + xxlxor 48, 47, 48 + xxsldwi 47, 47, 47, 1 + vrlw 16, 16, 1 + vperm 30, 31, 30, 10 + vperm 12, 12, 31, 11 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 13 + vperm 18, 18, 18, 3 + vperm 31, 12, 30, 8 + xxlandc 50, 50, 34 + vadduwm 15, 18, 15 + xxlxor 48, 47, 48 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + xxsldwi 49, 49, 49, 1 + vperm 18, 18, 18, 0 + vadduwm 17, 17, 30 + xxlandc 50, 50, 37 + vadduwm 15, 18, 15 + xxswapd 50, 50 + xxlxor 48, 47, 48 + xxsldwi 46, 47, 47, 3 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 50, 49, 50 + vadduwm 17, 17, 12 + vperm 18, 18, 18, 3 + xxlandc 47, 50, 34 + xxsldwi 50, 51, 51, 3 + vadduwm 14, 15, 14 + vperm 19, 13, 18, 6 + xxlxor 48, 46, 48 + vperm 18, 18, 18, 7 + vrlw 16, 16, 4 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vadduwm 17, 17, 19 + vperm 15, 15, 15, 0 + xxsldwi 49, 49, 49, 3 + xxlandc 47, 47, 37 + vadduwm 14, 15, 14 + xxswapd 47, 47 + xxlxor 48, 46, 48 + xxsldwi 46, 46, 46, 1 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 29, 15, 14 + vperm 14, 31, 18, 9 + xxmrgld 50, 45, 44 + xxlxor 48, 61, 48 + vmrghw 12, 12, 13 + vrlw 16, 16, 4 + vperm 18, 30, 18, 10 + vadduwm 17, 17, 14 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + xxsldwi 49, 49, 49, 1 + vperm 15, 15, 15, 0 + vadduwm 17, 17, 18 + xxlandc 47, 47, 37 + vadduwm 31, 15, 29 + xxswapd 47, 47 + xxlxor 48, 63, 48 + xxsldwi 45, 63, 63, 3 + vperm 31, 12, 30, 11 + vrlw 16, 16, 1 + vadduwm 17, 16, 17 + xxlxor 47, 49, 47 + vperm 15, 15, 15, 3 + xxlandc 47, 47, 34 + vadduwm 13, 15, 13 + xxlxor 44, 45, 48 + vadduwm 16, 17, 31 + xxsldwi 49, 51, 51, 3 + vrlw 12, 12, 4 + vperm 19, 14, 17, 6 + vadduwm 16, 12, 16 + xxlxor 47, 48, 47 + vperm 15, 15, 15, 0 + xxlandc 47, 47, 37 + vadduwm 13, 15, 13 + xxswapd 47, 47 + xxlxor 44, 45, 44 + xxsldwi 45, 45, 45, 1 + vrlw 30, 12, 1 + vadduwm 12, 16, 19 + xxsldwi 44, 44, 44, 3 + vadduwm 16, 30, 12 + xxlxor 44, 48, 47 + vperm 15, 17, 17, 7 + vperm 12, 12, 12, 3 + vperm 17, 31, 18, 8 + xxlandc 61, 44, 34 + vperm 12, 17, 15, 9 + vadduwm 13, 29, 13 + xxlxor 47, 45, 62 + xxmrgld 62, 46, 63 + vmrghw 14, 31, 14 + vrlw 15, 15, 4 + vadduwm 16, 16, 12 + vperm 30, 18, 30, 10 + vperm 14, 14, 18, 11 + xxsldwi 50, 51, 51, 3 + vadduwm 16, 15, 16 + xxlxor 49, 48, 61 + xxsldwi 48, 48, 48, 1 + vperm 19, 12, 18, 6 + vperm 17, 17, 17, 0 + vadduwm 16, 16, 30 + xxmrgld 60, 44, 46 + vmrghw 12, 14, 12 + vperm 28, 30, 28, 10 + xxlandc 49, 49, 37 + vadduwm 13, 17, 13 + xxswapd 49, 49 + vperm 12, 12, 30, 11 + xxlxor 47, 45, 47 + xxsldwi 45, 45, 45, 3 + vrlw 15, 15, 1 + vperm 8, 12, 28, 8 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vadduwm 16, 16, 14 + vperm 17, 17, 17, 3 + xxlandc 49, 49, 34 + vadduwm 13, 17, 13 + xxlxor 47, 45, 47 + vrlw 15, 15, 4 + vadduwm 16, 15, 16 + xxlxor 49, 48, 49 + vperm 17, 17, 17, 0 + xxlandc 49, 49, 37 + vadduwm 31, 17, 13 + xxlxor 45, 63, 47 + vrlw 15, 13, 1 + vadduwm 13, 16, 19 + xxswapd 48, 49 + xxsldwi 51, 51, 51, 3 + xxsldwi 45, 45, 45, 3 + vadduwm 17, 15, 13 + xxlxor 45, 49, 48 + lvx 16, 0, 4 + vperm 29, 13, 13, 3 + vperm 13, 18, 18, 7 + xxsldwi 50, 63, 63, 1 + vperm 16, 14, 30, 16 + vperm 7, 19, 19, 7 + xxlandc 63, 61, 34 + vadduwm 18, 31, 18 + vperm 29, 16, 13, 9 + xxlxor 47, 50, 47 + vperm 6, 16, 19, 6 + vrlw 15, 15, 4 + vperm 7, 8, 7, 9 + vadduwm 17, 17, 29 + xxmrgld 41, 61, 44 + vadduwm 17, 15, 17 + vperm 9, 28, 9, 10 + xxlxor 63, 49, 63 + xxsldwi 49, 49, 49, 1 + vperm 31, 31, 31, 0 + vadduwm 17, 17, 28 + xxlandc 63, 63, 37 + vadduwm 18, 31, 18 + xxswapd 63, 63 + xxlxor 47, 50, 47 + xxsldwi 46, 50, 50, 3 + vrlw 15, 15, 1 + vadduwm 17, 15, 17 + xxlxor 63, 49, 63 + vadduwm 17, 17, 12 + vperm 31, 31, 31, 3 + xxlandc 50, 63, 34 + vadduwm 14, 18, 14 + xxlxor 47, 46, 47 + vrlw 15, 15, 4 + vadduwm 17, 15, 17 + xxlxor 50, 49, 50 + vadduwm 6, 17, 6 + vperm 18, 18, 18, 0 + xxsldwi 38, 38, 38, 3 + xxlandc 50, 50, 37 + vadduwm 14, 18, 14 + xxswapd 48, 50 + xxlxor 47, 46, 47 + xxsldwi 46, 46, 46, 1 + vrlw 15, 15, 1 + vadduwm 6, 15, 6 + xxlxor 48, 38, 48 + vadduwm 6, 6, 7 + vperm 16, 16, 16, 3 + xxlandc 48, 48, 34 + vadduwm 14, 16, 14 + xxlxor 40, 46, 47 + vrlw 8, 8, 4 + vadduwm 6, 8, 6 + xxlxor 39, 38, 48 + xxsldwi 38, 38, 38, 1 + vperm 7, 7, 7, 0 + vadduwm 6, 6, 9 + xxlandc 39, 39, 37 + vadduwm 14, 7, 14 + xxswapd 39, 39 + xxlxor 40, 46, 40 + xxsldwi 41, 46, 46, 3 + vrlw 8, 8, 1 + vadduwm 6, 8, 6 + xxlxor 39, 38, 39 + vperm 3, 7, 7, 3 + vmrghw 7, 12, 13 + xxlandc 34, 35, 34 + vperm 7, 7, 28, 11 + vadduwm 3, 2, 9 + xxlxor 40, 35, 40 + vrlw 4, 8, 4 + vadduwm 6, 6, 7 + vadduwm 6, 4, 6 + xxlxor 34, 38, 34 + xxsldwi 0, 38, 38, 3 + vperm 2, 2, 2, 0 + xxlandc 34, 34, 37 + vadduwm 3, 2, 3 + xxswapd 34, 34 + xxlxor 36, 35, 36 + xxsldwi 1, 35, 35, 1 + vrlw 4, 4, 1 + xxlxor 0, 1, 0 + xxswapd 0, 0 + xxlxor 2, 36, 34 + stxvd2x 0, 0, 8 + xxswapd 2, 2 + stxvd2x 2, 8, 5 + lfdx 0, 0, 3 + lfd 2, 8(3) + xxmrghd 35, 2, 0 + xxlxor 0, 1, 35 + xxswapd 0, 0 + stxvd2x 0, 8, 7 + lfd 0, 16(3) + lfd 1, 24(3) + li 3, -16 + xxmrghd 35, 1, 0 + xxlxor 0, 34, 35 + xxswapd 0, 0 + stxvd2x 0, 8, 6 + lxvd2x 63, 1, 3 + li 3, -32 + lxvd2x 62, 1, 3 + li 3, -48 + lxvd2x 61, 1, 3 + li 3, -64 + lxvd2x 60, 1, 3 + blr + .long 0 + .quad 0 +.Lfunc_end1: + .size zfs_blake3_compress_xof_sse41, .Lfunc_end1-.Lfunc_begin1 + .cfi_endproc + + .globl zfs_blake3_hash_many_sse41 + .p2align 2 + .type zfs_blake3_hash_many_sse41,@function +zfs_blake3_hash_many_sse41: +.Lfunc_begin2: + .cfi_startproc +.Lfunc_gep2: + addis 2, 12, .TOC.-.Lfunc_gep2@ha + addi 2, 2, .TOC.-.Lfunc_gep2@l +.Lfunc_lep2: + .localentry zfs_blake3_hash_many_sse41, .Lfunc_lep2-.Lfunc_gep2 + mfocrf 12, 32 + mflr 0 + std 0, 16(1) + stw 12, 8(1) + stdu 1, -256(1) + .cfi_def_cfa_offset 256 + .cfi_offset lr, 16 + .cfi_offset r17, -120 + .cfi_offset r18, -112 + .cfi_offset r19, -104 + .cfi_offset r20, -96 + .cfi_offset r21, -88 + .cfi_offset r22, -80 + .cfi_offset r23, -72 + .cfi_offset r24, -64 + .cfi_offset r25, -56 + .cfi_offset r26, -48 + .cfi_offset r27, -40 + .cfi_offset r28, -32 + .cfi_offset r29, -24 + .cfi_offset r30, -16 + .cfi_offset cr2, 8 + std 26, 208(1) + mr 26, 4 + cmpldi 1, 4, 4 + andi. 4, 8, 1 + std 18, 144(1) + std 19, 152(1) + crmove 8, 1 + ld 19, 360(1) + lwz 18, 352(1) + std 24, 192(1) + std 25, 200(1) + std 27, 216(1) + std 28, 224(1) + mr 24, 10 + mr 28, 6 + mr 27, 5 + mr 25, 3 + std 29, 232(1) + std 30, 240(1) + mr 30, 9 + mr 29, 7 + std 17, 136(1) + std 20, 160(1) + std 21, 168(1) + std 22, 176(1) + std 23, 184(1) + blt 1, .LBB2_3 + li 3, 0 + li 4, 1 + clrldi 23, 30, 32 + isel 22, 4, 3, 8 + clrldi 21, 24, 32 + clrldi 20, 18, 32 +.LBB2_2: + mr 3, 25 + mr 4, 27 + mr 5, 28 + mr 6, 29 + mr 7, 22 + mr 8, 23 + mr 9, 21 + mr 10, 20 + std 19, 32(1) + bl blake3_hash4_sse41 + addi 26, 26, -4 + addi 3, 29, 4 + addi 25, 25, 32 + addi 19, 19, 128 + cmpldi 26, 3 + isel 29, 3, 29, 8 + bgt 0, .LBB2_2 +.LBB2_3: + cmpldi 26, 0 + beq 0, .LBB2_11 + li 3, 0 + li 4, 1 + or 21, 24, 30 + li 20, 16 + addi 24, 1, 96 + isel 22, 4, 3, 8 +.LBB2_5: + lxvd2x 0, 28, 20 + ld 23, 0(25) + mr 17, 27 + mr 3, 21 + stxvd2x 0, 24, 20 + lxvd2x 0, 0, 28 + stxvd2x 0, 0, 24 +.LBB2_6: + cmpldi 17, 1 + beq 0, .LBB2_8 + cmpldi 17, 0 + bne 0, .LBB2_9 + b .LBB2_10 +.LBB2_8: + or 3, 3, 18 +.LBB2_9: + clrldi 7, 3, 56 + mr 3, 24 + mr 4, 23 + li 5, 64 + mr 6, 29 + bl zfs_blake3_compress_in_place_sse41 + addi 23, 23, 64 + addi 17, 17, -1 + mr 3, 30 + b .LBB2_6 +.LBB2_10: + lxvd2x 0, 24, 20 + addi 26, 26, -1 + add 29, 29, 22 + addi 25, 25, 8 + cmpldi 26, 0 + stxvd2x 0, 19, 20 + lxvd2x 0, 0, 24 + stxvd2x 0, 0, 19 + addi 19, 19, 32 + bne 0, .LBB2_5 +.LBB2_11: + ld 30, 240(1) + ld 29, 232(1) + ld 28, 224(1) + ld 27, 216(1) + ld 26, 208(1) + ld 25, 200(1) + ld 24, 192(1) + ld 23, 184(1) + ld 22, 176(1) + ld 21, 168(1) + ld 20, 160(1) + ld 19, 152(1) + ld 18, 144(1) + ld 17, 136(1) + addi 1, 1, 256 + ld 0, 16(1) + lwz 12, 8(1) + mtocrf 32, 12 + mtlr 0 + blr + .long 0 + .quad 0 +.Lfunc_end2: + .size zfs_blake3_hash_many_sse41, .Lfunc_end2-.Lfunc_begin2 + .cfi_endproc + + .section .rodata.cst16,"aM",@progbits,16 + .p2align 4 +.LCPI3_0: + .quad 4294967296 + .quad 12884901890 +.LCPI3_1: + .byte 2 + .byte 3 + .byte 0 + .byte 1 + .byte 6 + .byte 7 + .byte 4 + .byte 5 + .byte 10 + .byte 11 + .byte 8 + .byte 9 + .byte 14 + .byte 15 + .byte 12 + .byte 13 +.LCPI3_2: + .byte 1 + .byte 2 + .byte 3 + .byte 0 + .byte 5 + .byte 6 + .byte 7 + .byte 4 + .byte 9 + .byte 10 + .byte 11 + .byte 8 + .byte 13 + .byte 14 + .byte 15 + .byte 12 +.LCPI3_3: + .byte 29 + .byte 28 + .byte 31 + .byte 30 + .byte 25 + .byte 24 + .byte 27 + .byte 26 + .byte 21 + .byte 20 + .byte 23 + .byte 22 + .byte 17 + .byte 16 + .byte 19 + .byte 18 +.LCPI3_4: + .long 1779033703 + .long 1779033703 + .long 1779033703 + .long 1779033703 +.LCPI3_5: + .long 3144134277 + .long 3144134277 + .long 3144134277 + .long 3144134277 +.LCPI3_6: + .long 1013904242 + .long 1013904242 + .long 1013904242 + .long 1013904242 +.LCPI3_7: + .long 2773480762 + .long 2773480762 + .long 2773480762 + .long 2773480762 +.LCPI3_8: + .byte 30 + .byte 29 + .byte 28 + .byte 31 + .byte 26 + .byte 25 + .byte 24 + .byte 27 + .byte 22 + .byte 21 + .byte 20 + .byte 23 + .byte 18 + .byte 17 + .byte 16 + .byte 19 + .text + .p2align 2 + .type blake3_hash4_sse41,@function +blake3_hash4_sse41: +.Lfunc_begin3: + .cfi_startproc +.Lfunc_gep3: + addis 2, 12, .TOC.-.Lfunc_gep3@ha + addi 2, 2, .TOC.-.Lfunc_gep3@l +.Lfunc_lep3: + .localentry blake3_hash4_sse41, .Lfunc_lep3-.Lfunc_gep3 + stdu 1, -416(1) + .cfi_def_cfa_offset 416 + .cfi_offset r22, -176 + .cfi_offset r23, -168 + .cfi_offset r24, -160 + .cfi_offset r25, -152 + .cfi_offset r26, -144 + .cfi_offset r27, -136 + .cfi_offset r28, -128 + .cfi_offset r29, -120 + .cfi_offset r30, -112 + .cfi_offset f20, -96 + .cfi_offset f21, -88 + .cfi_offset f22, -80 + .cfi_offset f23, -72 + .cfi_offset f24, -64 + .cfi_offset f25, -56 + .cfi_offset f26, -48 + .cfi_offset f27, -40 + .cfi_offset f28, -32 + .cfi_offset f29, -24 + .cfi_offset f30, -16 + .cfi_offset f31, -8 + .cfi_offset v20, -368 + .cfi_offset v21, -352 + .cfi_offset v22, -336 + .cfi_offset v23, -320 + .cfi_offset v24, -304 + .cfi_offset v25, -288 + .cfi_offset v26, -272 + .cfi_offset v27, -256 + .cfi_offset v28, -240 + .cfi_offset v29, -224 + .cfi_offset v30, -208 + .cfi_offset v31, -192 + li 11, 48 + li 0, 8 + std 30, 304(1) + li 30, 12 + li 12, 4 + lfiwzx 0, 0, 5 + stxvd2x 52, 1, 11 + li 11, 64 + lfiwzx 2, 5, 0 + li 0, 20 + lfiwzx 3, 5, 30 + stxvd2x 53, 1, 11 + li 11, 80 + li 30, 24 + lfiwzx 4, 5, 0 + li 0, 28 + stxvd2x 54, 1, 11 + li 11, 96 + lfiwzx 1, 5, 12 + lfiwzx 6, 5, 30 + xxspltw 47, 0, 1 + cmpldi 4, 0 + std 22, 240(1) + stxvd2x 55, 1, 11 + li 11, 112 + lfiwzx 7, 5, 0 + xxspltw 40, 2, 1 + std 23, 248(1) + xxspltw 39, 3, 1 + std 24, 256(1) + std 25, 264(1) + xxspltw 51, 1, 1 + xxspltw 43, 6, 1 + std 26, 272(1) + xxspltw 41, 7, 1 + std 27, 280(1) + std 28, 288(1) + std 29, 296(1) + stxvd2x 56, 1, 11 + li 11, 128 + stfd 20, 320(1) + stxvd2x 57, 1, 11 + li 11, 144 + stfd 21, 328(1) + stxvd2x 58, 1, 11 + li 11, 160 + stfd 22, 336(1) + stxvd2x 59, 1, 11 + li 11, 176 + stfd 23, 344(1) + stxvd2x 60, 1, 11 + li 11, 192 + stfd 24, 352(1) + stxvd2x 61, 1, 11 + li 11, 208 + stfd 25, 360(1) + stxvd2x 62, 1, 11 + li 11, 224 + stfd 26, 368(1) + stxvd2x 63, 1, 11 + li 11, 16 + xxspltw 63, 4, 1 + lfiwzx 5, 5, 11 + ld 5, 448(1) + stfd 27, 376(1) + stfd 28, 384(1) + stfd 29, 392(1) + stfd 30, 400(1) + stfd 31, 408(1) + xxspltw 50, 5, 1 + beq 0, .LBB3_5 + addis 30, 2, .LCPI3_0@toc@ha + neg 7, 7 + xxleqv 34, 34, 34 + addis 28, 2, .LCPI3_5@toc@ha + addis 27, 2, .LCPI3_6@toc@ha + addis 26, 2, .LCPI3_7@toc@ha + addis 29, 2, .LCPI3_4@toc@ha + addis 25, 2, .LCPI3_8@toc@ha + addi 0, 30, .LCPI3_0@toc@l + mtfprwz 2, 7 + addis 7, 2, .LCPI3_1@toc@ha + addis 30, 2, .LCPI3_3@toc@ha + addi 24, 29, .LCPI3_4@toc@l + ld 29, 24(3) + lxvd2x 1, 0, 0 + mtfprwz 0, 6 + rldicl 6, 6, 32, 32 + addi 0, 30, .LCPI3_3@toc@l + ld 30, 16(3) + xxspltw 2, 2, 1 + vslw 2, 2, 2 + xxspltw 37, 0, 1 + mtfprwz 0, 6 + addi 6, 7, .LCPI3_1@toc@l + addis 7, 2, .LCPI3_2@toc@ha + xxswapd 35, 1 + xxlxor 36, 36, 36 + xxspltw 33, 0, 1 + xxland 35, 2, 35 + vadduwm 0, 3, 5 + lvx 5, 0, 6 + addi 6, 7, .LCPI3_2@toc@l + ld 7, 8(3) + xxlor 35, 35, 34 + xxlxor 34, 32, 34 + xxlor 9, 32, 32 + lvx 0, 0, 6 + ld 6, 0(3) + addi 3, 3, -8 + vcmpgtsw 2, 3, 2 + lvx 3, 0, 0 + addi 0, 28, .LCPI3_5@toc@l + addi 28, 27, .LCPI3_6@toc@l + addi 27, 26, .LCPI3_7@toc@l + addi 26, 25, .LCPI3_8@toc@l + or 25, 9, 8 + li 9, 0 + vcmpgtsb 5, 4, 5 + vcmpgtsb 0, 4, 0 + xxlor 11, 35, 35 + lvx 3, 0, 24 + xxlor 12, 35, 35 + vsubuwm 2, 1, 2 + xxlnor 10, 37, 37 + xxlor 13, 34, 34 + lvx 2, 0, 0 + li 0, 32 + xxlnor 31, 32, 32 + xxlor 30, 34, 34 + lvx 2, 0, 28 + li 28, 48 + xxlor 29, 34, 34 + lvx 2, 0, 27 + li 27, 0 + xxlor 28, 34, 34 + lvx 2, 0, 26 + xxlor 27, 34, 34 +.LBB3_2: + mr 26, 27 + addi 27, 27, 1 + xxlor 23, 39, 39 + cmpld 27, 4 + sldi 26, 26, 6 + xxlor 24, 40, 40 + iseleq 24, 10, 9 + add 23, 6, 26 + add 22, 30, 26 + lxvd2x 0, 6, 26 + lxvd2x 1, 7, 26 + or 25, 24, 25 + add 24, 7, 26 + lxvd2x 2, 30, 26 + lxvd2x 3, 29, 26 + xxlor 26, 47, 47 + lxvd2x 4, 23, 11 + lxvd2x 6, 24, 11 + clrlwi 25, 25, 24 + xxlor 25, 51, 51 + lxvd2x 7, 22, 11 + lxvd2x 8, 23, 0 + mtfprd 5, 25 + add 25, 29, 26 + xxswapd 34, 0 + lxvd2x 0, 25, 11 + xxswapd 38, 1 + xxswapd 32, 2 + lxvd2x 1, 24, 0 + lxvd2x 2, 22, 0 + xxswapd 40, 3 + xxswapd 39, 4 + lxvd2x 3, 25, 0 + lxvd2x 4, 23, 28 + xxswapd 60, 6 + xxswapd 47, 7 + lxvd2x 6, 24, 28 + xxswapd 57, 8 + lxvd2x 7, 22, 28 + lxvd2x 8, 25, 28 + xxswapd 58, 0 + mr 25, 3 + xxswapd 53, 1 + xxswapd 56, 2 + xxswapd 52, 3 + xxswapd 55, 4 + xxswapd 54, 6 + xxswapd 0, 5 + xxswapd 42, 7 + xxswapd 48, 8 + mtctr 12 +.LBB3_3: + ldu 24, 8(25) + add 24, 24, 26 + addi 24, 24, 256 + dcbt 0, 24 + bdnz .LBB3_3 + vmrgew 4, 28, 7 + vspltisw 14, 9 + mr 25, 8 + vmrgew 27, 6, 2 + vspltisw 17, 4 + vmrglw 12, 6, 2 + vspltisw 19, 10 + vmrghw 30, 6, 2 + xxspltw 0, 0, 3 + vmrglw 2, 8, 0 + vmrghw 13, 8, 0 + xxlor 7, 36, 36 + vmrgew 4, 21, 25 + vmrglw 29, 28, 7 + vmrghw 1, 28, 7 + vmrglw 28, 26, 15 + xxmrgld 37, 34, 44 + vmrgew 7, 26, 15 + vmrghw 15, 26, 15 + xxlor 21, 36, 36 + vmrglw 4, 21, 25 + vmrghw 21, 21, 25 + vmrglw 25, 20, 24 + xxmrgld 34, 60, 61 + vmrghw 26, 20, 24 + xxlor 38, 26, 26 + vmrgew 3, 8, 0 + xxlor 5, 36, 36 + vmrgew 4, 20, 24 + vspltisw 24, -16 + vmrglw 20, 22, 23 + xxmrgld 57, 57, 5 + vmrglw 8, 16, 10 + vmrghw 0, 16, 10 + vadduwm 12, 19, 19 + xxlor 8, 37, 37 + xxlor 20, 36, 36 + vmrgew 4, 22, 23 + vmrghw 23, 22, 23 + xxmrgld 40, 40, 52 + vmrgew 22, 16, 10 + vsubuwm 10, 14, 24 + vslw 14, 17, 17 + vadduwm 17, 5, 6 + xxmrgld 37, 47, 33 + xxlor 22, 36, 36 + xxmrgld 36, 45, 62 + xxlor 38, 25, 25 + xxlor 2, 34, 34 + vadduwm 19, 4, 6 + xxmrgld 38, 39, 7 + xxlor 3, 36, 36 + xxmrghd 39, 47, 33 + xxlor 36, 24, 24 + xxmrgld 33, 58, 53 + vadduwm 17, 17, 18 + vadduwm 29, 2, 4 + xxmrgld 36, 35, 59 + xxlor 34, 23, 23 + xxmrghd 35, 45, 62 + xxlor 1, 9, 9 + vadduwm 28, 5, 2 + xxlor 1, 13, 13 + vadduwm 19, 19, 31 + vadduwm 24, 29, 11 + vadduwm 28, 28, 9 + xxlxor 61, 49, 9 + xxlor 1, 41, 41 + xxlor 41, 11, 11 + xxlxor 34, 51, 13 + vperm 29, 29, 29, 9 + xxlxor 46, 56, 46 + vperm 2, 2, 2, 9 + xxlxor 59, 60, 0 + vperm 14, 14, 14, 9 + vperm 30, 27, 27, 9 + vadduwm 19, 19, 3 + xxlor 4, 35, 35 + xxland 61, 61, 10 + xxlor 35, 12, 12 + xxland 34, 34, 10 + vadduwm 27, 29, 3 + xxlor 35, 30, 30 + vadduwm 17, 17, 4 + xxlor 26, 36, 36 + xxland 46, 46, 10 + vadduwm 3, 2, 3 + xxlor 36, 29, 29 + xxland 62, 62, 10 + xxlxor 45, 59, 50 + xxlxor 50, 35, 63 + vadduwm 31, 14, 4 + xxlor 36, 28, 28 + xxlor 6, 37, 37 + vadduwm 16, 30, 4 + xxlxor 43, 63, 43 + xxlxor 37, 48, 1 + vrlw 4, 13, 12 + vrlw 18, 18, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 15, 24, 6 + vadduwm 28, 28, 7 + vadduwm 17, 4, 17 + vadduwm 19, 18, 19 + vadduwm 15, 11, 15 + vadduwm 28, 5, 28 + xxlor 25, 38, 38 + xxlxor 61, 49, 61 + xxlxor 34, 51, 34 + xxlxor 46, 47, 46 + xxlxor 62, 60, 62 + xxlor 38, 27, 27 + vadduwm 19, 19, 1 + vperm 29, 29, 29, 6 + vperm 2, 2, 2, 6 + vperm 24, 14, 14, 6 + vperm 30, 30, 30, 6 + xxlor 5, 33, 33 + vadduwm 17, 17, 25 + xxland 61, 61, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + xxland 62, 62, 31 + vadduwm 27, 29, 27 + vadduwm 3, 2, 3 + vadduwm 31, 24, 31 + vadduwm 16, 30, 16 + xxlxor 36, 59, 36 + xxlxor 50, 35, 50 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 18, 10 + xxmrgld 50, 32, 55 + vrlw 11, 11, 10 + xxmrghd 55, 32, 55 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 15, 15, 8 + vadduwm 28, 28, 18 + vadduwm 17, 1, 17 + vadduwm 19, 11, 19 + vadduwm 15, 5, 15 + vadduwm 28, 4, 28 + xxlor 7, 57, 57 + xxlxor 62, 49, 62 + xxlxor 61, 51, 61 + xxlxor 57, 47, 34 + xxlxor 34, 60, 56 + vperm 24, 30, 30, 9 + xxmrgld 62, 20, 21 + vperm 29, 29, 29, 9 + vperm 25, 25, 25, 9 + vperm 2, 2, 2, 9 + vmr 14, 8 + xxmrghd 40, 58, 53 + xxmrgld 58, 54, 22 + vadduwm 17, 17, 30 + xxland 56, 56, 10 + vadduwm 21, 19, 8 + xxland 61, 61, 10 + xxland 51, 57, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vadduwm 0, 15, 26 + vadduwm 15, 28, 23 + vadduwm 17, 1, 17 + vadduwm 28, 11, 21 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vmr 13, 8 + xxlor 53, 3, 3 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 52, 4, 4 + xxlor 40, 2, 2 + vadduwm 17, 17, 21 + vadduwm 28, 28, 20 + vadduwm 0, 0, 7 + vadduwm 15, 15, 8 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + vmr 25, 26 + xxlor 3, 39, 39 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + xxlor 54, 6, 6 + xxlor 58, 5, 5 + xxlor 39, 8, 8 + vadduwm 17, 17, 22 + vadduwm 28, 28, 26 + vadduwm 0, 0, 7 + vadduwm 15, 15, 25 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 39, 26, 26 + vadduwm 28, 28, 14 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 17, 17, 7 + vadduwm 0, 0, 30 + vadduwm 15, 15, 23 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 24, 55, 55 + vadduwm 17, 17, 13 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vmr 23, 13 + xxlor 45, 25, 25 + xxlor 39, 7, 7 + vadduwm 28, 28, 13 + vadduwm 0, 0, 18 + vadduwm 15, 15, 7 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 2, 46, 46 + xxlor 46, 3, 3 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vadduwm 17, 17, 20 + vadduwm 28, 28, 26 + vadduwm 0, 0, 25 + vadduwm 15, 15, 14 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 52, 2, 2 + vadduwm 17, 17, 8 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 28, 28, 20 + vadduwm 0, 0, 21 + vadduwm 15, 15, 18 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + vadduwm 17, 17, 22 + vadduwm 28, 28, 30 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 0, 0, 23 + vadduwm 15, 15, 7 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 5, 4, 4 + xxlor 4, 58, 58 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + xxlor 39, 8, 8 + xxlor 54, 24, 24 + xxlor 58, 26, 26 + vadduwm 17, 17, 13 + vadduwm 28, 28, 7 + vadduwm 0, 0, 22 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 3, 53, 53 + xxlor 53, 4, 4 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vadduwm 17, 17, 21 + vadduwm 28, 28, 20 + vadduwm 0, 0, 18 + vadduwm 15, 15, 25 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 2, 55, 55 + vmr 23, 18 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + xxlor 50, 5, 5 + vadduwm 17, 17, 14 + vadduwm 28, 28, 30 + vadduwm 0, 0, 18 + vadduwm 15, 15, 22 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 25, 40, 40 + vmr 8, 13 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + xxlor 45, 25, 25 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 17, 17, 13 + xxlor 45, 2, 2 + vadduwm 0, 0, 8 + vadduwm 28, 28, 13 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 4, 57, 57 + xxlor 26, 46, 46 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + xxlor 8, 62, 62 + xxlor 57, 3, 3 + xxlor 46, 7, 7 + xxlor 62, 6, 6 + vadduwm 17, 17, 7 + vadduwm 28, 28, 25 + vadduwm 0, 0, 14 + vadduwm 15, 15, 30 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vadduwm 17, 17, 20 + xxlor 3, 52, 52 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 52, 8, 8 + vadduwm 0, 0, 22 + vadduwm 28, 28, 20 + vadduwm 15, 15, 23 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 6, 55, 55 + xxlor 55, 4, 4 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 17, 17, 23 + vadduwm 28, 28, 13 + vadduwm 0, 0, 21 + vadduwm 15, 15, 14 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 4, 53, 53 + xxlor 53, 26, 26 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 17, 17, 21 + vadduwm 28, 28, 8 + vadduwm 0, 0, 7 + vadduwm 15, 15, 30 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 5, 25, 25 + xxlor 2, 58, 58 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vmr 22, 26 + vadduwm 0, 0, 26 + xxlor 58, 5, 5 + vadduwm 17, 17, 25 + vadduwm 28, 28, 18 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 7, 24, 24 + xxlor 8, 57, 57 + xxland 56, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 59, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 57, 7, 7 + vadduwm 17, 17, 20 + vadduwm 28, 28, 13 + vadduwm 0, 0, 14 + vadduwm 15, 15, 25 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 24, 24, 24, 9 + xxlor 5, 52, 52 + xxlor 23, 45, 45 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 56, 56, 10 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + xxlor 52, 6, 6 + vadduwm 28, 28, 8 + vmr 13, 8 + xxlor 40, 3, 3 + vadduwm 17, 17, 20 + vadduwm 0, 0, 8 + vadduwm 15, 15, 22 + vadduwm 17, 4, 17 + vadduwm 28, 1, 28 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 49, 61 + xxlxor 51, 60, 51 + xxlxor 34, 32, 34 + xxlxor 56, 47, 56 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 24, 24, 24, 6 + xxlor 25, 39, 39 + vmr 7, 30 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 56, 56, 31 + vadduwm 27, 29, 27 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 24, 16 + xxlxor 36, 59, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vmr 30, 18 + xxlor 24, 46, 46 + xxlor 46, 25, 25 + xxlor 50, 8, 8 + vadduwm 17, 17, 23 + vadduwm 28, 28, 14 + vadduwm 0, 0, 18 + vadduwm 15, 15, 26 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 9 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + xxlor 6, 58, 58 + xxlor 58, 4, 4 + xxland 56, 56, 10 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + vadduwm 31, 24, 31 + vadduwm 16, 29, 16 + vadduwm 27, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 37, 59, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + vadduwm 17, 17, 30 + vadduwm 28, 28, 26 + vadduwm 0, 0, 7 + vadduwm 15, 15, 21 + vadduwm 17, 1, 17 + vadduwm 28, 11, 28 + vadduwm 0, 5, 0 + vadduwm 15, 4, 15 + xxlxor 56, 49, 56 + xxlxor 61, 60, 61 + xxlxor 51, 32, 51 + xxlxor 34, 47, 34 + vperm 24, 24, 24, 6 + vperm 29, 29, 29, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + xxlor 40, 23, 23 + vadduwm 13, 28, 13 + vadduwm 8, 17, 8 + xxland 49, 56, 31 + xxland 61, 61, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + vadduwm 31, 17, 31 + vadduwm 16, 29, 16 + vadduwm 28, 19, 27 + vadduwm 3, 2, 3 + xxlxor 33, 63, 33 + xxlxor 43, 48, 43 + xxlxor 36, 35, 36 + xxlxor 37, 60, 37 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlor 2, 55, 55 + vmr 23, 30 + xxlor 62, 24, 24 + vadduwm 0, 0, 22 + vadduwm 15, 15, 30 + vadduwm 8, 4, 8 + vadduwm 13, 1, 13 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 61, 40, 61 + xxlxor 51, 45, 51 + xxlxor 34, 32, 34 + xxlxor 49, 47, 49 + vperm 29, 29, 29, 9 + vperm 19, 19, 19, 9 + vperm 2, 2, 2, 9 + vperm 17, 17, 17, 9 + vadduwm 13, 13, 14 + xxlor 46, 5, 5 + xxland 61, 61, 10 + xxland 51, 51, 10 + xxland 34, 34, 10 + xxland 49, 49, 10 + vadduwm 28, 29, 28 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 17, 16 + xxlxor 36, 60, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 4, 4, 12 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vadduwm 8, 8, 25 + vadduwm 0, 0, 14 + vadduwm 15, 15, 7 + vadduwm 8, 4, 8 + vadduwm 13, 1, 13 + vadduwm 0, 11, 0 + vadduwm 15, 5, 15 + xxlxor 62, 40, 61 + xxlxor 51, 45, 51 + xxlxor 34, 32, 34 + xxlxor 49, 47, 49 + vperm 30, 30, 30, 6 + vperm 19, 19, 19, 6 + vperm 2, 2, 2, 6 + vperm 17, 17, 17, 6 + vadduwm 29, 8, 20 + vadduwm 8, 13, 18 + xxland 45, 62, 31 + xxland 51, 51, 31 + xxland 34, 34, 31 + xxland 49, 49, 31 + vadduwm 30, 13, 28 + vadduwm 3, 19, 3 + vadduwm 31, 2, 31 + vadduwm 16, 17, 16 + xxlxor 36, 62, 36 + xxlxor 33, 35, 33 + xxlxor 43, 63, 43 + xxlxor 37, 48, 37 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + vrlw 4, 4, 10 + vadduwm 0, 0, 23 + vadduwm 7, 15, 21 + vadduwm 29, 1, 29 + vadduwm 8, 11, 8 + vadduwm 0, 5, 0 + vadduwm 7, 4, 7 + xxlxor 47, 61, 49 + xxlxor 45, 40, 45 + xxlxor 49, 32, 51 + xxlxor 34, 39, 34 + vperm 15, 15, 15, 9 + vperm 13, 13, 13, 9 + vperm 17, 17, 17, 9 + vperm 2, 2, 2, 9 + xxlor 46, 3, 3 + vadduwm 9, 29, 26 + vadduwm 8, 8, 14 + xxland 46, 47, 10 + xxland 45, 45, 10 + xxland 47, 49, 10 + xxland 34, 34, 10 + vadduwm 17, 14, 31 + vadduwm 16, 13, 16 + vadduwm 18, 15, 30 + vadduwm 3, 2, 3 + xxlxor 33, 49, 33 + xxlxor 43, 48, 43 + xxlxor 37, 50, 37 + xxlxor 36, 35, 36 + vrlw 1, 1, 12 + vrlw 11, 11, 12 + vrlw 5, 5, 12 + vrlw 4, 4, 12 + xxlor 44, 6, 6 + xxlor 0, 10, 10 + vadduwm 0, 0, 12 + xxlor 44, 2, 2 + vadduwm 9, 1, 9 + vadduwm 7, 7, 12 + vadduwm 8, 11, 8 + vadduwm 7, 4, 7 + vadduwm 0, 5, 0 + xxlxor 34, 39, 34 + xxlxor 44, 32, 47 + vperm 2, 2, 2, 6 + xxlxor 46, 41, 46 + xxlxor 45, 40, 45 + vperm 12, 12, 12, 6 + vperm 14, 14, 14, 6 + vperm 13, 13, 13, 6 + xxland 34, 34, 31 + xxlor 1, 31, 31 + vadduwm 3, 2, 3 + xxland 44, 44, 31 + xxlxor 36, 35, 36 + xxlxor 51, 35, 40 + xxland 35, 46, 31 + xxland 38, 45, 31 + vadduwm 15, 12, 18 + vadduwm 8, 3, 17 + vadduwm 13, 6, 16 + xxlxor 37, 47, 37 + xxlxor 33, 40, 33 + xxlxor 43, 45, 43 + vrlw 4, 4, 10 + vrlw 1, 1, 10 + vrlw 11, 11, 10 + vrlw 5, 5, 10 + xxlxor 47, 47, 41 + xxlxor 40, 40, 32 + xxlxor 39, 45, 39 + xxlxor 50, 36, 38 + xxlxor 63, 33, 44 + xxlxor 43, 43, 34 + xxlxor 41, 37, 35 + bne 0, .LBB3_2 +.LBB3_5: + vmrglw 2, 19, 15 + li 3, 32 + li 4, 48 + vmrglw 4, 7, 8 + vmrglw 0, 31, 18 + vmrglw 1, 9, 11 + vmrghw 3, 19, 15 + vmrghw 5, 7, 8 + vmrghw 6, 31, 18 + vmrghw 7, 9, 11 + xxmrgld 40, 36, 34 + xxmrghd 34, 36, 34 + xxmrgld 41, 33, 32 + xxswapd 0, 40 + xxmrgld 36, 37, 35 + xxmrghd 35, 37, 35 + xxmrghd 37, 33, 32 + xxswapd 1, 41 + xxmrgld 32, 39, 38 + xxmrghd 33, 39, 38 + xxswapd 2, 34 + xxswapd 4, 36 + xxswapd 3, 37 + stxvd2x 0, 0, 5 + xxswapd 5, 32 + stxvd2x 1, 5, 11 + xxswapd 0, 35 + xxswapd 1, 33 + stxvd2x 2, 5, 3 + li 3, 64 + stxvd2x 3, 5, 4 + li 4, 80 + stxvd2x 4, 5, 3 + li 3, 96 + stxvd2x 5, 5, 4 + li 4, 112 + stxvd2x 0, 5, 3 + stxvd2x 1, 5, 4 + li 3, 224 + lxvd2x 63, 1, 3 + li 3, 208 + lfd 31, 408(1) + ld 30, 304(1) + ld 29, 296(1) + lxvd2x 62, 1, 3 + li 3, 192 + lfd 30, 400(1) + ld 28, 288(1) + ld 27, 280(1) + lxvd2x 61, 1, 3 + li 3, 176 + lfd 29, 392(1) + ld 26, 272(1) + ld 25, 264(1) + lxvd2x 60, 1, 3 + li 3, 160 + lfd 28, 384(1) + ld 24, 256(1) + ld 23, 248(1) + lxvd2x 59, 1, 3 + li 3, 144 + lfd 27, 376(1) + ld 22, 240(1) + lxvd2x 58, 1, 3 + li 3, 128 + lfd 26, 368(1) + lxvd2x 57, 1, 3 + li 3, 112 + lfd 25, 360(1) + lxvd2x 56, 1, 3 + li 3, 96 + lfd 24, 352(1) + lxvd2x 55, 1, 3 + li 3, 80 + lfd 23, 344(1) + lxvd2x 54, 1, 3 + li 3, 64 + lfd 22, 336(1) + lxvd2x 53, 1, 3 + li 3, 48 + lfd 21, 328(1) + lxvd2x 52, 1, 3 + lfd 20, 320(1) + addi 1, 1, 416 + blr + .long 0 + .quad 0 +.Lfunc_end3: + .size blake3_hash4_sse41, .Lfunc_end3-.Lfunc_begin3 + .cfi_endproc + .section ".note.GNU-stack","",@progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_avx2.S b/module/icp/asm-x86_64/blake3/blake3_avx2.S new file mode 100644 index 000000000000..b15d8fc7744e --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_avx2.S @@ -0,0 +1,1845 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves + * Copyright (c) 2022 Tino Reichardt + */ + +#if defined(HAVE_AVX2) + +#define _ASM +#include + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_hash_many_avx2 +.text + +.type zfs_blake3_hash_many_avx2,@function +.p2align 6 +zfs_blake3_hash_many_avx2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 680 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + vmovd xmm0, r9d + vpbroadcastd ymm0, xmm0 + vmovdqa ymmword ptr [rsp+0x280], ymm0 + vpand ymm1, ymm0, ymmword ptr [ADD0+rip] + vpand ymm2, ymm0, ymmword ptr [ADD1+rip] + vmovdqa ymmword ptr [rsp+0x220], ymm2 + vmovd xmm2, r8d + vpbroadcastd ymm2, xmm2 + vpaddd ymm2, ymm2, ymm1 + vmovdqa ymmword ptr [rsp+0x240], ymm2 + vpxor ymm1, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm2, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm1, ymm2 + shr r8, 32 + vmovd xmm3, r8d + vpbroadcastd ymm3, xmm3 + vpsubd ymm3, ymm3, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm3 + shl rdx, 6 + mov qword ptr [rsp+0x2A0], rdx + cmp rsi, 8 + jc 3f +2: + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x2A0] + cmove eax, ebx + mov dword ptr [rsp+0x200], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x20], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x40], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x60], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x80], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0xA0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0xC0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0xE0], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x100], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x120], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x140], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x160], ymm11 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm8, ymm12, ymm14, 136 + vmovaps ymmword ptr [rsp+0x180], ymm8 + vshufps ymm9, ymm12, ymm14, 221 + vmovaps ymmword ptr [rsp+0x1A0], ymm9 + vshufps ymm10, ymm13, ymm15, 136 + vmovaps ymmword ptr [rsp+0x1C0], ymm10 + vshufps ymm11, ymm13, ymm15, 221 + vmovaps ymmword ptr [rsp+0x1E0], ymm11 + vpbroadcastd ymm15, dword ptr [rsp+0x200] + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm0, ymmword ptr [rsp+0x240] + vpxor ymm13, ymm1, ymmword ptr [rsp+0x260] + vpxor ymm14, ymm2, ymmword ptr [BLAKE3_BLOCK_LEN+rip] + vpxor ymm15, ymm3, ymm15 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [BLAKE3_IV_0+rip] + vpaddd ymm9, ymm13, ymmword ptr [BLAKE3_IV_1+rip] + vpaddd ymm10, ymm14, ymmword ptr [BLAKE3_IV_2+rip] + vpaddd ymm11, ymm15, ymmword ptr [BLAKE3_IV_3+rip] + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x100] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xE0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x20] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1A0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xC0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x160] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xA0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1C0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x80] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xA0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x180] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x120] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x1E0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1C0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x140] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0xE0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x40] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x60] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x120] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x160] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x100] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1E0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x180] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x20] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1A0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x40] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x80] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x60] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x140] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0xC0] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x160] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0xA0] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x20] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x100] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1E0] + vpaddd ymm1, ymm1, ymmword ptr [rsp] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x120] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xC0] + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxor ymm12, ymm12, ymm0 + vpxor ymm13, ymm13, ymm1 + vpxor ymm14, ymm14, ymm2 + vpxor ymm15, ymm15, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpshufb ymm15, ymm15, ymm8 + vpaddd ymm8, ymm12, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxor ymm4, ymm4, ymm8 + vpxor ymm5, ymm5, ymm9 + vpxor ymm6, ymm6, ymm10 + vpxor ymm7, ymm7, ymm11 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x1C0] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x40] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x60] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0xE0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT16+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vmovdqa ymmword ptr [rsp+0x200], ymm8 + vpsrld ymm8, ymm5, 12 + vpslld ymm5, ymm5, 20 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 12 + vpslld ymm6, ymm6, 20 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 12 + vpslld ymm7, ymm7, 20 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 12 + vpslld ymm4, ymm4, 20 + vpor ymm4, ymm4, ymm8 + vpaddd ymm0, ymm0, ymmword ptr [rsp+0x140] + vpaddd ymm1, ymm1, ymmword ptr [rsp+0x180] + vpaddd ymm2, ymm2, ymmword ptr [rsp+0x80] + vpaddd ymm3, ymm3, ymmword ptr [rsp+0x1A0] + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxor ymm15, ymm15, ymm0 + vpxor ymm12, ymm12, ymm1 + vpxor ymm13, ymm13, ymm2 + vpxor ymm14, ymm14, ymm3 + vbroadcasti128 ymm8, xmmword ptr [ROT8+rip] + vpshufb ymm15, ymm15, ymm8 + vpshufb ymm12, ymm12, ymm8 + vpshufb ymm13, ymm13, ymm8 + vpshufb ymm14, ymm14, ymm8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm13, ymmword ptr [rsp+0x200] + vpaddd ymm9, ymm9, ymm14 + vpxor ymm5, ymm5, ymm10 + vpxor ymm6, ymm6, ymm11 + vpxor ymm7, ymm7, ymm8 + vpxor ymm4, ymm4, ymm9 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpsrld ymm8, ymm5, 7 + vpslld ymm5, ymm5, 25 + vpor ymm5, ymm5, ymm8 + vpsrld ymm8, ymm6, 7 + vpslld ymm6, ymm6, 25 + vpor ymm6, ymm6, ymm8 + vpsrld ymm8, ymm7, 7 + vpslld ymm7, ymm7, 25 + vpor ymm7, ymm7, ymm8 + vpsrld ymm8, ymm4, 7 + vpslld ymm4, ymm4, 25 + vpor ymm4, ymm4, ymm8 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp+0x220] + vpaddd ymm1, ymm0, ymmword ptr [rsp+0x240] + vmovdqa ymmword ptr [rsp+0x240], ymm1 + vpxor ymm0, ymm0, ymmword ptr [CMP_MSB_MASK+rip] + vpxor ymm2, ymm1, ymmword ptr [CMP_MSB_MASK+rip] + vpcmpgtd ymm2, ymm0, ymm2 + vmovdqa ymm0, ymmword ptr [rsp+0x260] + vpsubd ymm2, ymm0, ymm2 + vmovdqa ymmword ptr [rsp+0x260], ymm2 + add rdi, 64 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + sub rsi, 8 + cmp rsi, 8 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x2A0] + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + test rsi, 0x4 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovdqa ymm8, ymm0 + vmovdqa ymm9, ymm1 + vbroadcasti128 ymm12, xmmword ptr [rsp+0x240] + vbroadcasti128 ymm13, xmmword ptr [rsp+0x260] + vpunpckldq ymm14, ymm12, ymm13 + vpunpckhdq ymm15, ymm12, ymm13 + vpermq ymm14, ymm14, 0x50 + vpermq ymm15, ymm15, 0x50 + vbroadcasti128 ymm12, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + vpblendd ymm14, ymm14, ymm12, 0x44 + vpblendd ymm15, ymm15, ymm12, 0x44 + vmovdqa ymmword ptr [rsp], ymm14 + vmovdqa ymmword ptr [rsp+0x20], ymm15 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vmovups ymm2, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm2, ymm3, 136 + vshufps ymm5, ymm2, ymm3, 221 + vmovups ymm2, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm2, ymm2, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm3, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm3, ymm3, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm2, ymm3, 136 + vshufps ymm7, ymm2, ymm3, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + vmovups ymm10, ymmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x40], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x30], 0x01 + vshufps ymm12, ymm10, ymm11, 136 + vshufps ymm13, ymm10, ymm11, 221 + vmovups ymm10, ymmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r11+rdx-0x20], 0x01 + vmovups ymm11, ymmword ptr [r10+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r11+rdx-0x10], 0x01 + vshufps ymm14, ymm10, ymm11, 136 + vshufps ymm15, ymm10, ymm11, 221 + vpshufd ymm14, ymm14, 0x93 + vpshufd ymm15, ymm15, 0x93 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + vpbroadcastd ymm2, dword ptr [rsp+0x200] + vmovdqa ymm3, ymmword ptr [rsp] + vmovdqa ymm11, ymmword ptr [rsp+0x20] + vpblendd ymm3, ymm3, ymm2, 0x88 + vpblendd ymm11, ymm11, ymm2, 0x88 + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa ymm10, ymm2 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm8, ymm8, ymm12 + vmovdqa ymmword ptr [rsp+0x40], ymm4 + nop + vmovdqa ymmword ptr [rsp+0x60], ymm12 + nop + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vmovdqa ymmword ptr [rsp+0x80], ymm5 + vmovdqa ymmword ptr [rsp+0xA0], ymm13 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm8, ymm8, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpshufd ymm10, ymm10, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm8, ymm8, ymm14 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT16+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 12 + vpslld ymm9, ymm9, 20 + vpor ymm9, ymm9, ymm4 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm8, ymm8, ymm15 + vpaddd ymm0, ymm0, ymm1 + vpaddd ymm8, ymm8, ymm9 + vpxor ymm3, ymm3, ymm0 + vpxor ymm11, ymm11, ymm8 + vbroadcasti128 ymm4, xmmword ptr [ROT8+rip] + vpshufb ymm3, ymm3, ymm4 + vpshufb ymm11, ymm11, ymm4 + vpaddd ymm2, ymm2, ymm3 + vpaddd ymm10, ymm10, ymm11 + vpxor ymm1, ymm1, ymm2 + vpxor ymm9, ymm9, ymm10 + vpsrld ymm4, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm4 + vpsrld ymm4, ymm9, 7 + vpslld ymm9, ymm9, 25 + vpor ymm9, ymm9, ymm4 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm8, ymm8, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm11, ymm11, 0x4E + vpshufd ymm2, ymm2, 0x93 + vpshufd ymm10, ymm10, 0x93 + dec al + je 9f + vmovdqa ymm4, ymmword ptr [rsp+0x40] + vmovdqa ymm5, ymmword ptr [rsp+0x80] + vshufps ymm12, ymm4, ymm5, 214 + vpshufd ymm13, ymm4, 0x0F + vpshufd ymm4, ymm12, 0x39 + vshufps ymm12, ymm6, ymm7, 250 + vpblendd ymm13, ymm13, ymm12, 0xAA + vpunpcklqdq ymm12, ymm7, ymm5 + vpblendd ymm12, ymm12, ymm6, 0x88 + vpshufd ymm12, ymm12, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymmword ptr [rsp+0x40], ymm13 + vmovdqa ymmword ptr [rsp+0x80], ymm12 + vmovdqa ymm12, ymmword ptr [rsp+0x60] + vmovdqa ymm13, ymmword ptr [rsp+0xA0] + vshufps ymm5, ymm12, ymm13, 214 + vpshufd ymm6, ymm12, 0x0F + vpshufd ymm12, ymm5, 0x39 + vshufps ymm5, ymm14, ymm15, 250 + vpblendd ymm6, ymm6, ymm5, 0xAA + vpunpcklqdq ymm5, ymm15, ymm13 + vpblendd ymm5, ymm5, ymm14, 0x88 + vpshufd ymm5, ymm5, 0x78 + vpunpckhdq ymm13, ymm13, ymm15 + vpunpckldq ymm14, ymm14, ymm13 + vpshufd ymm15, ymm14, 0x1E + vmovdqa ymm13, ymm6 + vmovdqa ymm14, ymm5 + vmovdqa ymm5, ymmword ptr [rsp+0x40] + vmovdqa ymm6, ymmword ptr [rsp+0x80] + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + vpxor ymm8, ymm8, ymm10 + vpxor ymm9, ymm9, ymm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqu xmmword ptr [rbx+0x40], xmm8 + vmovdqu xmmword ptr [rbx+0x50], xmm9 + vextracti128 xmmword ptr [rbx+0x60], ymm8, 0x01 + vextracti128 xmmword ptr [rbx+0x70], ymm9, 0x01 + vmovaps xmm8, xmmword ptr [rsp+0x280] + vmovaps xmm0, xmmword ptr [rsp+0x240] + vmovaps xmm1, xmmword ptr [rsp+0x250] + vmovaps xmm2, xmmword ptr [rsp+0x260] + vmovaps xmm3, xmmword ptr [rsp+0x270] + vblendvps xmm0, xmm0, xmm1, xmm8 + vblendvps xmm2, xmm2, xmm3, xmm8 + vmovaps xmmword ptr [rsp+0x240], xmm0 + vmovaps xmmword ptr [rsp+0x260], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test rsi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp+0x240] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x244] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x264], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + vbroadcasti128 ymm14, xmmword ptr [ROT16+rip] + vbroadcasti128 ymm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x200], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x200] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm14 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 12 + vpslld ymm1, ymm1, 20 + vpor ymm1, ymm1, ymm8 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxor ymm3, ymm3, ymm0 + vpshufb ymm3, ymm3, ymm15 + vpaddd ymm2, ymm2, ymm3 + vpxor ymm1, ymm1, ymm2 + vpsrld ymm8, ymm1, 7 + vpslld ymm1, ymm1, 25 + vpor ymm1, ymm1, ymm8 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovaps ymm8, ymmword ptr [rsp+0x280] + vmovaps ymm0, ymmword ptr [rsp+0x240] + vmovups ymm1, ymmword ptr [rsp+0x248] + vmovaps ymm2, ymmword ptr [rsp+0x260] + vmovups ymm3, ymmword ptr [rsp+0x268] + vblendvps ymm0, ymm0, ymm1, ymm8 + vblendvps ymm2, ymm2, ymm3, ymm8 + vmovaps ymmword ptr [rsp+0x240], ymm0 + vmovaps ymmword ptr [rsp+0x260], ymm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test rsi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm3, dword ptr [rsp+0x240] + vpinsrd xmm3, xmm3, dword ptr [rsp+0x260], 1 + vpinsrd xmm13, xmm3, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm14, xmmword ptr [ROT16+rip] + vmovdqa xmm15, xmmword ptr [ROT8+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vmovdqa xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovdqa xmm3, xmm13 + vpinsrd xmm3, xmm3, eax, 3 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm14 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 12 + vpslld xmm1, xmm1, 20 + vpor xmm1, xmm1, xmm8 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxor xmm3, xmm3, xmm0 + vpshufb xmm3, xmm3, xmm15 + vpaddd xmm2, xmm2, xmm3 + vpxor xmm1, xmm1, xmm2 + vpsrld xmm8, xmm1, 7 + vpslld xmm1, xmm1, 25 + vpor xmm1, xmm1, xmm8 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.size zfs_blake3_hash_many_avx2, . - zfs_blake3_hash_many_avx2 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif + +.p2align 6 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 +ADD1: + .long 8, 8, 8, 8, 8, 8, 8, 8 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 + .long 0x00000040, 0x00000040, 0x00000040, 0x00000040 +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A +#endif /* HAVE_AVX2 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_avx512.S b/module/icp/asm-x86_64/blake3/blake3_avx512.S new file mode 100644 index 000000000000..d02c5e7ec92f --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_avx512.S @@ -0,0 +1,2618 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves + * Copyright (c) 2022 Tino Reichardt + */ + +#if defined(HAVE_AVX512F) && defined(HAVE_AVX512VL) + +#define _ASM +#include + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_hash_many_avx512 +.global zfs_blake3_compress_in_place_avx512 +.global zfs_blake3_compress_xof_avx512 +.text + +.type zfs_blake3_hash_many_avx512,@function +.type zfs_blake3_compress_xof_avx512,@function +.type zfs_blake3_compress_in_place_avx512,@function + +.p2align 6 +zfs_blake3_hash_many_avx512: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 144 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9 + kmovw k1, r9d + vmovd xmm0, r8d + vpbroadcastd ymm0, xmm0 + shr r8, 32 + vmovd xmm1, r8d + vpbroadcastd ymm1, xmm1 + vmovdqa ymm4, ymm1 + vmovdqa ymm5, ymm1 + vpaddd ymm2, ymm0, ymmword ptr [ADD0+rip] + vpaddd ymm3, ymm0, ymmword ptr [ADD0+32+rip] + vpcmpltud k2, ymm2, ymm0 + vpcmpltud k3, ymm3, ymm0 + vpaddd ymm4 {k2}, ymm4, dword ptr [ADD1+rip] {1to8} + vpaddd ymm5 {k3}, ymm5, dword ptr [ADD1+rip] {1to8} + knotw k2, k1 + vmovdqa32 ymm2 {k2}, ymm0 + vmovdqa32 ymm3 {k2}, ymm0 + vmovdqa32 ymm4 {k2}, ymm1 + vmovdqa32 ymm5 {k2}, ymm1 + vmovdqa ymmword ptr [rsp], ymm2 + vmovdqa ymmword ptr [rsp+0x1*0x20], ymm3 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm4 + vmovdqa ymmword ptr [rsp+0x3*0x20], ymm5 + shl rdx, 6 + mov qword ptr [rsp+0x80], rdx + cmp rsi, 16 + jc 3f +2: + vpbroadcastd zmm0, dword ptr [rcx] + vpbroadcastd zmm1, dword ptr [rcx+0x1*0x4] + vpbroadcastd zmm2, dword ptr [rcx+0x2*0x4] + vpbroadcastd zmm3, dword ptr [rcx+0x3*0x4] + vpbroadcastd zmm4, dword ptr [rcx+0x4*0x4] + vpbroadcastd zmm5, dword ptr [rcx+0x5*0x4] + vpbroadcastd zmm6, dword ptr [rcx+0x6*0x4] + vpbroadcastd zmm7, dword ptr [rcx+0x7*0x4] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +.p2align 5 +9: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm8, zmm16, zmm17 + vpunpckhqdq zmm9, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm10, zmm18, zmm19 + vpunpckhqdq zmm11, zmm18, zmm19 + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm16, ymmword ptr [rdx+r8-0x2*0x20] + vinserti32x8 zmm16, zmm16, ymmword ptr [rdx+r12-0x2*0x20], 0x01 + vmovdqu32 ymm17, ymmword ptr [rdx+r9-0x2*0x20] + vinserti32x8 zmm17, zmm17, ymmword ptr [rdx+r13-0x2*0x20], 0x01 + vpunpcklqdq zmm12, zmm16, zmm17 + vpunpckhqdq zmm13, zmm16, zmm17 + vmovdqu32 ymm18, ymmword ptr [rdx+r10-0x2*0x20] + vinserti32x8 zmm18, zmm18, ymmword ptr [rdx+r14-0x2*0x20], 0x01 + vmovdqu32 ymm19, ymmword ptr [rdx+r11-0x2*0x20] + vinserti32x8 zmm19, zmm19, ymmword ptr [rdx+r15-0x2*0x20], 0x01 + vpunpcklqdq zmm14, zmm18, zmm19 + vpunpckhqdq zmm15, zmm18, zmm19 + vmovdqa32 zmm27, zmmword ptr [INDEX0+rip] + vmovdqa32 zmm31, zmmword ptr [INDEX1+rip] + vshufps zmm16, zmm8, zmm10, 136 + vshufps zmm17, zmm12, zmm14, 136 + vmovdqa32 zmm20, zmm16 + vpermt2d zmm16, zmm27, zmm17 + vpermt2d zmm20, zmm31, zmm17 + vshufps zmm17, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm21, zmm17 + vpermt2d zmm17, zmm27, zmm30 + vpermt2d zmm21, zmm31, zmm30 + vshufps zmm18, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm22, zmm18 + vpermt2d zmm18, zmm27, zmm8 + vpermt2d zmm22, zmm31, zmm8 + vshufps zmm19, zmm9, zmm11, 221 + vshufps zmm8, zmm13, zmm15, 221 + vmovdqa32 zmm23, zmm19 + vpermt2d zmm19, zmm27, zmm8 + vpermt2d zmm23, zmm31, zmm8 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x40] + mov r13, qword ptr [rdi+0x48] + mov r14, qword ptr [rdi+0x50] + mov r15, qword ptr [rdi+0x58] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm8, zmm24, zmm25 + vpunpckhqdq zmm9, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm10, zmm24, zmm25 + vpunpckhqdq zmm11, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + mov r8, qword ptr [rdi+0x20] + mov r9, qword ptr [rdi+0x28] + mov r10, qword ptr [rdi+0x30] + mov r11, qword ptr [rdi+0x38] + mov r12, qword ptr [rdi+0x60] + mov r13, qword ptr [rdi+0x68] + mov r14, qword ptr [rdi+0x70] + mov r15, qword ptr [rdi+0x78] + vmovdqu32 ymm24, ymmword ptr [r8+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r12+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r9+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r13+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm12, zmm24, zmm25 + vpunpckhqdq zmm13, zmm24, zmm25 + vmovdqu32 ymm24, ymmword ptr [r10+rdx-0x1*0x20] + vinserti32x8 zmm24, zmm24, ymmword ptr [r14+rdx-0x1*0x20], 0x01 + vmovdqu32 ymm25, ymmword ptr [r11+rdx-0x1*0x20] + vinserti32x8 zmm25, zmm25, ymmword ptr [r15+rdx-0x1*0x20], 0x01 + vpunpcklqdq zmm14, zmm24, zmm25 + vpunpckhqdq zmm15, zmm24, zmm25 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r12+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r13+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r14+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + prefetcht0 [r15+rdx+0x80] + vshufps zmm24, zmm8, zmm10, 136 + vshufps zmm30, zmm12, zmm14, 136 + vmovdqa32 zmm28, zmm24 + vpermt2d zmm24, zmm27, zmm30 + vpermt2d zmm28, zmm31, zmm30 + vshufps zmm25, zmm8, zmm10, 221 + vshufps zmm30, zmm12, zmm14, 221 + vmovdqa32 zmm29, zmm25 + vpermt2d zmm25, zmm27, zmm30 + vpermt2d zmm29, zmm31, zmm30 + vshufps zmm26, zmm9, zmm11, 136 + vshufps zmm8, zmm13, zmm15, 136 + vmovdqa32 zmm30, zmm26 + vpermt2d zmm26, zmm27, zmm8 + vpermt2d zmm30, zmm31, zmm8 + vshufps zmm8, zmm9, zmm11, 221 + vshufps zmm10, zmm13, zmm15, 221 + vpermi2d zmm27, zmm8, zmm10 + vpermi2d zmm31, zmm8, zmm10 + vpbroadcastd zmm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd zmm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd zmm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd zmm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa32 zmm12, zmmword ptr [rsp] + vmovdqa32 zmm13, zmmword ptr [rsp+0x1*0x40] + vpbroadcastd zmm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd zmm15, dword ptr [rsp+0x22*0x4] + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm24 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm23 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm17 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm29 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm22 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm27 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm21 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm30 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm20 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm21 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm16 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm28 + vpaddd zmm1, zmm1, zmm25 + vpaddd zmm2, zmm2, zmm31 + vpaddd zmm3, zmm3, zmm30 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm26 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm23 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm16 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm18 + vpaddd zmm1, zmm1, zmm19 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm25 + vpaddd zmm1, zmm1, zmm27 + vpaddd zmm2, zmm2, zmm24 + vpaddd zmm3, zmm3, zmm31 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm28 + vpaddd zmm3, zmm3, zmm17 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm29 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm18 + vpaddd zmm3, zmm3, zmm20 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm19 + vpaddd zmm1, zmm1, zmm26 + vpaddd zmm2, zmm2, zmm22 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpaddd zmm0, zmm0, zmm27 + vpaddd zmm1, zmm1, zmm21 + vpaddd zmm2, zmm2, zmm17 + vpaddd zmm3, zmm3, zmm24 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vprord zmm15, zmm15, 16 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 12 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vpaddd zmm0, zmm0, zmm31 + vpaddd zmm1, zmm1, zmm16 + vpaddd zmm2, zmm2, zmm25 + vpaddd zmm3, zmm3, zmm22 + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm1, zmm1, zmm5 + vpaddd zmm2, zmm2, zmm6 + vpaddd zmm3, zmm3, zmm7 + vpxord zmm12, zmm12, zmm0 + vpxord zmm13, zmm13, zmm1 + vpxord zmm14, zmm14, zmm2 + vpxord zmm15, zmm15, zmm3 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vprord zmm15, zmm15, 8 + vpaddd zmm8, zmm8, zmm12 + vpaddd zmm9, zmm9, zmm13 + vpaddd zmm10, zmm10, zmm14 + vpaddd zmm11, zmm11, zmm15 + vpxord zmm4, zmm4, zmm8 + vpxord zmm5, zmm5, zmm9 + vpxord zmm6, zmm6, zmm10 + vpxord zmm7, zmm7, zmm11 + vprord zmm4, zmm4, 7 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vpaddd zmm0, zmm0, zmm30 + vpaddd zmm1, zmm1, zmm18 + vpaddd zmm2, zmm2, zmm19 + vpaddd zmm3, zmm3, zmm23 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 16 + vprord zmm12, zmm12, 16 + vprord zmm13, zmm13, 16 + vprord zmm14, zmm14, 16 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 12 + vprord zmm6, zmm6, 12 + vprord zmm7, zmm7, 12 + vprord zmm4, zmm4, 12 + vpaddd zmm0, zmm0, zmm26 + vpaddd zmm1, zmm1, zmm28 + vpaddd zmm2, zmm2, zmm20 + vpaddd zmm3, zmm3, zmm29 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm1, zmm1, zmm6 + vpaddd zmm2, zmm2, zmm7 + vpaddd zmm3, zmm3, zmm4 + vpxord zmm15, zmm15, zmm0 + vpxord zmm12, zmm12, zmm1 + vpxord zmm13, zmm13, zmm2 + vpxord zmm14, zmm14, zmm3 + vprord zmm15, zmm15, 8 + vprord zmm12, zmm12, 8 + vprord zmm13, zmm13, 8 + vprord zmm14, zmm14, 8 + vpaddd zmm10, zmm10, zmm15 + vpaddd zmm11, zmm11, zmm12 + vpaddd zmm8, zmm8, zmm13 + vpaddd zmm9, zmm9, zmm14 + vpxord zmm5, zmm5, zmm10 + vpxord zmm6, zmm6, zmm11 + vpxord zmm7, zmm7, zmm8 + vpxord zmm4, zmm4, zmm9 + vprord zmm5, zmm5, 7 + vprord zmm6, zmm6, 7 + vprord zmm7, zmm7, 7 + vprord zmm4, zmm4, 7 + vpxord zmm0, zmm0, zmm8 + vpxord zmm1, zmm1, zmm9 + vpxord zmm2, zmm2, zmm10 + vpxord zmm3, zmm3, zmm11 + vpxord zmm4, zmm4, zmm12 + vpxord zmm5, zmm5, zmm13 + vpxord zmm6, zmm6, zmm14 + vpxord zmm7, zmm7, zmm15 + movzx eax, byte ptr [rbp+0x38] + jne 9b + mov rbx, qword ptr [rbp+0x50] + vpunpckldq zmm16, zmm0, zmm1 + vpunpckhdq zmm17, zmm0, zmm1 + vpunpckldq zmm18, zmm2, zmm3 + vpunpckhdq zmm19, zmm2, zmm3 + vpunpckldq zmm20, zmm4, zmm5 + vpunpckhdq zmm21, zmm4, zmm5 + vpunpckldq zmm22, zmm6, zmm7 + vpunpckhdq zmm23, zmm6, zmm7 + vpunpcklqdq zmm0, zmm16, zmm18 + vpunpckhqdq zmm1, zmm16, zmm18 + vpunpcklqdq zmm2, zmm17, zmm19 + vpunpckhqdq zmm3, zmm17, zmm19 + vpunpcklqdq zmm4, zmm20, zmm22 + vpunpckhqdq zmm5, zmm20, zmm22 + vpunpcklqdq zmm6, zmm21, zmm23 + vpunpckhqdq zmm7, zmm21, zmm23 + vshufi32x4 zmm16, zmm0, zmm4, 0x88 + vshufi32x4 zmm17, zmm1, zmm5, 0x88 + vshufi32x4 zmm18, zmm2, zmm6, 0x88 + vshufi32x4 zmm19, zmm3, zmm7, 0x88 + vshufi32x4 zmm20, zmm0, zmm4, 0xDD + vshufi32x4 zmm21, zmm1, zmm5, 0xDD + vshufi32x4 zmm22, zmm2, zmm6, 0xDD + vshufi32x4 zmm23, zmm3, zmm7, 0xDD + vshufi32x4 zmm0, zmm16, zmm17, 0x88 + vshufi32x4 zmm1, zmm18, zmm19, 0x88 + vshufi32x4 zmm2, zmm20, zmm21, 0x88 + vshufi32x4 zmm3, zmm22, zmm23, 0x88 + vshufi32x4 zmm4, zmm16, zmm17, 0xDD + vshufi32x4 zmm5, zmm18, zmm19, 0xDD + vshufi32x4 zmm6, zmm20, zmm21, 0xDD + vshufi32x4 zmm7, zmm22, zmm23, 0xDD + vmovdqu32 zmmword ptr [rbx], zmm0 + vmovdqu32 zmmword ptr [rbx+0x1*0x40], zmm1 + vmovdqu32 zmmword ptr [rbx+0x2*0x40], zmm2 + vmovdqu32 zmmword ptr [rbx+0x3*0x40], zmm3 + vmovdqu32 zmmword ptr [rbx+0x4*0x40], zmm4 + vmovdqu32 zmmword ptr [rbx+0x5*0x40], zmm5 + vmovdqu32 zmmword ptr [rbx+0x6*0x40], zmm6 + vmovdqu32 zmmword ptr [rbx+0x7*0x40], zmm7 + vmovdqa32 zmm0, zmmword ptr [rsp] + vmovdqa32 zmm1, zmmword ptr [rsp+0x1*0x40] + vmovdqa32 zmm2, zmm0 + vpaddd zmm2{k1}, zmm0, dword ptr [ADD16+rip] {1to16} + vpcmpltud k2, zmm2, zmm0 + vpaddd zmm1 {k2}, zmm1, dword ptr [ADD1+rip] {1to16} + vmovdqa32 zmmword ptr [rsp], zmm2 + vmovdqa32 zmmword ptr [rsp+0x1*0x40], zmm1 + add rdi, 128 + add rbx, 512 + mov qword ptr [rbp+0x50], rbx + sub rsi, 16 + cmp rsi, 16 + jnc 2b + test rsi, rsi + jnz 3f +4: + vzeroupper + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 6 +3: + test esi, 0x8 + je 3f + vpbroadcastd ymm0, dword ptr [rcx] + vpbroadcastd ymm1, dword ptr [rcx+0x4] + vpbroadcastd ymm2, dword ptr [rcx+0x8] + vpbroadcastd ymm3, dword ptr [rcx+0xC] + vpbroadcastd ymm4, dword ptr [rcx+0x10] + vpbroadcastd ymm5, dword ptr [rcx+0x14] + vpbroadcastd ymm6, dword ptr [rcx+0x18] + vpbroadcastd ymm7, dword ptr [rcx+0x1C] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov r12, qword ptr [rdi+0x20] + mov r13, qword ptr [rdi+0x28] + mov r14, qword ptr [rdi+0x30] + mov r15, qword ptr [rdi+0x38] + movzx eax, byte ptr [rbp+0x38] + movzx ebx, byte ptr [rbp+0x40] + or eax, ebx + xor edx, edx +2: + movzx ebx, byte ptr [rbp+0x48] + or ebx, eax + add rdx, 64 + cmp rdx, qword ptr [rsp+0x80] + cmove eax, ebx + mov dword ptr [rsp+0x88], eax + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x40], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x40] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x40], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x40] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x40], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x40] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x40], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm16, ymm12, ymm14, 136 + vshufps ymm17, ymm12, ymm14, 221 + vshufps ymm18, ymm13, ymm15, 136 + vshufps ymm19, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x30] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x30], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x30], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x30] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x30], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x30] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x30], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm20, ymm12, ymm14, 136 + vshufps ymm21, ymm12, ymm14, 221 + vshufps ymm22, ymm13, ymm15, 136 + vshufps ymm23, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x20], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x20] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x20], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x20] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x20], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x20] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x20], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm24, ymm12, ymm14, 136 + vshufps ymm25, ymm12, ymm14, 221 + vshufps ymm26, ymm13, ymm15, 136 + vshufps ymm27, ymm13, ymm15, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x10] + vinsertf128 ymm8, ymm8, xmmword ptr [r12+rdx-0x10], 0x01 + vmovups xmm9, xmmword ptr [r9+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r13+rdx-0x10], 0x01 + vunpcklpd ymm12, ymm8, ymm9 + vunpckhpd ymm13, ymm8, ymm9 + vmovups xmm10, xmmword ptr [r10+rdx-0x10] + vinsertf128 ymm10, ymm10, xmmword ptr [r14+rdx-0x10], 0x01 + vmovups xmm11, xmmword ptr [r11+rdx-0x10] + vinsertf128 ymm11, ymm11, xmmword ptr [r15+rdx-0x10], 0x01 + vunpcklpd ymm14, ymm10, ymm11 + vunpckhpd ymm15, ymm10, ymm11 + vshufps ymm28, ymm12, ymm14, 136 + vshufps ymm29, ymm12, ymm14, 221 + vshufps ymm30, ymm13, ymm15, 136 + vshufps ymm31, ymm13, ymm15, 221 + vpbroadcastd ymm8, dword ptr [BLAKE3_IV_0+rip] + vpbroadcastd ymm9, dword ptr [BLAKE3_IV_1+rip] + vpbroadcastd ymm10, dword ptr [BLAKE3_IV_2+rip] + vpbroadcastd ymm11, dword ptr [BLAKE3_IV_3+rip] + vmovdqa ymm12, ymmword ptr [rsp] + vmovdqa ymm13, ymmword ptr [rsp+0x40] + vpbroadcastd ymm14, dword ptr [BLAKE3_BLOCK_LEN+rip] + vpbroadcastd ymm15, dword ptr [rsp+0x88] + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm24 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm23 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm17 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm29 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm22 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm27 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm21 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm30 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm20 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm21 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm16 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm28 + vpaddd ymm1, ymm1, ymm25 + vpaddd ymm2, ymm2, ymm31 + vpaddd ymm3, ymm3, ymm30 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm26 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm23 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm16 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm18 + vpaddd ymm1, ymm1, ymm19 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm25 + vpaddd ymm1, ymm1, ymm27 + vpaddd ymm2, ymm2, ymm24 + vpaddd ymm3, ymm3, ymm31 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm28 + vpaddd ymm3, ymm3, ymm17 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm29 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm18 + vpaddd ymm3, ymm3, ymm20 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm19 + vpaddd ymm1, ymm1, ymm26 + vpaddd ymm2, ymm2, ymm22 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpaddd ymm0, ymm0, ymm27 + vpaddd ymm1, ymm1, ymm21 + vpaddd ymm2, ymm2, ymm17 + vpaddd ymm3, ymm3, ymm24 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vprord ymm15, ymm15, 16 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 12 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vpaddd ymm0, ymm0, ymm31 + vpaddd ymm1, ymm1, ymm16 + vpaddd ymm2, ymm2, ymm25 + vpaddd ymm3, ymm3, ymm22 + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm1, ymm1, ymm5 + vpaddd ymm2, ymm2, ymm6 + vpaddd ymm3, ymm3, ymm7 + vpxord ymm12, ymm12, ymm0 + vpxord ymm13, ymm13, ymm1 + vpxord ymm14, ymm14, ymm2 + vpxord ymm15, ymm15, ymm3 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vprord ymm15, ymm15, 8 + vpaddd ymm8, ymm8, ymm12 + vpaddd ymm9, ymm9, ymm13 + vpaddd ymm10, ymm10, ymm14 + vpaddd ymm11, ymm11, ymm15 + vpxord ymm4, ymm4, ymm8 + vpxord ymm5, ymm5, ymm9 + vpxord ymm6, ymm6, ymm10 + vpxord ymm7, ymm7, ymm11 + vprord ymm4, ymm4, 7 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vpaddd ymm0, ymm0, ymm30 + vpaddd ymm1, ymm1, ymm18 + vpaddd ymm2, ymm2, ymm19 + vpaddd ymm3, ymm3, ymm23 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 16 + vprord ymm12, ymm12, 16 + vprord ymm13, ymm13, 16 + vprord ymm14, ymm14, 16 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 12 + vprord ymm6, ymm6, 12 + vprord ymm7, ymm7, 12 + vprord ymm4, ymm4, 12 + vpaddd ymm0, ymm0, ymm26 + vpaddd ymm1, ymm1, ymm28 + vpaddd ymm2, ymm2, ymm20 + vpaddd ymm3, ymm3, ymm29 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm1, ymm1, ymm6 + vpaddd ymm2, ymm2, ymm7 + vpaddd ymm3, ymm3, ymm4 + vpxord ymm15, ymm15, ymm0 + vpxord ymm12, ymm12, ymm1 + vpxord ymm13, ymm13, ymm2 + vpxord ymm14, ymm14, ymm3 + vprord ymm15, ymm15, 8 + vprord ymm12, ymm12, 8 + vprord ymm13, ymm13, 8 + vprord ymm14, ymm14, 8 + vpaddd ymm10, ymm10, ymm15 + vpaddd ymm11, ymm11, ymm12 + vpaddd ymm8, ymm8, ymm13 + vpaddd ymm9, ymm9, ymm14 + vpxord ymm5, ymm5, ymm10 + vpxord ymm6, ymm6, ymm11 + vpxord ymm7, ymm7, ymm8 + vpxord ymm4, ymm4, ymm9 + vprord ymm5, ymm5, 7 + vprord ymm6, ymm6, 7 + vprord ymm7, ymm7, 7 + vprord ymm4, ymm4, 7 + vpxor ymm0, ymm0, ymm8 + vpxor ymm1, ymm1, ymm9 + vpxor ymm2, ymm2, ymm10 + vpxor ymm3, ymm3, ymm11 + vpxor ymm4, ymm4, ymm12 + vpxor ymm5, ymm5, ymm13 + vpxor ymm6, ymm6, ymm14 + vpxor ymm7, ymm7, ymm15 + movzx eax, byte ptr [rbp+0x38] + jne 2b + mov rbx, qword ptr [rbp+0x50] + vunpcklps ymm8, ymm0, ymm1 + vunpcklps ymm9, ymm2, ymm3 + vunpckhps ymm10, ymm0, ymm1 + vunpcklps ymm11, ymm4, ymm5 + vunpcklps ymm0, ymm6, ymm7 + vshufps ymm12, ymm8, ymm9, 78 + vblendps ymm1, ymm8, ymm12, 0xCC + vshufps ymm8, ymm11, ymm0, 78 + vunpckhps ymm13, ymm2, ymm3 + vblendps ymm2, ymm11, ymm8, 0xCC + vblendps ymm3, ymm12, ymm9, 0xCC + vperm2f128 ymm12, ymm1, ymm2, 0x20 + vmovups ymmword ptr [rbx], ymm12 + vunpckhps ymm14, ymm4, ymm5 + vblendps ymm4, ymm8, ymm0, 0xCC + vunpckhps ymm15, ymm6, ymm7 + vperm2f128 ymm7, ymm3, ymm4, 0x20 + vmovups ymmword ptr [rbx+0x20], ymm7 + vshufps ymm5, ymm10, ymm13, 78 + vblendps ymm6, ymm5, ymm13, 0xCC + vshufps ymm13, ymm14, ymm15, 78 + vblendps ymm10, ymm10, ymm5, 0xCC + vblendps ymm14, ymm14, ymm13, 0xCC + vperm2f128 ymm8, ymm10, ymm14, 0x20 + vmovups ymmword ptr [rbx+0x40], ymm8 + vblendps ymm15, ymm13, ymm15, 0xCC + vperm2f128 ymm13, ymm6, ymm15, 0x20 + vmovups ymmword ptr [rbx+0x60], ymm13 + vperm2f128 ymm9, ymm1, ymm2, 0x31 + vperm2f128 ymm11, ymm3, ymm4, 0x31 + vmovups ymmword ptr [rbx+0x80], ymm9 + vperm2f128 ymm14, ymm10, ymm14, 0x31 + vperm2f128 ymm15, ymm6, ymm15, 0x31 + vmovups ymmword ptr [rbx+0xA0], ymm11 + vmovups ymmword ptr [rbx+0xC0], ymm14 + vmovups ymmword ptr [rbx+0xE0], ymm15 + vmovdqa ymm0, ymmword ptr [rsp] + vmovdqa ymm2, ymmword ptr [rsp+0x2*0x20] + vmovdqa32 ymm0 {k1}, ymmword ptr [rsp+0x1*0x20] + vmovdqa32 ymm2 {k1}, ymmword ptr [rsp+0x3*0x20] + vmovdqa ymmword ptr [rsp], ymm0 + vmovdqa ymmword ptr [rsp+0x2*0x20], ymm2 + add rbx, 256 + mov qword ptr [rbp+0x50], rbx + add rdi, 64 + sub rsi, 8 +3: + mov rbx, qword ptr [rbp+0x50] + mov r15, qword ptr [rsp+0x80] + movzx r13, byte ptr [rbp+0x38] + movzx r12, byte ptr [rbp+0x48] + test esi, 0x4 + je 3f + vbroadcasti32x4 zmm0, xmmword ptr [rcx] + vbroadcasti32x4 zmm1, xmmword ptr [rcx+0x1*0x10] + vmovdqa xmm12, xmmword ptr [rsp] + vmovdqa xmm13, xmmword ptr [rsp+0x4*0x10] + vpunpckldq xmm14, xmm12, xmm13 + vpunpckhdq xmm15, xmm12, xmm13 + vpermq ymm14, ymm14, 0xDC + vpermq ymm15, ymm15, 0xDC + vpbroadcastd zmm12, dword ptr [BLAKE3_BLOCK_LEN+rip] + vinserti32x8 zmm13, zmm14, ymm15, 0x01 + mov eax, 17476 + kmovw k2, eax + vpblendmd zmm13 {k2}, zmm13, zmm12 + vbroadcasti32x4 zmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + mov eax, 43690 + kmovw k3, eax + mov eax, 34952 + kmovw k4, eax + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vmovdqa32 zmm2, zmm15 + vpbroadcastd zmm8, dword ptr [rsp+0x22*0x4] + vpblendmd zmm3 {k4}, zmm13, zmm8 + vmovups zmm8, zmmword ptr [r8+rdx-0x1*0x40] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x4*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x4*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x4*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x30] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x3*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x3*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x3*0x10], 0x03 + vshufps zmm4, zmm8, zmm9, 136 + vshufps zmm5, zmm8, zmm9, 221 + vmovups zmm8, zmmword ptr [r8+rdx-0x20] + vinserti32x4 zmm8, zmm8, xmmword ptr [r9+rdx-0x2*0x10], 0x01 + vinserti32x4 zmm8, zmm8, xmmword ptr [r10+rdx-0x2*0x10], 0x02 + vinserti32x4 zmm8, zmm8, xmmword ptr [r11+rdx-0x2*0x10], 0x03 + vmovups zmm9, zmmword ptr [r8+rdx-0x10] + vinserti32x4 zmm9, zmm9, xmmword ptr [r9+rdx-0x1*0x10], 0x01 + vinserti32x4 zmm9, zmm9, xmmword ptr [r10+rdx-0x1*0x10], 0x02 + vinserti32x4 zmm9, zmm9, xmmword ptr [r11+rdx-0x1*0x10], 0x03 + vshufps zmm6, zmm8, zmm9, 136 + vshufps zmm7, zmm8, zmm9, 221 + vpshufd zmm6, zmm6, 0x93 + vpshufd zmm7, zmm7, 0x93 + mov al, 7 +9: + vpaddd zmm0, zmm0, zmm4 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm5 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x93 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x39 + vpaddd zmm0, zmm0, zmm6 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 16 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 12 + vpaddd zmm0, zmm0, zmm7 + vpaddd zmm0, zmm0, zmm1 + vpxord zmm3, zmm3, zmm0 + vprord zmm3, zmm3, 8 + vpaddd zmm2, zmm2, zmm3 + vpxord zmm1, zmm1, zmm2 + vprord zmm1, zmm1, 7 + vpshufd zmm0, zmm0, 0x39 + vpshufd zmm3, zmm3, 0x4E + vpshufd zmm2, zmm2, 0x93 + dec al + jz 9f + vshufps zmm8, zmm4, zmm5, 214 + vpshufd zmm9, zmm4, 0x0F + vpshufd zmm4, zmm8, 0x39 + vshufps zmm8, zmm6, zmm7, 250 + vpblendmd zmm9 {k3}, zmm9, zmm8 + vpunpcklqdq zmm8, zmm7, zmm5 + vpblendmd zmm8 {k4}, zmm8, zmm6 + vpshufd zmm8, zmm8, 0x78 + vpunpckhdq zmm5, zmm5, zmm7 + vpunpckldq zmm6, zmm6, zmm5 + vpshufd zmm7, zmm6, 0x1E + vmovdqa32 zmm5, zmm9 + vmovdqa32 zmm6, zmm8 + jmp 9b +9: + vpxord zmm0, zmm0, zmm2 + vpxord zmm1, zmm1, zmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vextracti32x4 xmmword ptr [rbx+0x4*0x10], zmm0, 0x02 + vextracti32x4 xmmword ptr [rbx+0x5*0x10], zmm1, 0x02 + vextracti32x4 xmmword ptr [rbx+0x6*0x10], zmm0, 0x03 + vextracti32x4 xmmword ptr [rbx+0x7*0x10], zmm1, 0x03 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x40] + vmovdqa32 xmm0 {k1}, xmmword ptr [rsp+0x1*0x10] + vmovdqa32 xmm2 {k1}, xmmword ptr [rsp+0x5*0x10] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x40], xmm2 + add rbx, 128 + add rdi, 32 + sub rsi, 4 +3: + test esi, 0x2 + je 3f + vbroadcasti128 ymm0, xmmword ptr [rcx] + vbroadcasti128 ymm1, xmmword ptr [rcx+0x10] + vmovd xmm13, dword ptr [rsp] + vpinsrd xmm13, xmm13, dword ptr [rsp+0x40], 1 + vpinsrd xmm13, xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovd xmm14, dword ptr [rsp+0x4] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x44], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vinserti128 ymm13, ymm13, xmm14, 0x01 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + mov dword ptr [rsp+0x88], eax + vbroadcasti128 ymm2, xmmword ptr [BLAKE3_IV+rip] + vpbroadcastd ymm8, dword ptr [rsp+0x88] + vpblendd ymm3, ymm13, ymm8, 0x88 + vmovups ymm8, ymmword ptr [r8+rdx-0x40] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x40], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x30] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x30], 0x01 + vshufps ymm4, ymm8, ymm9, 136 + vshufps ymm5, ymm8, ymm9, 221 + vmovups ymm8, ymmword ptr [r8+rdx-0x20] + vinsertf128 ymm8, ymm8, xmmword ptr [r9+rdx-0x20], 0x01 + vmovups ymm9, ymmword ptr [r8+rdx-0x10] + vinsertf128 ymm9, ymm9, xmmword ptr [r9+rdx-0x10], 0x01 + vshufps ymm6, ymm8, ymm9, 136 + vshufps ymm7, ymm8, ymm9, 221 + vpshufd ymm6, ymm6, 0x93 + vpshufd ymm7, ymm7, 0x93 + mov al, 7 +9: + vpaddd ymm0, ymm0, ymm4 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm5 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x93 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x39 + vpaddd ymm0, ymm0, ymm6 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 16 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 12 + vpaddd ymm0, ymm0, ymm7 + vpaddd ymm0, ymm0, ymm1 + vpxord ymm3, ymm3, ymm0 + vprord ymm3, ymm3, 8 + vpaddd ymm2, ymm2, ymm3 + vpxord ymm1, ymm1, ymm2 + vprord ymm1, ymm1, 7 + vpshufd ymm0, ymm0, 0x39 + vpshufd ymm3, ymm3, 0x4E + vpshufd ymm2, ymm2, 0x93 + dec al + jz 9f + vshufps ymm8, ymm4, ymm5, 214 + vpshufd ymm9, ymm4, 0x0F + vpshufd ymm4, ymm8, 0x39 + vshufps ymm8, ymm6, ymm7, 250 + vpblendd ymm9, ymm9, ymm8, 0xAA + vpunpcklqdq ymm8, ymm7, ymm5 + vpblendd ymm8, ymm8, ymm6, 0x88 + vpshufd ymm8, ymm8, 0x78 + vpunpckhdq ymm5, ymm5, ymm7 + vpunpckldq ymm6, ymm6, ymm5 + vpshufd ymm7, ymm6, 0x1E + vmovdqa ymm5, ymm9 + vmovdqa ymm6, ymm8 + jmp 9b +9: + vpxor ymm0, ymm0, ymm2 + vpxor ymm1, ymm1, ymm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + vextracti128 xmmword ptr [rbx+0x20], ymm0, 0x01 + vextracti128 xmmword ptr [rbx+0x30], ymm1, 0x01 + vmovdqa xmm0, xmmword ptr [rsp] + vmovdqa xmm2, xmmword ptr [rsp+0x4*0x10] + vmovdqu32 xmm0 {k1}, xmmword ptr [rsp+0x8] + vmovdqu32 xmm2 {k1}, xmmword ptr [rsp+0x48] + vmovdqa xmmword ptr [rsp], xmm0 + vmovdqa xmmword ptr [rsp+0x4*0x10], xmm2 + add rbx, 64 + add rdi, 16 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + vmovdqu xmm0, xmmword ptr [rcx] + vmovdqu xmm1, xmmword ptr [rcx+0x10] + vmovd xmm14, dword ptr [rsp] + vpinsrd xmm14, xmm14, dword ptr [rsp+0x40], 1 + vpinsrd xmm14, xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + vmovdqa xmm15, xmmword ptr [BLAKE3_IV+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +.p2align 5 +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + vpinsrd xmm3, xmm14, eax, 3 + vmovdqa xmm2, xmm15 + vmovups xmm8, xmmword ptr [r8+rdx-0x40] + vmovups xmm9, xmmword ptr [r8+rdx-0x30] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [r8+rdx-0x20] + vmovups xmm9, xmmword ptr [r8+rdx-0x10] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + vmovdqu xmmword ptr [rbx], xmm0 + vmovdqu xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +zfs_blake3_compress_in_place_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vmovdqu xmmword ptr [rdi], xmm0 + vmovdqu xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +zfs_blake3_compress_xof_avx512: + _CET_ENDBR + vmovdqu xmm0, xmmword ptr [rdi] + vmovdqu xmm1, xmmword ptr [rdi+0x10] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + vmovq xmm3, rcx + vmovq xmm4, rdx + vpunpcklqdq xmm3, xmm3, xmm4 + vmovaps xmm2, xmmword ptr [BLAKE3_IV+rip] + vmovups xmm8, xmmword ptr [rsi] + vmovups xmm9, xmmword ptr [rsi+0x10] + vshufps xmm4, xmm8, xmm9, 136 + vshufps xmm5, xmm8, xmm9, 221 + vmovups xmm8, xmmword ptr [rsi+0x20] + vmovups xmm9, xmmword ptr [rsi+0x30] + vshufps xmm6, xmm8, xmm9, 136 + vshufps xmm7, xmm8, xmm9, 221 + vpshufd xmm6, xmm6, 0x93 + vpshufd xmm7, xmm7, 0x93 + mov al, 7 +9: + vpaddd xmm0, xmm0, xmm4 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm5 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x93 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x39 + vpaddd xmm0, xmm0, xmm6 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 16 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 12 + vpaddd xmm0, xmm0, xmm7 + vpaddd xmm0, xmm0, xmm1 + vpxord xmm3, xmm3, xmm0 + vprord xmm3, xmm3, 8 + vpaddd xmm2, xmm2, xmm3 + vpxord xmm1, xmm1, xmm2 + vprord xmm1, xmm1, 7 + vpshufd xmm0, xmm0, 0x39 + vpshufd xmm3, xmm3, 0x4E + vpshufd xmm2, xmm2, 0x93 + dec al + jz 9f + vshufps xmm8, xmm4, xmm5, 214 + vpshufd xmm9, xmm4, 0x0F + vpshufd xmm4, xmm8, 0x39 + vshufps xmm8, xmm6, xmm7, 250 + vpblendd xmm9, xmm9, xmm8, 0xAA + vpunpcklqdq xmm8, xmm7, xmm5 + vpblendd xmm8, xmm8, xmm6, 0x88 + vpshufd xmm8, xmm8, 0x78 + vpunpckhdq xmm5, xmm5, xmm7 + vpunpckldq xmm6, xmm6, xmm5 + vpshufd xmm7, xmm6, 0x1E + vmovdqa xmm5, xmm9 + vmovdqa xmm6, xmm8 + jmp 9b +9: + vpxor xmm0, xmm0, xmm2 + vpxor xmm1, xmm1, xmm3 + vpxor xmm2, xmm2, [rdi] + vpxor xmm3, xmm3, [rdi+0x10] + vmovdqu xmmword ptr [r9], xmm0 + vmovdqu xmmword ptr [r9+0x10], xmm1 + vmovdqu xmmword ptr [r9+0x20], xmm2 + vmovdqu xmmword ptr [r9+0x30], xmm3 + ret + +.size zfs_blake3_hash_many_avx512, . - zfs_blake3_hash_many_avx512 +.size zfs_blake3_compress_in_place_avx512, . - zfs_blake3_compress_in_place_avx512 +.size zfs_blake3_compress_xof_avx512, . - zfs_blake3_compress_xof_avx512 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif + +.p2align 6 +INDEX0: + .long 0, 1, 2, 3, 16, 17, 18, 19 + .long 8, 9, 10, 11, 24, 25, 26, 27 +INDEX1: + .long 4, 5, 6, 7, 20, 21, 22, 23 + .long 12, 13, 14, 15, 28, 29, 30, 31 +ADD0: + .long 0, 1, 2, 3, 4, 5, 6, 7 + .long 8, 9, 10, 11, 12, 13, 14, 15 +ADD1: .long 1 + +ADD16: .long 16 +BLAKE3_BLOCK_LEN: + .long 64 +.p2align 6 +BLAKE3_IV: +BLAKE3_IV_0: + .long 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A + +#endif /* HAVE_AVX512 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_sse2.S b/module/icp/asm-x86_64/blake3/blake3_sse2.S new file mode 100644 index 000000000000..39d23ee233df --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_sse2.S @@ -0,0 +1,2323 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale + * Copyright (c) 2022 Tino Reichardt + */ + +#if defined(HAVE_SSE2) + +#define _ASM +#include + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_hash_many_sse2 +.global zfs_blake3_compress_in_place_sse2 +.global zfs_blake3_compress_xof_sse2 + +.text +.type zfs_blake3_hash_many_sse2,@function +.type zfs_blake3_compress_in_place_sse2,@function +.type zfs_blake3_compress_xof_sse2,@function + + .p2align 6 +zfs_blake3_hash_many_sse2: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + pshuflw xmm15, xmm15, 0xB1 + pshufhw xmm15, xmm15, 0xB1 + pshuflw xmm12, xmm12, 0xB1 + pshufhw xmm12, xmm12, 0xB1 + pshuflw xmm13, xmm13, 0xB1 + pshufhw xmm13, xmm13, 0xB1 + pshuflw xmm14, xmm14, 0xB1 + pshufhw xmm14, xmm14, 0xB1 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmm15 + psrld xmm15, 8 + pslld xmm8, 24 + pxor xmm15, xmm8 + movdqa xmm8, xmm12 + psrld xmm12, 8 + pslld xmm8, 24 + pxor xmm12, xmm8 + movdqa xmm8, xmm13 + psrld xmm13, 8 + pslld xmm8, 24 + pxor xmm13, xmm8 + movdqa xmm8, xmm14 + psrld xmm14, 8 + pslld xmm8, 24 + pxor xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + movd xmm13, dword ptr [rsp+0x124] + punpckldq xmm14, xmm13 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + shl rax, 0x20 + or rax, 0x40 + movq xmm3, rax + movdqa xmmword ptr [rsp+0x20], xmm3 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + punpcklqdq xmm3, xmmword ptr [rsp+0x20] + punpcklqdq xmm11, xmmword ptr [rsp+0x20] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + pshuflw xmm11, xmm11, 0xB1 + pshufhw xmm11, xmm11, 0xB1 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movdqa xmm13, xmm3 + psrld xmm3, 8 + pslld xmm13, 24 + pxor xmm3, xmm13 + movdqa xmm13, xmm11 + psrld xmm11, 8 + pslld xmm13, 24 + pxor xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm13, xmm12 + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + movdqa xmm13, xmm6 + pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm12, xmm13 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm6, xmm5 + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + movdqa xmmword ptr [rsp+0x30], xmm2 + movdqa xmm2, xmm14 + pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm5, xmm2 + movdqa xmm2, xmmword ptr [rsp+0x30] + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + mov eax, dword ptr [rsp+0x130] + neg eax + mov r10d, dword ptr [rsp+0x110+8*rax] + mov r11d, dword ptr [rsp+0x120+8*rax] + mov dword ptr [rsp+0x110], r10d + mov dword ptr [rsp+0x120], r11d + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + movd xmm14, dword ptr [rsp+0x120] + punpckldq xmm13, xmm14 + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl rax, 32 + or rax, 64 + movq xmm12, rax + movdqa xmm3, xmm13 + punpcklqdq xmm3, xmm12 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b + +.p2align 6 +zfs_blake3_compress_in_place_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret + +.p2align 6 +zfs_blake3_compress_xof_sse2: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshuflw xmm3, xmm3, 0xB1 + pshufhw xmm3, xmm3, 0xB1 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + movdqa xmm14, xmm3 + psrld xmm3, 8 + pslld xmm14, 24 + pxor xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip] + pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip] + por xmm9, xmm8 + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + movdqa xmm10, xmm6 + pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip] + pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip] + por xmm8, xmm10 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + +.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2 +.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2 +.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 +PBLENDW_0x33_MASK: + .long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xCC_MASK: + .long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF +PBLENDW_0x3F_MASK: + .long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 +PBLENDW_0xC0_MASK: + .long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF + +#endif /* HAVE_SSE2 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/blake3/blake3_sse41.S b/module/icp/asm-x86_64/blake3/blake3_sse41.S new file mode 100644 index 000000000000..1c40236f0628 --- /dev/null +++ b/module/icp/asm-x86_64/blake3/blake3_sse41.S @@ -0,0 +1,2058 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves + * Copyright (c) 2022 Tino Reichardt + */ + +#if defined(HAVE_SSE4_1) + +#define _ASM +#include + +#if defined(__ELF__) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif + +#if !defined(_CET_ENDBR) +#define _CET_ENDBR +#endif + +.intel_syntax noprefix +.global zfs_blake3_compress_in_place_sse41 +.global zfs_blake3_compress_xof_sse41 +.global zfs_blake3_hash_many_sse41 + +.text +.type zfs_blake3_hash_many_sse41,@function +.type zfs_blake3_compress_in_place_sse41,@function +.type zfs_blake3_compress_xof_sse41,@function + +.p2align 6 +zfs_blake3_hash_many_sse41: + _CET_ENDBR + push r15 + push r14 + push r13 + push r12 + push rbx + push rbp + mov rbp, rsp + sub rsp, 360 + and rsp, 0xFFFFFFFFFFFFFFC0 + neg r9d + movd xmm0, r9d + pshufd xmm0, xmm0, 0x00 + movdqa xmmword ptr [rsp+0x130], xmm0 + movdqa xmm1, xmm0 + pand xmm1, xmmword ptr [ADD0+rip] + pand xmm0, xmmword ptr [ADD1+rip] + movdqa xmmword ptr [rsp+0x150], xmm0 + movd xmm0, r8d + pshufd xmm0, xmm0, 0x00 + paddd xmm0, xmm1 + movdqa xmmword ptr [rsp+0x110], xmm0 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm1, xmm0 + shr r8, 32 + movd xmm2, r8d + pshufd xmm2, xmm2, 0x00 + psubd xmm2, xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + mov rbx, qword ptr [rbp+0x50] + mov r15, rdx + shl r15, 6 + movzx r13d, byte ptr [rbp+0x38] + movzx r12d, byte ptr [rbp+0x48] + cmp rsi, 4 + jc 3f +2: + movdqu xmm3, xmmword ptr [rcx] + pshufd xmm0, xmm3, 0x00 + pshufd xmm1, xmm3, 0x55 + pshufd xmm2, xmm3, 0xAA + pshufd xmm3, xmm3, 0xFF + movdqu xmm7, xmmword ptr [rcx+0x10] + pshufd xmm4, xmm7, 0x00 + pshufd xmm5, xmm7, 0x55 + pshufd xmm6, xmm7, 0xAA + pshufd xmm7, xmm7, 0xFF + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + mov r10, qword ptr [rdi+0x10] + mov r11, qword ptr [rdi+0x18] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +9: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movdqu xmm8, xmmword ptr [r8+rdx-0x40] + movdqu xmm9, xmmword ptr [r9+rdx-0x40] + movdqu xmm10, xmmword ptr [r10+rdx-0x40] + movdqu xmm11, xmmword ptr [r11+rdx-0x40] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp], xmm8 + movdqa xmmword ptr [rsp+0x10], xmm9 + movdqa xmmword ptr [rsp+0x20], xmm12 + movdqa xmmword ptr [rsp+0x30], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x30] + movdqu xmm9, xmmword ptr [r9+rdx-0x30] + movdqu xmm10, xmmword ptr [r10+rdx-0x30] + movdqu xmm11, xmmword ptr [r11+rdx-0x30] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x40], xmm8 + movdqa xmmword ptr [rsp+0x50], xmm9 + movdqa xmmword ptr [rsp+0x60], xmm12 + movdqa xmmword ptr [rsp+0x70], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x20] + movdqu xmm9, xmmword ptr [r9+rdx-0x20] + movdqu xmm10, xmmword ptr [r10+rdx-0x20] + movdqu xmm11, xmmword ptr [r11+rdx-0x20] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0x80], xmm8 + movdqa xmmword ptr [rsp+0x90], xmm9 + movdqa xmmword ptr [rsp+0xA0], xmm12 + movdqa xmmword ptr [rsp+0xB0], xmm13 + movdqu xmm8, xmmword ptr [r8+rdx-0x10] + movdqu xmm9, xmmword ptr [r9+rdx-0x10] + movdqu xmm10, xmmword ptr [r10+rdx-0x10] + movdqu xmm11, xmmword ptr [r11+rdx-0x10] + movdqa xmm12, xmm8 + punpckldq xmm8, xmm9 + punpckhdq xmm12, xmm9 + movdqa xmm14, xmm10 + punpckldq xmm10, xmm11 + punpckhdq xmm14, xmm11 + movdqa xmm9, xmm8 + punpcklqdq xmm8, xmm10 + punpckhqdq xmm9, xmm10 + movdqa xmm13, xmm12 + punpcklqdq xmm12, xmm14 + punpckhqdq xmm13, xmm14 + movdqa xmmword ptr [rsp+0xC0], xmm8 + movdqa xmmword ptr [rsp+0xD0], xmm9 + movdqa xmmword ptr [rsp+0xE0], xmm12 + movdqa xmmword ptr [rsp+0xF0], xmm13 + movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip] + movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip] + movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip] + movdqa xmm12, xmmword ptr [rsp+0x110] + movdqa xmm13, xmmword ptr [rsp+0x120] + movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip] + movd xmm15, eax + pshufd xmm15, xmm15, 0x00 + prefetcht0 [r8+rdx+0x80] + prefetcht0 [r9+rdx+0x80] + prefetcht0 [r10+rdx+0x80] + prefetcht0 [r11+rdx+0x80] + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x80] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x70] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x10] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0xD0] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x60] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xB0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x50] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0xE0] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x40] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x50] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xC0] + paddd xmm1, xmmword ptr [rsp+0x90] + paddd xmm2, xmmword ptr [rsp+0xF0] + paddd xmm3, xmmword ptr [rsp+0xE0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0xA0] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0x70] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x20] + paddd xmm1, xmmword ptr [rsp+0x30] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x90] + paddd xmm1, xmmword ptr [rsp+0xB0] + paddd xmm2, xmmword ptr [rsp+0x80] + paddd xmm3, xmmword ptr [rsp+0xF0] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0xC0] + paddd xmm3, xmmword ptr [rsp+0x10] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xD0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x20] + paddd xmm3, xmmword ptr [rsp+0x40] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0x30] + paddd xmm1, xmmword ptr [rsp+0xA0] + paddd xmm2, xmmword ptr [rsp+0x60] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xB0] + paddd xmm1, xmmword ptr [rsp+0x50] + paddd xmm2, xmmword ptr [rsp+0x10] + paddd xmm3, xmmword ptr [rsp+0x80] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xF0] + paddd xmm1, xmmword ptr [rsp] + paddd xmm2, xmmword ptr [rsp+0x90] + paddd xmm3, xmmword ptr [rsp+0x60] + paddd xmm0, xmm4 + paddd xmm1, xmm5 + paddd xmm2, xmm6 + paddd xmm3, xmm7 + pxor xmm12, xmm0 + pxor xmm13, xmm1 + pxor xmm14, xmm2 + pxor xmm15, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + pshufb xmm15, xmm8 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm12 + paddd xmm9, xmm13 + paddd xmm10, xmm14 + paddd xmm11, xmm15 + pxor xmm4, xmm8 + pxor xmm5, xmm9 + pxor xmm6, xmm10 + pxor xmm7, xmm11 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + paddd xmm0, xmmword ptr [rsp+0xE0] + paddd xmm1, xmmword ptr [rsp+0x20] + paddd xmm2, xmmword ptr [rsp+0x30] + paddd xmm3, xmmword ptr [rsp+0x70] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT16+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + movdqa xmmword ptr [rsp+0x100], xmm8 + movdqa xmm8, xmm5 + psrld xmm8, 12 + pslld xmm5, 20 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 12 + pslld xmm6, 20 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 12 + pslld xmm7, 20 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 12 + pslld xmm4, 20 + por xmm4, xmm8 + paddd xmm0, xmmword ptr [rsp+0xA0] + paddd xmm1, xmmword ptr [rsp+0xC0] + paddd xmm2, xmmword ptr [rsp+0x40] + paddd xmm3, xmmword ptr [rsp+0xD0] + paddd xmm0, xmm5 + paddd xmm1, xmm6 + paddd xmm2, xmm7 + paddd xmm3, xmm4 + pxor xmm15, xmm0 + pxor xmm12, xmm1 + pxor xmm13, xmm2 + pxor xmm14, xmm3 + movdqa xmm8, xmmword ptr [ROT8+rip] + pshufb xmm15, xmm8 + pshufb xmm12, xmm8 + pshufb xmm13, xmm8 + pshufb xmm14, xmm8 + paddd xmm10, xmm15 + paddd xmm11, xmm12 + movdqa xmm8, xmmword ptr [rsp+0x100] + paddd xmm8, xmm13 + paddd xmm9, xmm14 + pxor xmm5, xmm10 + pxor xmm6, xmm11 + pxor xmm7, xmm8 + pxor xmm4, xmm9 + pxor xmm0, xmm8 + pxor xmm1, xmm9 + pxor xmm2, xmm10 + pxor xmm3, xmm11 + movdqa xmm8, xmm5 + psrld xmm8, 7 + pslld xmm5, 25 + por xmm5, xmm8 + movdqa xmm8, xmm6 + psrld xmm8, 7 + pslld xmm6, 25 + por xmm6, xmm8 + movdqa xmm8, xmm7 + psrld xmm8, 7 + pslld xmm7, 25 + por xmm7, xmm8 + movdqa xmm8, xmm4 + psrld xmm8, 7 + pslld xmm4, 25 + por xmm4, xmm8 + pxor xmm4, xmm12 + pxor xmm5, xmm13 + pxor xmm6, xmm14 + pxor xmm7, xmm15 + mov eax, r13d + jne 9b + movdqa xmm9, xmm0 + punpckldq xmm0, xmm1 + punpckhdq xmm9, xmm1 + movdqa xmm11, xmm2 + punpckldq xmm2, xmm3 + punpckhdq xmm11, xmm3 + movdqa xmm1, xmm0 + punpcklqdq xmm0, xmm2 + punpckhqdq xmm1, xmm2 + movdqa xmm3, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm3, xmm11 + movdqu xmmword ptr [rbx], xmm0 + movdqu xmmword ptr [rbx+0x20], xmm1 + movdqu xmmword ptr [rbx+0x40], xmm9 + movdqu xmmword ptr [rbx+0x60], xmm3 + movdqa xmm9, xmm4 + punpckldq xmm4, xmm5 + punpckhdq xmm9, xmm5 + movdqa xmm11, xmm6 + punpckldq xmm6, xmm7 + punpckhdq xmm11, xmm7 + movdqa xmm5, xmm4 + punpcklqdq xmm4, xmm6 + punpckhqdq xmm5, xmm6 + movdqa xmm7, xmm9 + punpcklqdq xmm9, xmm11 + punpckhqdq xmm7, xmm11 + movdqu xmmword ptr [rbx+0x10], xmm4 + movdqu xmmword ptr [rbx+0x30], xmm5 + movdqu xmmword ptr [rbx+0x50], xmm9 + movdqu xmmword ptr [rbx+0x70], xmm7 + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm0, xmm1 + paddd xmm1, xmmword ptr [rsp+0x150] + movdqa xmmword ptr [rsp+0x110], xmm1 + pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip] + pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip] + pcmpgtd xmm0, xmm1 + movdqa xmm1, xmmword ptr [rsp+0x120] + psubd xmm1, xmm0 + movdqa xmmword ptr [rsp+0x120], xmm1 + add rbx, 128 + add rdi, 32 + sub rsi, 4 + cmp rsi, 4 + jnc 2b + test rsi, rsi + jnz 3f +4: + mov rsp, rbp + pop rbp + pop rbx + pop r12 + pop r13 + pop r14 + pop r15 + ret +.p2align 5 +3: + test esi, 0x2 + je 3f + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movaps xmm8, xmm0 + movaps xmm9, xmm1 + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp], xmm13 + movd xmm14, dword ptr [rsp+0x114] + pinsrd xmm14, dword ptr [rsp+0x124], 1 + pinsrd xmm14, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmmword ptr [rsp+0x10], xmm14 + mov r8, qword ptr [rdi] + mov r9, qword ptr [rdi+0x8] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm10, xmm2 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm3, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm3, xmm5, 221 + movaps xmm5, xmm3 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm3, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm3, xmm7, 221 + pshufd xmm7, xmm3, 0x93 + movups xmm12, xmmword ptr [r9+rdx-0x40] + movups xmm13, xmmword ptr [r9+rdx-0x30] + movaps xmm11, xmm12 + shufps xmm12, xmm13, 136 + shufps xmm11, xmm13, 221 + movaps xmm13, xmm11 + movups xmm14, xmmword ptr [r9+rdx-0x20] + movups xmm15, xmmword ptr [r9+rdx-0x10] + movaps xmm11, xmm14 + shufps xmm14, xmm15, 136 + pshufd xmm14, xmm14, 0x93 + shufps xmm11, xmm15, 221 + pshufd xmm15, xmm11, 0x93 + movaps xmm3, xmmword ptr [rsp] + movaps xmm11, xmmword ptr [rsp+0x10] + pinsrd xmm3, eax, 3 + pinsrd xmm11, eax, 3 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm8, xmm12 + movaps xmmword ptr [rsp+0x20], xmm4 + movaps xmmword ptr [rsp+0x30], xmm12 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm12, xmmword ptr [ROT16+rip] + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm5 + paddd xmm8, xmm13 + movaps xmmword ptr [rsp+0x40], xmm5 + movaps xmmword ptr [rsp+0x50], xmm13 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + movaps xmm13, xmmword ptr [ROT8+rip] + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x93 + pshufd xmm8, xmm8, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x39 + pshufd xmm10, xmm10, 0x39 + paddd xmm0, xmm6 + paddd xmm8, xmm14 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm12 + pshufb xmm11, xmm12 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 20 + psrld xmm4, 12 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 20 + psrld xmm4, 12 + por xmm9, xmm4 + paddd xmm0, xmm7 + paddd xmm8, xmm15 + paddd xmm0, xmm1 + paddd xmm8, xmm9 + pxor xmm3, xmm0 + pxor xmm11, xmm8 + pshufb xmm3, xmm13 + pshufb xmm11, xmm13 + paddd xmm2, xmm3 + paddd xmm10, xmm11 + pxor xmm1, xmm2 + pxor xmm9, xmm10 + movdqa xmm4, xmm1 + pslld xmm1, 25 + psrld xmm4, 7 + por xmm1, xmm4 + movdqa xmm4, xmm9 + pslld xmm9, 25 + psrld xmm4, 7 + por xmm9, xmm4 + pshufd xmm0, xmm0, 0x39 + pshufd xmm8, xmm8, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm11, xmm11, 0x4E + pshufd xmm2, xmm2, 0x93 + pshufd xmm10, xmm10, 0x93 + dec al + je 9f + movdqa xmm12, xmmword ptr [rsp+0x20] + movdqa xmm5, xmmword ptr [rsp+0x40] + pshufd xmm13, xmm12, 0x0F + shufps xmm12, xmm5, 214 + pshufd xmm4, xmm12, 0x39 + movdqa xmm12, xmm6 + shufps xmm12, xmm7, 250 + pblendw xmm13, xmm12, 0xCC + movdqa xmm12, xmm7 + punpcklqdq xmm12, xmm5 + pblendw xmm12, xmm6, 0xC0 + pshufd xmm12, xmm12, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmmword ptr [rsp+0x20], xmm13 + movdqa xmmword ptr [rsp+0x40], xmm12 + movdqa xmm5, xmmword ptr [rsp+0x30] + movdqa xmm13, xmmword ptr [rsp+0x50] + pshufd xmm6, xmm5, 0x0F + shufps xmm5, xmm13, 214 + pshufd xmm12, xmm5, 0x39 + movdqa xmm5, xmm14 + shufps xmm5, xmm15, 250 + pblendw xmm6, xmm5, 0xCC + movdqa xmm5, xmm15 + punpcklqdq xmm5, xmm13 + pblendw xmm5, xmm14, 0xC0 + pshufd xmm5, xmm5, 0x78 + punpckhdq xmm13, xmm15 + punpckldq xmm14, xmm13 + pshufd xmm15, xmm14, 0x1E + movdqa xmm13, xmm6 + movdqa xmm14, xmm5 + movdqa xmm5, xmmword ptr [rsp+0x20] + movdqa xmm6, xmmword ptr [rsp+0x40] + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm8, xmm10 + pxor xmm9, xmm11 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + movups xmmword ptr [rbx+0x20], xmm8 + movups xmmword ptr [rbx+0x30], xmm9 + movdqa xmm0, xmmword ptr [rsp+0x130] + movdqa xmm1, xmmword ptr [rsp+0x110] + movdqa xmm2, xmmword ptr [rsp+0x120] + movdqu xmm3, xmmword ptr [rsp+0x118] + movdqu xmm4, xmmword ptr [rsp+0x128] + blendvps xmm1, xmm3, xmm0 + blendvps xmm2, xmm4, xmm0 + movdqa xmmword ptr [rsp+0x110], xmm1 + movdqa xmmword ptr [rsp+0x120], xmm2 + add rdi, 16 + add rbx, 64 + sub rsi, 2 +3: + test esi, 0x1 + je 4b + movups xmm0, xmmword ptr [rcx] + movups xmm1, xmmword ptr [rcx+0x10] + movd xmm13, dword ptr [rsp+0x110] + pinsrd xmm13, dword ptr [rsp+0x120], 1 + pinsrd xmm13, dword ptr [BLAKE3_BLOCK_LEN+rip], 2 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov r8, qword ptr [rdi] + movzx eax, byte ptr [rbp+0x40] + or eax, r13d + xor edx, edx +2: + mov r14d, eax + or eax, r12d + add rdx, 64 + cmp rdx, r15 + cmovne eax, r14d + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movaps xmm3, xmm13 + pinsrd xmm3, eax, 3 + movups xmm4, xmmword ptr [r8+rdx-0x40] + movups xmm5, xmmword ptr [r8+rdx-0x30] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [r8+rdx-0x20] + movups xmm7, xmmword ptr [r8+rdx-0x10] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + mov eax, r13d + cmp rdx, r15 + jne 2b + movups xmmword ptr [rbx], xmm0 + movups xmmword ptr [rbx+0x10], xmm1 + jmp 4b +.p2align 6 +zfs_blake3_compress_in_place_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + shl r8, 32 + add rdx, r8 + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + pxor xmm0, xmm2 + pxor xmm1, xmm3 + movups xmmword ptr [rdi], xmm0 + movups xmmword ptr [rdi+0x10], xmm1 + ret +.p2align 6 +zfs_blake3_compress_xof_sse41: + _CET_ENDBR + movups xmm0, xmmword ptr [rdi] + movups xmm1, xmmword ptr [rdi+0x10] + movaps xmm2, xmmword ptr [BLAKE3_IV+rip] + movzx eax, r8b + movzx edx, dl + shl rax, 32 + add rdx, rax + movq xmm3, rcx + movq xmm4, rdx + punpcklqdq xmm3, xmm4 + movups xmm4, xmmword ptr [rsi] + movups xmm5, xmmword ptr [rsi+0x10] + movaps xmm8, xmm4 + shufps xmm4, xmm5, 136 + shufps xmm8, xmm5, 221 + movaps xmm5, xmm8 + movups xmm6, xmmword ptr [rsi+0x20] + movups xmm7, xmmword ptr [rsi+0x30] + movaps xmm8, xmm6 + shufps xmm6, xmm7, 136 + pshufd xmm6, xmm6, 0x93 + shufps xmm8, xmm7, 221 + pshufd xmm7, xmm8, 0x93 + movaps xmm14, xmmword ptr [ROT8+rip] + movaps xmm15, xmmword ptr [ROT16+rip] + mov al, 7 +9: + paddd xmm0, xmm4 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm5 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x93 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x39 + paddd xmm0, xmm6 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm15 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 20 + psrld xmm11, 12 + por xmm1, xmm11 + paddd xmm0, xmm7 + paddd xmm0, xmm1 + pxor xmm3, xmm0 + pshufb xmm3, xmm14 + paddd xmm2, xmm3 + pxor xmm1, xmm2 + movdqa xmm11, xmm1 + pslld xmm1, 25 + psrld xmm11, 7 + por xmm1, xmm11 + pshufd xmm0, xmm0, 0x39 + pshufd xmm3, xmm3, 0x4E + pshufd xmm2, xmm2, 0x93 + dec al + jz 9f + movdqa xmm8, xmm4 + shufps xmm8, xmm5, 214 + pshufd xmm9, xmm4, 0x0F + pshufd xmm4, xmm8, 0x39 + movdqa xmm8, xmm6 + shufps xmm8, xmm7, 250 + pblendw xmm9, xmm8, 0xCC + movdqa xmm8, xmm7 + punpcklqdq xmm8, xmm5 + pblendw xmm8, xmm6, 0xC0 + pshufd xmm8, xmm8, 0x78 + punpckhdq xmm5, xmm7 + punpckldq xmm6, xmm5 + pshufd xmm7, xmm6, 0x1E + movdqa xmm5, xmm9 + movdqa xmm6, xmm8 + jmp 9b +9: + movdqu xmm4, xmmword ptr [rdi] + movdqu xmm5, xmmword ptr [rdi+0x10] + pxor xmm0, xmm2 + pxor xmm1, xmm3 + pxor xmm2, xmm4 + pxor xmm3, xmm5 + movups xmmword ptr [r9], xmm0 + movups xmmword ptr [r9+0x10], xmm1 + movups xmmword ptr [r9+0x20], xmm2 + movups xmmword ptr [r9+0x30], xmm3 + ret + +.size zfs_blake3_hash_many_sse41, . - zfs_blake3_hash_many_sse41 +.size zfs_blake3_compress_in_place_sse41, . - zfs_blake3_compress_in_place_sse41 +.size zfs_blake3_compress_xof_sse41, . - zfs_blake3_compress_xof_sse41 + +#ifdef __APPLE__ +.static_data +#else +.section .rodata +#endif +.p2align 6 +BLAKE3_IV: + .long 0x6A09E667, 0xBB67AE85 + .long 0x3C6EF372, 0xA54FF53A +ROT16: + .byte 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13 +ROT8: + .byte 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12 +ADD0: + .long 0, 1, 2, 3 +ADD1: + .long 4, 4, 4, 4 +BLAKE3_IV_0: + .long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667 +BLAKE3_IV_1: + .long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85 +BLAKE3_IV_2: + .long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372 +BLAKE3_IV_3: + .long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A +BLAKE3_BLOCK_LEN: + .long 64, 64, 64, 64 +CMP_MSB_MASK: + .long 0x80000000, 0x80000000, 0x80000000, 0x80000000 + +#endif /* HAVE_SSE4_1 */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif From e8c7db6b9c8c5e1777562003026335d5be872eea Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Tue, 8 Mar 2022 11:26:42 +0100 Subject: [PATCH 02/10] Add zfs_vsx_available() for PowerPC Add zfs_vsx_available() and update the kfpu_begin() / kfpu_end() calls for PowerPC. Signed-off-by: Tino Reichardt Co-authored-by: Rich Ercolani --- include/os/linux/kernel/linux/simd_powerpc.h | 34 ++++++++++++++++---- lib/libspl/include/sys/simd.h | 18 +++++++++++ module/icp/algs/blake3/blake3_x86-64.c | 4 +++ 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/include/os/linux/kernel/linux/simd_powerpc.h b/include/os/linux/kernel/linux/simd_powerpc.h index 108cef22f56f..31e51ea20a1d 100644 --- a/include/os/linux/kernel/linux/simd_powerpc.h +++ b/include/os/linux/kernel/linux/simd_powerpc.h @@ -57,25 +57,45 @@ #include #include -#define kfpu_allowed() 1 -#define kfpu_begin() \ - { \ - preempt_disable(); \ - enable_kernel_altivec(); \ - } +#define kfpu_allowed() 1 + #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) #define kfpu_end() \ { \ + disable_kernel_vsx(); \ disable_kernel_altivec(); \ preempt_enable(); \ } +#define kfpu_begin() \ + { \ + preempt_disable(); \ + enable_kernel_altivec(); \ + enable_kernel_vsx(); \ + } #else -/* seems that before 4.5 no-one bothered disabling ... */ +/* seems that before 4.5 no-one bothered */ +#define kfpu_begin() #define kfpu_end() preempt_enable() #endif #define kfpu_init() 0 #define kfpu_fini() ((void) 0) +static inline boolean_t +zfs_vsx_available(void) +{ + boolean_t res; +#if defined(__powerpc64__) + u64 msr; +#else + u32 msr; +#endif + kfpu_begin(); + __asm volatile("mfmsr %0" : "=r"(msr)); + res = (msr & 0x800000) != 0; + kfpu_end(); + return (res); +} + /* * Check if AltiVec instruction set is available */ diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h index 6ef836c16e5c..6a6d8b7c6191 100644 --- a/lib/libspl/include/sys/simd.h +++ b/lib/libspl/include/sys/simd.h @@ -491,6 +491,24 @@ zfs_altivec_available(void) #endif return (has_altivec); } +static inline boolean_t +zfs_vsx_available(void) +{ + boolean_t has_vsx = B_FALSE; +#if defined(__ALTIVEC__) && !defined(__FreeBSD__) + sighandler_t savesig; + savesig = signal(SIGILL, sigillhandler); + if (setjmp(env)) { + signal(SIGILL, savesig); + has_vsx = B_FALSE; + } else { + __asm__ __volatile__("xssubsp 0,0,0\n"); + signal(SIGILL, savesig); + has_vsx = B_TRUE; + } +#endif + return (has_vsx); +} #else #define kfpu_allowed() 0 diff --git a/module/icp/algs/blake3/blake3_x86-64.c b/module/icp/algs/blake3/blake3_x86-64.c index 8502f3094b5e..48715e2128d2 100644 --- a/module/icp/algs/blake3/blake3_x86-64.c +++ b/module/icp/algs/blake3/blake3_x86-64.c @@ -74,6 +74,8 @@ static boolean_t blake3_is_sse2_supported(void) { #if defined(__x86_64) return (kfpu_allowed() && zfs_sse2_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); #else return (kfpu_allowed()); #endif @@ -138,6 +140,8 @@ static boolean_t blake3_is_sse41_supported(void) { #if defined(__x86_64) return (kfpu_allowed() && zfs_sse4_1_available()); +#elif defined(__PPC64__) + return (kfpu_allowed() && zfs_vsx_available()); #else return (kfpu_allowed()); #endif From 3de61e47d225f1b6f4d71c1f2dac2aa68c2af898 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Thu, 21 Apr 2022 09:48:27 +0200 Subject: [PATCH 03/10] Put checksum benchmarking and BLAKE3 into OpenZFS This commit adds the BLAKE3 hash into the OpenZFS infrastructure and introduces a new benchmarking file called chksum_bench within the kstat interface. It could be used for other checksum hashes as well. On Linux you can check the speeds of the checksum functions in this file now: /proc/spl/kstat/zfs/chksum_bench On FreeBSD via: sysctl kstat.zfs.misc.chksum_bench This is an example output of an i3-1005G1 test system with Debian 11: implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 1196 1602 1761 1749 1762 1759 1751 skein-generic 546 591 608 615 619 612 616 sha256-generic 240 300 316 314 304 285 276 sha512-generic 353 441 467 476 472 467 426 blake3-generic 308 313 313 313 312 313 312 blake3-sse2 402 1289 1423 1446 1432 1458 1413 blake3-sse41 427 1470 1625 1704 1679 1607 1629 blake3-avx2 428 1920 3095 3343 3356 3318 3204 blake3-avx512 473 2687 4905 5836 5844 5643 5374 Output on Debian 5.10.0-10-amd64 system: (Ryzen 7 5800X) implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 1840 2458 2665 2719 2711 2723 2693 skein-generic 870 966 996 992 1003 1005 1009 sha256-generic 415 442 453 455 457 457 457 sha512-generic 608 690 711 718 719 720 721 blake3-generic 301 313 311 309 309 310 310 blake3-sse2 343 1865 2124 2188 2180 2181 2186 blake3-sse41 364 2091 2396 2509 2463 2482 2488 blake3-avx2 365 2590 4399 4971 4915 4802 4764 Output on Debian 5.10.0-9-powerpc64le system: (POWER 9) implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 1213 1703 1889 1918 1957 1902 1907 skein-generic 434 492 520 522 511 525 525 sha256-generic 167 183 187 188 188 187 188 sha512-generic 186 216 222 221 225 224 224 blake3-generic 153 152 154 153 151 153 153 blake3-sse2 391 1170 1366 1406 1428 1426 1414 blake3-sse41 352 1049 1212 1174 1262 1258 1259 Output on Debian 5.10.0-11-arm64 system: (Pi400) implementation 1k 4k 16k 64k 256k 1m 4m edonr-generic 487 603 629 639 643 641 641 skein-generic 271 299 303 308 309 309 307 sha256-generic 117 127 128 130 130 129 130 sha512-generic 145 165 170 172 173 174 175 blake3-generic 81 29 71 89 89 89 89 blake3-sse2 112 323 368 379 380 371 374 blake3-sse41 101 315 357 368 369 364 360 Signed-off-by: Tino Reichardt --- include/Makefile.am | 2 + include/sys/blake3.h | 120 ++++++++++++++ include/sys/zfs_chksum.h | 48 ++++++ include/sys/zfs_ioctl.h | 3 +- include/sys/zio.h | 1 + include/sys/zio_checksum.h | 12 +- module/zfs/blake3_zfs.c | 113 +++++++++++++ module/zfs/spa_misc.c | 3 + module/zfs/zfs_chksum.c | 316 +++++++++++++++++++++++++++++++++++++ module/zfs/zio_checksum.c | 6 + 10 files changed, 622 insertions(+), 2 deletions(-) create mode 100644 include/sys/blake3.h create mode 100644 include/sys/zfs_chksum.h create mode 100644 module/zfs/blake3_zfs.c create mode 100644 module/zfs/zfs_chksum.c diff --git a/include/Makefile.am b/include/Makefile.am index eee989d4a150..1a7f67e9c440 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -23,6 +23,7 @@ COMMON_H = \ sys/avl.h \ sys/avl_impl.h \ sys/bitops.h \ + sys/blake3.h \ sys/blkptr.h \ sys/bplist.h \ sys/bpobj.h \ @@ -117,6 +118,7 @@ COMMON_H = \ sys/zfeature.h \ sys/zfs_acl.h \ sys/zfs_bootenv.h \ + sys/zfs_chksum.h \ sys/zfs_context.h \ sys/zfs_debug.h \ sys/zfs_delay.h \ diff --git a/include/sys/blake3.h b/include/sys/blake3.h new file mode 100644 index 000000000000..e6650372ccda --- /dev/null +++ b/include/sys/blake3.h @@ -0,0 +1,120 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3 + * Copyright (c) 2019-2020 Samuel Neves and Jack O'Connor + * Copyright (c) 2021 Tino Reichardt + */ + +#ifndef BLAKE3_H +#define BLAKE3_H + +#ifdef _KERNEL +#include +#else +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define BLAKE3_KEY_LEN 32 +#define BLAKE3_OUT_LEN 32 +#define BLAKE3_MAX_DEPTH 54 +#define BLAKE3_BLOCK_LEN 64 +#define BLAKE3_CHUNK_LEN 1024 + +/* + * This struct is a private implementation detail. + * It has to be here because it's part of BLAKE3_CTX below. + */ +typedef struct { + uint32_t cv[8]; + uint64_t chunk_counter; + uint8_t buf[BLAKE3_BLOCK_LEN]; + uint8_t buf_len; + uint8_t blocks_compressed; + uint8_t flags; +} blake3_chunk_state_t; + +typedef struct { + uint32_t key[8]; + blake3_chunk_state_t chunk; + uint8_t cv_stack_len; + + /* + * The stack size is MAX_DEPTH + 1 because we do lazy merging. For + * example, with 7 chunks, we have 3 entries in the stack. Adding an + * 8th chunk requires a 4th entry, rather than merging everything down + * to 1, because we don't know whether more input is coming. This is + * different from how the reference implementation does things. + */ + uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN]; + + /* const blake3_impl_ops_t *ops */ + const void *ops; +} BLAKE3_CTX; + +/* init the context for hash operation */ +void Blake3_Init(BLAKE3_CTX *ctx); + +/* init the context for a MAC and/or tree hash operation */ +void Blake3_InitKeyed(BLAKE3_CTX *ctx, const uint8_t key[BLAKE3_KEY_LEN]); + +/* process the input bytes */ +void Blake3_Update(BLAKE3_CTX *ctx, const void *input, size_t input_len); + +/* finalize the hash computation and output the result */ +void Blake3_Final(const BLAKE3_CTX *ctx, uint8_t *out); + +/* finalize the hash computation and output the result */ +void Blake3_FinalSeek(const BLAKE3_CTX *ctx, uint64_t seek, uint8_t *out, + size_t out_len); + +/* return number of supported implementations */ +extern int blake3_get_impl_count(void); + +/* return id of selected implementation */ +extern int blake3_get_impl_id(void); + +/* return name of selected implementation */ +extern const char *blake3_get_impl_name(void); + +/* setup id as fastest implementation */ +extern void blake3_set_impl_fastest(uint32_t id); + +/* set implementation by id */ +extern void blake3_set_impl_id(uint32_t id); + +/* set implementation by name */ +extern int blake3_set_impl_name(const char *name); + +/* set startup implementation */ +extern void blake3_setup_impl(void); + +#ifdef __cplusplus +} +#endif + +#endif /* BLAKE3_H */ diff --git a/include/sys/zfs_chksum.h b/include/sys/zfs_chksum.h new file mode 100644 index 000000000000..cfd07bd0ffe7 --- /dev/null +++ b/include/sys/zfs_chksum.h @@ -0,0 +1,48 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021 Tino Reichardt + */ + +#ifndef _ZFS_CHKSUM_H +#define _ZFS_CHKSUM_H + +#ifdef _KERNEL +#include +#else +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Benchmark the chksums of ZFS when the module is loading */ +void chksum_init(void); +void chksum_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CHKSUM_H */ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h index 4fb15636ecb8..94522179676a 100644 --- a/include/sys/zfs_ioctl.h +++ b/include/sys/zfs_ioctl.h @@ -124,6 +124,7 @@ typedef enum drr_headertype { * default use of "zfs send" won't encounter the bug mentioned above. */ #define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) +#define DMU_BACKUP_FEATURE_BLAKE3 (1 << 28) /* * Mask of all supported backup features @@ -134,7 +135,7 @@ typedef enum drr_headertype { DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \ - DMU_BACKUP_FEATURE_ZSTD) + DMU_BACKUP_FEATURE_ZSTD | DMU_BACKUP_FEATURE_BLAKE3) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) diff --git a/include/sys/zio.h b/include/sys/zio.h index 7b78f08787bf..4b624165f8b3 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -89,6 +89,7 @@ enum zio_checksum { ZIO_CHECKSUM_SHA512, ZIO_CHECKSUM_SKEIN, ZIO_CHECKSUM_EDONR, + ZIO_CHECKSUM_BLAKE3, ZIO_CHECKSUM_FUNCTIONS }; diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h index 9a73a626229b..a2ce5081644c 100644 --- a/include/sys/zio_checksum.h +++ b/include/sys/zio_checksum.h @@ -21,7 +21,8 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2014, 2016 by Delphix. All rights reserved. - * Copyright Saso Kiselkov 2013, All rights reserved. + * Copyright (c) 2013 Saso Kiselkov, All rights reserved. + * Copyright (c) 2021 Tino Reichardt */ #ifndef _SYS_ZIO_CHECKSUM_H @@ -107,6 +108,8 @@ _SYS_ZIO_CHECKSUM_H zio_checksum_info_t /* * Checksum routines. */ + +/* SHA2 */ extern zio_checksum_t abd_checksum_SHA256; extern zio_checksum_t abd_checksum_SHA512_native; extern zio_checksum_t abd_checksum_SHA512_byteswap; @@ -123,6 +126,13 @@ extern zio_checksum_t abd_checksum_edonr_byteswap; extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init; extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free; +/* BLAKE3 */ +extern zio_checksum_t abd_checksum_blake3_native; +extern zio_checksum_t abd_checksum_blake3_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_blake3_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_blake3_tmpl_free; + +/* Fletcher 4 */ _SYS_ZIO_CHECKSUM_H zio_abd_checksum_func_t fletcher_4_abd_ops; extern zio_checksum_t abd_fletcher_4_native; extern zio_checksum_t abd_fletcher_4_byteswap; diff --git a/module/zfs/blake3_zfs.c b/module/zfs/blake3_zfs.c new file mode 100644 index 000000000000..51c455fe7237 --- /dev/null +++ b/module/zfs/blake3_zfs.c @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2022 Tino Reichardt + */ + +#include +#include +#include +#include + +static int +blake3_incremental(void *buf, size_t size, void *arg) +{ + BLAKE3_CTX *ctx = arg; + + Blake3_Update(ctx, buf, size); + + return (0); +} + +/* + * Computes a native 256-bit BLAKE3 MAC checksum. Please note that this + * function requires the presence of a ctx_template that should be allocated + * using abd_checksum_blake3_tmpl_init. + */ +void +abd_checksum_blake3_native(abd_t *abd, uint64_t size, const void *ctx_template, + zio_cksum_t *zcp) +{ + BLAKE3_CTX *ctx; + + ctx = kmem_alloc(sizeof (*ctx), KM_NOSLEEP); + ASSERT(ctx != 0); + ASSERT(ctx_template != 0); + + memcpy(ctx, ctx_template, sizeof (*ctx)); + (void) abd_iterate_func(abd, 0, size, blake3_incremental, ctx); + Blake3_Final(ctx, (uint8_t *)zcp); + + memset(ctx, 0, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} + +/* + * Byteswapped version of abd_checksum_blake3_native. This just invokes + * the native checksum function and byteswaps the resulting checksum (since + * BLAKE3 is internally endian-insensitive). + */ +void +abd_checksum_blake3_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + ASSERT(ctx_template != 0); + + abd_checksum_blake3_native(abd, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} + +/* + * Allocates a BLAKE3 MAC template suitable for using in BLAKE3 MAC checksum + * computations and returns a pointer to it. + */ +void * +abd_checksum_blake3_tmpl_init(const zio_cksum_salt_t *salt) +{ + BLAKE3_CTX *ctx; + + ASSERT(sizeof (salt->zcs_bytes) == 32); + + /* init reference object */ + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + Blake3_InitKeyed(ctx, salt->zcs_bytes); + + return (ctx); +} + +/* + * Frees a BLAKE3 context template previously allocated using + * zio_checksum_blake3_tmpl_init. + */ +void +abd_checksum_blake3_tmpl_free(void *ctx_template) +{ + BLAKE3_CTX *ctx = ctx_template; + + memset(ctx, 0, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 12aec4a568eb..c57c69bd70e1 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -30,6 +30,7 @@ */ #include +#include #include #include #include @@ -2417,6 +2418,7 @@ spa_init(spa_mode_t mode) vdev_raidz_math_init(); vdev_file_init(); zfs_prop_init(); + chksum_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); @@ -2438,6 +2440,7 @@ spa_fini(void) vdev_cache_stat_fini(); vdev_mirror_stat_fini(); vdev_raidz_math_fini(); + chksum_fini(); zil_fini(); dmu_fini(); zio_fini(); diff --git a/module/zfs/zfs_chksum.c b/module/zfs/zfs_chksum.c new file mode 100644 index 000000000000..3ebe08541b0b --- /dev/null +++ b/module/zfs/zfs_chksum.c @@ -0,0 +1,316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2021 Tino Reichardt + */ + +#include +#include +#include +#include +#include + +#include + +static kstat_t *chksum_kstat = NULL; + +typedef struct { + const char *name; + const char *impl; + uint64_t bs1k; + uint64_t bs4k; + uint64_t bs16k; + uint64_t bs64k; + uint64_t bs256k; + uint64_t bs1m; + uint64_t bs4m; + zio_cksum_salt_t salt; + zio_checksum_t *(func); + zio_checksum_tmpl_init_t *(init); + zio_checksum_tmpl_free_t *(free); +} chksum_stat_t; + +static int chksum_stat_cnt = 0; +static chksum_stat_t *chksum_stat_data = 0; + +/* + * i3-1005G1 test output: + * + * implementation 1k 4k 16k 64k 256k 1m 4m + * fletcher-4 5421 15001 26468 32555 34720 32801 18847 + * edonr-generic 1196 1602 1761 1749 1762 1759 1751 + * skein-generic 546 591 608 615 619 612 616 + * sha256-generic 246 270 274 274 277 275 276 + * sha256-avx 262 296 304 307 307 307 306 + * sha256-sha-ni 769 1072 1172 1220 1219 1232 1228 + * sha256-openssl 240 300 316 314 304 285 276 + * sha512-generic 333 374 385 392 391 393 392 + * sha512-openssl 353 441 467 476 472 467 426 + * sha512-avx 362 444 473 475 479 476 478 + * sha512-avx2 394 500 530 538 543 545 542 + * blake3-generic 308 313 313 313 312 313 312 + * blake3-sse2 402 1289 1423 1446 1432 1458 1413 + * blake3-sse41 427 1470 1625 1704 1679 1607 1629 + * blake3-avx2 428 1920 3095 3343 3356 3318 3204 + * blake3-avx512 473 2687 4905 5836 5844 5643 5374 + */ +static int +chksum_stat_kstat_headers(char *buf, size_t size) +{ + ssize_t off = 0; + + off += snprintf(buf + off, size, "%-23s", "implementation"); + off += snprintf(buf + off, size - off, "%8s", "1k"); + off += snprintf(buf + off, size - off, "%8s", "4k"); + off += snprintf(buf + off, size - off, "%8s", "16k"); + off += snprintf(buf + off, size - off, "%8s", "64k"); + off += snprintf(buf + off, size - off, "%8s", "256k"); + off += snprintf(buf + off, size - off, "%8s", "1m"); + (void) snprintf(buf + off, size - off, "%8s\n", "4m"); + + return (0); +} + +static int +chksum_stat_kstat_data(char *buf, size_t size, void *data) +{ + chksum_stat_t *cs; + ssize_t off = 0; + char b[24]; + + cs = (chksum_stat_t *)data; + snprintf(b, 23, "%s-%s", cs->name, cs->impl); + off += snprintf(buf + off, size - off, "%-23s", b); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs1k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs4k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs16k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs64k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs256k); + off += snprintf(buf + off, size - off, "%8llu", + (u_longlong_t)cs->bs1m); + (void) snprintf(buf + off, size - off, "%8llu\n", + (u_longlong_t)cs->bs4m); + + return (0); +} + +static void * +chksum_stat_kstat_addr(kstat_t *ksp, loff_t n) +{ + if (n < chksum_stat_cnt) + ksp->ks_private = (void *)(chksum_stat_data + n); + else + ksp->ks_private = NULL; + + return (ksp->ks_private); +} + +static void +chksum_run(chksum_stat_t *cs, abd_t *abd, void *ctx, int round, + uint64_t *result) +{ + hrtime_t start; + uint64_t run_bw, run_time_ns, run_count = 0, size = 0; + uint32_t l, loops = 0; + zio_cksum_t zcp; + + switch (round) { + case 1: /* 1k */ + size = 1<<10; loops = 128; break; + case 2: /* 2k */ + size = 1<<12; loops = 64; break; + case 3: /* 4k */ + size = 1<<14; loops = 32; break; + case 4: /* 16k */ + size = 1<<16; loops = 16; break; + case 5: /* 256k */ + size = 1<<18; loops = 8; break; + case 6: /* 1m */ + size = 1<<20; loops = 4; break; + case 7: /* 4m */ + size = 1<<22; loops = 1; break; + } + + kpreempt_disable(); + start = gethrtime(); + do { + for (l = 0; l < loops; l++, run_count++) + cs->func(abd, size, ctx, &zcp); + + run_time_ns = gethrtime() - start; + } while (run_time_ns < MSEC2NSEC(1)); + kpreempt_enable(); + + run_bw = size * run_count * NANOSEC; + run_bw /= run_time_ns; /* B/s */ + *result = run_bw/1024/1024; /* MiB/s */ +} + +static void +chksum_benchit(chksum_stat_t *cs) +{ + abd_t *abd; + void *ctx = 0; + void *salt = &cs->salt.zcs_bytes; + + /* allocate test memory via default abd interface */ + abd = abd_alloc_linear(1<<22, B_FALSE); + memset(salt, 0, sizeof (cs->salt.zcs_bytes)); + if (cs->init) { + ctx = cs->init(&cs->salt); + } + + chksum_run(cs, abd, ctx, 1, &cs->bs1k); + chksum_run(cs, abd, ctx, 2, &cs->bs4k); + chksum_run(cs, abd, ctx, 3, &cs->bs16k); + chksum_run(cs, abd, ctx, 4, &cs->bs64k); + chksum_run(cs, abd, ctx, 5, &cs->bs256k); + chksum_run(cs, abd, ctx, 6, &cs->bs1m); + chksum_run(cs, abd, ctx, 7, &cs->bs4m); + + /* free up temp memory */ + if (cs->free) { + cs->free(ctx); + } + abd_free(abd); +} + +/* + * Initialize and benchmark all supported implementations. + */ +static void +chksum_benchmark(void) +{ + +#ifndef _KERNEL + /* we need the benchmark only for the kernel module */ + return; +#endif + + chksum_stat_t *cs; + int cbid = 0, id; + uint64_t max = 0; + + /* space for the benchmark times */ + chksum_stat_cnt = 4; + chksum_stat_cnt += blake3_get_impl_count(); + chksum_stat_data = (chksum_stat_t *)kmem_zalloc( + sizeof (chksum_stat_t) * chksum_stat_cnt, KM_SLEEP); + + /* edonr */ + cs = &chksum_stat_data[cbid++]; + cs->init = abd_checksum_edonr_tmpl_init; + cs->func = abd_checksum_edonr_native; + cs->free = abd_checksum_edonr_tmpl_free; + cs->name = "edonr"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* skein */ + cs = &chksum_stat_data[cbid++]; + cs->init = abd_checksum_skein_tmpl_init; + cs->func = abd_checksum_skein_native; + cs->free = abd_checksum_skein_tmpl_free; + cs->name = "skein"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* sha256 */ + cs = &chksum_stat_data[cbid++]; + cs->init = 0; + cs->func = abd_checksum_SHA256; + cs->free = 0; + cs->name = "sha256"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* sha512 */ + cs = &chksum_stat_data[cbid++]; + cs->init = 0; + cs->func = abd_checksum_SHA512_native; + cs->free = 0; + cs->name = "sha512"; + cs->impl = "generic"; + chksum_benchit(cs); + + /* blake3 */ + for (id = 0; id < blake3_get_impl_count(); id++) { + blake3_set_impl_id(id); + cs = &chksum_stat_data[cbid++]; + cs->init = abd_checksum_blake3_tmpl_init; + cs->func = abd_checksum_blake3_native; + cs->free = abd_checksum_blake3_tmpl_free; + cs->name = "blake3"; + cs->impl = blake3_get_impl_name(); + chksum_benchit(cs); + if (cs->bs256k > max) { + max = cs->bs256k; + blake3_set_impl_fastest(id); + } + } +} + +void +chksum_init(void) +{ + + /* Benchmark supported implementations */ + chksum_benchmark(); + + /* Install kstats for all implementations */ + chksum_kstat = kstat_create("zfs", 0, "chksum_bench", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + if (chksum_kstat != NULL) { + chksum_kstat->ks_data = NULL; + chksum_kstat->ks_ndata = UINT32_MAX; + kstat_set_raw_ops(chksum_kstat, + chksum_stat_kstat_headers, + chksum_stat_kstat_data, + chksum_stat_kstat_addr); + kstat_install(chksum_kstat); + } + + /* setup implementations */ + blake3_setup_impl(); +} + +void +chksum_fini(void) +{ + if (chksum_kstat != NULL) { + kstat_delete(chksum_kstat); + chksum_kstat = NULL; + } + + if (chksum_stat_cnt) { + kmem_free(chksum_stat_data, + sizeof (chksum_stat_t) * chksum_stat_cnt); + chksum_stat_cnt = 0; + chksum_stat_data = 0; + } +} diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index d89e5765326f..3c5cdf604100 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -195,6 +195,10 @@ zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, + {{abd_checksum_blake3_native, abd_checksum_blake3_byteswap}, + abd_checksum_blake3_tmpl_init, abd_checksum_blake3_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "blake3"}, }; /* @@ -207,6 +211,8 @@ zio_checksum_to_feature(enum zio_checksum cksum) VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0); switch (cksum) { + case ZIO_CHECKSUM_BLAKE3: + return (SPA_FEATURE_BLAKE3); case ZIO_CHECKSUM_SHA512: return (SPA_FEATURE_SHA512); case ZIO_CHECKSUM_SKEIN: From 348c40e6728a15f444d9bef9c1a958b4e7cda9a1 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Tue, 17 May 2022 18:53:43 +0200 Subject: [PATCH 04/10] Introduce BLAKE3 as new feature for OpenZFS This commit puts BLAKE3 into the needed structures. Signed-off-by: Tino Reichardt --- include/zfeature_common.h | 1 + module/zcommon/zfeature_common.c | 31 +++++++++++++++++++++---------- module/zcommon/zfs_prop.c | 8 ++++++-- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/include/zfeature_common.h b/include/zfeature_common.h index d4d636f9c266..d98345fe6850 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -77,6 +77,7 @@ typedef enum spa_feature { SPA_FEATURE_DRAID, SPA_FEATURE_ZILSAXATTR, SPA_FEATURE_HEAD_ERRLOG, + SPA_FEATURE_BLAKE3, SPA_FEATURES } spa_feature_t; diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index f09389e6d02e..4df09884aa91 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -696,16 +696,15 @@ zpool_feature_init(void) ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); { - - static const spa_feature_t zilsaxattr_deps[] = { - SPA_FEATURE_EXTENSIBLE_DATASET, - SPA_FEATURE_NONE - }; - zfeature_register(SPA_FEATURE_ZILSAXATTR, - "org.openzfs:zilsaxattr", "zilsaxattr", - "Support for xattr=sa extended attribute logging in ZIL.", - ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, - ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures); + static const spa_feature_t zilsaxattr_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_ZILSAXATTR, + "org.openzfs:zilsaxattr", "zilsaxattr", + "Support for xattr=sa extended attribute logging in ZIL.", + ZFEATURE_FLAG_PER_DATASET | ZFEATURE_FLAG_READONLY_COMPAT, + ZFEATURE_TYPE_BOOLEAN, zilsaxattr_deps, sfeatures); } zfeature_register(SPA_FEATURE_HEAD_ERRLOG, @@ -714,6 +713,18 @@ zpool_feature_init(void) ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL, sfeatures); + { + static const spa_feature_t blake3_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_BLAKE3, + "org.openzfs:blake3", "blake3", + "BLAKE3 hash algorithm.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + blake3_deps, sfeatures); + } + zfs_mod_list_supported_free(sfeatures); } diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c index 500d80a33b6b..32475611e11f 100644 --- a/module/zcommon/zfs_prop.c +++ b/module/zcommon/zfs_prop.c @@ -84,6 +84,7 @@ zfs_prop_init(void) { "sha512", ZIO_CHECKSUM_SHA512 }, { "skein", ZIO_CHECKSUM_SKEIN }, { "edonr", ZIO_CHECKSUM_EDONR }, + { "blake3", ZIO_CHECKSUM_BLAKE3 }, { NULL } }; @@ -102,6 +103,9 @@ zfs_prop_init(void) ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY }, { "edonr,verify", ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY }, + { "blake3", ZIO_CHECKSUM_BLAKE3 }, + { "blake3,verify", + ZIO_CHECKSUM_BLAKE3 | ZIO_CHECKSUM_VERIFY }, { NULL } }; @@ -394,12 +398,12 @@ zfs_prop_init(void) ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein" - " | edonr", + " | edonr | blake3", "CHECKSUM", checksum_table, sfeatures); zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | verify | sha256[,verify] | sha512[,verify] | " - "skein[,verify] | edonr,verify", + "skein[,verify] | edonr,verify | blake3[,verify]", "DEDUP", dedup_table, sfeatures); zprop_register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, From a9819d62da8ce34a9efd7dfb994bc7a1f2028f41 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Tue, 4 Jan 2022 21:46:47 +0100 Subject: [PATCH 05/10] Fix compiling on FreeBSD This commit fixes this compiling error: error: unexpected token in argument list typedef int bool_t; ^ typedef int enum_t; ^ *** Error code 1 Signed-off-by: Tino Reichardt --- include/os/freebsd/spl/sys/ccompile.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h index a46a3a18be14..90b077a7be4e 100644 --- a/include/os/freebsd/spl/sys/ccompile.h +++ b/include/os/freebsd/spl/sys/ccompile.h @@ -74,10 +74,12 @@ extern "C" { #ifndef LOCORE #ifndef HAVE_RPC_TYPES +#ifndef _KERNEL typedef int bool_t; typedef int enum_t; #endif #endif +#endif #ifndef __cplusplus #define __init From 662e0ac9330afef1ca383d9a5c820de9db0d2ca9 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Tue, 4 Jan 2022 21:47:48 +0100 Subject: [PATCH 06/10] Document BLAKE3 in the manpages Add some notes about BLAKE3 with the manpages. I would also change the OpenZFS documentation located here: https://openzfs.github.io/openzfs-docs/Basic%20Concepts/Checksums.html Signed-off-by: Tino Reichardt Co-authored-by: Rich Ercolani --- man/man7/zfsprops.7 | 7 ++++--- man/man7/zpool-features.7 | 8 ++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/man/man7/zfsprops.7 b/man/man7/zfsprops.7 index 2694938aa206..b1e1ce377fe2 100644 --- a/man/man7/zfsprops.7 +++ b/man/man7/zfsprops.7 @@ -743,7 +743,7 @@ This property is not inherited. .It Xo .Sy checksum Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy fletcher2 Ns | Ns .Sy fletcher4 Ns | Ns Sy sha256 Ns | Ns Sy noparity Ns | Ns -.Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr +.Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr Ns | Ns Sy blake3 .Xc Controls the checksum used to verify data integrity. The default value is @@ -768,8 +768,9 @@ a recommended practice. The .Sy sha512 , .Sy skein , +.Sy edonr , and -.Sy edonr +.Sy blake3 checksum algorithms require enabling the appropriate features on the pool. .Pp Please see @@ -984,7 +985,7 @@ mount options. .It Xo .Sy dedup Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy verify Ns | Ns .Sy sha256 Ns Oo , Ns Sy verify Oc Ns | Ns Sy sha512 Ns Oo , Ns Sy verify Oc Ns | Ns Sy skein Ns Oo , Ns Sy verify Oc Ns | Ns -.Sy edonr , Ns Sy verify +.Sy edonr , Ns Sy verify Ns | Ns Sy blake3 Ns Oo , Ns Sy verify Oc Ns .Xc Configures deduplication for a dataset. The default value is diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index b92109c4ac98..df9e64701e37 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -326,6 +326,12 @@ while .Sy freeing is non-zero. . +.feature org.openzfs blake3 no extensible_dataset +This feature enables the use of the BLAKE3 hash algorithm for checksum and dedup. +BLAKE3 is a secure hash algorithm focused on high performance. +.Pp +.checksum-spiel blake3 +. .feature com.delphix bookmarks yes extensible_dataset This feature enables use of the .Nm zfs Cm bookmark @@ -436,6 +442,8 @@ in ZFS, which means that the checksum is pre-seeded with a secret to be checksummed. Thus the produced checksums are unique to a given pool, preventing hash collision attacks on systems with dedup. +.Pp +.checksum-spiel edonr . .feature com.delphix embedded_data no This feature improves the performance and compression ratio of From d27c879d6e0486dd26effb306497b2cc1f9e66cb Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Sat, 14 May 2022 20:08:28 +0200 Subject: [PATCH 07/10] Add BLAKE3 to the ztest command Add the new cryptographic hash function BLAKE3 to the ztest command. The generic implementation is used for reference. Signed-off-by: Tino Reichardt --- cmd/ztest.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/cmd/ztest.c b/cmd/ztest.c index ca05cf26511e..95f6107ff420 100644 --- a/cmd/ztest.c +++ b/cmd/ztest.c @@ -121,6 +121,7 @@ #include #include #include +#include #include #include #include @@ -417,6 +418,7 @@ ztest_func_t ztest_device_removal; ztest_func_t ztest_spa_checkpoint_create_discard; ztest_func_t ztest_initialize; ztest_func_t ztest_trim; +ztest_func_t ztest_blake3; ztest_func_t ztest_fletcher; ztest_func_t ztest_fletcher_incr; ztest_func_t ztest_verify_dnode_bt; @@ -470,6 +472,7 @@ ztest_info_t ztest_info[] = { ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), ZTI_INIT(ztest_trim, 1, &zopt_sometimes), + ZTI_INIT(ztest_blake3, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), @@ -6373,6 +6376,92 @@ ztest_reguid(ztest_ds_t *zd, uint64_t id) VERIFY3U(load, ==, spa_load_guid(spa)); } +void +ztest_blake3(ztest_ds_t *zd, uint64_t id) +{ + (void) zd, (void) id; + hrtime_t end = gethrtime() + NANOSEC; + zio_cksum_salt_t salt; + void *salt_ptr = &salt.zcs_bytes; + struct abd *abd_data, *abd_meta; + void *buf, *templ; + int i, *ptr; + uint32_t size; + BLAKE3_CTX ctx; + + size = ztest_random_blocksize(); + buf = umem_alloc(size, UMEM_NOFAIL); + abd_data = abd_alloc(size, B_FALSE); + abd_meta = abd_alloc(size, B_TRUE); + + for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) + *ptr = ztest_random(UINT_MAX); + memset(salt_ptr, 'A', 32); + + abd_copy_from_buf_off(abd_data, buf, 0, size); + abd_copy_from_buf_off(abd_meta, buf, 0, size); + + while (gethrtime() <= end) { + int run_count = 100; + zio_cksum_t zc_ref1, zc_ref2; + zio_cksum_t zc_res1, zc_res2; + + void *ref1 = &zc_ref1; + void *ref2 = &zc_ref2; + void *res1 = &zc_res1; + void *res2 = &zc_res2; + + /* BLAKE3_KEY_LEN = 32 */ + VERIFY0(blake3_set_impl_name("generic")); + templ = abd_checksum_blake3_tmpl_init(&salt); + Blake3_InitKeyed(&ctx, salt_ptr); + Blake3_Update(&ctx, buf, size); + Blake3_Final(&ctx, ref1); + zc_ref2 = zc_ref1; + ZIO_CHECKSUM_BSWAP(&zc_ref2); + abd_checksum_blake3_tmpl_free(templ); + + VERIFY0(blake3_set_impl_name("cycle")); + while (run_count-- > 0) { + + /* Test current implementation */ + Blake3_InitKeyed(&ctx, salt_ptr); + Blake3_Update(&ctx, buf, size); + Blake3_Final(&ctx, res1); + zc_res2 = zc_res1; + ZIO_CHECKSUM_BSWAP(&zc_res2); + + VERIFY0(memcmp(ref1, res1, 32)); + VERIFY0(memcmp(ref2, res2, 32)); + + /* Test ABD - data */ + templ = abd_checksum_blake3_tmpl_init(&salt); + abd_checksum_blake3_native(abd_data, size, + templ, &zc_res1); + abd_checksum_blake3_byteswap(abd_data, size, + templ, &zc_res2); + + VERIFY0(memcmp(ref1, res1, 32)); + VERIFY0(memcmp(ref2, res2, 32)); + + /* Test ABD - metadata */ + abd_checksum_blake3_native(abd_meta, size, + templ, &zc_res1); + abd_checksum_blake3_byteswap(abd_meta, size, + templ, &zc_res2); + abd_checksum_blake3_tmpl_free(templ); + + VERIFY0(memcmp(ref1, res1, 32)); + VERIFY0(memcmp(ref2, res2, 32)); + + } + } + + abd_free(abd_data); + abd_free(abd_meta); + umem_free(buf, size); +} + void ztest_fletcher(ztest_ds_t *zd, uint64_t id) { From 6a4bdd43faa76506415c7413408d4eb07cda0b6e Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Sat, 14 May 2022 20:08:38 +0200 Subject: [PATCH 08/10] Add BLAKE3 to the tests of zfs-tests/tests/functional/checksum Add the needed files for testing the new hash variant within the functional tests. All supported implementations off the current system get tested against the test vectors of this site: https://github.com/BLAKE3-team/BLAKE3/tree/master/test_vectors Signed-off-by: Tino Reichardt --- tests/runfiles/common.run | 4 +- tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 6 +- tests/zfs-tests/cmd/checksum/blake3_test.c | 575 ++++++++++++++++++ tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/include/properties.shlib | 2 +- tests/zfs-tests/tests/Makefile.am | 1 + .../tests/functional/checksum/default.cfg | 2 +- .../functional/checksum/run_blake3_test.ksh | 30 + .../cli_root/zfs_set/checksum_001_pos.ksh | 2 +- .../cli_root/zpool_get/zpool_get.cfg | 1 + 11 files changed, 619 insertions(+), 6 deletions(-) create mode 100644 tests/zfs-tests/cmd/checksum/blake3_test.c create mode 100755 tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 4ff46e7af35f..243221598d09 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -113,8 +113,8 @@ tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', tags = ['functional', 'channel_program', 'synctask_core'] [tests/functional/checksum] -tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'filetest_001_pos', - 'filetest_002_pos'] +tests = ['run_edonr_test', 'run_sha2_test', 'run_skein_test', 'run_blake3_test', + 'filetest_001_pos', 'filetest_002_pos'] tags = ['functional', 'checksum'] [tests/functional/clean_mirror] diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 1830cab76fee..20d1382532bd 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -42,6 +42,7 @@ /ereports /zfs_diff-socket /dosmode_readonly_write +/blake3_test /edonr_test /skein_test /sha2_test diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index e3c9874dcd54..3c8faf5afbbb 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -98,15 +98,19 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/ereports libzfs.la -scripts_zfs_tests_bin_PROGRAMS += %D%/edonr_test %D%/skein_test %D%/sha2_test +scripts_zfs_tests_bin_PROGRAMS += %D%/edonr_test %D%/skein_test \ + %D%/sha2_test %D%/blake3_test %C%_skein_test_SOURCES = %D%/checksum/skein_test.c %C%_sha2_test_SOURCES = %D%/checksum/sha2_test.c %C%_edonr_test_SOURCES = %D%/checksum/edonr_test.c +%C%_blake3_test_SOURCES = %D%/checksum/blake3_test.c %C%_skein_test_LDADD = \ libicp.la \ + libspl.la \ libspl_assert.la %C%_sha2_test_LDADD = $(%C%_skein_test_LDADD) %C%_edonr_test_LDADD = $(%C%_skein_test_LDADD) +%C%_blake3_test_LDADD = $(%C%_skein_test_LDADD) if BUILD_LINUX diff --git a/tests/zfs-tests/cmd/checksum/blake3_test.c b/tests/zfs-tests/cmd/checksum/blake3_test.c new file mode 100644 index 000000000000..55d268f5f8b7 --- /dev/null +++ b/tests/zfs-tests/cmd/checksum/blake3_test.c @@ -0,0 +1,575 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2022 Tino Reichardt + */ + +#include +#include +#include +#include +#include +#include + +/* + * set it to a define for debugging + */ +#undef BLAKE3_DEBUG + +/* + * C version of: + * https://github.com/BLAKE3-team/BLAKE3/tree/master/test_vectors + */ +typedef struct { + /* input length for this entry */ + const int input_len; + + /* hash value */ + const char *hash; + + /* salted hash value */ + const char *shash; +} blake3_test_t; + +/* BLAKE3 is variable here */ +#define TEST_DIGEST_LEN 262 + +/* + * key for the keyed hashing + */ +static const char *salt = "whats the Elvish word for friend"; + +static blake3_test_t TestArray[] = { + { + 0, + "af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262e0" + "0f03e7b69af26b7faaf09fcd333050338ddfe085b8cc869ca98b206c08243a26f5" + "487789e8f660afe6c99ef9e0c52b92e7393024a80459cf91f476f9ffdbda7001c2" + "2e159b402631f277ca96f2defdf1078282314e763699a31c5363165421cce14d", + "92b2b75604ed3c761f9d6f62392c8a9227ad0ea3f09573e783f1498a4ed60d26b1" + "8171a2f22a4b94822c701f107153dba24918c4bae4d2945c20ece13387627d3b73" + "cbf97b797d5e59948c7ef788f54372df45e45e4293c7dc18c1d41144a9758be589" + "60856be1eabbe22c2653190de560ca3b2ac4aa692a9210694254c371e851bc8f", + }, + { + 1, + "2d3adedff11b61f14c886e35afa036736dcd87a74d27b5c1510225d0f592e213c3" + "a6cb8bf623e20cdb535f8d1a5ffb86342d9c0b64aca3bce1d31f60adfa137b358a" + "d4d79f97b47c3d5e79f179df87a3b9776ef8325f8329886ba42f07fb138bb502f4" + "081cbcec3195c5871e6c23e2cc97d3c69a613eba131e5f1351f3f1da786545e5", + "6d7878dfff2f485635d39013278ae14f1454b8c0a3a2d34bc1ab38228a80c95b65" + "68c0490609413006fbd428eb3fd14e7756d90f73a4725fad147f7bf70fd61c4e0c" + "f7074885e92b0e3f125978b4154986d4fb202a3f331a3fb6cf349a3a70e49990f9" + "8fe4289761c8602c4e6ab1138d31d3b62218078b2f3ba9a88e1d08d0dd4cea11", + }, + { + 2, + "7b7015bb92cf0b318037702a6cdd81dee41224f734684c2c122cd6359cb1ee63d8" + "386b22e2ddc05836b7c1bb693d92af006deb5ffbc4c70fb44d0195d0c6f252faac" + "61659ef86523aa16517f87cb5f1340e723756ab65efb2f91964e14391de2a43226" + "3a6faf1d146937b35a33621c12d00be8223a7f1919cec0acd12097ff3ab00ab1", + "5392ddae0e0a69d5f40160462cbd9bd889375082ff224ac9c758802b7a6fd20a9f" + "fbf7efd13e989a6c246f96d3a96b9d279f2c4e63fb0bdff633957acf50ee1a5f65" + "8be144bab0f6f16500dee4aa5967fc2c586d85a04caddec90fffb7633f46a60786" + "024353b9e5cebe277fcd9514217fee2267dcda8f7b31697b7c54fab6a939bf8f", + }, + { + 3, + "e1be4d7a8ab5560aa4199eea339849ba8e293d55ca0a81006726d184519e647f5b" + "49b82f805a538c68915c1ae8035c900fd1d4b13902920fd05e1450822f36de9454" + "b7e9996de4900c8e723512883f93f4345f8a58bfe64ee38d3ad71ab027765d25cd" + "d0e448328a8e7a683b9a6af8b0af94fa09010d9186890b096a08471e4230a134", + "39e67b76b5a007d4921969779fe666da67b5213b096084ab674742f0d5ec62b9b9" + "142d0fab08e1b161efdbb28d18afc64d8f72160c958e53a950cdecf91c1a1bbab1" + "a9c0f01def762a77e2e8545d4dec241e98a89b6db2e9a5b070fc110caae2622690" + "bd7b76c02ab60750a3ea75426a6bb8803c370ffe465f07fb57def95df772c39f", + }, + { + 4, + "f30f5ab28fe047904037f77b6da4fea1e27241c5d132638d8bedce9d40494f328f" + "603ba4564453e06cdcee6cbe728a4519bbe6f0d41e8a14b5b225174a566dbfa61b" + "56afb1e452dc08c804f8c3143c9e2cc4a31bb738bf8c1917b55830c6e657972117" + "01dc0b98daa1faeaa6ee9e56ab606ce03a1a881e8f14e87a4acf4646272cfd12", + "7671dde590c95d5ac9616651ff5aa0a27bee5913a348e053b8aa9108917fe07011" + "6c0acff3f0d1fa97ab38d813fd46506089118147d83393019b068a55d646251ecf" + "81105f798d76a10ae413f3d925787d6216a7eb444e510fd56916f1d753a5544ecf" + "0072134a146b2615b42f50c179f56b8fae0788008e3e27c67482349e249cb86a", + }, + { + 5, + "b40b44dfd97e7a84a996a91af8b85188c66c126940ba7aad2e7ae6b385402aa2eb" + "cfdac6c5d32c31209e1f81a454751280db64942ce395104e1e4eaca62607de1c2c" + "a748251754ea5bbe8c20150e7f47efd57012c63b3c6a6632dc1c7cd15f3e1c9999" + "04037d60fac2eb9397f2adbe458d7f264e64f1e73aa927b30988e2aed2f03620", + "73ac69eecf286894d8102018a6fc729f4b1f4247d3703f69bdc6a5fe3e0c84616a" + "b199d1f2f3e53bffb17f0a2209fe8b4f7d4c7bae59c2bc7d01f1ff94c67588cc6b" + "38fa6024886f2c078bfe09b5d9e6584cd6c521c3bb52f4de7687b37117a2dbbec0" + "d59e92fa9a8cc3240d4432f91757aabcae03e87431dac003e7d73574bfdd8218", + }, + { + 6, + "06c4e8ffb6872fad96f9aaca5eee1553eb62aed0ad7198cef42e87f6a616c84461" + "1a30c4e4f37fe2fe23c0883cde5cf7059d88b657c7ed2087e3d210925ede716435" + "d6d5d82597a1e52b9553919e804f5656278bd739880692c94bff2824d8e0b48cac" + "1d24682699e4883389dc4f2faa2eb3b4db6e39debd5061ff3609916f3e07529a", + "82d3199d0013035682cc7f2a399d4c212544376a839aa863a0f4c91220ca7a6dc2" + "ffb3aa05f2631f0fa9ac19b6e97eb7e6669e5ec254799350c8b8d189e880780084" + "2a5383c4d907c932f34490aaf00064de8cdb157357bde37c1504d2960034930887" + "603abc5ccb9f5247f79224baff6120a3c622a46d7b1bcaee02c5025460941256", + }, + { + 7, + "3f8770f387faad08faa9d8414e9f449ac68e6ff0417f673f602a646a891419fe66" + "036ef6e6d1a8f54baa9fed1fc11c77cfb9cff65bae915045027046ebe0c01bf5a9" + "41f3bb0f73791d3fc0b84370f9f30af0cd5b0fc334dd61f70feb60dad785f070fe" + "f1f343ed933b49a5ca0d16a503f599a365a4296739248b28d1a20b0e2cc8975c", + "af0a7ec382aedc0cfd626e49e7628bc7a353a4cb108855541a5651bf64fbb28a7c" + "5035ba0f48a9c73dabb2be0533d02e8fd5d0d5639a18b2803ba6bf527e1d145d5f" + "d6406c437b79bcaad6c7bdf1cf4bd56a893c3eb9510335a7a798548c6753f74617" + "bede88bef924ba4b334f8852476d90b26c5dc4c3668a2519266a562c6c8034a6", + }, + { + 8, + "2351207d04fc16ade43ccab08600939c7c1fa70a5c0aaca76063d04c3228eaeb72" + "5d6d46ceed8f785ab9f2f9b06acfe398c6699c6129da084cb531177445a682894f" + "9685eaf836999221d17c9a64a3a057000524cd2823986db378b074290a1a9b93a2" + "2e135ed2c14c7e20c6d045cd00b903400374126676ea78874d79f2dd7883cf5c", + "be2f5495c61cba1bb348a34948c004045e3bd4dae8f0fe82bf44d0da245a060048" + "eb5e68ce6dea1eb0229e144f578b3aa7e9f4f85febd135df8525e6fe40c6f0340d" + "13dd09b255ccd5112a94238f2be3c0b5b7ecde06580426a93e0708555a265305ab" + "f86d874e34b4995b788e37a823491f25127a502fe0704baa6bfdf04e76c13276", + }, + { + 63, + "e9bc37a594daad83be9470df7f7b3798297c3d834ce80ba85d6e207627b7db7b11" + "97012b1e7d9af4d7cb7bdd1f3bb49a90a9b5dec3ea2bbc6eaebce77f4e470cbf46" + "87093b5352f04e4a4570fba233164e6acc36900e35d185886a827f7ea9bdc1e5c3" + "ce88b095a200e62c10c043b3e9bc6cb9b6ac4dfa51794b02ace9f98779040755", + "bb1eb5d4afa793c1ebdd9fb08def6c36d10096986ae0cfe148cd101170ce37aea0" + "5a63d74a840aecd514f654f080e51ac50fd617d22610d91780fe6b07a26b0847ab" + "b38291058c97474ef6ddd190d30fc318185c09ca1589d2024f0a6f16d45f116783" + "77483fa5c005b2a107cb9943e5da634e7046855eaa888663de55d6471371d55d", + }, + { + 64, + "4eed7141ea4a5cd4b788606bd23f46e212af9cacebacdc7d1f4c6dc7f2511b98fc" + "9cc56cb831ffe33ea8e7e1d1df09b26efd2767670066aa82d023b1dfe8ab1b2b7f" + "bb5b97592d46ffe3e05a6a9b592e2949c74160e4674301bc3f97e04903f8c6cf95" + "b863174c33228924cdef7ae47559b10b294acd660666c4538833582b43f82d74", + "ba8ced36f327700d213f120b1a207a3b8c04330528586f414d09f2f7d9ccb7e682" + "44c26010afc3f762615bbac552a1ca909e67c83e2fd5478cf46b9e811efccc93f7" + "7a21b17a152ebaca1695733fdb086e23cd0eb48c41c034d52523fc21236e5d8c92" + "55306e48d52ba40b4dac24256460d56573d1312319afcf3ed39d72d0bfc69acb", + }, + { + 65, + "de1e5fa0be70df6d2be8fffd0e99ceaa8eb6e8c93a63f2d8d1c30ecb6b263dee0e" + "16e0a4749d6811dd1d6d1265c29729b1b75a9ac346cf93f0e1d7296dfcfd4313b3" + "a227faaaaf7757cc95b4e87a49be3b8a270a12020233509b1c3632b3485eef309d" + "0abc4a4a696c9decc6e90454b53b000f456a3f10079072baaf7a981653221f2c", + "c0a4edefa2d2accb9277c371ac12fcdbb52988a86edc54f0716e1591b4326e72d5" + "e795f46a596b02d3d4bfb43abad1e5d19211152722ec1f20fef2cd413e3c22f2fc" + "5da3d73041275be6ede3517b3b9f0fc67ade5956a672b8b75d96cb43294b904149" + "7de92637ed3f2439225e683910cb3ae923374449ca788fb0f9bea92731bc26ad", + }, + { + 127, + "d81293fda863f008c09e92fc382a81f5a0b4a1251cba1634016a0f86a6bd640de3" + "137d477156d1fde56b0cf36f8ef18b44b2d79897bece12227539ac9ae0a5119da4" + "7644d934d26e74dc316145dcb8bb69ac3f2e05c242dd6ee06484fcb0e956dc4435" + "5b452c5e2bbb5e2b66e99f5dd443d0cbcaaafd4beebaed24ae2f8bb672bcef78", + "c64200ae7dfaf35577ac5a9521c47863fb71514a3bcad18819218b818de85818ee" + "7a317aaccc1458f78d6f65f3427ec97d9c0adb0d6dacd4471374b621b7b5f35cd5" + "4663c64dbe0b9e2d95632f84c611313ea5bd90b71ce97b3cf645776f3adc11e27d" + "135cbadb9875c2bf8d3ae6b02f8a0206aba0c35bfe42574011931c9a255ce6dc", + }, + { + 128, + "f17e570564b26578c33bb7f44643f539624b05df1a76c81f30acd548c44b45efa6" + "9faba091427f9c5c4caa873aa07828651f19c55bad85c47d1368b11c6fd99e47ec" + "ba5820a0325984d74fe3e4058494ca12e3f1d3293d0010a9722f7dee64f71246f7" + "5e9361f44cc8e214a100650db1313ff76a9f93ec6e84edb7add1cb4a95019b0c", + "b04fe15577457267ff3b6f3c947d93be581e7e3a4b018679125eaf86f6a628ecd8" + "6bbe0001f10bda47e6077b735016fca8119da11348d93ca302bbd125bde0db2b50" + "edbe728a620bb9d3e6f706286aedea973425c0b9eedf8a38873544cf91badf49ad" + "92a635a93f71ddfcee1eae536c25d1b270956be16588ef1cfef2f1d15f650bd5", + }, + { + 129, + "683aaae9f3c5ba37eaaf072aed0f9e30bac0865137bae68b1fde4ca2aebdcb12f9" + "6ffa7b36dd78ba321be7e842d364a62a42e3746681c8bace18a4a8a79649285c71" + "27bf8febf125be9de39586d251f0d41da20980b70d35e3dac0eee59e468a894fa7" + "e6a07129aaad09855f6ad4801512a116ba2b7841e6cfc99ad77594a8f2d181a7", + "d4a64dae6cdccbac1e5287f54f17c5f985105457c1a2ec1878ebd4b57e20d38f1c" + "9db018541eec241b748f87725665b7b1ace3e0065b29c3bcb232c90e37897fa5aa" + "ee7e1e8a2ecfcd9b51463e42238cfdd7fee1aecb3267fa7f2128079176132a412c" + "d8aaf0791276f6b98ff67359bd8652ef3a203976d5ff1cd41885573487bcd683", + }, + { + 1023, + "10108970eeda3eb932baac1428c7a2163b0e924c9a9e25b35bba72b28f70bd11a1" + "82d27a591b05592b15607500e1e8dd56bc6c7fc063715b7a1d737df5bad3339c56" + "778957d870eb9717b57ea3d9fb68d1b55127bba6a906a4a24bbd5acb2d123a37b2" + "8f9e9a81bbaae360d58f85e5fc9d75f7c370a0cc09b6522d9c8d822f2f28f485", + "c951ecdf03288d0fcc96ee3413563d8a6d3589547f2c2fb36d9786470f1b9d6e89" + "0316d2e6d8b8c25b0a5b2180f94fb1a158ef508c3cde45e2966bd796a696d3e13e" + "fd86259d756387d9becf5c8bf1ce2192b87025152907b6d8cc33d17826d8b7b9bc" + "97e38c3c85108ef09f013e01c229c20a83d9e8efac5b37470da28575fd755a10", + }, + { + 1024, + "42214739f095a406f3fc83deb889744ac00df831c10daa55189b5d121c855af71c" + "f8107265ecdaf8505b95d8fcec83a98a6a96ea5109d2c179c47a387ffbb404756f" + "6eeae7883b446b70ebb144527c2075ab8ab204c0086bb22b7c93d465efc57f8d91" + "7f0b385c6df265e77003b85102967486ed57db5c5ca170ba441427ed9afa684e", + "75c46f6f3d9eb4f55ecaaee480db732e6c2105546f1e675003687c31719c7ba4a7" + "8bc838c72852d4f49c864acb7adafe2478e824afe51c8919d06168414c265f298a" + "8094b1ad813a9b8614acabac321f24ce61c5a5346eb519520d38ecc43e89b50002" + "36df0597243e4d2493fd626730e2ba17ac4d8824d09d1a4a8f57b8227778e2de", + }, + { + 1025, + "d00278ae47eb27b34faecf67b4fe263f82d5412916c1ffd97c8cb7fb814b8444f4" + "c4a22b4b399155358a994e52bf255de60035742ec71bd08ac275a1b51cc6bfe332" + "b0ef84b409108cda080e6269ed4b3e2c3f7d722aa4cdc98d16deb554e5627be8f9" + "55c98e1d5f9565a9194cad0c4285f93700062d9595adb992ae68ff12800ab67a", + "357dc55de0c7e382c900fd6e320acc04146be01db6a8ce7210b7189bd664ea6936" + "2396b77fdc0d2634a552970843722066c3c15902ae5097e00ff53f1e116f1cd535" + "2720113a837ab2452cafbde4d54085d9cf5d21ca613071551b25d52e69d6c81123" + "872b6f19cd3bc1333edf0c52b94de23ba772cf82636cff4542540a7738d5b930", + }, + { + 2048, + "e776b6028c7cd22a4d0ba182a8bf62205d2ef576467e838ed6f2529b85fba24a9a" + "60bf80001410ec9eea6698cd537939fad4749edd484cb541aced55cd9bf54764d0" + "63f23f6f1e32e12958ba5cfeb1bf618ad094266d4fc3c968c2088f677454c288c6" + "7ba0dba337b9d91c7e1ba586dc9a5bc2d5e90c14f53a8863ac75655461cea8f9", + "879cf1fa2ea0e79126cb1063617a05b6ad9d0b696d0d757cf053439f60a99dd101" + "73b961cd574288194b23ece278c330fbb8585485e74967f31352a8183aa782b2b2" + "2f26cdcadb61eed1a5bc144b8198fbb0c13abbf8e3192c145d0a5c21633b0ef860" + "54f42809df823389ee40811a5910dcbd1018af31c3b43aa55201ed4edaac74fe", + }, + { + 2049, + "5f4d72f40d7a5f82b15ca2b2e44b1de3c2ef86c426c95c1af0b687952256303096" + "de31d71d74103403822a2e0bc1eb193e7aecc9643a76b7bbc0c9f9c52e8783aae9" + "8764ca468962b5c2ec92f0c74eb5448d519713e09413719431c802f948dd5d9042" + "5a4ecdadece9eb178d80f26efccae630734dff63340285adec2aed3b51073ad3", + "9f29700902f7c86e514ddc4df1e3049f258b2472b6dd5267f61bf13983b78dd5f9" + "a88abfefdfa1e00b418971f2b39c64ca621e8eb37fceac57fd0c8fc8e117d43b81" + "447be22d5d8186f8f5919ba6bcc6846bd7d50726c06d245672c2ad4f61702c6464" + "99ee1173daa061ffe15bf45a631e2946d616a4c345822f1151284712f76b2b0e", + }, + { + 3072, + "b98cb0ff3623be03326b373de6b9095218513e64f1ee2edd2525c7ad1e5cffd29a" + "3f6b0b978d6608335c09dc94ccf682f9951cdfc501bfe47b9c9189a6fc7b404d12" + "0258506341a6d802857322fbd20d3e5dae05b95c88793fa83db1cb08e7d8008d15" + "99b6209d78336e24839724c191b2a52a80448306e0daa84a3fdb566661a37e11", + "044a0e7b172a312dc02a4c9a818c036ffa2776368d7f528268d2e6b5df19177022" + "f302d0529e4174cc507c463671217975e81dab02b8fdeb0d7ccc7568dd22574c78" + "3a76be215441b32e91b9a904be8ea81f7a0afd14bad8ee7c8efc305ace5d3dd61b" + "996febe8da4f56ca0919359a7533216e2999fc87ff7d8f176fbecb3d6f34278b", + }, + { + 3073, + "7124b49501012f81cc7f11ca069ec9226cecb8a2c850cfe644e327d22d3e1cd39a" + "27ae3b79d68d89da9bf25bc27139ae65a324918a5f9b7828181e52cf373c84f35b" + "639b7fccbb985b6f2fa56aea0c18f531203497b8bbd3a07ceb5926f1cab74d14bd" + "66486d9a91eba99059a98bd1cd25876b2af5a76c3e9eed554ed72ea952b603bf", + "68dede9bef00ba89e43f31a6825f4cf433389fedae75c04ee9f0cf16a427c95a96" + "d6da3fe985054d3478865be9a092250839a697bbda74e279e8a9e69f0025e4cfdd" + "d6cfb434b1cd9543aaf97c635d1b451a4386041e4bb100f5e45407cbbc24fa53ea" + "2de3536ccb329e4eb9466ec37093a42cf62b82903c696a93a50b702c80f3c3c5", + }, + { + 4096, + "015094013f57a5277b59d8475c0501042c0b642e531b0a1c8f58d2163229e96902" + "89e9409ddb1b99768eafe1623da896faf7e1114bebeadc1be30829b6f8af707d85" + "c298f4f0ff4d9438aef948335612ae921e76d411c3a9111df62d27eaf871959ae0" + "062b5492a0feb98ef3ed4af277f5395172dbe5c311918ea0074ce0036454f620", + "befc660aea2f1718884cd8deb9902811d332f4fc4a38cf7c7300d597a081bfc0bb" + "b64a36edb564e01e4b4aaf3b060092a6b838bea44afebd2deb8298fa562b7b597c" + "757b9df4c911c3ca462e2ac89e9a787357aaf74c3b56d5c07bc93ce899568a3eb1" + "7d9250c20f6c5f6c1e792ec9a2dcb715398d5a6ec6d5c54f586a00403a1af1de", + }, + { + 4097, + "9b4052b38f1c5fc8b1f9ff7ac7b27cd242487b3d890d15c96a1c25b8aa0fb99505" + "f91b0b5600a11251652eacfa9497b31cd3c409ce2e45cfe6c0a016967316c426bd" + "26f619eab5d70af9a418b845c608840390f361630bd497b1ab44019316357c61db" + "e091ce72fc16dc340ac3d6e009e050b3adac4b5b2c92e722cffdc46501531956", + "00df940cd36bb9fa7cbbc3556744e0dbc8191401afe70520ba292ee3ca80abbc60" + "6db4976cfdd266ae0abf667d9481831ff12e0caa268e7d3e57260c0824115a54ce" + "595ccc897786d9dcbf495599cfd90157186a46ec800a6763f1c59e36197e9939e9" + "00809f7077c102f888caaf864b253bc41eea812656d46742e4ea42769f89b83f", + }, + { + 5120, + "9cadc15fed8b5d854562b26a9536d9707cadeda9b143978f319ab34230535833ac" + "c61c8fdc114a2010ce8038c853e121e1544985133fccdd0a2d507e8e615e611e9a" + "0ba4f47915f49e53d721816a9198e8b30f12d20ec3689989175f1bf7a300eee0d9" + "321fad8da232ece6efb8e9fd81b42ad161f6b9550a069e66b11b40487a5f5059", + "2c493e48e9b9bf31e0553a22b23503c0a3388f035cece68eb438d22fa1943e209b" + "4dc9209cd80ce7c1f7c9a744658e7e288465717ae6e56d5463d4f80cdb2ef56495" + "f6a4f5487f69749af0c34c2cdfa857f3056bf8d807336a14d7b89bf62bef2fb54f" + "9af6a546f818dc1e98b9e07f8a5834da50fa28fb5874af91bf06020d1bf0120e", + }, + { + 5121, + "628bd2cb2004694adaab7bbd778a25df25c47b9d4155a55f8fbd79f2fe154cff96" + "adaab0613a6146cdaabe498c3a94e529d3fc1da2bd08edf54ed64d40dcd6777647" + "eac51d8277d70219a9694334a68bc8f0f23e20b0ff70ada6f844542dfa32cd4204" + "ca1846ef76d811cdb296f65e260227f477aa7aa008bac878f72257484f2b6c95", + "6ccf1c34753e7a044db80798ecd0782a8f76f33563accaddbfbb2e0ea4b2d0240d" + "07e63f13667a8d1490e5e04f13eb617aea16a8c8a5aaed1ef6fbde1b0515e3c810" + "50b361af6ead126032998290b563e3caddeaebfab592e155f2e161fb7cba939092" + "133f23f9e65245e58ec23457b78a2e8a125588aad6e07d7f11a85b88d375b72d", + }, + { + 6144, + "3e2e5b74e048f3add6d21faab3f83aa44d3b2278afb83b80b3c35164ebeca2054d" + "742022da6fdda444ebc384b04a54c3ac5839b49da7d39f6d8a9db03deab32aade1" + "56c1c0311e9b3435cde0ddba0dce7b26a376cad121294b689193508dd63151603c" + "6ddb866ad16c2ee41585d1633a2cea093bea714f4c5d6b903522045b20395c83", + "3d6b6d21281d0ade5b2b016ae4034c5dec10ca7e475f90f76eac7138e9bc8f1dc3" + "5754060091dc5caf3efabe0603c60f45e415bb3407db67e6beb3d11cf8e4f79075" + "61f05dace0c15807f4b5f389c841eb114d81a82c02a00b57206b1d11fa6e803486" + "b048a5ce87105a686dee041207e095323dfe172df73deb8c9532066d88f9da7e", + }, + { + 6145, + "f1323a8631446cc50536a9f705ee5cb619424d46887f3c376c695b70e0f0507f18" + "a2cfdd73c6e39dd75ce7c1c6e3ef238fd54465f053b25d21044ccb2093beb01501" + "5532b108313b5829c3621ce324b8e14229091b7c93f32db2e4e63126a377d2a63a" + "3597997d4f1cba59309cb4af240ba70cebff9a23d5e3ff0cdae2cfd54e070022", + "9ac301e9e39e45e3250a7e3b3df701aa0fb6889fbd80eeecf28dbc6300fbc539f3" + "c184ca2f59780e27a576c1d1fb9772e99fd17881d02ac7dfd39675aca918453283" + "ed8c3169085ef4a466b91c1649cc341dfdee60e32231fc34c9c4e0b9a2ba87ca8f" + "372589c744c15fd6f985eec15e98136f25beeb4b13c4e43dc84abcc79cd4646c", + }, + { + 7168, + "61da957ec2499a95d6b8023e2b0e604ec7f6b50e80a9678b89d2628e99ada77a57" + "07c321c83361793b9af62a40f43b523df1c8633cecb4cd14d00bdc79c78fca5165" + "b863893f6d38b02ff7236c5a9a8ad2dba87d24c547cab046c29fc5bc1ed142e1de" + "4763613bb162a5a538e6ef05ed05199d751f9eb58d332791b8d73fb74e4fce95", + "b42835e40e9d4a7f42ad8cc04f85a963a76e18198377ed84adddeaecacc6f3fca2" + "f01d5277d69bb681c70fa8d36094f73ec06e452c80d2ff2257ed82e7ba34840098" + "9a65ee8daa7094ae0933e3d2210ac6395c4af24f91c2b590ef87d7788d7066ea3e" + "aebca4c08a4f14b9a27644f99084c3543711b64a070b94f2c9d1d8a90d035d52", + }, + { + 7169, + "a003fc7a51754a9b3c7fae0367ab3d782dccf28855a03d435f8cfe74605e781798" + "a8b20534be1ca9eb2ae2df3fae2ea60e48c6fb0b850b1385b5de0fe460dbe9d9f9" + "b0d8db4435da75c601156df9d047f4ede008732eb17adc05d96180f8a735485228" + "40779e6062d643b79478a6e8dbce68927f36ebf676ffa7d72d5f68f050b119c8", + "ed9b1a922c046fdb3d423ae34e143b05ca1bf28b710432857bf738bcedbfa5113c" + "9e28d72fcbfc020814ce3f5d4fc867f01c8f5b6caf305b3ea8a8ba2da3ab69fabc" + "b438f19ff11f5378ad4484d75c478de425fb8e6ee809b54eec9bdb184315dc8566" + "17c09f5340451bf42fd3270a7b0b6566169f242e533777604c118a6358250f54", + }, + { + 8192, + "aae792484c8efe4f19e2ca7d371d8c467ffb10748d8a5a1ae579948f718a2a635f" + "e51a27db045a567c1ad51be5aa34c01c6651c4d9b5b5ac5d0fd58cf18dd61a4777" + "8566b797a8c67df7b1d60b97b19288d2d877bb2df417ace009dcb0241ca1257d62" + "712b6a4043b4ff33f690d849da91ea3bf711ed583cb7b7a7da2839ba71309bbf", + "dc9637c8845a770b4cbf76b8daec0eebf7dc2eac11498517f08d44c8fc00d58a48" + "34464159dcbc12a0ba0c6d6eb41bac0ed6585cabfe0aca36a375e6c5480c22afdc" + "40785c170f5a6b8a1107dbee282318d00d915ac9ed1143ad40765ec120042ee121" + "cd2baa36250c618adaf9e27260fda2f94dea8fb6f08c04f8f10c78292aa46102", + }, + { + 8193, + "bab6c09cb8ce8cf459261398d2e7aef35700bf488116ceb94a36d0f5f1b7bc3bb2" + "282aa69be089359ea1154b9a9286c4a56af4de975a9aa4a5c497654914d279bea6" + "0bb6d2cf7225a2fa0ff5ef56bbe4b149f3ed15860f78b4e2ad04e158e375c1e0c0" + "b551cd7dfc82f1b155c11b6b3ed51ec9edb30d133653bb5709d1dbd55f4e1ff6", + "954a2a75420c8d6547e3ba5b98d963e6fa6491addc8c023189cc519821b4a1f5f0" + "3228648fd983aef045c2fa8290934b0866b615f585149587dda229903996532883" + "5a2b18f1d63b7e300fc76ff260b571839fe44876a4eae66cbac8c67694411ed7e0" + "9df51068a22c6e67d6d3dd2cca8ff12e3275384006c80f4db68023f24eebba57", + }, + { + 16384, + "f875d6646de28985646f34ee13be9a576fd515f76b5b0a26bb324735041ddde49d" + "764c270176e53e97bdffa58d549073f2c660be0e81293767ed4e4929f9ad34bbb3" + "9a529334c57c4a381ffd2a6d4bfdbf1482651b172aa883cc13408fa67758a3e475" + "03f93f87720a3177325f7823251b85275f64636a8f1d599c2e49722f42e93893", + "9e9fc4eb7cf081ea7c47d1807790ed211bfec56aa25bb7037784c13c4b707b0df9" + "e601b101e4cf63a404dfe50f2e1865bb12edc8fca166579ce0c70dba5a5c0fc960" + "ad6f3772183416a00bd29d4c6e651ea7620bb100c9449858bf14e1ddc9ecd35725" + "581ca5b9160de04060045993d972571c3e8f71e9d0496bfa744656861b169d65", + }, + { + 31744, + "62b6960e1a44bcc1eb1a611a8d6235b6b4b78f32e7abc4fb4c6cdcce94895c4786" + "0cc51f2b0c28a7b77304bd55fe73af663c02d3f52ea053ba43431ca5bab7bfea2f" + "5e9d7121770d88f70ae9649ea713087d1914f7f312147e247f87eb2d4ffef0ac97" + "8bf7b6579d57d533355aa20b8b77b13fd09748728a5cc327a8ec470f4013226f", + "efa53b389ab67c593dba624d898d0f7353ab99e4ac9d42302ee64cbf9939a4193a" + "7258db2d9cd32a7a3ecfce46144114b15c2fcb68a618a976bd74515d47be08b628" + "be420b5e830fade7c080e351a076fbc38641ad80c736c8a18fe3c66ce12f95c61c" + "2462a9770d60d0f77115bbcd3782b593016a4e728d4c06cee4505cb0c08a42ec", + }, + { + 102400, + "bc3e3d41a1146b069abffad3c0d44860cf664390afce4d9661f7902e7943e085e0" + "1c59dab908c04c3342b816941a26d69c2605ebee5ec5291cc55e15b76146e6745f" + "0601156c3596cb75065a9c57f35585a52e1ac70f69131c23d611ce11ee4ab1ec2c" + "009012d236648e77be9295dd0426f29b764d65de58eb7d01dd42248204f45f8e", + "1c35d1a5811083fd7119f5d5d1ba027b4d01c0c6c49fb6ff2cf75393ea5db4a7f9" + "dbdd3e1d81dcbca3ba241bb18760f207710b751846faaeb9dff8262710999a59b2" + "aa1aca298a032d94eacfadf1aa192418eb54808db23b56e34213266aa08499a16b" + "354f018fc4967d05f8b9d2ad87a7278337be9693fc638a3bfdbe314574ee6fc4", + }, + { + 0, 0, 0 + } +}; + +#ifdef BLAKE3_DEBUG +#define dprintf printf +#else +#define dprintf(...) +#endif + +static char fmt_tohex(char c); +static size_t fmt_hexdump(char *dest, const char *src, size_t len); + +static char fmt_tohex(char c) { + return ((char)(c >= 10 ? c-10+'a' : c+'0')); +} + +static size_t fmt_hexdump(char *dest, const char *src, size_t len) { + register const unsigned char *s = (const unsigned char *) src; + size_t written = 0, i; + + if (!dest) + return ((len > ((size_t)-1)/2) ? (size_t)-1 : len*2); + for (i = 0; i < len; ++i) { + dest[written] = fmt_tohex(s[i]>>4); + dest[written+1] = fmt_tohex(s[i]&15); + written += 2; + } + + return (written); +} + +int +main(int argc, char *argv[]) +{ + boolean_t failed = B_FALSE; + uint8_t buffer[102400]; + uint64_t cpu_mhz = 0; + int id, i, j; + + if (argc == 2) + cpu_mhz = atoi(argv[1]); + + /* fill test message */ + for (i = 0, j = 0; i < sizeof (buffer); i++, j++) { + if (j == 251) + j = 0; + buffer[i] = (uint8_t)j; + } + + (void) printf("Running algorithm correctness tests:\n"); + for (id = 0; id < blake3_get_impl_count(); id++) { + blake3_set_impl_id(id); + const char *name = blake3_get_impl_name(); + dprintf("Result for BLAKE3-%s:\n", name); + for (i = 0; TestArray[i].hash; i++) { + blake3_test_t *cur = &TestArray[i]; + + BLAKE3_CTX ctx; + uint8_t digest[TEST_DIGEST_LEN]; + char result[TEST_DIGEST_LEN]; + + /* default hashing */ + Blake3_Init(&ctx); + Blake3_Update(&ctx, buffer, cur->input_len); + Blake3_FinalSeek(&ctx, 0, digest, TEST_DIGEST_LEN); + fmt_hexdump(result, (char *)digest, 131); + if (memcmp(result, cur->hash, 131) != 0) + failed = B_TRUE; + + dprintf("HASH-res: %s\n", result); + dprintf("HASH-ref: %s\n", cur->hash); + + /* salted hashing */ + Blake3_InitKeyed(&ctx, (const uint8_t *)salt); + Blake3_Update(&ctx, buffer, cur->input_len); + Blake3_FinalSeek(&ctx, 0, digest, TEST_DIGEST_LEN); + fmt_hexdump(result, (char *)digest, 131); + if (memcmp(result, cur->shash, 131) != 0) + failed = B_TRUE; + + dprintf("SHASH-res: %s\n", result); + dprintf("SHASH-ref: %s\n", cur->shash); + + printf("BLAKE3-%s Message (inlen=%d)\tResult: %s\n", + name, cur->input_len, failed?"FAILED!":"OK"); + } + } + + if (failed) + return (1); + +#define BLAKE3_PERF_TEST(impl, diglen) \ + do { \ + BLAKE3_CTX ctx; \ + uint8_t digest[diglen / 8]; \ + uint8_t block[131072]; \ + uint64_t delta; \ + double cpb = 0; \ + int i; \ + struct timeval start, end; \ + memset(block, 0, sizeof (block)); \ + (void) gettimeofday(&start, NULL); \ + Blake3_Init(&ctx); \ + for (i = 0; i < 8192; i++) \ + Blake3_Update(&ctx, block, sizeof (block)); \ + Blake3_Final(&ctx, digest); \ + (void) gettimeofday(&end, NULL); \ + delta = (end.tv_sec * 1000000llu + end.tv_usec) - \ + (start.tv_sec * 1000000llu + start.tv_usec); \ + if (cpu_mhz != 0) { \ + cpb = (cpu_mhz * 1e6 * ((double)delta / \ + 1000000)) / (8192 * 128 * 1024); \ + } \ + (void) printf("BLAKE3-%s %llu us (%.02f CPB)\n", impl, \ + (u_longlong_t)delta, cpb); \ + } while (0) + + printf("Running performance tests (hashing 1024 MiB of data):\n"); + for (id = 0; id < blake3_get_impl_count(); id++) { + blake3_set_impl_id(id); + const char *name = blake3_get_impl_name(); + BLAKE3_PERF_TEST(name, 256); + } + + return (0); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 9dc2b4d0e08b..99430bc10324 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -212,6 +212,7 @@ export ZFSTEST_FILES='badsend zed_fd_spill-zedlet suid_write_to_file cp_files + blake3_test edonr_test skein_test sha2_test diff --git a/tests/zfs-tests/include/properties.shlib b/tests/zfs-tests/include/properties.shlib index ba82f96202b2..14b3f4415b7d 100644 --- a/tests/zfs-tests/include/properties.shlib +++ b/tests/zfs-tests/include/properties.shlib @@ -17,7 +17,7 @@ typeset -a compress_prop_vals=('off' 'lzjb' 'lz4' 'gzip' 'zle' 'zstd') typeset -a checksum_prop_vals=('on' 'off' 'fletcher2' 'fletcher4' 'sha256' - 'noparity' 'sha512' 'skein') + 'noparity' 'sha512' 'skein' 'blake3') if ! is_freebsd; then checksum_prop_vals+=('edonr') fi diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index a91a24d16680..ffc087351e38 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -545,6 +545,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/checksum/cleanup.ksh \ functional/checksum/filetest_001_pos.ksh \ functional/checksum/filetest_002_pos.ksh \ + functional/checksum/run_blake3_test.ksh \ functional/checksum/run_edonr_test.ksh \ functional/checksum/run_sha2_test.ksh \ functional/checksum/run_skein_test.ksh \ diff --git a/tests/zfs-tests/tests/functional/checksum/default.cfg b/tests/zfs-tests/tests/functional/checksum/default.cfg index afb956093d8a..a7e143e75ea3 100644 --- a/tests/zfs-tests/tests/functional/checksum/default.cfg +++ b/tests/zfs-tests/tests/functional/checksum/default.cfg @@ -30,4 +30,4 @@ . $STF_SUITE/include/libtest.shlib -set -A CHECKSUM_TYPES "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" +set -A CHECKSUM_TYPES "fletcher2" "fletcher4" "blake3" "sha256" "sha512" "skein" "edonr" diff --git a/tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh b/tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh new file mode 100755 index 000000000000..cf1ca70328e1 --- /dev/null +++ b/tests/zfs-tests/tests/functional/checksum/run_blake3_test.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015, 2016 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# Description: +# Run the tests for the BLAKE3 hash algorithm. +# + +log_assert "Run the tests for the BLAKE3 hash algorithm." + +freq=$(get_cpu_freq) +log_must blake3_test $freq + +log_pass "BLAKE3 tests passed." diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh index 27003b21b556..cab7c185e16a 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh @@ -46,7 +46,7 @@ verify_runnable "both" set -A dataset "$TESTPOOL" "$TESTPOOL/$TESTFS" "$TESTPOOL/$TESTVOL" -set -A values "on" "off" "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" "noparity" +set -A values "on" "off" "fletcher2" "fletcher4" "sha256" "sha512" "skein" "edonr" "blake3" "noparity" log_assert "Setting a valid checksum on a file system, volume," \ "it should be successful." diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index 4ea5725e040e..7849ed22634e 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -99,5 +99,6 @@ if is_linux || is_freebsd; then "feature@zstd_compress" "feature@zilsaxattr" "feature@head_errlog" + "feature@blake3" ) fi From b0737cf3095515dec8518c0499c2c235c596aaa6 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Fri, 20 May 2022 19:13:12 +0200 Subject: [PATCH 09/10] Modify the ABI to match the new properties Change libzfs.abi a bit to match the new structure properties. It also adds me to the AUTHORS file. Signed-off-by: Tino Reichardt --- AUTHORS | 1 + lib/libzfs/libzfs.abi | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/AUTHORS b/AUTHORS index aab8bf29c99f..86083ba87715 100644 --- a/AUTHORS +++ b/AUTHORS @@ -285,6 +285,7 @@ CONTRIBUTORS: Tim Connors Tim Crawford Tim Haley + Tino Reichardt Tobin Harding Tom Caputi Tom Matthews diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 8a71da95148e..9f9a2f9071d9 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -583,7 +583,7 @@ - + @@ -4770,8 +4770,8 @@ - - + + @@ -4812,7 +4812,8 @@ - + + From 3b6ae5085ab518f486be0192957a8aadead9bfcd Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Tue, 17 May 2022 17:04:20 +0200 Subject: [PATCH 10/10] Drop #ifdef #undef #endif _KERNEL for checksum/*_test.c Dropping the defines does not cause any issues and isn't needed anymore. Signed-off-by: Tino Reichardt --- tests/zfs-tests/cmd/checksum/edonr_test.c | 3 --- tests/zfs-tests/cmd/checksum/sha2_test.c | 3 --- tests/zfs-tests/cmd/checksum/skein_test.c | 3 --- 3 files changed, 9 deletions(-) diff --git a/tests/zfs-tests/cmd/checksum/edonr_test.c b/tests/zfs-tests/cmd/checksum/edonr_test.c index c6365a4147e6..3a0a48533c53 100644 --- a/tests/zfs-tests/cmd/checksum/edonr_test.c +++ b/tests/zfs-tests/cmd/checksum/edonr_test.c @@ -28,9 +28,6 @@ * gettimeofday due to -D_KERNEL (we can do this since we're actually * running in userspace, but we need -D_KERNEL for the remaining Edon-R code). */ -#ifdef _KERNEL -#undef _KERNEL -#endif #include #include diff --git a/tests/zfs-tests/cmd/checksum/sha2_test.c b/tests/zfs-tests/cmd/checksum/sha2_test.c index dc4173e1059a..bb355311091e 100644 --- a/tests/zfs-tests/cmd/checksum/sha2_test.c +++ b/tests/zfs-tests/cmd/checksum/sha2_test.c @@ -28,9 +28,6 @@ * gettimeofday due to -D_KERNEL (we can do this since we're actually * running in userspace, but we need -D_KERNEL for the remaining SHA2 code). */ -#ifdef _KERNEL -#undef _KERNEL -#endif #include #include diff --git a/tests/zfs-tests/cmd/checksum/skein_test.c b/tests/zfs-tests/cmd/checksum/skein_test.c index 99b47b4532fc..13611c860c42 100644 --- a/tests/zfs-tests/cmd/checksum/skein_test.c +++ b/tests/zfs-tests/cmd/checksum/skein_test.c @@ -28,9 +28,6 @@ * gettimeofday due to -D_KERNEL (we can do this since we're actually * running in userspace, but we need -D_KERNEL for the remaining Skein code). */ -#ifdef _KERNEL -#undef _KERNEL -#endif #include #include