From 47079e5d3cd04a861c3d6a6985c6fc9985f90b7f Mon Sep 17 00:00:00 2001 From: ipl_ci Date: Fri, 19 May 2023 02:15:49 -0700 Subject: [PATCH] WW20'23 source code update --- BUILD.md | 21 +- CHANGELOG.md | 3 + CONST_TIME_EXECUTION_TESTING.md | 4 +- DEPRECATION_NOTES.md | 11 +- OVERVIEW.md | 25 +- include/ippcpdefs.h | 10 +- include/ippversion.h | 4 +- sources/ippcp/crypto_mb/Readme.md | 13 +- .../ippcp/crypto_mb/include/crypto_mb/sm4.h | 8 + .../crypto_mb/include/crypto_mb/version.h | 4 +- .../crypto_mb/include/internal/sm4/sm4_mb.h | 140 ++++- .../src/cmake/dll_export/crypto_mb.defs | 4 + .../dll_export/crypto_mb.linux.lib-export | 4 + .../dll_export/crypto_mb.macosx.lib-export | 4 + .../ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c | 8 +- .../ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c | 37 ++ .../crypto_mb/src/sm4/sm4_xts_dec_mb16.c | 59 ++ .../crypto_mb/src/sm4/sm4_xts_enc_mb16.c | 59 ++ .../ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c | 529 ++++++++++++++++++ .../requirements.txt | 2 + 20 files changed, 906 insertions(+), 43 deletions(-) create mode 100644 sources/ippcp/crypto_mb/src/sm4/sm4_xts_dec_mb16.c create mode 100644 sources/ippcp/crypto_mb/src/sm4/sm4_xts_enc_mb16.c create mode 100644 sources/ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c create mode 100644 tools/ipp_custom_library_tool_python/requirements.txt diff --git a/BUILD.md b/BUILD.md index 7177cc40..f6db9f7f 100644 --- a/BUILD.md +++ b/BUILD.md @@ -29,7 +29,7 @@ ### Linux* OS - [Common tools](#common-tools) -- Intel® C++ Compiler Classic 2021.3 for Linux\* OS +- Intel® C++ Compiler Classic 2021.9 for Linux\* OS - GCC 8.3 - GCC 9.1 - GCC 10.1 @@ -39,7 +39,7 @@ - GNU binutils 2.32 ### Windows* OS - [Common tools](#common-tools) -- Intel® C++ Compiler Classic 2021.3 for Windows\* OS +- Intel® C++ Compiler Classic 2021.9 for Windows\* OS - Microsoft Visual C++ Compiler\* version 19.16 provided by Microsoft Visual Studio\* 2017 version 15.9 > **NOTE:** Support for this compiler version will be removed from Intel IPP Cryptography starting 2021.4 release. If you use it for building Intel IPP Cryptography library, please plan on migrating to a newer supported version of Microsoft Visual C++ Compiler\*. - Microsoft Visual C++ Compiler\* version 19.24 provided by Microsoft Visual Studio\* 2019 version 16.4 @@ -47,12 +47,12 @@ > **NOTE:** [CMake\*](https://cmake.org/download) 3.21 or higher is required to build using Microsoft Visual Studio\* 2022. ### macOS* - [Common tools](#common-tools) -- Intel® C++ Compiler Classic 2021.3 for macOS\* +- Intel® C++ Compiler Classic 2021.9 for macOS\* ## Building Intel IPP Cryptography on Linux\* OS The software was validated on: -- Red Hat\* Enterprise Linux\* 7 +- Red Hat\* Enterprise Linux\* 8 To build the Intel IPP Cryptography library on Linux\* OS, complete the following steps: 1. Clone the source code from GitHub\* as follows: @@ -102,7 +102,7 @@ To build the Intel IPP Cryptography library on Linux\* OS, complete the followin The software was validated on: -- Windows Server\* 2016 +- Windows Server\* 2019 To build the Intel IPP Cryptography library on Windows* OS, complete the following steps: @@ -214,7 +214,16 @@ To build the Intel IPP Cryptography library on macOS*, complete the following st `-DPLATFORM_LIST="m7;s8;p8;g9;h9"` - Example for Linux\* OS and the Intel® 64 architecture: - `-DPLATFORM_LIST="w7;n8;y8;e9;l9;n0;k0"` + `-DPLATFORM_LIST="w7;n8;y8;e9;l9;k0"` +- `-DIPPCP_CUSTOM_BUILD=""` - optional, works only if `-DMERGED_BLD:BOOL=off` is set, i.e. only for 1CPU libraries. Enables the CPU feature dispatching mask at compile-time based on the provided list. + + - Currently supported by the library custom features dispatching: + 1. Intel® Advanced Encryption Standard New Instructions (Intel® AES-NI) code-path enabling: `IPPCP_AES_ON;IPPCP_CLMUL_ON` + 2. Intel® Advanced Vector Extensions 512 (Intel(R) AVX-512) and vector extensions of Intel(R) AES New Instructions (Intel(R) AES-NI) code-path enabling: `IPPCP_VAES_ON;IPPCP_VCLMUL_ON` + - Example: + `-DPLATFORM_LIST="IPPCP_AES_ON;IPPCP_CLMUL_ON"` - this combination enables Intel® AES-NI in all 1CPU libraries, which contains this code path. + - Example of using a combination of CPU features: + `-DPLATFORM_LIST="IPPCP_AES_ON;IPPCP_CLMUL_ON;IPPCP_VAES_ON;IPPCP_VCLMUL_ON"` - in this combination the highest available feature in each 1CPU library will be enabled (e.g. for `"y8"` it’s Intel® AES-NI and for `"k1"` - Intel AVX-512 VAES) ### Windows\* OS diff --git a/CHANGELOG.md b/CHANGELOG.md index 291c0dd3..93d62b7c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ This is a list of notable changes to Intel(R) IPP Cryptography, in reverse chronological order. +## Intel(R) IPP Cryptography 2021.8 +- Crypto Multi-buffer library was extended with XTS mode of SM4 algorithm. + ## Intel(R) IPP Cryptography 2021.7.1 - Added re-initialization API for AES-GCM context - ippsAES_GCMReinit. The use-case of this function is very specific, please, refer to the documentation for more details. diff --git a/CONST_TIME_EXECUTION_TESTING.md b/CONST_TIME_EXECUTION_TESTING.md index 87e4b181..61868362 100644 --- a/CONST_TIME_EXECUTION_TESTING.md +++ b/CONST_TIME_EXECUTION_TESTING.md @@ -7,14 +7,14 @@ ## General information
- Testing is conducted under Linux for 64-bit Intel® IPP Cryptography built with the following compilers: - Intel® C++ Compiler 19.1 - - Intel® C++ Compiler Classic 2021.3 + - Intel® C++ Compiler Classic 2021.9 - GCC 8.3 - GCC 9.1 - GCC 10.1 - GCC 11.1 - Clang 9.0 - Clang 12.0 -- Tested platforms: w7, n8, y8, e9, l9, n0, k0 (see the supported platforms list [here](./OVERVIEW.md#target-optimization-codes-in-function-names)). +- Tested platforms: w7, n8, y8, e9, l9, k0 (see the supported platforms list [here](./OVERVIEW.md#target-optimization-codes-in-function-names)). - Testing scope described below is guaranteed to pass for **`release`** branches. This is not guaranteed for the **`develop`** branch ([branches description](./OVERVIEW.md#branches-description)) - Information about Pin-Based Constant Execution Checker can be found [here](https://github.com/intel/pin_based_cec) diff --git a/DEPRECATION_NOTES.md b/DEPRECATION_NOTES.md index 6773e7b1..3ef6475c 100644 --- a/DEPRECATION_NOTES.md +++ b/DEPRECATION_NOTES.md @@ -14,7 +14,7 @@ The deprecated API means it is obsolete and will be removed in one of future Int | ippsSHA1Init
ippsSHA224Init
ippsSHA256Init
ippsSHA384Init
ippsSHA512Init
ippsSM3Init
ippsMD5Init | ippsHashInit_rmf \* | | ippsSHA1Duplicate
ippsSHA224Duplicate
ippsSHA256Duplicate
ippsSHA384Duplicate
ippsSHA512Duplicate
ippsSM3Duplicate
ippsMD5Duplicate | ippsHashDuplicate_rmf | | ippsSHA1Pack, ippsSHA1Unpack
ippsSHA224Pack, ippsSHA224Unpack
ippsSHA256Pack, ippsSHA256Unpack
ippsSHA384Pack, ippsSHA384Unpack
ippsSHA512Pack, ippsSHA512Unpack
ippsSM3Pack, ippsSM3Unpack
ippsMD5Pack, ippsMD5Unpack | ippsHashPack_rmf,
ippsHashUnpack_rmf | -| ippsSHA1Update, ippsSHA1GetTag, ippsSHA1Final
ippsSHA224Update, ippsSHA224GetTag, ippsSHA224Final
ippsSHA256Update, ippsSHA256GetTag, ippsSHA256Final
ippsSHA384Update, ippsSHA384GetTag, ippsSHA384Final
ippsSHA512Update, ippsSHA512GetTag, ippsSHA512Final
ippsSM3Update, ippsSM3GetTag, ippsSM3Final
ippsMD5Update, ippsMD5GetTag, ippsSMD5Final | ippsHashUpdate_rmf,
ippsHashGetTag_rmf,
ippsHashFinal_rmf | +| ippsSHA1Update, ippsSHA1GetTag, ippsSHA1Final
ippsSHA224Update, ippsSHA224GetTag, ippsSHA224Final
ippsSHA256Update, ippsSHA256GetTag, ippsSHA256Final
ippsSHA384Update, ippsSHA384GetTag, ippsSHA384Final
ippsSHA512Update, ippsSHA512GetTag, ippsSHA512Final
ippsSM3Update, ippsSM3GetTag, ippsSM3Final
ippsMD5Update, ippsMD5GetTag, ippsMD5Final | ippsHashUpdate_rmf,
ippsHashGetTag_rmf,
ippsHashFinal_rmf | | ippsSHA1MessageDigest
ippsSHA224MessageDigest
ippsSHA256MessageDigest
ippsSHA384MessageDigest
ippsSHA512MessageDigest
ippsSM3MessageDigest
ippsMD5MessageDigest | ippsHashMessage_rmf \* | | ippsHashGetSize | ippsHashGetSize_rmf | | ippsHashInit \*\* | ippsHashInit_rmf \* | @@ -24,7 +24,7 @@ The deprecated API means it is obsolete and will be removed in one of future Int | ippsHashMessage \*\* | ippsHashMessage_rmf \* | >\* To choose hash algorithm, specify [IppsHashMethod parameter](#ippshashalgid-to-ippshashmethod-parameter-map) ->\*\* IppsHashAlgId parameter used in 'ippsHMAC_Init' and in ippsHMAC_Message for choosing hash algorithm is deprecated (see Recommended replacement column for alternative in [IppsHashAlgId to IppsHashMethod parameter map](#ippshashalgid-to-ippshashmethod-parameter-map) +>\*\* IppsHashAlgId parameter used in ippsHMAC_Init and in ippsHMAC_Message for choosing hash algorithm is deprecated (see Recommended replacement column for alternative in [IppsHashAlgId to IppsHashMethod Parameter Map](#ippshashalgid-to-ippshashmethod-parameter-map) ### Keyed HMAC Functionality @@ -37,7 +37,7 @@ The deprecated API means it is obsolete and will be removed in one of future Int | ippsHMAC_Message \*\* | ippsHMAC_Message_rmf \* | >\* To choose hash algorithm, specify [IppsHashMethod parameter](#ippshashalgid-to-ippshashmethod-parameter-map) ->\*\* IppsHashAlgId parameter used in 'ippsHMAC_Init' and in ippsHMAC_Message for choosing hash algorithm is deprecated (see Recommended replacement column for alternative in [IppsHashAlgId to IppsHashMethod parameter map](#ippshashalgid-to-ippshashmethod-parameter-map) +>\*\* IppsHashAlgId parameter used in 'ippsHMAC_Init' and in ippsHMAC_Message for choosing hash algorithm is deprecated (see Recommended replacement column for alternative in [IppsHashAlgId to IppsHashMethod Parameter Map](#ippshashalgid-to-ippshashmethod-parameter-map) ### MGF Functionality @@ -62,7 +62,6 @@ The deprecated API means it is obsolete and will be removed in one of future Int | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | | ippsECCPGetSize
ippsECCPGetSizeStd128r1
ippsECCPGetSizeStd128r2
ippsECCPGetSizeStd192r1
ippsECCPGetSizeStd224r1
ippsECCPGetSizeStd256r1
ippsECCPGetSizeStd384r1
ippsECCPGetSizeStd521r1
ippsECCPGetSizeStdSM2 | ippsGFpECGetSize | | ippsECCPInit
ippsECCPInitStd128r1
ippsECCPInitStd128r2
ippsECCPInitStd192r1
ippsECCPInitStd224r1
ippsECCPInitStd256r1
ippsECCPInitStd384r1
ippsECCPInitStd521r1
ippsECCPInitStdSM2 | ippsGFpECInitStd \*
* ippsGFpECInitStd functions provides both initialization
and set up standard EC set of parameters | -| ippsECCPGetSize
ippsECCPGetSizeStd128r1
ippsECCPGetSizeStd128r2
ippsECCPGetSizeStd192r1
ippsECCPGetSizeStd224r1
ippsECCPGetSizeStd256r1
ippsECCPGetSizeStd384r1
ippsECCPGetSizeStd521r1
ippsECCPGetSizeStdSM2 | ippsGFpECGetSize | | ippsECCPSet | ippsGFpECSet | | ippsECCPSetStd | ippsGFpECInitStd \*
* ippsGFpECInitStd functions provides both initialization
and set up standard EC set of parameters | | ippsECCPSetStd128r1
ippsECCPSetStd128r2
ippsECCPSetStd192r1
ippsECCPSetStd224r1
ippsECCPSetStd256r1
ippsECCPSetStd384r1
ippsECCPSetStd521r1
ippsECCPSetStdSM2 | ippsGFpECInitStd128r1
ippsGFpECInitStd128r2
ippsGFpECInitStd192r1
ippsGFpECInitStd224r1
ippsGFpECInitStd256r1
ippsGFpECInitStd384r1
ippsGFpECInitStd521r1
ippsGFpECInitStdSM2 | @@ -72,9 +71,9 @@ The deprecated API means it is obsolete and will be removed in one of future Int | ippsECCPSharedSecretDH
ippsECCPSharedSecretDHC | ippsGFpECSharedSecretDH
ippsGFpECSharedSecretDHC | | ippsECCPSignDSA
ippsECCPVerifyDSA
ippsECCPSignNR
ippsECCPVerifyNR
ippsECCPSignSM2
ippsECCPVerifySM2 | ippsGFpECSignDSA
ippsGFpECVerifyDSA
ippsGFpECSignNR
ippsGFpECVerifyNR
ippsGFpECSignSM2
ippsGFpECVerifySM2 | -### IppsHashAlgId to IppsHashMethod parameter map +### IppsHashAlgId to IppsHashMethod Parameter Map -| Algorithm | IppsHashAlgId (deprecated) | IppsHashMethod(recommended) | Note | +| Algorithm | IppsHashAlgId (deprecated) | IppsHashMethod (recommended) | Notes | | :--------: | :------------------------: | :---------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------: | | SHA1 | ippsHashAlg_SHA1 | ippsHashMethod_SHA1
ippsHashMethod_SHA1_NI
ippsHashMethod_SHA1_TT | Intel® Secure Hash Algorithm - New Instructions (Intel® SHA-NI) not supported
Intel SHA-NI only supported
Automatic switch on Intel SHA-NI, if possible (tick-tock) | | SHA224 | ippsHashAlg_SHA224 | ippsHashMethod_SHA224
ippsHashMethod_SHA224_NI
ippsHashMethod_SHA224_TT | Intel SHA-NI not supported
Intel SHA-NI only supported
Automatic switch on Intel SHA-NI, if possible supported | diff --git a/OVERVIEW.md b/OVERVIEW.md index 8268b7ab..c7254c31 100644 --- a/OVERVIEW.md +++ b/OVERVIEW.md @@ -77,16 +77,17 @@ By default, the dispatcher chooses the most appropriate optimization for the cur #### Target Optimization Codes in Function Names -| IA-32 Intel® architecture | Intel® 64 architecture | Meaning | -| ------------------------- | ---------------------- | ------------------------------------------------------------------------------------ | -| px | mx | Generic code without hardware specific optimizations suitable for any CPU | -| w7 | - | Optimized for processors with Intel® Streaming SIMD Extensions 2 (Intel® SSE2) | -| - | m7 | Optimized for processors with Intel® SSE3 | -| s8 | n8 | Optimized for processors with Supplemental Streaming SIMD Extensions 3 (SSSE3) | -| p8 | y8 | Optimized for processors with Intel® SSE4.2 | -| g9 | e9 | Optimized for processors with Intel® Advanced Vector Extensions (Intel® AVX) | -| h9 | l9 | Optimized for processors with Intel® Advanced Vector Extensions 2 (Intel® AVX2) | -| - | k0 | Optimized for processors with Intel® Advanced Vector Extensions 512 (Intel® AVX-512) | +| IA-32 Intel® architecture | Intel® 64 architecture | Meaning | +| ------------------------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------ | +| px | mx | Generic code without hardware specific optimizations suitable for any CPU | +| w7 | - | Optimized for processors with Intel® Streaming SIMD Extensions 2 (Intel® SSE2) | +| - | m7 | Optimized for processors with Intel® SSE3 | +| s8 | n8 | Optimized for processors with Supplemental Streaming SIMD Extensions 3 (SSSE3) | +| p8 | y8 | Optimized for processors with Intel® SSE4.2 | +| g9 | e9 | Optimized for processors with Intel® Advanced Vector Extensions (Intel® AVX) | +| h9 | l9 | Optimized for processors with Intel® Advanced Vector Extensions 2 (Intel® AVX2) | +| - | k0 | Optimized for processors with Intel® Advanced Vector Extensions 512 (Intel® AVX-512) (formerly codenamed SkyLake) | +| - | k1 | Optimized for processors with Intel® Advanced Vector Extensions 512 (Intel® AVX-512) (formerly codenamed IceLake) | ### CPU Feature Dispatching @@ -104,6 +105,8 @@ List of CPU feature subsets that the library has special optimizations for: - Intel AVX-512 IFMA - Intel AVX-512 GFNI + > **NOTE:** For some features there is also an opportunity to force their dispatching inside the 1CPU libraries manually during the compile time. For more information please, refer to [common for all operating systems CMake build options](./BUILD.md/#common-for-all-operating-systems). + ### How to Avoid Dispatcher in All CPUs Static Library To leave only specific ISA when linking with an [All CPUs Static Library](#all-cpus-library) and drop dispatcher, please refer to [this section](#choosing-specific-isa-from-the-all-cpus-static-library). @@ -202,4 +205,4 @@ To build your own dynamic library containing only the functionality that is nece The tool is located in the `tools/ipp_custom_library_tool_python` directory. -Please refer to the [tool documentation](https://software.intel.com/en-us/ipp-dev-guide-building-a-custom-dll-with-custom-library-tool) for more information. +Please refer to the [tool documentation](https://www.intel.com/content/www/us/en/docs/ipp/developer-guide-oneapi/current/ipp-custom-library-tool.html) for more information. diff --git a/include/ippcpdefs.h b/include/ippcpdefs.h index 6282f685..7d004758 100644 --- a/include/ippcpdefs.h +++ b/include/ippcpdefs.h @@ -367,14 +367,16 @@ typedef enum { #ifndef IPPCP_AES_ON #define IPPCP_AES_ON (0) #endif +#ifndef IPPCP_CLMUL_ON +#define IPPCP_CLMUL_ON (0) +#endif #ifndef IPPCP_VAES_ON #define IPPCP_VAES_ON (0) #endif -#ifndef IPPCP_CLMUL_ON -#define IPPCP_CLMUL_ON (0) +#ifndef IPPCP_VCLMUL_ON +#define IPPCP_VCLMUL_ON (0) #endif - -#define IPP_CUSTOM_ENABLED_FEATURES (ippCPUID_AES*IPPCP_AES_ON | ippCPUID_CLMUL*IPPCP_CLMUL_ON | ippCPUID_AVX512VAES*IPPCP_VAES_ON) +#define IPP_CUSTOM_ENABLED_FEATURES (ippCPUID_AES*IPPCP_AES_ON | ippCPUID_CLMUL*IPPCP_CLMUL_ON | ippCPUID_AVX512VAES*IPPCP_VAES_ON | ippCPUID_AVX512VCLMUL*IPPCP_VCLMUL_ON) #endif /* IPP_CUSTOM_CPU_FEATURES__ */ #endif /* !defined(_MERGED_BLD) && defined(IPPCP_CUSTOM_BUILD) */ diff --git a/include/ippversion.h b/include/ippversion.h index 159f25cd..4d269aed 100644 --- a/include/ippversion.h +++ b/include/ippversion.h @@ -28,13 +28,13 @@ #define IPPVERSION_H__ #define IPP_VERSION_MAJOR 2021 -#define IPP_VERSION_MINOR 7 +#define IPP_VERSION_MINOR 8 #define IPP_VERSION_UPDATE 0 // Major interface version #define IPP_INTERFACE_VERSION_MAJOR 11 // Minor interface version -#define IPP_INTERFACE_VERSION_MINOR 5 +#define IPP_INTERFACE_VERSION_MINOR 8 #define IPP_VERSION_STR STR(IPP_VERSION_MAJOR) "." STR(IPP_VERSION_MINOR) "." STR(IPP_VERSION_UPDATE) " (" STR(IPP_INTERFACE_VERSION_MAJOR) "." STR(IPP_INTERFACE_VERSION_MINOR) " )" diff --git a/sources/ippcp/crypto_mb/Readme.md b/sources/ippcp/crypto_mb/Readme.md index 53eef996..f8e22150 100644 --- a/sources/ippcp/crypto_mb/Readme.md +++ b/sources/ippcp/crypto_mb/Readme.md @@ -1,6 +1,9 @@ # Crypto Multi-buffer Library -Currently, the library provides optimized version of RSA, ECDSA, SM3, x25519 multi-buffer algorithms based on Intel® Advanced Vector Extensions 512 (Intel® AVX-512) integer fused multiply-add (IFMA) operations. This CPU feature is introduced with Intel® Microarchitecture Code Named Ice Lake. +Currently, the library provides optimized version of the following algorithms: +1. RSA, ECDSA, ECDH, x25519 multi-buffer algorithms based on Intel® Advanced Vector Extensions 512 (Intel® AVX-512) integer fused multiply-add (IFMA) operations. This CPU feature is introduced with Intel® Microarchitecture Code Named Ice Lake. +2. SM4 based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) GFNI instructions. +3. SM3 based on Intel® Advanced Vector Extensions 512 (Intel® AVX-512) instructions. ## Multiple Buffers Processing Overview @@ -24,7 +27,7 @@ This library consists of highly-optimized kernels taking advantage of Intel’s ### Linux* OS -- Intel® C++ Compiler Classic 2021.3 for Linux\* OS +- Intel® C++ Compiler Classic 2021.9 for Linux\* OS - GCC 8.3 - GCC 9.1 - GCC 10.1 @@ -35,14 +38,14 @@ This library consists of highly-optimized kernels taking advantage of Intel’s ### Windows* OS -- Intel® C++ Compiler Classic 2021.3 for Windows\* OS +- Intel® C++ Compiler Classic 2021.9 for Windows\* OS - Microsoft Visual C++ Compiler\* version 19.24 provided by Microsoft Visual Studio\* 2019 version 16.4 - Microsoft Visual C++ Compiler\* version 19.30 provided by Microsoft Visual Studio\* 2022 version 17.0 > **NOTE:** [CMake\*](https://cmake.org/download) 3.21 or higher is required to build using Microsoft Visual Studio\* 2022. ### macOS* -- Intel® C++ Compiler Classic 2021.3 for macOS\* +- Intel® C++ Compiler Classic 2021.9 for macOS\* ## Installation @@ -83,6 +86,8 @@ You can find the installed files in:    │   ├── exp.h    │   ├── rsa.h    │   ├── sm3.h +   │   ├── sm4_ccm.h +   │   ├── sm4_gcm.h    │   ├── sm4.h    │   ├── status.h | ├── version.h diff --git a/sources/ippcp/crypto_mb/include/crypto_mb/sm4.h b/sources/ippcp/crypto_mb/include/crypto_mb/sm4.h index d8e6bd6b..2eeab8c1 100644 --- a/sources/ippcp/crypto_mb/include/crypto_mb/sm4.h +++ b/sources/ippcp/crypto_mb/include/crypto_mb/sm4.h @@ -28,9 +28,11 @@ #define SM4_ROUNDS (32) /* SM4 number of rounds */ typedef int8u sm4_key[SM4_KEY_SIZE]; +typedef int8u sm4_xts_key[SM4_KEY_SIZE*2]; typedef int32u mbx_sm4_key_schedule[SM4_ROUNDS][SM4_LINES]; EXTERN_C mbx_status16 mbx_sm4_set_key_mb16(mbx_sm4_key_schedule* key_sched, const sm4_key* pa_key[SM4_LINES]); +EXTERN_C mbx_status16 mbx_sm4_xts_set_keys_mb16(mbx_sm4_key_schedule* key_sched1, mbx_sm4_key_schedule* key_sched2, const sm4_xts_key* pa_key[SM4_LINES]); EXTERN_C mbx_status16 mbx_sm4_encrypt_ecb_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched); EXTERN_C mbx_status16 mbx_sm4_decrypt_ecb_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched); @@ -47,4 +49,10 @@ EXTERN_C mbx_status16 mbx_sm4_decrypt_ofb_mb16(int8u* pa_out[SM4_LINES], const i EXTERN_C mbx_status16 mbx_sm4_encrypt_cfb128_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched, const int8u* pa_iv[SM4_LINES]); EXTERN_C mbx_status16 mbx_sm4_decrypt_cfb128_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched, const int8u* pa_iv[SM4_LINES]); +EXTERN_C mbx_status16 mbx_sm4_xts_encrypt_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], + const mbx_sm4_key_schedule* key_sched1, const mbx_sm4_key_schedule* key_sched2, + const int8u* pa_tweak[SM4_LINES]); +EXTERN_C mbx_status16 mbx_sm4_xts_decrypt_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], + const mbx_sm4_key_schedule* key_sched1, const mbx_sm4_key_schedule* key_sched2, + const int8u* pa_tweak[SM4_LINES]); #endif /* SM4_H */ diff --git a/sources/ippcp/crypto_mb/include/crypto_mb/version.h b/sources/ippcp/crypto_mb/include/crypto_mb/version.h index 0436b297..650cf46f 100644 --- a/sources/ippcp/crypto_mb/include/crypto_mb/version.h +++ b/sources/ippcp/crypto_mb/include/crypto_mb/version.h @@ -24,12 +24,12 @@ #define MBX_LIB_NAME() "crypto_mb" #define MBX_VER_MAJOR 1 #define MBX_VER_MINOR 0 -#define MBX_VER_REV 6 +#define MBX_VER_REV 8 /* major interface version */ #define MBX_INTERFACE_VERSION_MAJOR 11 /* minor interface version */ -#define MBX_INTERFACE_VERSION_MINOR 5 +#define MBX_INTERFACE_VERSION_MINOR 8 typedef struct { int major; /* e.g. 1 */ diff --git a/sources/ippcp/crypto_mb/include/internal/sm4/sm4_mb.h b/sources/ippcp/crypto_mb/include/internal/sm4/sm4_mb.h index b8b2cb1d..b938cf59 100644 --- a/sources/ippcp/crypto_mb/include/internal/sm4/sm4_mb.h +++ b/sources/ippcp/crypto_mb/include/internal/sm4/sm4_mb.h @@ -180,6 +180,66 @@ static __ALIGN64 const int8u shuf8[] = { 0x09, 0x0A, 0x0B, 0x08, 0x0D, 0x0E, 0x0F, 0x0C, }; +/* For SM4-XTS */ +static __ALIGN64 const int64u xts_poly[] = { + 0x87, 0x87, 0x87, 0x87, 0x87, 0x87, 0x87, 0x87 +}; + +static __ALIGN64 const int8u xts_shuf_mask[] = { + 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +static __ALIGN64 const int64u xts_const_dq3210[] = { + 0, 0, 1, 1, 2, 2, 3, 3 +}; + +static __ALIGN64 const int64u xts_const_dq5678[] = { + 8, 8, 7, 7, 6, 6, 5, 5 +}; + +static __ALIGN64 const int32u xts_full_block_mask[] = { + 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, + 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0 +}; + +static __ALIGN64 const int32u xts_partial_block_mask[] = { + 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, + 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f +}; + +static __ALIGN64 const int32u xts_dw0_7_to_qw_idx[] = { + 0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF, + 4, 0xFF, 5, 0xFF, 6, 0xFF, 7, 0xFF +}; + +static __ALIGN64 const int32u xts_dw8_15_to_qw_idx[] = { + 8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF, + 12, 0xFF, 13, 0xFF, 14, 0xFF, 15, 0xFF +}; + +static __ALIGN64 const int64u xts_tweak_permq[] = { + 2, 3, 0, 1, 0xFF, 0xFF, 0xFF, 0xFF, + 0, 1, 4, 5, 2, 3, 0xFF, 0xFF, + 0, 1, 2, 3, 6, 7, 4, 5, + 0, 1, 2, 3, 4, 5, 10, 11 /* for vpermi2q */ +}; + +static __ALIGN64 const int64u xts_next_tweak_permq[] = { + 0, 1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 2, 3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 14, 15, 0, 1, 0xFF, 0xFF, 0xFF, 0xFF /* for vpermi2q */ +}; + +static __ALIGN64 const int64u xts_next_tweak_permq_enc[] = { + 2, 3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 4, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, +}; + #define SM4_ONE_ROUND(X0, X1, X2, X3, TMP, RK) { \ /* (Xi+1 ^ Xi+2 ^ Xi+3 ^ rki) */ \ TMP = _mm512_ternarylogic_epi32 (X1, X2, X3, 0x96); \ @@ -241,7 +301,7 @@ static __ALIGN64 const int8u shuf8[] = { #define UPDATE_STREAM_MASK_64(MASK, p_len) MASK = *p_len < (4 * 16) ? (*p_len <= 0 ? 0 : ((int64u)1 << *p_len) - 1) : (__mmask64)(-1); p_len++; #define SM4_ENC (1) -#define SM4_DEC (2) +#define SM4_DEC (-1) /* // Internal functions @@ -256,6 +316,9 @@ EXTERN_C void sm4_ofb_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[ EXTERN_C void sm4_cfb128_enc_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const int32u* key_sched[SM4_ROUNDS], const int8u* pa_iv[SM4_LINES], __mmask16 mb_mask); EXTERN_C void sm4_cfb128_dec_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const int32u* key_sched[SM4_ROUNDS], const int8u* pa_iv[SM4_LINES], __mmask16 mb_mask); EXTERN_C void sm4_set_round_keys_mb16(int32u* key_sched[SM4_ROUNDS], const int8u* pa_inp_key[SM4_LINES], __mmask16 mb_mask); +EXTERN_C void sm4_xts_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], + const int32u* key_sched1[SM4_ROUNDS], const int32u* key_sched2[SM4_ROUNDS], + const int8u* pa_tweak[SM4_LINES], __mmask16 mb_mask, const int dir); // The transformation based on SM4 sbox algebraic structure, parameters were computed manually __INLINE __m512i sBox512(__m512i block) @@ -448,6 +511,42 @@ __INLINE void TRANSPOSE_16x4_I32_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __ *t3 = _mm512_unpackhi_epi64(z1, z3); } +__INLINE void TRANSPOSE_16x4_I32_XMM_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __m512i* t3, const __m128i in[16]) { + // L0 - L3 + __m512i z0 = _mm512_castsi128_si512(in[0]); + __m512i z1 = _mm512_castsi128_si512(in[1]); + __m512i z2 = _mm512_castsi128_si512(in[2]); + __m512i z3 = _mm512_castsi128_si512(in[3]); + + // L4 - L7 + z0 = _mm512_inserti64x2(z0, in[4], 1); + z1 = _mm512_inserti64x2(z1, in[5], 1); + z2 = _mm512_inserti64x2(z2, in[6], 1); + z3 = _mm512_inserti64x2(z3, in[7], 1); + + // L8 - Lb + z0 = _mm512_inserti64x2(z0, in[8], 2); + z1 = _mm512_inserti64x2(z1, in[9], 2); + z2 = _mm512_inserti64x2(z2, in[10], 2); + z3 = _mm512_inserti64x2(z3, in[11], 2); + + // Lc - Lf + *t0 = ENDIANNESS_16x32(_mm512_inserti64x2(z0, in[12], 3)); + *t1 = ENDIANNESS_16x32(_mm512_inserti64x2(z1, in[13], 3)); + *t2 = ENDIANNESS_16x32(_mm512_inserti64x2(z2, in[14], 3)); + *t3 = ENDIANNESS_16x32(_mm512_inserti64x2(z3, in[15], 3)); + + z0 = _mm512_unpacklo_epi32(*t0, *t1); + z1 = _mm512_unpackhi_epi32(*t0, *t1); + z2 = _mm512_unpacklo_epi32(*t2, *t3); + z3 = _mm512_unpackhi_epi32(*t2, *t3); + + *t0 = _mm512_unpacklo_epi64(z0, z2); + *t1 = _mm512_unpackhi_epi64(z0, z2); + *t2 = _mm512_unpacklo_epi64(z1, z3); + *t3 = _mm512_unpackhi_epi64(z1, z3); +} + __INLINE void TRANSPOSE_4x16_I32_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __m512i* t3, int8u* p_out[16], __mmask16 mb_mask) { #define STORE_RESULT(OUT, store_mask, loc_mb_mask, Ti) \ @@ -493,6 +592,45 @@ __INLINE void TRANSPOSE_4x16_I32_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __ } +__INLINE void TRANSPOSE_4x16_I32_XMM_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __m512i* t3, __m128i out[16]) { + + __m512i z0 = _mm512_unpacklo_epi32(*t0, *t1); + __m512i z1 = _mm512_unpackhi_epi32(*t0, *t1); + __m512i z2 = _mm512_unpacklo_epi32(*t2, *t3); + __m512i z3 = _mm512_unpackhi_epi32(*t2, *t3); + + /* Get the right endianness and do (Y0, Y1, Y2, Y3) = R(X32, X33, X34, X35) = (X35, X34, X33, X32) */ + *t0 = CHANGE_ORDER_BLOCKS(_mm512_unpacklo_epi64(z0, z2)); + *t1 = CHANGE_ORDER_BLOCKS(_mm512_unpackhi_epi64(z0, z2)); + *t2 = CHANGE_ORDER_BLOCKS(_mm512_unpacklo_epi64(z1, z3)); + *t3 = CHANGE_ORDER_BLOCKS(_mm512_unpackhi_epi64(z1, z3)); + + // L0 - L3 + out[0] = _mm512_extracti64x2_epi64(*t0, 0); + out[1] = _mm512_extracti64x2_epi64(*t1, 0); + out[2] = _mm512_extracti64x2_epi64(*t2, 0); + out[3] = _mm512_extracti64x2_epi64(*t3, 0); + + // L4 - L7 + out[4] = _mm512_extracti64x2_epi64(*t0, 1); + out[5] = _mm512_extracti64x2_epi64(*t1, 1); + out[6] = _mm512_extracti64x2_epi64(*t2, 1); + out[7] = _mm512_extracti64x2_epi64(*t3, 1); + + // L8 - Lb + out[8] = _mm512_extracti64x2_epi64(*t0, 2); + out[9] = _mm512_extracti64x2_epi64(*t1, 2); + out[10] = _mm512_extracti64x2_epi64(*t2, 2); + out[11] = _mm512_extracti64x2_epi64(*t3, 2); + + // Lc - Lf + out[12] = _mm512_extracti64x2_epi64(*t0, 3); + out[13] = _mm512_extracti64x2_epi64(*t1, 3); + out[14] = _mm512_extracti64x2_epi64(*t2, 3); + out[15] = _mm512_extracti64x2_epi64(*t3, 3); + +} + __INLINE void TRANSPOSE_4x16_I32_O128_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __m512i* t3, __m128i p_out[16], __mmask16 mb_mask) { #define STORE_RESULT(OUT, store_mask, loc_mb_mask, Ti) \ diff --git a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.defs b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.defs index 4c52c4bb..1faf7a1e 100644 --- a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.defs +++ b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.defs @@ -125,3 +125,7 @@ mbx_sm4_ccm_update_aad_mb16 mbx_sm4_ccm_encrypt_mb16 mbx_sm4_ccm_decrypt_mb16 mbx_sm4_ccm_get_tag_mb16 + +mbx_sm4_xts_set_keys_mb16 +mbx_sm4_xts_encrypt_mb16 +mbx_sm4_xts_decrypt_mb16 diff --git a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.linux.lib-export b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.linux.lib-export index dc28cc72..5bbea890 100644 --- a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.linux.lib-export +++ b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.linux.lib-export @@ -123,3 +123,7 @@ EXTERN (mbx_sm4_ccm_update_aad_mb16) EXTERN (mbx_sm4_ccm_encrypt_mb16) EXTERN (mbx_sm4_ccm_decrypt_mb16) EXTERN (mbx_sm4_ccm_get_tag_mb16) + +EXTERN (mbx_sm4_xts_set_keys_mb16) +EXTERN (mbx_sm4_xts_encrypt_mb16) +EXTERN (mbx_sm4_xts_decrypt_mb16) diff --git a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.macosx.lib-export b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.macosx.lib-export index b48e5fba..b090a284 100644 --- a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.macosx.lib-export +++ b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.macosx.lib-export @@ -123,3 +123,7 @@ _mbx_sm4_ccm_update_aad_mb16 _mbx_sm4_ccm_encrypt_mb16 _mbx_sm4_ccm_decrypt_mb16 _mbx_sm4_ccm_get_tag_mb16 + +_mbx_sm4_xts_set_keys_mb16 +_mbx_sm4_xts_encrypt_mb16 +_mbx_sm4_xts_decrypt_mb16 diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c index d544d431..298c4a84 100644 --- a/sources/ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c +++ b/sources/ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c @@ -40,9 +40,7 @@ void sm4_ecb_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES _mm512_storeu_si512(loc_out, _mm512_loadu_si512(pa_out)); _mm512_storeu_si512(loc_out + 8, _mm512_loadu_si512(pa_out + 8)); - /* Depending on the operation(enc or dec): sign allows to go up and down on the key schedule */ /* p_rk set to the beginning or to the end of the key schedule */ - const int sign = (operation == SM4_ENC) ? 1 : -1; const __m512i* p_rk = (operation == SM4_ENC) ? (const __m512i*)key_sched : ((const __m512i*)key_sched + (SM4_ROUNDS - 1)); __ALIGN64 __m512i TMP[20]; @@ -92,8 +90,8 @@ void sm4_ecb_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); TRANSPOSE_INP_512(TMP[16], TMP[17], TMP[18], TMP[19], TMP[0], TMP[1], TMP[2], TMP[3]); - SM4_KERNEL(TMP, p_rk, sign); - p_rk -= sign*SM4_ROUNDS; + SM4_KERNEL(TMP, p_rk, operation); + p_rk -= operation*SM4_ROUNDS; TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], TMP[5], TMP[6], TMP[7]); TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); @@ -149,7 +147,7 @@ void sm4_ecb_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES /* compute incomplete buffer loading */ sm4_ecb_incomplete_buff_mb16(loc_inp, loc_out, - num_blocks, p_rk, sign, + num_blocks, p_rk, operation, mb_mask, TMP); /* clear local copy of sensitive data */ diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c index cf2fdea4..e6163321 100644 --- a/sources/ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c +++ b/sources/ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c @@ -112,3 +112,40 @@ mbx_status16 mbx_sm4_set_key_mb16(mbx_sm4_key_schedule* key_sched, const sm4_key return status; } + +DLL_PUBLIC +mbx_status16 mbx_sm4_xts_set_keys_mb16(mbx_sm4_key_schedule* key_sched1, + mbx_sm4_key_schedule* key_sched2, + const sm4_xts_key* pa_key[SM4_LINES]) +{ + int buf_no; + mbx_status16 status = 0; + __mmask16 mb_mask = 0xFFFF; + + /* Test input pointers */ + if (NULL == key_sched1 || NULL == key_sched2 || NULL == pa_key) + return MBX_SET_STS16_ALL(MBX_STATUS_NULL_PARAM_ERR); + + /* Don't process buffers with input pointers equal to zero */ + for (buf_no = 0; buf_no < SM4_LINES; buf_no++) { + if (pa_key[buf_no] == NULL) { + status = MBX_SET_STS16(status, buf_no, MBX_STATUS_NULL_PARAM_ERR); + mb_mask &= ~(0x1 << buf_no); + } + } + + if (MBX_IS_ANY_OK_STS16(status)) { + /* Generate round keys for key1 */ + sm4_set_round_keys_mb16((int32u**)key_sched1, (const int8u**)pa_key, mb_mask); + + const sm4_key* pa_key2[SM4_LINES]; + + for (int i = 0; i < SM4_LINES; i++) + pa_key2[i] = (const sm4_key*)&((int8u*)pa_key[i])[16]; + + /* Generate round keys for key2 */ + sm4_set_round_keys_mb16((int32u**)key_sched2, (const int8u**)pa_key2, mb_mask); + } + + return status; +} diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_xts_dec_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_dec_mb16.c new file mode 100644 index 00000000..9bb03d9b --- /dev/null +++ b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_dec_mb16.c @@ -0,0 +1,59 @@ +/******************************************************************************* +* Copyright (C) 2023 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the 'License'); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an 'AS IS' BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +*******************************************************************************/ + +#include +#include + +#include +#include + +DLL_PUBLIC +mbx_status16 mbx_sm4_xts_decrypt_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], + const mbx_sm4_key_schedule* key_sched1, + const mbx_sm4_key_schedule* key_sched2, + const int8u* pa_tweak[SM4_LINES]) +{ + unsigned buf_no; + mbx_status16 status = 0; + __mmask16 mb_mask = 0xFFFF; + + /* Test input pointers */ + if (NULL == pa_out || NULL == pa_inp || NULL == len || + NULL == key_sched1 || NULL == key_sched2 || NULL == pa_tweak) + return MBX_SET_STS16_ALL(MBX_STATUS_NULL_PARAM_ERR); + + /* Test input data length and input pointers */ + for (buf_no = 0; buf_no < SM4_LINES; buf_no++) { + if (pa_out[buf_no] == NULL || pa_inp[buf_no] == NULL || pa_tweak[buf_no] == NULL) { + status = MBX_SET_STS16(status, buf_no, MBX_STATUS_NULL_PARAM_ERR); + /* Do not process empty buffers */ + mb_mask &= ~(0x1 << buf_no); + } + if (len[buf_no] < SM4_BLOCK_SIZE) { + status = MBX_SET_STS16(status, buf_no, MBX_STATUS_MISMATCH_PARAM_ERR); + /* Do not process non-valid buffers */ + mb_mask &= ~(0x1 << buf_no); + } + } + + if (MBX_IS_ANY_OK_STS16(status)) + sm4_xts_kernel_mb16(pa_out, (const int8u**)pa_inp, (const int*)len, + (const int32u**)key_sched1, (const int32u**)key_sched2, + pa_tweak, mb_mask, SM4_DEC); + + return status; +} diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_xts_enc_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_enc_mb16.c new file mode 100644 index 00000000..c15e826a --- /dev/null +++ b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_enc_mb16.c @@ -0,0 +1,59 @@ +/******************************************************************************* +* Copyright (C) 2023 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the 'License'); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an 'AS IS' BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +*******************************************************************************/ + +#include +#include + +#include +#include + +DLL_PUBLIC +mbx_status16 mbx_sm4_xts_encrypt_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], + const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched1, + const mbx_sm4_key_schedule* key_sched2, + const int8u* pa_tweak[SM4_LINES]) +{ + unsigned buf_no; + mbx_status16 status = 0; + __mmask16 mb_mask = 0xFFFF; + + /* Test input pointers */ + if (NULL == pa_out || NULL == pa_inp || NULL == len || + NULL == key_sched1 || NULL == key_sched2 || NULL == pa_tweak) + return MBX_SET_STS16_ALL(MBX_STATUS_NULL_PARAM_ERR); + + /* Test input data length and input pointers */ + for (buf_no = 0; buf_no < SM4_LINES; buf_no++) { + if (pa_out[buf_no] == NULL || pa_inp[buf_no] == NULL || pa_tweak[buf_no] == NULL) { + status = MBX_SET_STS16(status, buf_no, MBX_STATUS_NULL_PARAM_ERR); + /* Do not process empty buffers */ + mb_mask &= ~(0x1 << buf_no); + } + if (len[buf_no] < SM4_BLOCK_SIZE) { + status = MBX_SET_STS16(status, buf_no, MBX_STATUS_MISMATCH_PARAM_ERR); + /* Do not process non-valid buffers */ + mb_mask &= ~(0x1 << buf_no); + } + } + + if (MBX_IS_ANY_OK_STS16(status)) + sm4_xts_kernel_mb16(pa_out, (const int8u**)pa_inp, (const int*)len, + (const int32u**)key_sched1, (const int32u**)key_sched2, + pa_tweak, mb_mask, SM4_ENC); + + return status; +} diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c new file mode 100644 index 00000000..02db4078 --- /dev/null +++ b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c @@ -0,0 +1,529 @@ +/******************************************************************************* +* Copyright (C) 2023 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the 'License'); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, +* software distributed under the License is distributed on an 'AS IS' BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions +* and limitations under the License. +* +*******************************************************************************/ + +#include +#include + +#define FIRST_TWEAKS 1 +#define NEXT_TWEAKS 0 + +/* Generate the next 4 tweaks for a given buffer */ +static void generate_next_4_tweaks(const __m512i *PREV_TWEAK, __m512i *NEXT_TWEAK, + const __m512i z_shuf_mask, const __m512i z_poly, + const int first_tweaks) +{ + __m512i TMP1, TMP2, TMP3, TMP4; + const __mmask8 xor_mask = _cvtu32_mask8(0xAA); + + TMP1 = _mm512_shuffle_epi8(*PREV_TWEAK, z_shuf_mask); + /* + * In case of the first 4 tweaks, the shifts are variable, + * as we are start from tweak 1 in all 128-bit lanes, to construct + * tweaks 1, 2, 3 and 4 + */ + if (first_tweaks) { + const __m512i z_dq3210 = _mm512_loadu_si512(xts_const_dq3210); + const __m512i z_dq5678 = _mm512_loadu_si512(xts_const_dq5678); + + TMP2 = _mm512_sllv_epi64(*PREV_TWEAK, z_dq3210); + TMP3 = _mm512_srlv_epi64(TMP1, z_dq5678); + /* + * For following tweaks, the shifts are constant, + * as we calculate the next 4 tweaks, parting from tweaks N-4, N-3, N-2 and N, + * to construct tweaks N, N+1, N+2, N+3 + */ + } else { + TMP2 = _mm512_slli_epi64(*PREV_TWEAK, 4); + TMP3 = _mm512_srli_epi64(TMP1, 4); + } + TMP4 = _mm512_clmulepi64_epi128(TMP3, z_poly, 0); + TMP2 = _mm512_mask_xor_epi64(TMP2, xor_mask, TMP2, TMP3); + *NEXT_TWEAK = _mm512_xor_epi32(TMP4, TMP2); +} + +/* Prepare the last tweaks for a given buffer, if it has a partial block */ +static void prepare_last_tweaks(__m512i *TWEAK, __m512i *NEXT_TWEAK, + const int operation, int num_remain_full_blocks) +{ + /* + * For the encryption case, we need to prepare the tweak + * for the partial block to be at the beginning of NEXT_TWEAK, + * so depending on the number of remaining full blocks, its position + * will vary, so the permute mask will be different. In case, there are 4 full blocks, + * the newly generated NEXT_TWEAK will be positioned correctly. + */ + if (operation == SM4_ENC) { + if (num_remain_full_blocks == 1) + *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq_enc[0]), *TWEAK); + else if (num_remain_full_blocks == 2) + *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq_enc[1*8]), *TWEAK); + else if (num_remain_full_blocks == 3) + *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq_enc[2*8]), *TWEAK); + /* + * For the decryption case, it is a bit more complicated. + * In case of a partial block, the last two tweaks (the last tweak of the last full block) + * and the tweak of the last block, need to be interchanged. + * TWEAK will have the tweaks for the last FULL blocks and *NEXT_TWEAK, + * as earlier, will have the tweak for the last partial block. + */ + } else { + if (num_remain_full_blocks == 1) { + *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq[0]), *TWEAK); + *TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_tweak_permq[0]), *TWEAK); + } else if (num_remain_full_blocks == 2) { + *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq[1*8]), *TWEAK); + *TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_tweak_permq[1*8]), *TWEAK); + } else if (num_remain_full_blocks == 3) { + *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq[2*8]), *TWEAK); + *TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_tweak_permq[2*8]), *TWEAK); + } else if (num_remain_full_blocks == 4) { + *NEXT_TWEAK = _mm512_permutex2var_epi64(*NEXT_TWEAK, _mm512_loadu_si512(&xts_next_tweak_permq[3*8]), *TWEAK); + *TWEAK = _mm512_permutex2var_epi64(*TWEAK, _mm512_loadu_si512(&xts_tweak_permq[3*8]), *NEXT_TWEAK); + } + } +} + +static void sm4_xts_mask_kernel_mb16(__m512i* NEXT_TWEAK, const __m512i* p_rk, __m512i loc_len32, + const int8u** loc_inp, int8u** loc_out, + __mmask16 mb_mask, const int operation) +{ + __m512i TMP[20]; + const __m512i z_poly = _mm512_loadu_si512(xts_poly); + const __m512i z_partial_block_mask = _mm512_loadu_si512(xts_partial_block_mask); + const __m512i z_full_block_mask = _mm512_loadu_si512(xts_full_block_mask); + const __m512i z_shuf_mask = _mm512_loadu_si512(xts_shuf_mask); + /* Length in bytes of partial blocks for all buffers */ + const __m512i partial_len32 = _mm512_and_si512(loc_len32, z_partial_block_mask); + /* Length in bytes of full blocks for all buffers */ + loc_len32 = _mm512_and_si512(loc_len32, z_full_block_mask); + + __mmask16 ge_64_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len32, _mm512_set1_epi32(4 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT); + __mmask8 ge_64_mask_0_7 = (__mmask8) ge_64_mask; + __mmask8 ge_64_mask_8_15 = (__mmask8) _kshiftri_mask16(ge_64_mask, 8); + /* Expand 32-bit lengths to 64-bit lengths for 16 buffers */ + const __mmask16 expand_mask = _cvtu32_mask16(0x5555); + __m512i remain_len64_0_7 = _mm512_maskz_permutexvar_epi32(expand_mask, _mm512_loadu_si512(xts_dw0_7_to_qw_idx), loc_len32); + __m512i remain_len64_8_15 = _mm512_maskz_permutexvar_epi32(expand_mask, _mm512_loadu_si512(xts_dw8_15_to_qw_idx), loc_len32); + __m512i processed_len64_0_7; + __m512i processed_len64_8_15; + __m512i TWEAK[SM4_LINES]; + __m512i num_remain_full_blocks = _mm512_srli_epi32(loc_len32, 4); + /* Calculate bitmask of buffers with at least one full block */ + __mmask16 tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len32, _mm512_set1_epi32(0), _MM_CMPINT_NLE); + + /* + * While there is at least one full block in any of the buffer, keep encrypting + * (this loop only handles full blocks, but some buffers will have here + * less than 4 full blocks) + */ + while (tmp_mask) { + /* Mask for data loading */ + __mmask64 stream_mask[16]; + int i; + + int* p_loc_len32 = (int*)&loc_len32; + int* p_num_remain_full_blocks = (int*)&num_remain_full_blocks; + int* p_partial_block = (int*)&partial_len32; + + /* Generate tweaks for next rounds */ + for (i = 0; i < SM4_LINES; i++) { + TWEAK[i] = NEXT_TWEAK[i]; + /* + * If there are at least 4 more full blocks to process, + * at least one more tweak will be needed (for more full blocks or + * for a last partial block) + */ + if (p_num_remain_full_blocks[i] >= 4) + generate_next_4_tweaks(&TWEAK[i], &NEXT_TWEAK[i], z_shuf_mask, z_poly, NEXT_TWEAKS); + + /* If there is a partial block, tweaks need to be rearranged depending on cipher direction */ + if ((p_partial_block[i] > 0) & (p_num_remain_full_blocks[i] <= 4)) + prepare_last_tweaks(&TWEAK[i], &NEXT_TWEAK[i], operation, p_num_remain_full_blocks[i]); + } + + num_remain_full_blocks = _mm512_sub_epi32(num_remain_full_blocks, _mm512_set1_epi32(4)); + + /* + * XOR plaintext from each lane with the 4 tweaks and transpose to prepare for encryption. + * Since some buffers will have less than 4 full blocks, + * a bitmask is required to load less than 64 bytes (stream_mask) + */ + UPDATE_STREAM_MASK_64(stream_mask[0], p_loc_len32) + TMP[0] = _mm512_xor_si512(TWEAK[0], _mm512_maskz_loadu_epi8(stream_mask[0], loc_inp[0])); + UPDATE_STREAM_MASK_64(stream_mask[1], p_loc_len32) + TMP[1] = _mm512_xor_si512(TWEAK[1], _mm512_maskz_loadu_epi8(stream_mask[1], loc_inp[1])); + UPDATE_STREAM_MASK_64(stream_mask[2], p_loc_len32) + TMP[2] = _mm512_xor_si512(TWEAK[2], _mm512_maskz_loadu_epi8(stream_mask[2], loc_inp[2])); + UPDATE_STREAM_MASK_64(stream_mask[3], p_loc_len32) + TMP[3] = _mm512_xor_si512(TWEAK[3], _mm512_maskz_loadu_epi8(stream_mask[3], loc_inp[3])); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + + TRANSPOSE_INP_512(TMP[4], TMP[5], TMP[6], TMP[7], TMP[0], TMP[1], TMP[2], TMP[3]); + + UPDATE_STREAM_MASK_64(stream_mask[4], p_loc_len32) + TMP[0] = _mm512_xor_si512(TWEAK[4], _mm512_maskz_loadu_epi8(stream_mask[4], loc_inp[4])); + UPDATE_STREAM_MASK_64(stream_mask[5], p_loc_len32) + TMP[1] = _mm512_xor_si512(TWEAK[5], _mm512_maskz_loadu_epi8(stream_mask[5], loc_inp[5])); + UPDATE_STREAM_MASK_64(stream_mask[6], p_loc_len32) + TMP[2] = _mm512_xor_si512(TWEAK[6], _mm512_maskz_loadu_epi8(stream_mask[6], loc_inp[6])); + UPDATE_STREAM_MASK_64(stream_mask[7], p_loc_len32) + TMP[3] = _mm512_xor_si512(TWEAK[7], _mm512_maskz_loadu_epi8(stream_mask[7], loc_inp[7])); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + + TRANSPOSE_INP_512(TMP[8], TMP[9], TMP[10], TMP[11], TMP[0], TMP[1], TMP[2], TMP[3]); + + UPDATE_STREAM_MASK_64(stream_mask[8], p_loc_len32) + TMP[0] = _mm512_xor_si512(TWEAK[8], _mm512_maskz_loadu_epi8(stream_mask[8], loc_inp[8])); + UPDATE_STREAM_MASK_64(stream_mask[9], p_loc_len32) + TMP[1] = _mm512_xor_si512(TWEAK[9], _mm512_maskz_loadu_epi8(stream_mask[9], loc_inp[9])); + UPDATE_STREAM_MASK_64(stream_mask[10], p_loc_len32) + TMP[2] = _mm512_xor_si512(TWEAK[10], _mm512_maskz_loadu_epi8(stream_mask[10], loc_inp[10])); + UPDATE_STREAM_MASK_64(stream_mask[11], p_loc_len32) + TMP[3] = _mm512_xor_si512(TWEAK[11], _mm512_maskz_loadu_epi8(stream_mask[11], loc_inp[11])); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + + TRANSPOSE_INP_512(TMP[12], TMP[13], TMP[14], TMP[15], TMP[0], TMP[1], TMP[2], TMP[3]); + + UPDATE_STREAM_MASK_64(stream_mask[12], p_loc_len32) + TMP[0] = _mm512_xor_si512(TWEAK[12], _mm512_maskz_loadu_epi8(stream_mask[12], loc_inp[12])); + UPDATE_STREAM_MASK_64(stream_mask[13], p_loc_len32) + TMP[1] = _mm512_xor_si512(TWEAK[13], _mm512_maskz_loadu_epi8(stream_mask[13], loc_inp[13])); + UPDATE_STREAM_MASK_64(stream_mask[14], p_loc_len32) + TMP[2] = _mm512_xor_si512(TWEAK[14], _mm512_maskz_loadu_epi8(stream_mask[14], loc_inp[14])); + UPDATE_STREAM_MASK_64(stream_mask[15], p_loc_len32) + TMP[3] = _mm512_xor_si512(TWEAK[15], _mm512_maskz_loadu_epi8(stream_mask[15], loc_inp[15])); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + + TRANSPOSE_INP_512(TMP[16], TMP[17], TMP[18], TMP[19], TMP[0], TMP[1], TMP[2], TMP[3]); + + SM4_KERNEL(TMP, p_rk, operation); + p_rk -= operation*SM4_ROUNDS; + + /* Transpose, XOR with the tweaks again and write data out */ + TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], TMP[5], TMP[6], TMP[7]); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + _mm512_mask_storeu_epi8((__m512i*)loc_out[0], stream_mask[0], _mm512_xor_si512(TMP[0], TWEAK[0])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[1], stream_mask[1], _mm512_xor_si512(TMP[1], TWEAK[1])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[2], stream_mask[2], _mm512_xor_si512(TMP[2], TWEAK[2])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[3], stream_mask[3], _mm512_xor_si512(TMP[3], TWEAK[3])); + + TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[8], TMP[9], TMP[10], TMP[11]); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + _mm512_mask_storeu_epi8((__m512i*)loc_out[4], stream_mask[4], _mm512_xor_si512(TMP[0], TWEAK[4])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[5], stream_mask[5], _mm512_xor_si512(TMP[1], TWEAK[5])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[6], stream_mask[6], _mm512_xor_si512(TMP[2], TWEAK[6])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[7], stream_mask[7], _mm512_xor_si512(TMP[3], TWEAK[7])); + + TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[12], TMP[13], TMP[14], TMP[15]); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + _mm512_mask_storeu_epi8((__m512i*)loc_out[8], stream_mask[8], _mm512_xor_si512(TMP[0], TWEAK[8])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[9], stream_mask[9], _mm512_xor_si512(TMP[1], TWEAK[9])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[10], stream_mask[10], _mm512_xor_si512(TMP[2],TWEAK[10])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[11], stream_mask[11], _mm512_xor_si512(TMP[3],TWEAK[11])); + + TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[16], TMP[17], TMP[18], TMP[19]); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + _mm512_mask_storeu_epi8((__m512i*)loc_out[12], stream_mask[12], _mm512_xor_si512(TMP[0], TWEAK[12])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[13], stream_mask[13], _mm512_xor_si512(TMP[1], TWEAK[13])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[14], stream_mask[14], _mm512_xor_si512(TMP[2], TWEAK[14])); + _mm512_mask_storeu_epi8((__m512i*)loc_out[15], stream_mask[15], _mm512_xor_si512(TMP[3], TWEAK[15])); + + /* Update input/output pointers to data */ + processed_len64_0_7 = _mm512_mask_blend_epi64(ge_64_mask_0_7, remain_len64_0_7, _mm512_set1_epi64(4 * SM4_BLOCK_SIZE)); + processed_len64_8_15 = _mm512_mask_blend_epi64(ge_64_mask_8_15, remain_len64_8_15, _mm512_set1_epi64(4 * SM4_BLOCK_SIZE)); + M512(loc_inp) = _mm512_add_epi64(_mm512_loadu_si512(loc_inp), processed_len64_0_7); + M512(loc_inp + 8) = _mm512_add_epi64(_mm512_loadu_si512(loc_inp + 8), processed_len64_8_15); + + M512(loc_out) = _mm512_add_epi64(_mm512_loadu_si512(loc_out), processed_len64_0_7); + M512(loc_out + 8) = _mm512_add_epi64(_mm512_loadu_si512(loc_out + 8), processed_len64_8_15); + + /* Update number of blocks left and processing mask */ + remain_len64_0_7 = _mm512_sub_epi64(remain_len64_0_7, processed_len64_0_7); + remain_len64_8_15 = _mm512_sub_epi64(remain_len64_8_15, processed_len64_8_15); + loc_len32 = _mm512_sub_epi32(loc_len32, _mm512_set1_epi32(4 * SM4_BLOCK_SIZE)); + tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len32, _mm512_set1_epi32(0), _MM_CMPINT_NLE); + ge_64_mask_0_7 = _mm512_cmp_epi64_mask(remain_len64_0_7, _mm512_set1_epi64(4 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT); + ge_64_mask_8_15 = _mm512_cmp_epi64_mask(remain_len64_8_15, _mm512_set1_epi64(4 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT); + } + + /* At this stage, all buffers have at most 15 bytes (a partial block) */ + + /* Calculate bitmask of buffers with a partial block */ + tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, partial_len32, _mm512_set1_epi32(0), _MM_CMPINT_NLE); + + if (tmp_mask) { + /* Encrypt last plaintext using bytes from previous ciphertext block */ + __mmask64 stream_mask[16]; + int* p_loc_len32 = (int*)&partial_len32; + __m128i XTMP[16]; + int i; + + for (i = 0; i < SM4_LINES; i++) { + /* Get right tweak (position tweak in last 16 bytes of ZMM register) */ + UPDATE_STREAM_MASK_64(stream_mask[i], p_loc_len32); + /* Read final bytes of input partial block */ + XTMP[i] = _mm_maskz_loadu_epi8((__mmask16)stream_mask[i], loc_inp[i]); + /* + * Read last bytes of previous output block to form 16 bytes + * (only if there is a partial block at the end of the buffer) + */ + if (stream_mask[i] == 0) + continue; + __m128i XOUT = _mm_maskz_loadu_epi8((__mmask16)~stream_mask[i], (loc_out[i] - 16)); + XTMP[i] = _mm_or_si128(XTMP[i], XOUT); + /* Initial XOR of new constructed input with tweak */ + XTMP[i] = _mm_xor_si128(XTMP[i], _mm512_castsi512_si128(NEXT_TWEAK[i])); + } + + /* Encrypt final block from all lanes, compressing the 16 XMMs into 4 ZMMs */ + TRANSPOSE_16x4_I32_XMM_EPI32(&TMP[0], &TMP[1], &TMP[2], &TMP[3], XTMP); + for (i = 0; i < SM4_ROUNDS; i += 4, p_rk += 4*operation) + SM4_FOUR_ROUNDS(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], p_rk, operation); + + p_rk -= operation*SM4_ROUNDS; + + /* Spread out the 4 ZMMs into 16 XMMs */ + TRANSPOSE_4x16_I32_XMM_EPI32(&TMP[0], &TMP[1], &TMP[2], &TMP[3], XTMP); + for (i = 0; i < SM4_LINES; i++) { + /* Skip the buffer if there is no partial block left */ + if (stream_mask[i] == 0) + continue; + /* + * Final XOR of output with tweak (it will be always + * in the beginning of NEXT_TWEAK, hence the cast) + */ + XTMP[i] = _mm_xor_si128(XTMP[i], _mm512_castsi512_si128(NEXT_TWEAK[i])); + /* Write first bytes of previous output block as the output of the partial block */ + __m128i XOUT = _mm_maskz_loadu_epi8((__mmask16)stream_mask[i], (loc_out[i] - 16)); + _mm_mask_storeu_epi8(loc_out[i], (__mmask16)stream_mask[i], XOUT); + /* Write last output as the output of the previous block */ + _mm_storeu_si128((__m128i*)(loc_out[i] - 16), XTMP[i]); + } + } + /* clear local copy of sensitive data */ + zero_mb8((int64u(*)[8])TMP, sizeof(TMP) / sizeof(TMP[0])); +} + +void sm4_xts_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], + const int32u* key_sched1[SM4_ROUNDS], const int32u* key_sched2[SM4_ROUNDS], + const int8u* pa_tweak[SM4_LINES], __mmask16 mb_mask, const int operation) +{ + __ALIGN64 const int8u* loc_inp[SM4_LINES]; + __ALIGN64 int8u* loc_out[SM4_LINES]; + + /* Create the local copy of the input data length in bytes and set it to zero for non-valid buffers */ + __m512i loc_len; + loc_len = _mm512_loadu_si512(len); + loc_len = _mm512_mask_set1_epi32(loc_len, ~mb_mask, 0); + + /* Local copies of the pointers to input and otput buffers */ + _mm512_storeu_si512((void*)loc_inp, _mm512_loadu_si512(pa_inp)); + _mm512_storeu_si512((void*)(loc_inp + 8), _mm512_loadu_si512(pa_inp + 8)); + + _mm512_storeu_si512(loc_out, _mm512_loadu_si512(pa_out)); + _mm512_storeu_si512(loc_out + 8, _mm512_loadu_si512(pa_out + 8)); + + /* Depending on the operation(enc or dec): sign allows to go up and down on the key schedule + * p_rk set to the beginning or to the end of the key schedule */ + const __m512i* p_rk1 = (operation == SM4_ENC) ? (const __m512i*)key_sched1 : ((const __m512i*)key_sched1 + (SM4_ROUNDS - 1)); + /* Pointer p_rk2 is set to the beginning of the key schedule, + * as it always encrypts the tweak, regardless the direction */ + const __m512i* p_rk2 = (const __m512i*)key_sched2; + + /* TMP[] - temporary buffer for processing */ + /* TWEAK - tweak values for current blocks (4 blocks per buffer) */ + /* NEXT_TWEAK - tweak values for following blocks (4 blocks per buffer) */ + /* inital_tweak - first tweak for all buffers */ + __m512i TMP[20]; + __m512i TWEAK[SM4_LINES]; + __m512i NEXT_TWEAK[SM4_LINES]; + __m128i initial_tweak[SM4_LINES]; + int i; + + const __m512i z_poly = _mm512_loadu_si512(xts_poly); + const __m512i z_shuf_mask = _mm512_loadu_si512(xts_shuf_mask); + + /* Encrypt initial tweak */ + TRANSPOSE_16x4_I32_EPI32(&TMP[0], &TMP[1], &TMP[2], &TMP[3], pa_tweak, mb_mask); + + for (i = 0; i < SM4_ROUNDS; i += 4, p_rk2 += 4) + SM4_FOUR_ROUNDS(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], p_rk2, SM4_ENC); + + p_rk2 -= SM4_ROUNDS; + + TRANSPOSE_4x16_I32_O128_EPI32(&TMP[0], &TMP[1], &TMP[2], &TMP[3], initial_tweak, mb_mask); + + /* Load TWEAK value from valid buffers and generate first 4 values */ + for (i = 0; i < SM4_LINES; i++) { + TWEAK[i] = _mm512_broadcast_i64x2(initial_tweak[i]); + generate_next_4_tweaks(&TWEAK[i], &NEXT_TWEAK[i], z_shuf_mask, z_poly, FIRST_TWEAKS); + } + + /* + * Generate the mask to process 4 full blocks from each buffer. + * Less than 5 full blocks requires sm4_xts_mask_kernel_mb16 to handle it, + * as it is the function that can handle partial blocks. + */ + __mmask16 tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len, _mm512_set1_epi32(5 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT); + + /* Go to this loop if all 16 buffers contain at least 5 full blocks each */ + while (tmp_mask == 0xFFFF) { + for (i = 0; i < SM4_LINES; i++) { + TWEAK[i] = NEXT_TWEAK[i]; + + /* Update tweaks for next rounds */ + generate_next_4_tweaks(&TWEAK[i], &NEXT_TWEAK[i], z_shuf_mask, z_poly, NEXT_TWEAKS); + } + + /* XOR plaintext from each lane with the 4 tweaks and transpose to prepare for encryption */ + TMP[0] = _mm512_xor_si512(TWEAK[0], _mm512_loadu_si512(loc_inp[0])); + TMP[1] = _mm512_xor_si512(TWEAK[1], _mm512_loadu_si512(loc_inp[1])); + TMP[2] = _mm512_xor_si512(TWEAK[2], _mm512_loadu_si512(loc_inp[2])); + TMP[3] = _mm512_xor_si512(TWEAK[3], _mm512_loadu_si512(loc_inp[3])); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + + TRANSPOSE_INP_512(TMP[4], TMP[5], TMP[6], TMP[7], TMP[0], TMP[1], TMP[2], TMP[3]); + + TMP[0] = _mm512_xor_si512(TWEAK[4], _mm512_loadu_si512(loc_inp[4])); + TMP[1] = _mm512_xor_si512(TWEAK[5], _mm512_loadu_si512(loc_inp[5])); + TMP[2] = _mm512_xor_si512(TWEAK[6], _mm512_loadu_si512(loc_inp[6])); + TMP[3] = _mm512_xor_si512(TWEAK[7], _mm512_loadu_si512(loc_inp[7])); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + + TRANSPOSE_INP_512(TMP[8], TMP[9], TMP[10], TMP[11], TMP[0], TMP[1], TMP[2], TMP[3]); + + TMP[0] = _mm512_xor_si512(TWEAK[8], _mm512_loadu_si512(loc_inp[8])); + TMP[1] = _mm512_xor_si512(TWEAK[9], _mm512_loadu_si512(loc_inp[9])); + TMP[2] = _mm512_xor_si512(TWEAK[10], _mm512_loadu_si512(loc_inp[10])); + TMP[3] = _mm512_xor_si512(TWEAK[11], _mm512_loadu_si512(loc_inp[11])); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + + TRANSPOSE_INP_512(TMP[12], TMP[13], TMP[14], TMP[15], TMP[0], TMP[1], TMP[2], TMP[3]); + + TMP[0] = _mm512_xor_si512(TWEAK[12], _mm512_loadu_si512(loc_inp[12])); + TMP[1] = _mm512_xor_si512(TWEAK[13], _mm512_loadu_si512(loc_inp[13])); + TMP[2] = _mm512_xor_si512(TWEAK[14], _mm512_loadu_si512(loc_inp[14])); + TMP[3] = _mm512_xor_si512(TWEAK[15], _mm512_loadu_si512(loc_inp[15])); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + + TRANSPOSE_INP_512(TMP[16], TMP[17], TMP[18], TMP[19], TMP[0], TMP[1], TMP[2], TMP[3]); + + SM4_KERNEL(TMP, p_rk1, operation); + p_rk1 -= operation*SM4_ROUNDS; + + /* Transpose, XOR with the tweaks again and write data out */ + TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], TMP[5], TMP[6], TMP[7]); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + _mm512_storeu_si512((__m512i*)loc_out[0], _mm512_xor_si512(TMP[0], TWEAK[0])); + _mm512_storeu_si512((__m512i*)loc_out[1], _mm512_xor_si512(TMP[1], TWEAK[1])); + _mm512_storeu_si512((__m512i*)loc_out[2], _mm512_xor_si512(TMP[2], TWEAK[2])); + _mm512_storeu_si512((__m512i*)loc_out[3], _mm512_xor_si512(TMP[3], TWEAK[3])); + + TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[8], TMP[9], TMP[10], TMP[11]); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + _mm512_storeu_si512((__m512i*)loc_out[4], _mm512_xor_si512(TMP[0], TWEAK[4])); + _mm512_storeu_si512((__m512i*)loc_out[5], _mm512_xor_si512(TMP[1], TWEAK[5])); + _mm512_storeu_si512((__m512i*)loc_out[6], _mm512_xor_si512(TMP[2], TWEAK[6])); + _mm512_storeu_si512((__m512i*)loc_out[7], _mm512_xor_si512(TMP[3], TWEAK[7])); + + TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[12], TMP[13], TMP[14], TMP[15]); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + _mm512_storeu_si512((__m512i*)loc_out[8], _mm512_xor_si512(TMP[0], TWEAK[8])); + _mm512_storeu_si512((__m512i*)loc_out[9], _mm512_xor_si512(TMP[1], TWEAK[9])); + _mm512_storeu_si512((__m512i*)loc_out[10], _mm512_xor_si512(TMP[2], TWEAK[10])); + _mm512_storeu_si512((__m512i*)loc_out[11], _mm512_xor_si512(TMP[3], TWEAK[11])); + + TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[16], TMP[17], TMP[18], TMP[19]); + TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes)); + TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes)); + TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes)); + TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes)); + _mm512_storeu_si512((__m512i*)loc_out[13], _mm512_xor_si512(TMP[1], TWEAK[13])); + _mm512_storeu_si512((__m512i*)loc_out[13], _mm512_xor_si512(TMP[1], TWEAK[13])); + _mm512_storeu_si512((__m512i*)loc_out[14], _mm512_xor_si512(TMP[2], TWEAK[14])); + _mm512_storeu_si512((__m512i*)loc_out[15], _mm512_xor_si512(TMP[3], TWEAK[15])); + + /* Update input/output pointers to data */ + M512(loc_inp) = _mm512_add_epi64(_mm512_loadu_si512(loc_inp), _mm512_set1_epi64(4 * SM4_BLOCK_SIZE)); + M512(loc_inp + 8) = _mm512_add_epi64(_mm512_loadu_si512(loc_inp + 8), _mm512_set1_epi64(4 * SM4_BLOCK_SIZE)); + + M512(loc_out) = _mm512_add_epi64(_mm512_loadu_si512(loc_out), _mm512_set1_epi64(4 * SM4_BLOCK_SIZE)); + M512(loc_out + 8) = _mm512_add_epi64(_mm512_loadu_si512(loc_out + 8), _mm512_set1_epi64(4 * SM4_BLOCK_SIZE)); + + /* Update number of blocks left and processing mask */ + loc_len = _mm512_sub_epi32(loc_len, _mm512_set1_epi32(4 * SM4_BLOCK_SIZE)); + tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len, _mm512_set1_epi32(5 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT); + } + + /* Check if we have any data left on any of the buffers */ + tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len, _mm512_setzero_si512(), _MM_CMPINT_NLE); + /* + * At this point, at least one buffer has less than 5 full blocks, + * so dealing with a partial block might be needed. + */ + if (tmp_mask) + sm4_xts_mask_kernel_mb16(NEXT_TWEAK, p_rk1, loc_len, loc_inp, loc_out, mb_mask, operation); + + /* clear local copy of sensitive data */ + zero_mb8((int64u(*)[8])TMP, sizeof(TMP) / sizeof(TMP[0])); + zero_mb8((int64u(*)[8])TWEAK, sizeof(TWEAK) / sizeof(TWEAK[0])); +} diff --git a/tools/ipp_custom_library_tool_python/requirements.txt b/tools/ipp_custom_library_tool_python/requirements.txt new file mode 100644 index 00000000..be36e373 --- /dev/null +++ b/tools/ipp_custom_library_tool_python/requirements.txt @@ -0,0 +1,2 @@ +PyQt5==5.15.9 +