From 47079e5d3cd04a861c3d6a6985c6fc9985f90b7f Mon Sep 17 00:00:00 2001
From: ipl_ci <ipl_ci@sdpipp123360.jf.intel.com>
Date: Fri, 19 May 2023 02:15:49 -0700
Subject: [PATCH] WW20'23 source code update

---
 BUILD.md                                      |  21 +-
 CHANGELOG.md                                  |   3 +
 CONST_TIME_EXECUTION_TESTING.md               |   4 +-
 DEPRECATION_NOTES.md                          |  11 +-
 OVERVIEW.md                                   |  25 +-
 include/ippcpdefs.h                           |  10 +-
 include/ippversion.h                          |   4 +-
 sources/ippcp/crypto_mb/Readme.md             |  13 +-
 .../ippcp/crypto_mb/include/crypto_mb/sm4.h   |   8 +
 .../crypto_mb/include/crypto_mb/version.h     |   4 +-
 .../crypto_mb/include/internal/sm4/sm4_mb.h   | 140 ++++-
 .../src/cmake/dll_export/crypto_mb.defs       |   4 +
 .../dll_export/crypto_mb.linux.lib-export     |   4 +
 .../dll_export/crypto_mb.macosx.lib-export    |   4 +
 .../ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c    |   8 +-
 .../ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c |  37 ++
 .../crypto_mb/src/sm4/sm4_xts_dec_mb16.c      |  59 ++
 .../crypto_mb/src/sm4/sm4_xts_enc_mb16.c      |  59 ++
 .../ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c    | 529 ++++++++++++++++++
 .../requirements.txt                          |   2 +
 20 files changed, 906 insertions(+), 43 deletions(-)
 create mode 100644 sources/ippcp/crypto_mb/src/sm4/sm4_xts_dec_mb16.c
 create mode 100644 sources/ippcp/crypto_mb/src/sm4/sm4_xts_enc_mb16.c
 create mode 100644 sources/ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c
 create mode 100644 tools/ipp_custom_library_tool_python/requirements.txt
diff --git a/BUILD.md b/BUILD.md
index 7177cc40..f6db9f7f 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -29,7 +29,7 @@
 
 ### Linux* OS
 - [Common tools](#common-tools)
-- Intel® C++ Compiler Classic 2021.3 for Linux\* OS
+- Intel® C++ Compiler Classic 2021.9 for Linux\* OS
 - GCC 8.3
 - GCC 9.1
 - GCC 10.1
@@ -39,7 +39,7 @@
 - GNU binutils 2.32
 ### Windows* OS
 - [Common tools](#common-tools)
-- Intel® C++ Compiler Classic 2021.3 for Windows\* OS
+- Intel® C++ Compiler Classic 2021.9 for Windows\* OS
 - Microsoft Visual C++ Compiler\* version 19.16 provided by Microsoft Visual Studio\* 2017 version 15.9
 > **NOTE:** Support for this compiler version will be removed from Intel IPP Cryptography starting 2021.4 release. If you use it for building Intel IPP Cryptography library, please plan on migrating to a newer supported version of Microsoft Visual C++ Compiler\*.
 - Microsoft Visual C++ Compiler\* version 19.24 provided by Microsoft Visual Studio\* 2019 version 16.4
@@ -47,12 +47,12 @@
 > **NOTE:** [CMake\*](https://cmake.org/download) 3.21 or higher is required to build using Microsoft Visual Studio\* 2022.
 ### macOS*
 - [Common tools](#common-tools)
-- Intel® C++ Compiler Classic 2021.3 for macOS\*
+- Intel® C++ Compiler Classic 2021.9 for macOS\*
 ## Building Intel IPP Cryptography on Linux\* OS
 
 The software was validated on:
 
-- Red Hat\* Enterprise Linux\* 7
+- Red Hat\* Enterprise Linux\* 8
 
 To build the Intel IPP Cryptography library on Linux\* OS, complete the following steps:
 1. Clone the source code from GitHub\* as follows:
@@ -102,7 +102,7 @@ To build the Intel IPP Cryptography library on Linux\* OS, complete the followin
 
 The software was validated on:
 
-- Windows Server\* 2016
+- Windows Server\* 2019
 
 To build the Intel IPP Cryptography library on Windows* OS, complete the following steps:
 
@@ -214,7 +214,16 @@ To build the Intel IPP Cryptography library on macOS*, complete the following st
         `-DPLATFORM_LIST="m7;s8;p8;g9;h9"`
 
     - Example for Linux\* OS and the Intel® 64 architecture:
-        `-DPLATFORM_LIST="w7;n8;y8;e9;l9;n0;k0"`
+        `-DPLATFORM_LIST="w7;n8;y8;e9;l9;k0"`
+- `-DIPPCP_CUSTOM_BUILD="<CPU features list>"` - optional, works only if `-DMERGED_BLD:BOOL=off` is set, i.e. only for 1CPU libraries. Enables the CPU feature dispatching mask at compile-time based on the provided list.
+
+   - Currently supported by the library custom features dispatching: 
+        1. Intel® Advanced Encryption Standard New Instructions (Intel® AES-NI) code-path enabling: `IPPCP_AES_ON;IPPCP_CLMUL_ON`
+        2. Intel® Advanced Vector Extensions 512 (Intel(R) AVX-512) and vector extensions of Intel(R) AES New Instructions (Intel(R) AES-NI) code-path enabling: `IPPCP_VAES_ON;IPPCP_VCLMUL_ON`
+    - Example:
+        `-DPLATFORM_LIST="IPPCP_AES_ON;IPPCP_CLMUL_ON"` - this combination enables Intel® AES-NI in all 1CPU libraries, which contains this code path.
+    - Example of using a combination of CPU features:
+        `-DPLATFORM_LIST="IPPCP_AES_ON;IPPCP_CLMUL_ON;IPPCP_VAES_ON;IPPCP_VCLMUL_ON"` - in this combination the highest available feature in each 1CPU library will be enabled (e.g. for `"y8"` it’s Intel® AES-NI and for `"k1"` - Intel AVX-512 VAES)
 
 ### Windows\* OS
 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 291c0dd3..93d62b7c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 This is a list of notable changes to Intel(R) IPP Cryptography, in reverse chronological order.
 
+## Intel(R) IPP Cryptography 2021.8
+- Crypto Multi-buffer library was extended with XTS mode of SM4 algorithm.
+
 ## Intel(R) IPP Cryptography 2021.7.1
 - Added re-initialization API for AES-GCM context - ippsAES_GCMReinit. The use-case of this function is very specific, please, refer to the documentation for more details.
 
diff --git a/CONST_TIME_EXECUTION_TESTING.md b/CONST_TIME_EXECUTION_TESTING.md
index 87e4b181..61868362 100644
--- a/CONST_TIME_EXECUTION_TESTING.md
+++ b/CONST_TIME_EXECUTION_TESTING.md
@@ -7,14 +7,14 @@
 ## General information <div id = 'general'>
 - Testing is conducted under Linux for 64-bit Intel® IPP Cryptography built with the following compilers:
   -  Intel® C++ Compiler 19.1
-  -  Intel® C++ Compiler Classic 2021.3
+  -  Intel® C++ Compiler Classic 2021.9
   -  GCC 8.3 
   -  GCC 9.1 
   -  GCC 10.1 
   -  GCC 11.1
   -  Clang 9.0 
   -  Clang 12.0 
-- Tested platforms: w7, n8, y8, e9, l9, n0, k0 (see the supported platforms list [here](./OVERVIEW.md#target-optimization-codes-in-function-names)).
+- Tested platforms: w7, n8, y8, e9, l9, k0 (see the supported platforms list [here](./OVERVIEW.md#target-optimization-codes-in-function-names)).
 - Testing scope described below is guaranteed to pass for **`release`** branches. This is not guaranteed for the **`develop`** branch ([branches description](./OVERVIEW.md#branches-description))
 - Information about Pin-Based Constant Execution Checker can be found [here](https://github.com/intel/pin_based_cec) 
 
diff --git a/DEPRECATION_NOTES.md b/DEPRECATION_NOTES.md
index 6773e7b1..3ef6475c 100644
--- a/DEPRECATION_NOTES.md
+++ b/DEPRECATION_NOTES.md
@@ -14,7 +14,7 @@ The deprecated API means it is obsolete and will be removed in one of future Int
 | ippsSHA1Init<br>ippsSHA224Init<br>ippsSHA256Init<br>ippsSHA384Init<br>ippsSHA512Init<br>ippsSM3Init<br>ippsMD5Init                                                                                                                                                                                                                                                     |                       ippsHashInit_rmf \*                       |
 | ippsSHA1Duplicate<br>ippsSHA224Duplicate<br>ippsSHA256Duplicate<br>ippsSHA384Duplicate<br>ippsSHA512Duplicate<br>ippsSM3Duplicate<br>ippsMD5Duplicate                                                                                                                                                                                                                  |                      ippsHashDuplicate_rmf                      |
 | ippsSHA1Pack, ippsSHA1Unpack<br>ippsSHA224Pack, ippsSHA224Unpack<br>ippsSHA256Pack, ippsSHA256Unpack<br>ippsSHA384Pack, ippsSHA384Unpack<br>ippsSHA512Pack, ippsSHA512Unpack<br>ippsSM3Pack, ippsSM3Unpack<br>ippsMD5Pack, ippsMD5Unpack                                                                                                                               |             ippsHashPack_rmf,<br>ippsHashUnpack_rmf             |
-| ippsSHA1Update, ippsSHA1GetTag, ippsSHA1Final<br>ippsSHA224Update, ippsSHA224GetTag, ippsSHA224Final<br>ippsSHA256Update, ippsSHA256GetTag, ippsSHA256Final<br>ippsSHA384Update, ippsSHA384GetTag, ippsSHA384Final<br>ippsSHA512Update, ippsSHA512GetTag, ippsSHA512Final<br>ippsSM3Update, ippsSM3GetTag, ippsSM3Final<br>ippsMD5Update, ippsMD5GetTag, ippsSMD5Final | ippsHashUpdate_rmf,<br>ippsHashGetTag_rmf,<br>ippsHashFinal_rmf |
+| ippsSHA1Update, ippsSHA1GetTag, ippsSHA1Final<br>ippsSHA224Update, ippsSHA224GetTag, ippsSHA224Final<br>ippsSHA256Update, ippsSHA256GetTag, ippsSHA256Final<br>ippsSHA384Update, ippsSHA384GetTag, ippsSHA384Final<br>ippsSHA512Update, ippsSHA512GetTag, ippsSHA512Final<br>ippsSM3Update, ippsSM3GetTag, ippsSM3Final<br>ippsMD5Update, ippsMD5GetTag, ippsMD5Final | ippsHashUpdate_rmf,<br>ippsHashGetTag_rmf,<br>ippsHashFinal_rmf |
 | ippsSHA1MessageDigest<br>ippsSHA224MessageDigest<br>ippsSHA256MessageDigest<br>ippsSHA384MessageDigest<br>ippsSHA512MessageDigest<br>ippsSM3MessageDigest<br>ippsMD5MessageDigest                                                                                                                                                                                      |                     ippsHashMessage_rmf \*                      |
 | ippsHashGetSize                                                                                                                                                                                                                                                                                                                                                        |                       ippsHashGetSize_rmf                       |
 | ippsHashInit \*\*                                                                                                                                                                                                                                                                                                                                                      |                       ippsHashInit_rmf \*                       |
@@ -24,7 +24,7 @@ The deprecated API means it is obsolete and will be removed in one of future Int
 | ippsHashMessage \*\*                                                                                                                                                                                                                                                                                                                                                   |                     ippsHashMessage_rmf \*                      |
 
 >\* To choose hash algorithm, specify [IppsHashMethod parameter](#ippshashalgid-to-ippshashmethod-parameter-map)
->\*\* IppsHashAlgId parameter used in 'ippsHMAC_Init' and in ippsHMAC_Message for choosing hash algorithm is deprecated (see Recommended replacement column for alternative in [IppsHashAlgId to IppsHashMethod parameter map](#ippshashalgid-to-ippshashmethod-parameter-map)
+>\*\* IppsHashAlgId parameter used in ippsHMAC_Init and in ippsHMAC_Message for choosing hash algorithm is deprecated (see Recommended replacement column for alternative in [IppsHashAlgId to IppsHashMethod Parameter Map](#ippshashalgid-to-ippshashmethod-parameter-map)
 
 ### Keyed HMAC Functionality
 
@@ -37,7 +37,7 @@ The deprecated API means it is obsolete and will be removed in one of future Int
 | ippsHMAC_Message \*\*                             |                    ippsHMAC_Message_rmf \*                     |
 
 >\* To choose hash algorithm, specify [IppsHashMethod parameter](#ippshashalgid-to-ippshashmethod-parameter-map)
->\*\* IppsHashAlgId parameter used in 'ippsHMAC_Init' and in ippsHMAC_Message for choosing hash algorithm is deprecated (see Recommended replacement column for alternative in [IppsHashAlgId to IppsHashMethod parameter map](#ippshashalgid-to-ippshashmethod-parameter-map)
+>\*\* IppsHashAlgId parameter used in 'ippsHMAC_Init' and in ippsHMAC_Message for choosing hash algorithm is deprecated (see Recommended replacement column for alternative in [IppsHashAlgId to IppsHashMethod Parameter Map](#ippshashalgid-to-ippshashmethod-parameter-map)
 
 
 ### MGF Functionality
@@ -62,7 +62,6 @@ The deprecated API means it is obsolete and will be removed in one of future Int
 | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | ippsECCPGetSize<br>ippsECCPGetSizeStd128r1<br>ippsECCPGetSizeStd128r2<br>ippsECCPGetSizeStd192r1<br>ippsECCPGetSizeStd224r1<br>ippsECCPGetSizeStd256r1<br>ippsECCPGetSizeStd384r1<br>ippsECCPGetSizeStd521r1<br>ippsECCPGetSizeStdSM2                                                         |                                                                                                                                         ippsGFpECGetSize                                                                                                                                          |
 | ippsECCPInit<br>ippsECCPInitStd128r1<br>ippsECCPInitStd128r2<br>ippsECCPInitStd192r1<br>ippsECCPInitStd224r1<br>ippsECCPInitStd256r1<br>ippsECCPInitStd384r1<br>ippsECCPInitStd521r1<br>ippsECCPInitStdSM2                                                                                    |                                                                                   ippsGFpECInitStd \*<br>* ippsGFpECInitStd functions provides both initialization<br>and set up standard EC set of parameters                                                                                    |
-| ippsECCPGetSize<br>ippsECCPGetSizeStd128r1<br>ippsECCPGetSizeStd128r2<br>ippsECCPGetSizeStd192r1<br>ippsECCPGetSizeStd224r1<br>ippsECCPGetSizeStd256r1<br>ippsECCPGetSizeStd384r1<br>ippsECCPGetSizeStd521r1<br>ippsECCPGetSizeStdSM2                                                         |                                                                                                                                         ippsGFpECGetSize                                                                                                                                          |
 | ippsECCPSet                                                                                                                                                                                                                                                                                   |                                                                                                                                           ippsGFpECSet                                                                                                                                            |
 | ippsECCPSetStd                                                                                                                                                                                                                                                                                |                                                                                   ippsGFpECInitStd \*<br>* ippsGFpECInitStd functions provides both initialization<br>and set up standard EC set of parameters                                                                                    |
 | ippsECCPSetStd128r1<br>ippsECCPSetStd128r2<br>ippsECCPSetStd192r1<br>ippsECCPSetStd224r1<br>ippsECCPSetStd256r1<br>ippsECCPSetStd384r1<br>ippsECCPSetStd521r1<br>ippsECCPSetStdSM2                                                                                                            |                                                ippsGFpECInitStd128r1<br>ippsGFpECInitStd128r2<br>ippsGFpECInitStd192r1<br>ippsGFpECInitStd224r1<br>ippsGFpECInitStd256r1<br>ippsGFpECInitStd384r1<br>ippsGFpECInitStd521r1<br>ippsGFpECInitStdSM2                                                 |
@@ -72,9 +71,9 @@ The deprecated API means it is obsolete and will be removed in one of future Int
 | ippsECCPSharedSecretDH<br>ippsECCPSharedSecretDHC                                                                                                                                                                                                                                             |                                                                                                                        ippsGFpECSharedSecretDH<br>ippsGFpECSharedSecretDHC                                                                                                                        |
 | ippsECCPSignDSA<br>ippsECCPVerifyDSA<br>ippsECCPSignNR<br>ippsECCPVerifyNR<br>ippsECCPSignSM2<br>ippsECCPVerifySM2                                                                                                                                                                            |                                                                                     ippsGFpECSignDSA<br>ippsGFpECVerifyDSA<br>ippsGFpECSignNR<br>ippsGFpECVerifyNR<br>ippsGFpECSignSM2<br>ippsGFpECVerifySM2                                                                                      |
 
-### IppsHashAlgId to IppsHashMethod parameter map
+### IppsHashAlgId to IppsHashMethod Parameter Map
 
-| Algorithm  | IppsHashAlgId (deprecated) |                          IppsHashMethod(recommended)                          |                                                 Note                                                 |
+| Algorithm  | IppsHashAlgId (deprecated) |                          IppsHashMethod (recommended)                          |                                                 Notes                                                 |
 | :--------: | :------------------------: | :---------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------: |
 |    SHA1    |      ippsHashAlg_SHA1      |    ippsHashMethod_SHA1<br>ippsHashMethod_SHA1_NI<br>ippsHashMethod_SHA1_TT    |  Intel® Secure Hash Algorithm - New Instructions (Intel® SHA-NI) not supported<br>Intel SHA-NI only supported<br>Automatic switch on Intel SHA-NI, if possible (tick-tock) |
 |   SHA224   |     ippsHashAlg_SHA224     | ippsHashMethod_SHA224<br>ippsHashMethod_SHA224_NI<br>ippsHashMethod_SHA224_TT |  Intel SHA-NI not supported<br>Intel SHA-NI only supported<br>Automatic switch on Intel SHA-NI, if possible supported  |
diff --git a/OVERVIEW.md b/OVERVIEW.md
index 8268b7ab..c7254c31 100644
--- a/OVERVIEW.md
+++ b/OVERVIEW.md
@@ -77,16 +77,17 @@ By default, the dispatcher chooses the most appropriate optimization for the cur
 
 #### Target Optimization Codes in Function Names
 
-| IA-32 Intel® architecture | Intel® 64 architecture | Meaning                                                                              |
-| ------------------------- | ---------------------- | ------------------------------------------------------------------------------------ |
-| px                        | mx                     | Generic code without hardware specific optimizations suitable for any CPU            |
-| w7                        | -                      | Optimized for processors with Intel® Streaming SIMD Extensions 2 (Intel® SSE2)       |
-| -                         | m7                     | Optimized for processors with Intel® SSE3                                            |
-| s8                        | n8                     | Optimized for processors with Supplemental Streaming SIMD Extensions 3 (SSSE3)       |
-| p8                        | y8                     | Optimized for processors with Intel® SSE4.2                                          |
-| g9                        | e9                     | Optimized for processors with Intel® Advanced Vector Extensions (Intel® AVX)         |
-| h9                        | l9                     | Optimized for processors with Intel® Advanced Vector Extensions 2 (Intel® AVX2)      |
-| -                         | k0                     | Optimized for processors with Intel® Advanced Vector Extensions 512 (Intel® AVX-512) |
+| IA-32 Intel® architecture | Intel® 64 architecture | Meaning                                                                                                            |
+| ------------------------- | ---------------------- | ------------------------------------------------------------------------------------------------------------------ |
+| px                        | mx                     | Generic code without hardware specific optimizations suitable for any CPU                                          |
+| w7                        | -                      | Optimized for processors with Intel® Streaming SIMD Extensions 2 (Intel® SSE2)                                     |
+| -                         | m7                     | Optimized for processors with Intel® SSE3                                                                          |
+| s8                        | n8                     | Optimized for processors with Supplemental Streaming SIMD Extensions 3 (SSSE3)                                     |
+| p8                        | y8                     | Optimized for processors with Intel® SSE4.2                                                                        |
+| g9                        | e9                     | Optimized for processors with Intel® Advanced Vector Extensions (Intel® AVX)                                       |
+| h9                        | l9                     | Optimized for processors with Intel® Advanced Vector Extensions 2 (Intel® AVX2)                                    |
+| -                         | k0                     | Optimized for processors with Intel® Advanced Vector Extensions 512 (Intel® AVX-512) (formerly codenamed SkyLake)  |
+| -                         | k1                     | Optimized for processors with Intel® Advanced Vector Extensions 512 (Intel® AVX-512) (formerly codenamed IceLake)  |
 
 ### CPU Feature Dispatching
 
@@ -104,6 +105,8 @@ List of CPU feature subsets that the library has special optimizations for:
 - Intel AVX-512 IFMA
 - Intel AVX-512 GFNI
 
+ > **NOTE:** For some features there is also an opportunity to force their dispatching inside the 1CPU libraries manually during the compile time. For more information please, refer to [common for all operating systems CMake build options](./BUILD.md/#common-for-all-operating-systems).
+
 ### How to Avoid Dispatcher in All CPUs Static Library
 
 To leave only specific ISA when linking with an [All CPUs Static Library](#all-cpus-library) and drop dispatcher, please refer to [this section](#choosing-specific-isa-from-the-all-cpus-static-library).
@@ -202,4 +205,4 @@ To build your own dynamic library containing only the functionality that is nece
 
 The tool is located in the `tools/ipp_custom_library_tool_python` directory.
 
-Please refer to the [tool documentation](https://software.intel.com/en-us/ipp-dev-guide-building-a-custom-dll-with-custom-library-tool) for more information.
+Please refer to the [tool documentation](https://www.intel.com/content/www/us/en/docs/ipp/developer-guide-oneapi/current/ipp-custom-library-tool.html) for more information.
diff --git a/include/ippcpdefs.h b/include/ippcpdefs.h
index 6282f685..7d004758 100644
--- a/include/ippcpdefs.h
+++ b/include/ippcpdefs.h
@@ -367,14 +367,16 @@ typedef enum {
 #ifndef IPPCP_AES_ON
 #define IPPCP_AES_ON (0)
 #endif
+#ifndef IPPCP_CLMUL_ON
+#define IPPCP_CLMUL_ON (0)
+#endif
 #ifndef IPPCP_VAES_ON
 #define IPPCP_VAES_ON (0)
 #endif
-#ifndef IPPCP_CLMUL_ON
-#define IPPCP_CLMUL_ON (0)
+#ifndef IPPCP_VCLMUL_ON
+#define IPPCP_VCLMUL_ON (0)
 #endif
-
-#define IPP_CUSTOM_ENABLED_FEATURES (ippCPUID_AES*IPPCP_AES_ON | ippCPUID_CLMUL*IPPCP_CLMUL_ON | ippCPUID_AVX512VAES*IPPCP_VAES_ON)
+#define IPP_CUSTOM_ENABLED_FEATURES (ippCPUID_AES*IPPCP_AES_ON | ippCPUID_CLMUL*IPPCP_CLMUL_ON | ippCPUID_AVX512VAES*IPPCP_VAES_ON | ippCPUID_AVX512VCLMUL*IPPCP_VCLMUL_ON)
 
 #endif /* IPP_CUSTOM_CPU_FEATURES__ */
 #endif /* !defined(_MERGED_BLD) && defined(IPPCP_CUSTOM_BUILD) */
diff --git a/include/ippversion.h b/include/ippversion.h
index 159f25cd..4d269aed 100644
--- a/include/ippversion.h
+++ b/include/ippversion.h
@@ -28,13 +28,13 @@
 #define IPPVERSION_H__
 
 #define IPP_VERSION_MAJOR  2021
-#define IPP_VERSION_MINOR  7
+#define IPP_VERSION_MINOR  8
 #define IPP_VERSION_UPDATE 0
 
 // Major interface version
 #define IPP_INTERFACE_VERSION_MAJOR 11
 // Minor interface version
-#define IPP_INTERFACE_VERSION_MINOR 5
+#define IPP_INTERFACE_VERSION_MINOR 8
 
 #define IPP_VERSION_STR  STR(IPP_VERSION_MAJOR) "." STR(IPP_VERSION_MINOR) "." STR(IPP_VERSION_UPDATE) " (" STR(IPP_INTERFACE_VERSION_MAJOR) "." STR(IPP_INTERFACE_VERSION_MINOR) " )"
 
diff --git a/sources/ippcp/crypto_mb/Readme.md b/sources/ippcp/crypto_mb/Readme.md
index 53eef996..f8e22150 100644
--- a/sources/ippcp/crypto_mb/Readme.md
+++ b/sources/ippcp/crypto_mb/Readme.md
@@ -1,6 +1,9 @@
 # Crypto Multi-buffer Library
 
-Currently, the library provides optimized version of RSA, ECDSA, SM3, x25519 multi-buffer algorithms based on Intel® Advanced Vector Extensions 512 (Intel® AVX-512) integer fused multiply-add (IFMA) operations. This CPU feature is introduced with Intel® Microarchitecture Code Named Ice Lake.
+Currently, the library provides optimized version of the following algorithms:
+1. RSA, ECDSA, ECDH, x25519 multi-buffer algorithms based on Intel® Advanced Vector Extensions 512 (Intel® AVX-512) integer fused multiply-add (IFMA) operations. This CPU feature is introduced with Intel® Microarchitecture Code Named Ice Lake. 
+2. SM4 based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) GFNI instructions.
+3. SM3 based on Intel® Advanced Vector Extensions 512 (Intel® AVX-512) instructions.
 
 ## Multiple Buffers Processing Overview
 
@@ -24,7 +27,7 @@ This library consists of highly-optimized kernels taking advantage of Intel’s
 
 ### Linux* OS
 
-- Intel® C++ Compiler Classic 2021.3 for Linux\* OS
+- Intel® C++ Compiler Classic 2021.9 for Linux\* OS
 - GCC 8.3
 - GCC 9.1
 - GCC 10.1
@@ -35,14 +38,14 @@ This library consists of highly-optimized kernels taking advantage of Intel’s
 
 ### Windows* OS
 
-- Intel® C++ Compiler Classic 2021.3 for Windows\* OS
+- Intel® C++ Compiler Classic 2021.9 for Windows\* OS
 - Microsoft Visual C++ Compiler\* version 19.24 provided by Microsoft Visual Studio\* 2019 version 16.4
 - Microsoft Visual C++ Compiler\* version 19.30 provided by Microsoft Visual Studio\* 2022 version 17.0
 > **NOTE:** [CMake\*](https://cmake.org/download) 3.21 or higher is required to build using Microsoft Visual Studio\* 2022.
 
 ### macOS*
 
-- Intel® C++ Compiler Classic 2021.3 for macOS\*
+- Intel® C++ Compiler Classic 2021.9 for macOS\*
 
 ## Installation
 
@@ -83,6 +86,8 @@ You can find the installed files in:
     │        ├── exp.h
     │        ├── rsa.h
     │        ├── sm3.h
+    │        ├── sm4_ccm.h
+    │        ├── sm4_gcm.h
     │        ├── sm4.h
     │        ├── status.h
     |        ├── version.h
diff --git a/sources/ippcp/crypto_mb/include/crypto_mb/sm4.h b/sources/ippcp/crypto_mb/include/crypto_mb/sm4.h
index d8e6bd6b..2eeab8c1 100644
--- a/sources/ippcp/crypto_mb/include/crypto_mb/sm4.h
+++ b/sources/ippcp/crypto_mb/include/crypto_mb/sm4.h
@@ -28,9 +28,11 @@
 #define SM4_ROUNDS      (32)        /*    SM4 number of rounds     */
 
 typedef int8u  sm4_key[SM4_KEY_SIZE];
+typedef int8u  sm4_xts_key[SM4_KEY_SIZE*2];
 typedef int32u mbx_sm4_key_schedule[SM4_ROUNDS][SM4_LINES];
 
 EXTERN_C mbx_status16 mbx_sm4_set_key_mb16(mbx_sm4_key_schedule* key_sched, const sm4_key* pa_key[SM4_LINES]);
+EXTERN_C mbx_status16 mbx_sm4_xts_set_keys_mb16(mbx_sm4_key_schedule* key_sched1, mbx_sm4_key_schedule* key_sched2, const sm4_xts_key* pa_key[SM4_LINES]);
 
 EXTERN_C mbx_status16 mbx_sm4_encrypt_ecb_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched);
 EXTERN_C mbx_status16 mbx_sm4_decrypt_ecb_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched);
@@ -47,4 +49,10 @@ EXTERN_C mbx_status16 mbx_sm4_decrypt_ofb_mb16(int8u* pa_out[SM4_LINES], const i
 EXTERN_C mbx_status16 mbx_sm4_encrypt_cfb128_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched, const int8u* pa_iv[SM4_LINES]);
 EXTERN_C mbx_status16 mbx_sm4_decrypt_cfb128_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched, const int8u* pa_iv[SM4_LINES]);
 
+EXTERN_C mbx_status16 mbx_sm4_xts_encrypt_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES],
+                                               const mbx_sm4_key_schedule* key_sched1, const mbx_sm4_key_schedule* key_sched2,
+                                               const int8u* pa_tweak[SM4_LINES]);
+EXTERN_C mbx_status16 mbx_sm4_xts_decrypt_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES],
+                                               const mbx_sm4_key_schedule* key_sched1, const mbx_sm4_key_schedule* key_sched2,
+                                               const int8u* pa_tweak[SM4_LINES]);
 #endif /* SM4_H */
diff --git a/sources/ippcp/crypto_mb/include/crypto_mb/version.h b/sources/ippcp/crypto_mb/include/crypto_mb/version.h
index 0436b297..650cf46f 100644
--- a/sources/ippcp/crypto_mb/include/crypto_mb/version.h
+++ b/sources/ippcp/crypto_mb/include/crypto_mb/version.h
@@ -24,12 +24,12 @@
 #define MBX_LIB_NAME()    "crypto_mb"
 #define MBX_VER_MAJOR  1
 #define MBX_VER_MINOR  0
-#define MBX_VER_REV    6
+#define MBX_VER_REV    8
 
 /* major interface version */
 #define MBX_INTERFACE_VERSION_MAJOR 11
 /* minor interface version */
-#define MBX_INTERFACE_VERSION_MINOR 5
+#define MBX_INTERFACE_VERSION_MINOR 8
 
 typedef struct {
    int    major;          /* e.g. 1               */
diff --git a/sources/ippcp/crypto_mb/include/internal/sm4/sm4_mb.h b/sources/ippcp/crypto_mb/include/internal/sm4/sm4_mb.h
index b8b2cb1d..b938cf59 100644
--- a/sources/ippcp/crypto_mb/include/internal/sm4/sm4_mb.h
+++ b/sources/ippcp/crypto_mb/include/internal/sm4/sm4_mb.h
@@ -180,6 +180,66 @@ static __ALIGN64 const int8u shuf8[] = {
     0x09, 0x0A, 0x0B, 0x08, 0x0D, 0x0E, 0x0F, 0x0C,
 };
 
+/* For SM4-XTS */
+static __ALIGN64 const int64u xts_poly[] = {
+    0x87, 0x87, 0x87, 0x87, 0x87, 0x87, 0x87, 0x87
+};
+
+static __ALIGN64 const int8u xts_shuf_mask[] = {
+    15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+};
+
+static __ALIGN64 const int64u xts_const_dq3210[] = {
+    0, 0, 1, 1, 2, 2, 3, 3
+};
+
+static __ALIGN64 const int64u xts_const_dq5678[] = {
+    8, 8, 7, 7, 6, 6, 5, 5
+};
+
+static __ALIGN64 const int32u xts_full_block_mask[] = {
+    0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0,
+    0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0, 0xfffffff0
+};
+
+static __ALIGN64 const int32u xts_partial_block_mask[] = {
+    0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f,
+    0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f, 0x0000000f
+};
+
+static __ALIGN64 const int32u xts_dw0_7_to_qw_idx[] = {
+    0, 0xFF, 1, 0xFF, 2, 0xFF, 3, 0xFF,
+    4, 0xFF, 5, 0xFF, 6, 0xFF, 7, 0xFF
+};
+
+static __ALIGN64 const int32u xts_dw8_15_to_qw_idx[] = {
+    8, 0xFF, 9, 0xFF, 10, 0xFF, 11, 0xFF,
+    12, 0xFF, 13, 0xFF, 14, 0xFF, 15, 0xFF
+};
+
+static __ALIGN64 const int64u xts_tweak_permq[] = {
+    2, 3, 0, 1, 0xFF, 0xFF, 0xFF, 0xFF,
+    0, 1, 4, 5, 2, 3, 0xFF, 0xFF,
+    0, 1, 2, 3, 6, 7, 4, 5,
+    0, 1, 2, 3, 4, 5, 10, 11 /* for vpermi2q */
+};
+
+static __ALIGN64 const int64u xts_next_tweak_permq[] = {
+    0, 1, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    2, 3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    14, 15, 0, 1, 0xFF, 0xFF, 0xFF, 0xFF  /* for vpermi2q */
+};
+
+static __ALIGN64 const int64u xts_next_tweak_permq_enc[] = {
+    2, 3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    4, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    6, 7, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+};
+
 #define SM4_ONE_ROUND(X0, X1, X2, X3, TMP, RK) {    \
     /* (Xi+1 ^ Xi+2 ^ Xi+3 ^ rki) */                \
     TMP = _mm512_ternarylogic_epi32 (X1, X2, X3, 0x96); \
@@ -241,7 +301,7 @@ static __ALIGN64 const int8u shuf8[] = {
 #define UPDATE_STREAM_MASK_64(MASK, p_len) MASK = *p_len < (4 * 16) ? (*p_len <= 0 ? 0 : ((int64u)1 << *p_len) - 1) : (__mmask64)(-1); p_len++;
 
 #define SM4_ENC (1)
-#define SM4_DEC (2)
+#define SM4_DEC (-1)
 
 /*
 // Internal functions
@@ -256,6 +316,9 @@ EXTERN_C void sm4_ofb_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[
 EXTERN_C void sm4_cfb128_enc_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const int32u* key_sched[SM4_ROUNDS], const int8u* pa_iv[SM4_LINES], __mmask16 mb_mask);
 EXTERN_C void sm4_cfb128_dec_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES], const int32u* key_sched[SM4_ROUNDS], const int8u* pa_iv[SM4_LINES], __mmask16 mb_mask);
 EXTERN_C void sm4_set_round_keys_mb16(int32u* key_sched[SM4_ROUNDS], const int8u* pa_inp_key[SM4_LINES], __mmask16 mb_mask);
+EXTERN_C void sm4_xts_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES],
+                                  const int32u* key_sched1[SM4_ROUNDS], const int32u* key_sched2[SM4_ROUNDS],
+                                  const int8u* pa_tweak[SM4_LINES], __mmask16 mb_mask, const int dir);
 
 // The transformation based on SM4 sbox algebraic structure, parameters were computed manually
 __INLINE __m512i sBox512(__m512i block)
@@ -448,6 +511,42 @@ __INLINE void TRANSPOSE_16x4_I32_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __
     *t3 = _mm512_unpackhi_epi64(z1, z3);
 }
 
+__INLINE void TRANSPOSE_16x4_I32_XMM_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __m512i* t3, const __m128i in[16]) {
+    // L0 - L3
+    __m512i z0 = _mm512_castsi128_si512(in[0]);
+    __m512i z1 = _mm512_castsi128_si512(in[1]);
+    __m512i z2 = _mm512_castsi128_si512(in[2]);
+    __m512i z3 = _mm512_castsi128_si512(in[3]);
+
+    // L4 - L7
+    z0 = _mm512_inserti64x2(z0, in[4], 1);
+    z1 = _mm512_inserti64x2(z1, in[5], 1);
+    z2 = _mm512_inserti64x2(z2, in[6], 1);
+    z3 = _mm512_inserti64x2(z3, in[7], 1);
+
+    // L8 - Lb
+    z0 = _mm512_inserti64x2(z0, in[8], 2);
+    z1 = _mm512_inserti64x2(z1, in[9], 2);
+    z2 = _mm512_inserti64x2(z2, in[10], 2);
+    z3 = _mm512_inserti64x2(z3, in[11], 2);
+
+    // Lc - Lf
+    *t0 = ENDIANNESS_16x32(_mm512_inserti64x2(z0, in[12], 3));
+    *t1 = ENDIANNESS_16x32(_mm512_inserti64x2(z1, in[13], 3));
+    *t2 = ENDIANNESS_16x32(_mm512_inserti64x2(z2, in[14], 3));
+    *t3 = ENDIANNESS_16x32(_mm512_inserti64x2(z3, in[15], 3));
+
+    z0 = _mm512_unpacklo_epi32(*t0, *t1);
+    z1 = _mm512_unpackhi_epi32(*t0, *t1);
+    z2 = _mm512_unpacklo_epi32(*t2, *t3);
+    z3 = _mm512_unpackhi_epi32(*t2, *t3);
+
+    *t0 = _mm512_unpacklo_epi64(z0, z2);
+    *t1 = _mm512_unpackhi_epi64(z0, z2);
+    *t2 = _mm512_unpacklo_epi64(z1, z3);
+    *t3 = _mm512_unpackhi_epi64(z1, z3);
+}
+
 __INLINE void TRANSPOSE_4x16_I32_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __m512i* t3, int8u* p_out[16], __mmask16 mb_mask) {
 
     #define STORE_RESULT(OUT, store_mask, loc_mb_mask, Ti)                              \
@@ -493,6 +592,45 @@ __INLINE void TRANSPOSE_4x16_I32_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __
 
 }
 
+__INLINE void TRANSPOSE_4x16_I32_XMM_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __m512i* t3, __m128i out[16]) {
+
+    __m512i z0 = _mm512_unpacklo_epi32(*t0, *t1);
+    __m512i z1 = _mm512_unpackhi_epi32(*t0, *t1);
+    __m512i z2 = _mm512_unpacklo_epi32(*t2, *t3);
+    __m512i z3 = _mm512_unpackhi_epi32(*t2, *t3);
+
+    /* Get the right endianness and do (Y0, Y1, Y2, Y3) = R(X32, X33, X34, X35) = (X35, X34, X33, X32) */
+    *t0 = CHANGE_ORDER_BLOCKS(_mm512_unpacklo_epi64(z0, z2));
+    *t1 = CHANGE_ORDER_BLOCKS(_mm512_unpackhi_epi64(z0, z2));
+    *t2 = CHANGE_ORDER_BLOCKS(_mm512_unpacklo_epi64(z1, z3));
+    *t3 = CHANGE_ORDER_BLOCKS(_mm512_unpackhi_epi64(z1, z3));
+
+    // L0 - L3
+    out[0] = _mm512_extracti64x2_epi64(*t0, 0);
+    out[1] = _mm512_extracti64x2_epi64(*t1, 0);
+    out[2] = _mm512_extracti64x2_epi64(*t2, 0);
+    out[3] = _mm512_extracti64x2_epi64(*t3, 0);
+
+    // L4 - L7
+    out[4] = _mm512_extracti64x2_epi64(*t0, 1);
+    out[5] = _mm512_extracti64x2_epi64(*t1, 1);
+    out[6] = _mm512_extracti64x2_epi64(*t2, 1);
+    out[7] = _mm512_extracti64x2_epi64(*t3, 1);
+
+    // L8 - Lb
+    out[8] = _mm512_extracti64x2_epi64(*t0, 2);
+    out[9] = _mm512_extracti64x2_epi64(*t1, 2);
+    out[10] = _mm512_extracti64x2_epi64(*t2, 2);
+    out[11] = _mm512_extracti64x2_epi64(*t3, 2);
+
+    // Lc - Lf
+    out[12] = _mm512_extracti64x2_epi64(*t0, 3);
+    out[13] = _mm512_extracti64x2_epi64(*t1, 3);
+    out[14] = _mm512_extracti64x2_epi64(*t2, 3);
+    out[15] = _mm512_extracti64x2_epi64(*t3, 3);
+
+}
+
 __INLINE void TRANSPOSE_4x16_I32_O128_EPI32(__m512i* t0, __m512i* t1, __m512i* t2, __m512i* t3, __m128i p_out[16], __mmask16 mb_mask) {
 
     #define STORE_RESULT(OUT, store_mask, loc_mb_mask, Ti)                              \
diff --git a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.defs b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.defs
index 4c52c4bb..1faf7a1e 100644
--- a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.defs
+++ b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.defs
@@ -125,3 +125,7 @@ mbx_sm4_ccm_update_aad_mb16
 mbx_sm4_ccm_encrypt_mb16
 mbx_sm4_ccm_decrypt_mb16
 mbx_sm4_ccm_get_tag_mb16
+
+mbx_sm4_xts_set_keys_mb16
+mbx_sm4_xts_encrypt_mb16
+mbx_sm4_xts_decrypt_mb16
diff --git a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.linux.lib-export b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.linux.lib-export
index dc28cc72..5bbea890 100644
--- a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.linux.lib-export
+++ b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.linux.lib-export
@@ -123,3 +123,7 @@ EXTERN (mbx_sm4_ccm_update_aad_mb16)
 EXTERN (mbx_sm4_ccm_encrypt_mb16)
 EXTERN (mbx_sm4_ccm_decrypt_mb16)
 EXTERN (mbx_sm4_ccm_get_tag_mb16)
+
+EXTERN (mbx_sm4_xts_set_keys_mb16)
+EXTERN (mbx_sm4_xts_encrypt_mb16)
+EXTERN (mbx_sm4_xts_decrypt_mb16)
diff --git a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.macosx.lib-export b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.macosx.lib-export
index b48e5fba..b090a284 100644
--- a/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.macosx.lib-export
+++ b/sources/ippcp/crypto_mb/src/cmake/dll_export/crypto_mb.macosx.lib-export
@@ -123,3 +123,7 @@ _mbx_sm4_ccm_update_aad_mb16
 _mbx_sm4_ccm_encrypt_mb16
 _mbx_sm4_ccm_decrypt_mb16
 _mbx_sm4_ccm_get_tag_mb16
+
+_mbx_sm4_xts_set_keys_mb16
+_mbx_sm4_xts_encrypt_mb16
+_mbx_sm4_xts_decrypt_mb16
diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c
index d544d431..298c4a84 100644
--- a/sources/ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c
+++ b/sources/ippcp/crypto_mb/src/sm4/sm4_ecb_mb16.c
@@ -40,9 +40,7 @@ void sm4_ecb_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES
     _mm512_storeu_si512(loc_out, _mm512_loadu_si512(pa_out));
     _mm512_storeu_si512(loc_out + 8, _mm512_loadu_si512(pa_out + 8));
 
-    /* Depending on the operation(enc or dec): sign allows to go up and down on the key schedule */
     /* p_rk set to the beginning or to the end of the key schedule */
-    const int sign = (operation == SM4_ENC) ? 1 : -1;
     const __m512i* p_rk = (operation == SM4_ENC) ? (const __m512i*)key_sched : ((const __m512i*)key_sched + (SM4_ROUNDS - 1));
 
     __ALIGN64 __m512i TMP[20];
@@ -92,8 +90,8 @@ void sm4_ecb_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES
         TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
         TRANSPOSE_INP_512(TMP[16], TMP[17], TMP[18], TMP[19], TMP[0], TMP[1], TMP[2], TMP[3]);
 
-        SM4_KERNEL(TMP, p_rk, sign);
-        p_rk -= sign*SM4_ROUNDS;
+        SM4_KERNEL(TMP, p_rk, operation);
+        p_rk -= operation*SM4_ROUNDS;
 
         TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], TMP[5], TMP[6], TMP[7]);
         TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
@@ -149,7 +147,7 @@ void sm4_ecb_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES
 
     /* compute incomplete buffer loading */
     sm4_ecb_incomplete_buff_mb16(loc_inp, loc_out,
-                                 num_blocks, p_rk, sign,
+                                 num_blocks, p_rk, operation,
                                  mb_mask,
                                  TMP);
     /* clear local copy of sensitive data */
diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c
index cf2fdea4..e6163321 100644
--- a/sources/ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c
+++ b/sources/ippcp/crypto_mb/src/sm4/sm4_setkey_mb16.c
@@ -112,3 +112,40 @@ mbx_status16 mbx_sm4_set_key_mb16(mbx_sm4_key_schedule* key_sched, const sm4_key
 
     return status;
 }
+
+DLL_PUBLIC
+mbx_status16 mbx_sm4_xts_set_keys_mb16(mbx_sm4_key_schedule* key_sched1,
+                                       mbx_sm4_key_schedule* key_sched2,
+                                       const sm4_xts_key* pa_key[SM4_LINES])
+{
+    int buf_no;
+    mbx_status16 status = 0;
+    __mmask16 mb_mask = 0xFFFF;
+
+    /* Test input pointers */
+    if (NULL == key_sched1 || NULL == key_sched2 || NULL == pa_key)
+        return MBX_SET_STS16_ALL(MBX_STATUS_NULL_PARAM_ERR);
+
+    /* Don't process buffers with input pointers equal to zero */
+    for (buf_no = 0; buf_no < SM4_LINES; buf_no++) {
+        if (pa_key[buf_no] == NULL) {
+            status = MBX_SET_STS16(status, buf_no, MBX_STATUS_NULL_PARAM_ERR);
+            mb_mask &= ~(0x1 << buf_no);
+        }
+    }
+
+    if (MBX_IS_ANY_OK_STS16(status)) {
+        /* Generate round keys for key1 */
+        sm4_set_round_keys_mb16((int32u**)key_sched1, (const int8u**)pa_key, mb_mask);
+
+        const sm4_key* pa_key2[SM4_LINES];
+
+        for (int i = 0; i < SM4_LINES; i++)
+            pa_key2[i] = (const sm4_key*)&((int8u*)pa_key[i])[16];
+
+        /* Generate round keys for key2 */
+        sm4_set_round_keys_mb16((int32u**)key_sched2, (const int8u**)pa_key2, mb_mask);
+    }
+
+    return status;
+}
diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_xts_dec_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_dec_mb16.c
new file mode 100644
index 00000000..9bb03d9b
--- /dev/null
+++ b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_dec_mb16.c
@@ -0,0 +1,59 @@
+/*******************************************************************************
+* Copyright (C) 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the 'License');
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an 'AS IS' BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*******************************************************************************/
+
+#include <crypto_mb/status.h>
+#include <crypto_mb/sm4.h>
+
+#include <internal/common/ifma_defs.h>
+#include <internal/sm4/sm4_mb.h>
+
+DLL_PUBLIC
+mbx_status16 mbx_sm4_xts_decrypt_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES],
+                                      const mbx_sm4_key_schedule* key_sched1,
+                                      const mbx_sm4_key_schedule* key_sched2,
+                                      const int8u* pa_tweak[SM4_LINES])
+{
+    unsigned buf_no;
+    mbx_status16 status = 0;
+    __mmask16 mb_mask = 0xFFFF;
+
+    /* Test input pointers */
+    if (NULL == pa_out || NULL == pa_inp || NULL == len ||
+        NULL == key_sched1 || NULL == key_sched2 || NULL == pa_tweak)
+        return MBX_SET_STS16_ALL(MBX_STATUS_NULL_PARAM_ERR);
+
+    /* Test input data length and input pointers */
+    for (buf_no = 0; buf_no < SM4_LINES; buf_no++) {
+        if (pa_out[buf_no] == NULL || pa_inp[buf_no] == NULL || pa_tweak[buf_no] == NULL) {
+            status = MBX_SET_STS16(status, buf_no, MBX_STATUS_NULL_PARAM_ERR);
+            /* Do not process empty buffers */
+            mb_mask &= ~(0x1 << buf_no);
+        }
+        if (len[buf_no] < SM4_BLOCK_SIZE) {
+            status = MBX_SET_STS16(status, buf_no, MBX_STATUS_MISMATCH_PARAM_ERR);
+            /* Do not process non-valid buffers */
+            mb_mask &= ~(0x1 << buf_no);
+        }
+    }
+
+    if (MBX_IS_ANY_OK_STS16(status))
+        sm4_xts_kernel_mb16(pa_out, (const int8u**)pa_inp, (const int*)len,
+                            (const int32u**)key_sched1, (const int32u**)key_sched2,
+                            pa_tweak, mb_mask, SM4_DEC);
+
+    return status;
+}
diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_xts_enc_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_enc_mb16.c
new file mode 100644
index 00000000..c15e826a
--- /dev/null
+++ b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_enc_mb16.c
@@ -0,0 +1,59 @@
+/*******************************************************************************
+* Copyright (C) 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the 'License');
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an 'AS IS' BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*******************************************************************************/
+
+#include <crypto_mb/status.h>
+#include <crypto_mb/sm4.h>
+
+#include <internal/common/ifma_defs.h>
+#include <internal/sm4/sm4_mb.h>
+
+DLL_PUBLIC
+mbx_status16 mbx_sm4_xts_encrypt_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES],
+                                      const int len[SM4_LINES], const mbx_sm4_key_schedule* key_sched1,
+                                      const mbx_sm4_key_schedule* key_sched2,
+                                      const int8u* pa_tweak[SM4_LINES])
+{
+    unsigned buf_no;
+    mbx_status16 status = 0;
+    __mmask16 mb_mask = 0xFFFF;
+
+    /* Test input pointers */
+    if (NULL == pa_out || NULL == pa_inp || NULL == len ||
+        NULL == key_sched1 || NULL == key_sched2 || NULL == pa_tweak)
+        return MBX_SET_STS16_ALL(MBX_STATUS_NULL_PARAM_ERR);
+
+    /* Test input data length and input pointers */
+    for (buf_no = 0; buf_no < SM4_LINES; buf_no++) {
+        if (pa_out[buf_no] == NULL || pa_inp[buf_no] == NULL || pa_tweak[buf_no] == NULL) {
+            status = MBX_SET_STS16(status, buf_no, MBX_STATUS_NULL_PARAM_ERR);
+            /* Do not process empty buffers */
+            mb_mask &= ~(0x1 << buf_no);
+        }
+        if (len[buf_no] < SM4_BLOCK_SIZE) {
+            status = MBX_SET_STS16(status, buf_no, MBX_STATUS_MISMATCH_PARAM_ERR);
+            /* Do not process non-valid buffers */
+            mb_mask &= ~(0x1 << buf_no);
+        }
+    }
+
+    if (MBX_IS_ANY_OK_STS16(status))
+        sm4_xts_kernel_mb16(pa_out, (const int8u**)pa_inp, (const int*)len,
+                            (const int32u**)key_sched1, (const int32u**)key_sched2,
+                            pa_tweak, mb_mask, SM4_ENC);
+
+    return status;
+}
diff --git a/sources/ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c
new file mode 100644
index 00000000..02db4078
--- /dev/null
+++ b/sources/ippcp/crypto_mb/src/sm4/sm4_xts_mb16.c
@@ -0,0 +1,529 @@
+﻿/*******************************************************************************
+* Copyright (C) 2023 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the 'License');
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an 'AS IS' BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions
+* and limitations under the License.
+*
+*******************************************************************************/
+
+#include <internal/sm4/sm4_mb.h>
+#include <internal/rsa/ifma_rsa_arith.h>
+
+#define FIRST_TWEAKS 1
+#define NEXT_TWEAKS 0
+
+/* Generate the next 4 tweaks for a given buffer */
+static void generate_next_4_tweaks(const __m512i *PREV_TWEAK, __m512i *NEXT_TWEAK,
+                                   const __m512i z_shuf_mask, const __m512i z_poly,
+                                   const int first_tweaks)
+{
+    __m512i TMP1, TMP2, TMP3, TMP4;
+    const __mmask8 xor_mask = _cvtu32_mask8(0xAA);
+
+    TMP1 = _mm512_shuffle_epi8(*PREV_TWEAK, z_shuf_mask);
+    /*
+     * In case of the first 4 tweaks, the shifts are variable,
+     * as we are start from tweak 1 in all 128-bit lanes, to construct
+     * tweaks 1, 2, 3 and 4
+     */
+    if (first_tweaks) {
+        const __m512i z_dq3210 = _mm512_loadu_si512(xts_const_dq3210);
+        const __m512i z_dq5678 = _mm512_loadu_si512(xts_const_dq5678);
+
+        TMP2 = _mm512_sllv_epi64(*PREV_TWEAK, z_dq3210);
+        TMP3 = _mm512_srlv_epi64(TMP1, z_dq5678);
+    /*
+     * For following tweaks, the shifts are constant,
+     * as we calculate the next 4 tweaks, parting from tweaks N-4, N-3, N-2 and N,
+     * to construct tweaks N, N+1, N+2, N+3
+     */
+    } else {
+        TMP2 = _mm512_slli_epi64(*PREV_TWEAK, 4);
+        TMP3 = _mm512_srli_epi64(TMP1, 4);
+    }
+    TMP4 = _mm512_clmulepi64_epi128(TMP3, z_poly, 0);
+    TMP2 = _mm512_mask_xor_epi64(TMP2, xor_mask, TMP2, TMP3);
+    *NEXT_TWEAK = _mm512_xor_epi32(TMP4, TMP2);
+}
+
+/* Prepare the last tweaks for a given buffer, if it has a partial block */
+static void prepare_last_tweaks(__m512i *TWEAK, __m512i *NEXT_TWEAK,
+                                const int operation, int num_remain_full_blocks)
+{
+    /*
+     * For the encryption case, we need to prepare the tweak
+     * for the partial block to be at the beginning of NEXT_TWEAK,
+     * so depending on the number of remaining full blocks, its position
+     * will vary, so the permute mask will be different. In case, there are 4 full blocks,
+     * the newly generated NEXT_TWEAK will be positioned correctly.
+     */
+    if (operation == SM4_ENC) {
+        if (num_remain_full_blocks == 1)
+            *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq_enc[0]), *TWEAK);
+        else if (num_remain_full_blocks == 2)
+            *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq_enc[1*8]), *TWEAK);
+        else if (num_remain_full_blocks == 3)
+            *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq_enc[2*8]), *TWEAK);
+    /*
+     * For the decryption case, it is a bit more complicated.
+     * In case of a partial block, the last two tweaks (the last tweak of the last full block)
+     * and the tweak of the last block, need to be interchanged.
+     * TWEAK will have the tweaks for the last FULL blocks and *NEXT_TWEAK,
+     * as earlier, will have the tweak for the last partial block.
+     */
+    } else {
+        if (num_remain_full_blocks == 1) {
+            *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq[0]), *TWEAK);
+            *TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_tweak_permq[0]), *TWEAK);
+        } else if (num_remain_full_blocks == 2) {
+            *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq[1*8]), *TWEAK);
+            *TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_tweak_permq[1*8]), *TWEAK);
+        } else if (num_remain_full_blocks == 3) {
+            *NEXT_TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_next_tweak_permq[2*8]), *TWEAK);
+            *TWEAK = _mm512_permutexvar_epi64(_mm512_loadu_si512(&xts_tweak_permq[2*8]), *TWEAK);
+        } else if (num_remain_full_blocks == 4) {
+            *NEXT_TWEAK = _mm512_permutex2var_epi64(*NEXT_TWEAK, _mm512_loadu_si512(&xts_next_tweak_permq[3*8]), *TWEAK);
+            *TWEAK = _mm512_permutex2var_epi64(*TWEAK, _mm512_loadu_si512(&xts_tweak_permq[3*8]), *NEXT_TWEAK);
+        }
+    }
+}
+
+static void sm4_xts_mask_kernel_mb16(__m512i* NEXT_TWEAK, const __m512i* p_rk, __m512i loc_len32,
+                                     const int8u** loc_inp, int8u** loc_out,
+                                     __mmask16 mb_mask, const int operation)
+{
+    __m512i TMP[20];
+    const __m512i z_poly = _mm512_loadu_si512(xts_poly);
+    const __m512i z_partial_block_mask = _mm512_loadu_si512(xts_partial_block_mask);
+    const __m512i z_full_block_mask = _mm512_loadu_si512(xts_full_block_mask);
+    const __m512i z_shuf_mask = _mm512_loadu_si512(xts_shuf_mask);
+    /* Length in bytes of partial blocks for all buffers */
+    const __m512i partial_len32 = _mm512_and_si512(loc_len32, z_partial_block_mask);
+    /* Length in bytes of full blocks for all buffers */
+    loc_len32 = _mm512_and_si512(loc_len32, z_full_block_mask);
+
+    __mmask16 ge_64_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len32, _mm512_set1_epi32(4 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT);
+    __mmask8 ge_64_mask_0_7 = (__mmask8) ge_64_mask;
+    __mmask8 ge_64_mask_8_15 = (__mmask8) _kshiftri_mask16(ge_64_mask, 8);
+    /* Expand 32-bit lengths to 64-bit lengths for 16 buffers */
+    const __mmask16 expand_mask = _cvtu32_mask16(0x5555);
+    __m512i remain_len64_0_7 = _mm512_maskz_permutexvar_epi32(expand_mask, _mm512_loadu_si512(xts_dw0_7_to_qw_idx), loc_len32);
+    __m512i remain_len64_8_15 = _mm512_maskz_permutexvar_epi32(expand_mask, _mm512_loadu_si512(xts_dw8_15_to_qw_idx), loc_len32);
+    __m512i processed_len64_0_7;
+    __m512i processed_len64_8_15;
+    __m512i TWEAK[SM4_LINES];
+    __m512i num_remain_full_blocks = _mm512_srli_epi32(loc_len32, 4);
+    /* Calculate bitmask of buffers with at least one full block */
+    __mmask16 tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len32, _mm512_set1_epi32(0), _MM_CMPINT_NLE);
+
+    /*
+     * While there is at least one full block in any of the buffer, keep encrypting
+     * (this loop only handles full blocks, but some buffers will have here
+     * less than 4 full blocks)
+     */
+    while (tmp_mask) {
+        /* Mask for data loading */
+        __mmask64 stream_mask[16];
+        int i;
+
+        int* p_loc_len32 = (int*)&loc_len32;
+        int* p_num_remain_full_blocks = (int*)&num_remain_full_blocks;
+        int* p_partial_block = (int*)&partial_len32;
+
+        /* Generate tweaks for next rounds */
+        for (i = 0; i < SM4_LINES; i++) {
+            TWEAK[i] = NEXT_TWEAK[i];
+            /*
+             * If there are at least 4 more full blocks to process,
+             * at least one more tweak will be needed (for more full blocks or
+             * for a last partial block)
+             */
+            if (p_num_remain_full_blocks[i] >= 4)
+                generate_next_4_tweaks(&TWEAK[i], &NEXT_TWEAK[i], z_shuf_mask, z_poly, NEXT_TWEAKS);
+
+            /* If there is a partial block, tweaks need to be rearranged depending on cipher direction */
+            if ((p_partial_block[i] > 0) & (p_num_remain_full_blocks[i] <= 4))
+                prepare_last_tweaks(&TWEAK[i], &NEXT_TWEAK[i], operation, p_num_remain_full_blocks[i]);
+        }
+
+        num_remain_full_blocks = _mm512_sub_epi32(num_remain_full_blocks, _mm512_set1_epi32(4));
+
+        /*
+         * XOR plaintext from each lane with the 4 tweaks and transpose to prepare for encryption.
+         * Since some buffers will have less than 4 full blocks,
+         * a bitmask is required to load less than 64 bytes (stream_mask)
+         */
+        UPDATE_STREAM_MASK_64(stream_mask[0], p_loc_len32)
+        TMP[0] = _mm512_xor_si512(TWEAK[0], _mm512_maskz_loadu_epi8(stream_mask[0], loc_inp[0]));
+        UPDATE_STREAM_MASK_64(stream_mask[1], p_loc_len32)
+        TMP[1] = _mm512_xor_si512(TWEAK[1], _mm512_maskz_loadu_epi8(stream_mask[1], loc_inp[1]));
+        UPDATE_STREAM_MASK_64(stream_mask[2], p_loc_len32)
+        TMP[2] = _mm512_xor_si512(TWEAK[2], _mm512_maskz_loadu_epi8(stream_mask[2], loc_inp[2]));
+        UPDATE_STREAM_MASK_64(stream_mask[3], p_loc_len32)
+        TMP[3] = _mm512_xor_si512(TWEAK[3], _mm512_maskz_loadu_epi8(stream_mask[3], loc_inp[3]));
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+
+        TRANSPOSE_INP_512(TMP[4], TMP[5], TMP[6], TMP[7], TMP[0], TMP[1], TMP[2], TMP[3]);
+
+        UPDATE_STREAM_MASK_64(stream_mask[4], p_loc_len32)
+        TMP[0] = _mm512_xor_si512(TWEAK[4], _mm512_maskz_loadu_epi8(stream_mask[4], loc_inp[4]));
+        UPDATE_STREAM_MASK_64(stream_mask[5], p_loc_len32)
+        TMP[1] = _mm512_xor_si512(TWEAK[5], _mm512_maskz_loadu_epi8(stream_mask[5], loc_inp[5]));
+        UPDATE_STREAM_MASK_64(stream_mask[6], p_loc_len32)
+        TMP[2] = _mm512_xor_si512(TWEAK[6], _mm512_maskz_loadu_epi8(stream_mask[6], loc_inp[6]));
+        UPDATE_STREAM_MASK_64(stream_mask[7], p_loc_len32)
+        TMP[3] = _mm512_xor_si512(TWEAK[7], _mm512_maskz_loadu_epi8(stream_mask[7], loc_inp[7]));
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+
+        TRANSPOSE_INP_512(TMP[8], TMP[9], TMP[10], TMP[11], TMP[0], TMP[1], TMP[2], TMP[3]);
+
+        UPDATE_STREAM_MASK_64(stream_mask[8], p_loc_len32)
+        TMP[0] = _mm512_xor_si512(TWEAK[8], _mm512_maskz_loadu_epi8(stream_mask[8], loc_inp[8]));
+        UPDATE_STREAM_MASK_64(stream_mask[9], p_loc_len32)
+        TMP[1] = _mm512_xor_si512(TWEAK[9], _mm512_maskz_loadu_epi8(stream_mask[9], loc_inp[9]));
+        UPDATE_STREAM_MASK_64(stream_mask[10], p_loc_len32)
+        TMP[2] = _mm512_xor_si512(TWEAK[10], _mm512_maskz_loadu_epi8(stream_mask[10], loc_inp[10]));
+        UPDATE_STREAM_MASK_64(stream_mask[11], p_loc_len32)
+        TMP[3] = _mm512_xor_si512(TWEAK[11], _mm512_maskz_loadu_epi8(stream_mask[11], loc_inp[11]));
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+
+        TRANSPOSE_INP_512(TMP[12], TMP[13], TMP[14], TMP[15], TMP[0], TMP[1], TMP[2], TMP[3]);
+
+        UPDATE_STREAM_MASK_64(stream_mask[12], p_loc_len32)
+        TMP[0] = _mm512_xor_si512(TWEAK[12], _mm512_maskz_loadu_epi8(stream_mask[12], loc_inp[12]));
+        UPDATE_STREAM_MASK_64(stream_mask[13], p_loc_len32)
+        TMP[1] = _mm512_xor_si512(TWEAK[13], _mm512_maskz_loadu_epi8(stream_mask[13], loc_inp[13]));
+        UPDATE_STREAM_MASK_64(stream_mask[14], p_loc_len32)
+        TMP[2] = _mm512_xor_si512(TWEAK[14], _mm512_maskz_loadu_epi8(stream_mask[14], loc_inp[14]));
+        UPDATE_STREAM_MASK_64(stream_mask[15], p_loc_len32)
+        TMP[3] = _mm512_xor_si512(TWEAK[15], _mm512_maskz_loadu_epi8(stream_mask[15], loc_inp[15]));
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+
+        TRANSPOSE_INP_512(TMP[16], TMP[17], TMP[18], TMP[19], TMP[0], TMP[1], TMP[2], TMP[3]);
+
+        SM4_KERNEL(TMP, p_rk, operation);
+        p_rk -= operation*SM4_ROUNDS;
+
+        /* Transpose, XOR with the tweaks again and write data out */
+        TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], TMP[5], TMP[6], TMP[7]);
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[0], stream_mask[0], _mm512_xor_si512(TMP[0], TWEAK[0]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[1], stream_mask[1], _mm512_xor_si512(TMP[1], TWEAK[1]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[2], stream_mask[2], _mm512_xor_si512(TMP[2], TWEAK[2]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[3], stream_mask[3], _mm512_xor_si512(TMP[3], TWEAK[3]));
+
+        TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[8], TMP[9], TMP[10], TMP[11]);
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[4], stream_mask[4], _mm512_xor_si512(TMP[0], TWEAK[4]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[5], stream_mask[5], _mm512_xor_si512(TMP[1], TWEAK[5]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[6], stream_mask[6], _mm512_xor_si512(TMP[2], TWEAK[6]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[7], stream_mask[7], _mm512_xor_si512(TMP[3], TWEAK[7]));
+
+        TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[12], TMP[13], TMP[14], TMP[15]);
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[8], stream_mask[8], _mm512_xor_si512(TMP[0], TWEAK[8]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[9], stream_mask[9], _mm512_xor_si512(TMP[1], TWEAK[9]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[10], stream_mask[10], _mm512_xor_si512(TMP[2],TWEAK[10]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[11], stream_mask[11], _mm512_xor_si512(TMP[3],TWEAK[11]));
+
+        TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[16], TMP[17], TMP[18], TMP[19]);
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[12], stream_mask[12], _mm512_xor_si512(TMP[0], TWEAK[12]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[13], stream_mask[13], _mm512_xor_si512(TMP[1], TWEAK[13]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[14], stream_mask[14], _mm512_xor_si512(TMP[2], TWEAK[14]));
+        _mm512_mask_storeu_epi8((__m512i*)loc_out[15], stream_mask[15], _mm512_xor_si512(TMP[3], TWEAK[15]));
+
+        /* Update input/output pointers to data */
+        processed_len64_0_7 = _mm512_mask_blend_epi64(ge_64_mask_0_7, remain_len64_0_7, _mm512_set1_epi64(4 * SM4_BLOCK_SIZE));
+        processed_len64_8_15 = _mm512_mask_blend_epi64(ge_64_mask_8_15, remain_len64_8_15, _mm512_set1_epi64(4 * SM4_BLOCK_SIZE));
+        M512(loc_inp) = _mm512_add_epi64(_mm512_loadu_si512(loc_inp), processed_len64_0_7);
+        M512(loc_inp + 8) = _mm512_add_epi64(_mm512_loadu_si512(loc_inp + 8), processed_len64_8_15);
+
+        M512(loc_out) = _mm512_add_epi64(_mm512_loadu_si512(loc_out), processed_len64_0_7);
+        M512(loc_out + 8) = _mm512_add_epi64(_mm512_loadu_si512(loc_out + 8), processed_len64_8_15);
+
+        /* Update number of blocks left and processing mask */
+        remain_len64_0_7 = _mm512_sub_epi64(remain_len64_0_7, processed_len64_0_7);
+        remain_len64_8_15 = _mm512_sub_epi64(remain_len64_8_15, processed_len64_8_15);
+        loc_len32 = _mm512_sub_epi32(loc_len32, _mm512_set1_epi32(4 * SM4_BLOCK_SIZE));
+        tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len32, _mm512_set1_epi32(0), _MM_CMPINT_NLE);
+        ge_64_mask_0_7 = _mm512_cmp_epi64_mask(remain_len64_0_7, _mm512_set1_epi64(4 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT);
+        ge_64_mask_8_15 = _mm512_cmp_epi64_mask(remain_len64_8_15, _mm512_set1_epi64(4 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT);
+    }
+
+    /* At this stage, all buffers have at most 15 bytes (a partial block) */
+
+    /* Calculate bitmask of buffers with a partial block */
+    tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, partial_len32, _mm512_set1_epi32(0), _MM_CMPINT_NLE);
+
+    if (tmp_mask) {
+        /* Encrypt last plaintext using bytes from previous ciphertext block */
+        __mmask64 stream_mask[16];
+        int* p_loc_len32 = (int*)&partial_len32;
+        __m128i XTMP[16];
+        int i;
+
+        for (i = 0; i < SM4_LINES; i++) {
+            /* Get right tweak (position tweak in last 16 bytes of ZMM register) */
+            UPDATE_STREAM_MASK_64(stream_mask[i], p_loc_len32);
+            /* Read final bytes of input partial block */
+            XTMP[i] = _mm_maskz_loadu_epi8((__mmask16)stream_mask[i], loc_inp[i]);
+            /*
+             * Read last bytes of previous output block to form 16 bytes
+             * (only if there is a partial block at the end of the buffer)
+             */
+            if (stream_mask[i] == 0)
+                continue;
+            __m128i XOUT = _mm_maskz_loadu_epi8((__mmask16)~stream_mask[i], (loc_out[i] - 16));
+            XTMP[i] = _mm_or_si128(XTMP[i], XOUT);
+            /* Initial XOR of new constructed input with tweak */
+            XTMP[i] = _mm_xor_si128(XTMP[i], _mm512_castsi512_si128(NEXT_TWEAK[i]));
+        }
+
+        /* Encrypt final block from all lanes, compressing the 16 XMMs into 4 ZMMs */
+        TRANSPOSE_16x4_I32_XMM_EPI32(&TMP[0], &TMP[1], &TMP[2], &TMP[3], XTMP);
+        for (i = 0; i < SM4_ROUNDS; i += 4, p_rk += 4*operation)
+            SM4_FOUR_ROUNDS(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], p_rk, operation);
+
+        p_rk -= operation*SM4_ROUNDS;
+
+        /* Spread out the 4 ZMMs into 16 XMMs */
+        TRANSPOSE_4x16_I32_XMM_EPI32(&TMP[0], &TMP[1], &TMP[2], &TMP[3], XTMP);
+        for (i = 0; i < SM4_LINES; i++) {
+            /* Skip the buffer if there is no partial block left */
+            if (stream_mask[i] == 0)
+                continue;
+            /*
+             * Final XOR of output with tweak (it will be always
+             * in the beginning of NEXT_TWEAK, hence the cast)
+             */
+            XTMP[i] = _mm_xor_si128(XTMP[i], _mm512_castsi512_si128(NEXT_TWEAK[i]));
+            /* Write first bytes of previous output block as the output of the partial block */
+            __m128i XOUT = _mm_maskz_loadu_epi8((__mmask16)stream_mask[i], (loc_out[i] - 16));
+            _mm_mask_storeu_epi8(loc_out[i], (__mmask16)stream_mask[i], XOUT);
+            /* Write last output as the output of the previous block */
+            _mm_storeu_si128((__m128i*)(loc_out[i] - 16), XTMP[i]);
+        }
+    }
+    /* clear local copy of sensitive data */
+    zero_mb8((int64u(*)[8])TMP, sizeof(TMP) / sizeof(TMP[0]));
+}
+
+void sm4_xts_kernel_mb16(int8u* pa_out[SM4_LINES], const int8u* pa_inp[SM4_LINES], const int len[SM4_LINES],
+                         const int32u* key_sched1[SM4_ROUNDS], const int32u* key_sched2[SM4_ROUNDS],
+                         const int8u* pa_tweak[SM4_LINES], __mmask16 mb_mask, const int operation)
+{
+    __ALIGN64 const int8u* loc_inp[SM4_LINES];
+    __ALIGN64 int8u* loc_out[SM4_LINES];
+
+    /* Create the local copy of the input data length in bytes and set it to zero for non-valid buffers */
+    __m512i loc_len;
+    loc_len = _mm512_loadu_si512(len);
+    loc_len = _mm512_mask_set1_epi32(loc_len, ~mb_mask, 0);
+
+    /* Local copies of the pointers to input and otput buffers */
+    _mm512_storeu_si512((void*)loc_inp, _mm512_loadu_si512(pa_inp));
+    _mm512_storeu_si512((void*)(loc_inp + 8), _mm512_loadu_si512(pa_inp + 8));
+
+    _mm512_storeu_si512(loc_out, _mm512_loadu_si512(pa_out));
+    _mm512_storeu_si512(loc_out + 8, _mm512_loadu_si512(pa_out + 8));
+
+    /* Depending on the operation(enc or dec): sign allows to go up and down on the key schedule
+     * p_rk set to the beginning or to the end of the key schedule */
+    const __m512i* p_rk1 = (operation == SM4_ENC) ? (const __m512i*)key_sched1 : ((const __m512i*)key_sched1 + (SM4_ROUNDS - 1));
+    /* Pointer p_rk2 is set to the beginning of the key schedule,
+     * as it always encrypts the tweak, regardless the direction */
+    const __m512i* p_rk2 = (const __m512i*)key_sched2;
+
+    /* TMP[] - temporary buffer for processing */
+    /* TWEAK - tweak values for current blocks (4 blocks per buffer) */
+    /* NEXT_TWEAK - tweak values for following blocks (4 blocks per buffer) */
+    /* inital_tweak - first tweak for all buffers */
+    __m512i TMP[20];
+    __m512i TWEAK[SM4_LINES];
+    __m512i NEXT_TWEAK[SM4_LINES];
+    __m128i initial_tweak[SM4_LINES];
+    int i;
+
+    const __m512i z_poly = _mm512_loadu_si512(xts_poly);
+    const __m512i z_shuf_mask = _mm512_loadu_si512(xts_shuf_mask);
+
+    /* Encrypt initial tweak */
+    TRANSPOSE_16x4_I32_EPI32(&TMP[0], &TMP[1], &TMP[2], &TMP[3], pa_tweak, mb_mask);
+
+    for (i = 0; i < SM4_ROUNDS; i += 4, p_rk2 += 4)
+        SM4_FOUR_ROUNDS(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], p_rk2, SM4_ENC);
+
+    p_rk2 -= SM4_ROUNDS;
+
+    TRANSPOSE_4x16_I32_O128_EPI32(&TMP[0], &TMP[1], &TMP[2], &TMP[3], initial_tweak, mb_mask);
+
+    /* Load TWEAK value from valid buffers and generate first 4 values */
+    for (i = 0; i < SM4_LINES; i++) {
+        TWEAK[i] = _mm512_broadcast_i64x2(initial_tweak[i]);
+        generate_next_4_tweaks(&TWEAK[i], &NEXT_TWEAK[i], z_shuf_mask, z_poly, FIRST_TWEAKS);
+    }
+
+    /*
+     * Generate the mask to process 4 full blocks from each buffer.
+     * Less than 5 full blocks requires sm4_xts_mask_kernel_mb16 to handle it,
+     * as it is the function that can handle partial blocks.
+     */
+    __mmask16 tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len, _mm512_set1_epi32(5 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT);
+
+    /* Go to this loop if all 16 buffers contain at least 5 full blocks each */
+    while (tmp_mask == 0xFFFF) {
+        for (i = 0; i < SM4_LINES; i++) {
+            TWEAK[i] = NEXT_TWEAK[i];
+
+            /* Update tweaks for next rounds */
+            generate_next_4_tweaks(&TWEAK[i], &NEXT_TWEAK[i], z_shuf_mask, z_poly, NEXT_TWEAKS);
+        }
+
+        /* XOR plaintext from each lane with the 4 tweaks and transpose to prepare for encryption */
+        TMP[0] = _mm512_xor_si512(TWEAK[0], _mm512_loadu_si512(loc_inp[0]));
+        TMP[1] = _mm512_xor_si512(TWEAK[1], _mm512_loadu_si512(loc_inp[1]));
+        TMP[2] = _mm512_xor_si512(TWEAK[2], _mm512_loadu_si512(loc_inp[2]));
+        TMP[3] = _mm512_xor_si512(TWEAK[3], _mm512_loadu_si512(loc_inp[3]));
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+
+        TRANSPOSE_INP_512(TMP[4], TMP[5], TMP[6], TMP[7], TMP[0], TMP[1], TMP[2], TMP[3]);
+
+        TMP[0] = _mm512_xor_si512(TWEAK[4], _mm512_loadu_si512(loc_inp[4]));
+        TMP[1] = _mm512_xor_si512(TWEAK[5], _mm512_loadu_si512(loc_inp[5]));
+        TMP[2] = _mm512_xor_si512(TWEAK[6], _mm512_loadu_si512(loc_inp[6]));
+        TMP[3] = _mm512_xor_si512(TWEAK[7], _mm512_loadu_si512(loc_inp[7]));
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+
+        TRANSPOSE_INP_512(TMP[8], TMP[9], TMP[10], TMP[11], TMP[0], TMP[1], TMP[2], TMP[3]);
+
+        TMP[0] = _mm512_xor_si512(TWEAK[8], _mm512_loadu_si512(loc_inp[8]));
+        TMP[1] = _mm512_xor_si512(TWEAK[9], _mm512_loadu_si512(loc_inp[9]));
+        TMP[2] = _mm512_xor_si512(TWEAK[10], _mm512_loadu_si512(loc_inp[10]));
+        TMP[3] = _mm512_xor_si512(TWEAK[11], _mm512_loadu_si512(loc_inp[11]));
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+
+        TRANSPOSE_INP_512(TMP[12], TMP[13], TMP[14], TMP[15], TMP[0], TMP[1], TMP[2], TMP[3]);
+
+        TMP[0] = _mm512_xor_si512(TWEAK[12], _mm512_loadu_si512(loc_inp[12]));
+        TMP[1] = _mm512_xor_si512(TWEAK[13], _mm512_loadu_si512(loc_inp[13]));
+        TMP[2] = _mm512_xor_si512(TWEAK[14], _mm512_loadu_si512(loc_inp[14]));
+        TMP[3] = _mm512_xor_si512(TWEAK[15], _mm512_loadu_si512(loc_inp[15]));
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+
+        TRANSPOSE_INP_512(TMP[16], TMP[17], TMP[18], TMP[19], TMP[0], TMP[1], TMP[2], TMP[3]);
+
+        SM4_KERNEL(TMP, p_rk1, operation);
+        p_rk1 -= operation*SM4_ROUNDS;
+
+        /* Transpose, XOR with the tweaks again and write data out */
+        TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[4], TMP[5], TMP[6], TMP[7]);
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+        _mm512_storeu_si512((__m512i*)loc_out[0], _mm512_xor_si512(TMP[0], TWEAK[0]));
+        _mm512_storeu_si512((__m512i*)loc_out[1], _mm512_xor_si512(TMP[1], TWEAK[1]));
+        _mm512_storeu_si512((__m512i*)loc_out[2], _mm512_xor_si512(TMP[2], TWEAK[2]));
+        _mm512_storeu_si512((__m512i*)loc_out[3], _mm512_xor_si512(TMP[3], TWEAK[3]));
+
+        TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[8], TMP[9], TMP[10], TMP[11]);
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+        _mm512_storeu_si512((__m512i*)loc_out[4], _mm512_xor_si512(TMP[0], TWEAK[4]));
+        _mm512_storeu_si512((__m512i*)loc_out[5], _mm512_xor_si512(TMP[1], TWEAK[5]));
+        _mm512_storeu_si512((__m512i*)loc_out[6], _mm512_xor_si512(TMP[2], TWEAK[6]));
+        _mm512_storeu_si512((__m512i*)loc_out[7], _mm512_xor_si512(TMP[3], TWEAK[7]));
+
+        TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[12], TMP[13], TMP[14], TMP[15]);
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+        _mm512_storeu_si512((__m512i*)loc_out[8], _mm512_xor_si512(TMP[0], TWEAK[8]));
+        _mm512_storeu_si512((__m512i*)loc_out[9], _mm512_xor_si512(TMP[1], TWEAK[9]));
+        _mm512_storeu_si512((__m512i*)loc_out[10], _mm512_xor_si512(TMP[2], TWEAK[10]));
+        _mm512_storeu_si512((__m512i*)loc_out[11], _mm512_xor_si512(TMP[3], TWEAK[11]));
+
+        TRANSPOSE_OUT_512(TMP[0], TMP[1], TMP[2], TMP[3], TMP[16], TMP[17], TMP[18], TMP[19]);
+        TMP[0] = _mm512_shuffle_epi8(TMP[0], M512(swapBytes));
+        TMP[1] = _mm512_shuffle_epi8(TMP[1], M512(swapBytes));
+        TMP[2] = _mm512_shuffle_epi8(TMP[2], M512(swapBytes));
+        TMP[3] = _mm512_shuffle_epi8(TMP[3], M512(swapBytes));
+        _mm512_storeu_si512((__m512i*)loc_out[13], _mm512_xor_si512(TMP[1], TWEAK[13]));
+        _mm512_storeu_si512((__m512i*)loc_out[13], _mm512_xor_si512(TMP[1], TWEAK[13]));
+        _mm512_storeu_si512((__m512i*)loc_out[14], _mm512_xor_si512(TMP[2], TWEAK[14]));
+        _mm512_storeu_si512((__m512i*)loc_out[15], _mm512_xor_si512(TMP[3], TWEAK[15]));
+
+        /* Update input/output pointers to data */
+        M512(loc_inp) = _mm512_add_epi64(_mm512_loadu_si512(loc_inp), _mm512_set1_epi64(4 * SM4_BLOCK_SIZE));
+        M512(loc_inp + 8) = _mm512_add_epi64(_mm512_loadu_si512(loc_inp + 8), _mm512_set1_epi64(4 * SM4_BLOCK_SIZE));
+
+        M512(loc_out) = _mm512_add_epi64(_mm512_loadu_si512(loc_out), _mm512_set1_epi64(4 * SM4_BLOCK_SIZE));
+        M512(loc_out + 8) = _mm512_add_epi64(_mm512_loadu_si512(loc_out + 8), _mm512_set1_epi64(4 * SM4_BLOCK_SIZE));
+
+        /* Update number of blocks left and processing mask */
+        loc_len = _mm512_sub_epi32(loc_len, _mm512_set1_epi32(4 * SM4_BLOCK_SIZE));
+        tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len, _mm512_set1_epi32(5 * SM4_BLOCK_SIZE), _MM_CMPINT_NLT);
+    }
+
+    /* Check if we have any data left on any of the buffers */
+    tmp_mask = _mm512_mask_cmp_epi32_mask(mb_mask, loc_len, _mm512_setzero_si512(), _MM_CMPINT_NLE);
+    /*
+     * At this point, at least one buffer has less than 5 full blocks,
+     * so dealing with a partial block might be needed.
+     */
+    if (tmp_mask)
+        sm4_xts_mask_kernel_mb16(NEXT_TWEAK, p_rk1, loc_len, loc_inp, loc_out, mb_mask, operation);
+
+    /* clear local copy of sensitive data */
+    zero_mb8((int64u(*)[8])TMP, sizeof(TMP) / sizeof(TMP[0]));
+    zero_mb8((int64u(*)[8])TWEAK, sizeof(TWEAK) / sizeof(TWEAK[0]));
+}
diff --git a/tools/ipp_custom_library_tool_python/requirements.txt b/tools/ipp_custom_library_tool_python/requirements.txt
new file mode 100644
index 00000000..be36e373
--- /dev/null
+++ b/tools/ipp_custom_library_tool_python/requirements.txt
@@ -0,0 +1,2 @@
+PyQt5==5.15.9
+