diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp index 16ed7cadc19..f76b9225491 100644 --- a/Common/include/CConfig.hpp +++ b/Common/include/CConfig.hpp @@ -43,8 +43,8 @@ #include #include -#include "./option_structure.hpp" -#include "./toolboxes/C2DContainer.hpp" +#include "option_structure.hpp" +#include "containers/container_decorators.hpp" #ifdef HAVE_CGNS #include "cgnslib.h" @@ -419,6 +419,7 @@ class CConfig { su2double *RK_Alpha_Step; /*!< \brief Runge-Kutta beta coefficients. */ unsigned short nQuasiNewtonSamples; /*!< \brief Number of samples used in quasi-Newton solution methods. */ + bool UseVectorization; /*!< \brief Whether to use vectorized numerics schemes. */ unsigned short nMGLevels; /*!< \brief Number of multigrid levels (coarse levels). */ unsigned short nCFL; /*!< \brief Number of CFL, one for each multigrid level. */ @@ -591,10 +592,10 @@ class CConfig { *Kappa_AdjFlow, /*!< \brief Numerical dissipation coefficients for the adjoint flow equations. */ *Kappa_Heat; /*!< \brief Numerical dissipation coefficients for the (fvm) heat equation. */ su2double* FFD_Axis; /*!< \brief Numerical dissipation coefficients for the adjoint equations. */ - su2double Kappa_1st_AdjFlow, /*!< \brief JST 1st order dissipation coefficient for adjoint flow equations (coarse multigrid levels). */ + su2double Kappa_1st_AdjFlow, /*!< \brief Lax 1st order dissipation coefficient for adjoint flow equations (coarse multigrid levels). */ Kappa_2nd_AdjFlow, /*!< \brief JST 2nd order dissipation coefficient for adjoint flow equations. */ Kappa_4th_AdjFlow, /*!< \brief JST 4th order dissipation coefficient for adjoint flow equations. */ - Kappa_1st_Flow, /*!< \brief JST 1st order dissipation coefficient for flow equations (coarse multigrid levels). */ + Kappa_1st_Flow, /*!< \brief Lax 1st order dissipation coefficient for flow equations (coarse multigrid levels). */ Kappa_2nd_Flow, /*!< \brief JST 2nd order dissipation coefficient for flow equations. */ Kappa_4th_Flow, /*!< \brief JST 4th order dissipation coefficient for flow equations. */ Kappa_2nd_Heat, /*!< \brief 2nd order dissipation coefficient for heat equation. */ @@ -1164,7 +1165,7 @@ class CConfig { ionization; /*!< \brief Flag for determining if free electron gas is in the mixture. */ string GasModel, /*!< \brief Gas Model. */ *Wall_Catalytic; /*!< \brief Pointer to catalytic walls. */ - + /*! * \brief Set the default values of config options not set in the config file using another config object. * \param config - Config object to use the default values from. @@ -4114,6 +4115,11 @@ class CConfig { */ unsigned short GetnQuasiNewtonSamples(void) const { return nQuasiNewtonSamples; } + /*! + * \brief Get whether to use vectorized numerics (if available). + */ + bool GetUseVectorization(void) const { return UseVectorization; } + /*! * \brief Get the relaxation coefficient of the linear solver for the implicit formulation. * \return relaxation coefficient of the linear solver for the implicit formulation. @@ -4509,7 +4515,7 @@ class CConfig { * during the computation. * \return Kind of center convective numerical scheme for the flow equations. */ - unsigned short GetKind_Centered_Flow(void) const { return Kind_Centered_Flow; } + ENUM_CENTERED GetKind_Centered_Flow(void) const { return static_cast(Kind_Centered_Flow); } /*! * \brief Get the kind of center convective numerical scheme for the plasma equations. diff --git a/Common/include/CMultiGridQueue.hpp b/Common/include/CMultiGridQueue.hpp index 42f14a598c1..369685d86c5 100644 --- a/Common/include/CMultiGridQueue.hpp +++ b/Common/include/CMultiGridQueue.hpp @@ -29,7 +29,7 @@ #pragma once #include -#include "toolboxes/CFastFindAndEraseQueue.hpp" +#include "containers/CFastFindAndEraseQueue.hpp" #include "geometry/CGeometry.hpp" using namespace std; diff --git a/Common/include/basic_types/ad_structure.hpp b/Common/include/basic_types/ad_structure.hpp index acf46861300..f38d54876da 100644 --- a/Common/include/basic_types/ad_structure.hpp +++ b/Common/include/basic_types/ad_structure.hpp @@ -353,8 +353,7 @@ namespace AD{ /*--- Base case for parameter pack expansion. ---*/ FORCEINLINE void SetPreaccIn() {} - template::value,bool>::type = 0> + template::value> = 0> FORCEINLINE void SetPreaccIn(const T& data, Ts&&... moreData) { if (!PreaccActive) return; if (data.isActive()) @@ -385,6 +384,18 @@ namespace AD{ } } + template + FORCEINLINE void SetPreaccIn(const T& data, const int size_x, const int size_y, const int size_z) { + if (!PreaccActive) return; + for (int i = 0; i < size_x; i++) { + for (int j = 0; j < size_y; j++) { + for (int k = 0; k < size_z; k++) { + if (data[i][j][k].isActive()) PreaccHelper.addInput(data[i][j][k]); + } + } + } + } + FORCEINLINE void StartPreacc() { if (globalTape.isActive() && PreaccEnabled) { PreaccHelper.start(); @@ -395,8 +406,7 @@ namespace AD{ /*--- Base case for parameter pack expansion. ---*/ FORCEINLINE void SetPreaccOut() {} - template::value,bool>::type = 0> + template::value> = 0> FORCEINLINE void SetPreaccOut(T& data, Ts&&... moreData) { if (!PreaccActive) return; if (data.isActive()) diff --git a/Common/include/basic_types/datatype_structure.hpp b/Common/include/basic_types/datatype_structure.hpp index a6b94544319..00f5a7a31bb 100644 --- a/Common/include/basic_types/datatype_structure.hpp +++ b/Common/include/basic_types/datatype_structure.hpp @@ -40,6 +40,17 @@ #define FORCEINLINE inline #endif +#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER) +#define NEVERINLINE inline __attribute__((noinline)) +#else +#define NEVERINLINE inline +#endif + +/*--- Convenience SFINAE typedef to conditionally + * enable/disable function template overloads. ---*/ +template +using su2enable_if = typename std::enable_if::type; + /*--- Depending on the datatype defined during the configuration, * include the correct definition, and create the main typedef. ---*/ @@ -194,10 +205,10 @@ namespace SU2_TYPE { /*--- Special handling of the sprintf routine for non-primitive types. ---*/ /*--- Pass-through for built-in types. ---*/ - template::value,bool>::type = 0> + template::value> = 0> FORCEINLINE const T& _printGetValue(const T& val) {return val;} /*--- Overload for expressions of active types. ---*/ - template::value,bool>::type = 0> + template::value> = 0> FORCEINLINE passivedouble _printGetValue(const T& val) { return val.getValue(); } /*! diff --git a/Common/include/toolboxes/C2DContainer.hpp b/Common/include/containers/C2DContainer.hpp similarity index 78% rename from Common/include/toolboxes/C2DContainer.hpp rename to Common/include/containers/C2DContainer.hpp index e14bef89365..fb3d71fc7ff 100644 --- a/Common/include/toolboxes/C2DContainer.hpp +++ b/Common/include/containers/C2DContainer.hpp @@ -27,8 +27,9 @@ #pragma once -#include "allocation_toolbox.hpp" +#include "../toolboxes/allocation_toolbox.hpp" #include "../basic_types/datatype_structure.hpp" +#include "../parallelization/vectorization.hpp" #include #include @@ -134,7 +135,9 @@ class AccessorImpl #define UNIV_ACCESSORS \ bool empty() const noexcept {return size()==0;} \ Scalar_t* data() noexcept {return m_data;} \ - const Scalar_t* data() const noexcept {return m_data;} + const Scalar_t* data() const noexcept {return m_data;} \ + const Scalar_t* begin() const noexcept {return data();} \ + const Scalar_t* end() const noexcept {return data()+size();} /*! * Operator (,) gives pointwise access, operator [] returns a pointer to the @@ -177,8 +180,7 @@ class AccessorImpl } /*! - * Vectors do not provide operator [] as it is redundant - * since operator () already returns by reference. + * Vectors provide both [] and () with the same behavior. */ #define VECTOR_ACCESSORS(M,ROWMAJOR) \ UNIV_ACCESSORS \ @@ -193,6 +195,18 @@ class AccessorImpl } \ \ const Scalar_t& operator() (const Index_t i) const noexcept \ + { \ + assert(i>=0 && i=0 && i=0 && i + class CInnerIterGather { + private: + static_assert(std::is_integral::value,""); + enum {Size = IndexSIMD_t::Size}; + IndexSIMD_t m_offsets; + const Index m_increment; + const Scalar* const m_data; + public: + CInnerIterGather() = delete; + + FORCEINLINE CInnerIterGather(const Scalar* data, Index increment, IndexSIMD_t offsets) noexcept : + m_offsets(offsets), + m_increment(increment), + m_data(data) { + } + + FORCEINLINE simd::Array operator* () const noexcept { + return simd::Array(m_data, m_offsets); + } + + FORCEINLINE CInnerIterGather operator++(int) noexcept { + auto ret = *this; m_offsets += m_increment; return ret; + } + }; private: /*! @@ -379,11 +451,10 @@ class C2DContainer : size_t m_resize(Index_t rows, Index_t cols) noexcept { /*--- fully static, no allocation needed ---*/ - if(StaticRows!=DynamicSize && StaticCols!=DynamicSize) - return StaticRows*StaticCols; + if(StaticSize!=DynamicSize) return StaticSize; /*--- dynamic row vector, swap size specification ---*/ - if(StaticRows==1 && StaticCols==DynamicSize) {cols = rows; rows = 1;} + if(StaticRows==1 && IsVector) {cols = rows; rows = 1;} /*--- assert a static size is not being asked to change ---*/ if(StaticRows!=DynamicSize) assert(rows==StaticRows && "A static size was asked to change."); @@ -483,148 +554,69 @@ class C2DContainer : { for(size_t i=0; i using su2vector = C2DContainer; -template using su2matrix = C2DContainer; - -using su2activevector = su2vector; -using su2activematrix = su2matrix; - -using su2passivevector = su2vector; -using su2passivematrix = su2matrix; - -/*! - * \class CVectorOfMatrix - * \brief This contrived container is used to store small matrices in a contiguous manner - * but still present the "su2double**" interface to the outside world. - * The "interface" part should be replaced by something more efficient, e.g. a "matrix view". - */ -struct CVectorOfMatrix { - su2activevector storage; - su2matrix interface; - unsigned long M, N; - - CVectorOfMatrix() = default; - - CVectorOfMatrix(unsigned long length, unsigned long rows, unsigned long cols, su2double value = 0.0) { - resize(length, rows, cols, value); - } - - void resize(unsigned long length, unsigned long rows, unsigned long cols, su2double value = 0.0) { - M = rows; - N = cols; - storage.resize(length*rows*cols) = value; - interface.resize(length,rows); - - for(unsigned long i=0; i -struct C2DDummyLastView -{ - using Index = typename T::Index; - using Scalar = typename T::Scalar; - - T& data; - - C2DDummyLastView() = delete; - - C2DDummyLastView(T& ref) : data(ref) {} - - template::value, bool>::type = 0> - Scalar& operator() (Index i, Index) noexcept + /*! + * \brief Get a scalar iterator to the inner dimension of the container. + */ + FORCEINLINE CInnerIter innerIter(Index_t row) const noexcept { - return data(i); + return CInnerIter(&m_data[IsRowMajor? row*cols() : row], IsRowMajor? 1 : rows()); } - const Scalar& operator() (Index i, Index) const noexcept + /*! + * \brief Get a SIMD gather iterator to the inner dimension of the container. + */ + template + FORCEINLINE CInnerIterGather > innerIter(simd::Array row) const noexcept { - return data(i); + return CInnerIterGather >(m_data, IsRowMajor? 1 : rows(), IsRowMajor? row*cols() : row); } -}; - -/*! - * \class C3DDummyMiddleView - * \brief Helper class, adds dummy middle dimension to a reference of a - * matrix object making it a dummy 3D array. - * \note The constness of the object is derived from the template type, but - * we allways keep a reference, never a copy of the associated matrix. - */ -template -struct C3DDummyMiddleView -{ - using Index = typename T::Index; - using Scalar = typename T::Scalar; - - T& data; - C3DDummyMiddleView() = delete; - - C3DDummyMiddleView(T& ref) : data(ref) {} - - template::value, bool>::type = 0> - Scalar& operator() (Index i, Index, Index k) noexcept + /*! + * \brief Return copy of data in a static size container. + * \param[in] row - Row of the matrix. + * \param[in] start - Starting column to copy the data (amount determined by container size). + */ + template + FORCEINLINE StaticContainer get(Index_t row, Index_t start = 0) const noexcept { - return data(i,k); + constexpr size_t Size = StaticContainer::StaticSize; + static_assert(Size, "This method requires a static output type."); + assert(Size <= cols()-start); + StaticContainer ret; + SU2_OMP_SIMD + for (size_t i=0; i + FORCEINLINE StaticContainer get(simd::Array row, Index_t start = 0) const noexcept { - return data(i,k); + constexpr size_t Size = StaticContainer::StaticSize; + static_assert(Size, "This method requires a static output type."); + assert(Size <= cols()-start); + StaticContainer ret; + for (size_t k=0; k -struct C3DContainerDecorator { - using Scalar = typename Storage::Scalar; - using Index = typename Storage::Index; - - Storage storage; - Index M, N; - - C3DContainerDecorator() = default; - - C3DContainerDecorator(Index length, Index rows, Index cols, Scalar value = 0) { - resize(length, rows, cols, value); - } - - void resize(Index length, Index rows, Index cols, Scalar value = 0) { - M = rows; - N = cols; - storage.resize(length*rows*cols) = value; - } +template using su2vector = C2DContainer; +template using su2matrix = C2DContainer; - Scalar& operator() (Index i, Index j, Index k) { return storage(i*M*N + j*N + k); } - const Scalar& operator() (Index i, Index j, Index k) const { return storage(i*M*N + j*N + k); } -}; +using su2activevector = su2vector; +using su2activematrix = su2matrix; -/* Define an alias for a 3D int matrix, we use su2vector to store the integers contiguously - * and the container decorator to create the access semantics we want. */ -using C3DIntMatrix = C3DContainerDecorator >; -using C3DDoubleMatrix = C3DContainerDecorator >; +using su2passivevector = su2vector; +using su2passivematrix = su2matrix; diff --git a/Common/include/toolboxes/CFastFindAndEraseQueue.hpp b/Common/include/containers/CFastFindAndEraseQueue.hpp similarity index 100% rename from Common/include/toolboxes/CFastFindAndEraseQueue.hpp rename to Common/include/containers/CFastFindAndEraseQueue.hpp diff --git a/Common/include/toolboxes/CVertexMap.hpp b/Common/include/containers/CVertexMap.hpp similarity index 100% rename from Common/include/toolboxes/CVertexMap.hpp rename to Common/include/containers/CVertexMap.hpp diff --git a/Common/include/containers/container_decorators.hpp b/Common/include/containers/container_decorators.hpp new file mode 100644 index 00000000000..32834c5c7d8 --- /dev/null +++ b/Common/include/containers/container_decorators.hpp @@ -0,0 +1,207 @@ +/*! + * \file container_decorators.hpp + * \brief Collection of small classes that decorate C2DContainer to + * augment its functionality, e.g. give it extra dimensions. + * \author P. Gomes + * \version 7.0.6 "Blackbird" + * + * SU2 Project Website: https://su2code.github.io + * + * The SU2 Project is maintained by the SU2 Foundation + * (http://su2foundation.org) + * + * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md) + * + * SU2 is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * SU2 is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with SU2. If not, see . + */ + +#pragma once + +#include "C2DContainer.hpp" + +/*! + * \class C3DContainerDecorator + * \brief Decorate a matrix type (Storage) with 3 dimensions. + */ +template +class C3DContainerDecorator { + static_assert(!Storage::IsVector, "Storage type must be a matrix."); + static_assert(Storage::IsRowMajor, "Storage type must be row major."); +public: + using Scalar = typename Storage::Scalar; + using Index = typename Storage::Index; + static constexpr bool IsRowMajor = true; + static constexpr bool IsColumnMajor = false; + + using CInnerIter = typename Storage::CInnerIter; + template + using CInnerIterGather = typename Storage::template CInnerIterGather >; + +private: + Storage m_storage; + Index m_innerSz; + +public: + C3DContainerDecorator() = default; + + C3DContainerDecorator(Index length, Index rows, Index cols, Scalar value = 0) noexcept { + resize(length, rows, cols, value); + } + + void resize(Index length, Index rows, Index cols, Scalar value = 0) noexcept { + m_innerSz = cols; + m_storage.resize(length, rows*cols) = value; + } + + /*! + * \brief Container sizes. + */ + Index size() const noexcept { return m_storage.size(); } + Index length() const noexcept { return m_storage.rows(); } + Index rows() const noexcept { return m_storage.cols() / m_innerSz; } + Index cols() const noexcept { return m_innerSz; } + + /*! + * \brief Element-wise access. + */ + Scalar& operator() (Index i, Index j, Index k) noexcept { return m_storage(i, j*m_innerSz + k); } + const Scalar& operator() (Index i, Index j, Index k) const noexcept { return m_storage(i, j*m_innerSz + k); } + + /*! + * \brief Get a scalar iterator to the inner-most dimension of the container. + */ + FORCEINLINE CInnerIter innerIter(Index i, Index j) const noexcept { + return CInnerIter(&m_storage(i, j*m_innerSz), 1); + } + + /*! + * \brief Get a SIMD gather iterator to the inner-most dimension of the container. + */ + template + FORCEINLINE CInnerIterGather innerIter(simd::Array i, Index j) const noexcept { + return CInnerIterGather(m_storage.data(), 1, i*m_storage.cols() + j*m_innerSz); + } + + /*! + * \brief Return copy of data in a static size container (see C2DContainer::get). + * \param[in] i - Outer index. + * \param[in] j - Starting middle index for the copy (amount determined by container size). + */ + template + FORCEINLINE StaticContainer get(Int i, Index j = 0) const noexcept { + return m_storage.template get(i, j*m_innerSz); + } +}; + +/*! + * \brief Some typedefs for the + */ +using C3DIntMatrix = C3DContainerDecorator >; +using C3DDoubleMatrix = C3DContainerDecorator; + +/*! + * \class CVectorOfMatrix + * \brief This contrived container is used to store small matrices in a contiguous manner + * but still present the "su2double**" interface to the outside world. + * The "interface" part should be replaced by something more efficient, e.g. a "matrix view". + */ +class CVectorOfMatrix: public C3DDoubleMatrix { +private: + su2matrix interface; + +public: + CVectorOfMatrix() = default; + + CVectorOfMatrix(Index length, Index rows, Index cols, Scalar value = 0) noexcept { + resize(length, rows, cols, value); + } + + void resize(Index length, Index rows, Index cols, Scalar value = 0) noexcept { + C3DDoubleMatrix::resize(length, rows, cols, value); + interface.resize(length,rows); + for(Index i=0; i +struct C2DDummyLastView +{ + static_assert(T::IsVector, "This class decorates vectors."); + using Index = typename T::Index; + using Scalar = typename T::Scalar; + + T& data; + + C2DDummyLastView() = delete; + + C2DDummyLastView(T& ref) : data(ref) {} + + template::value> = 0> + Scalar& operator() (Index i, Index) noexcept + { + return data(i); + } + + const Scalar& operator() (Index i, Index) const noexcept + { + return data(i); + } +}; + +/*! + * \class C3DDummyMiddleView + * \brief Helper class, adds dummy middle dimension to a reference of a + * matrix object making it a dummy 3D array. + * \note The constness of the object is derived from the template type, but + * we allways keep a reference, never a copy of the associated matrix. + */ +template +struct C3DDummyMiddleView +{ + static_assert(!T::IsVector, "This class decorates matrices."); + using Index = typename T::Index; + using Scalar = typename T::Scalar; + + T& data; + + C3DDummyMiddleView() = delete; + + C3DDummyMiddleView(T& ref) : data(ref) {} + + template::value> = 0> + Scalar& operator() (Index i, Index, Index k) noexcept + { + return data(i,k); + } + + const Scalar& operator() (Index i, Index, Index k) const noexcept + { + return data(i,k); + } +}; diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp index bd0934e9329..8cfbd7ae381 100644 --- a/Common/include/geometry/CPhysicalGeometry.hpp +++ b/Common/include/geometry/CPhysicalGeometry.hpp @@ -29,7 +29,7 @@ #include "CGeometry.hpp" #include "meshreader/CMeshReaderFVM.hpp" -#include "../toolboxes/C2DContainer.hpp" +#include "../containers/C2DContainer.hpp" /*! diff --git a/Common/include/geometry/dual_grid/CEdge.hpp b/Common/include/geometry/dual_grid/CEdge.hpp index 394c428709b..bdddbfff357 100644 --- a/Common/include/geometry/dual_grid/CEdge.hpp +++ b/Common/include/geometry/dual_grid/CEdge.hpp @@ -27,7 +27,7 @@ #pragma once -#include "../../toolboxes/C2DContainer.hpp" +#include "../../containers/C2DContainer.hpp" /*! * \class CEdge @@ -35,12 +35,13 @@ * \author F. Palacios */ class CEdge { - static_assert(su2activematrix::Storage == StorageType::RowMajor, "Needed to return normal as pointer."); - + static_assert(su2activematrix::IsRowMajor, "Needed to return normal as pointer."); private: - su2matrix Nodes; /*!< \brief Vector to store the node indices of the edge. */ - su2activematrix Normal; /*!< \brief Normal (area) of the edge. */ - su2activematrix Coord_CG; /*!< \brief Center-of-gravity (mid point) of the edge. */ + using Index = unsigned long; + using NodeArray = C2DContainer; + NodeArray Nodes; /*!< \brief Vector to store the node indices of the edge. */ + su2activematrix Normal; /*!< \brief Normal (area) of the edge. */ + su2activematrix Coord_CG; /*!< \brief Center-of-gravity (mid point) of the edge. */ public: enum NodePosition : unsigned long {LEFT = 0, RIGHT = 1}; @@ -84,6 +85,14 @@ class CEdge { */ inline unsigned long GetNode(unsigned long iEdge, unsigned long iNode) const { return Nodes(iEdge,iNode); } + /*! + * \brief SIMD version of GetNode, iNode returned for multiple contiguous iEdges + */ + template + FORCEINLINE simd::Array GetNode(simd::Array iEdge, unsigned long iNode) const { + return simd::Array(&Nodes(iEdge[0],iNode)); + } + /*! * \brief Set the node indices of an edge. * \param[in] iEdge - Edge index. @@ -168,6 +177,11 @@ class CEdge { */ inline const su2double* GetNormal(unsigned long iEdge) const { return Normal[iEdge]; } + /*! + * \brief Get the entire matrix of edge normals. + */ + inline const su2activematrix& GetNormal() const { return Normal; } + /*! * \brief Initialize normal vector to 0. */ diff --git a/Common/include/geometry/dual_grid/CPoint.hpp b/Common/include/geometry/dual_grid/CPoint.hpp index b211baaa401..82654c34b65 100644 --- a/Common/include/geometry/dual_grid/CPoint.hpp +++ b/Common/include/geometry/dual_grid/CPoint.hpp @@ -28,7 +28,8 @@ #pragma once -#include "../../toolboxes/C2DContainer.hpp" +#include "../../containers/C2DContainer.hpp" +#include "../../containers/container_decorators.hpp" #include "../../toolboxes/graph_toolbox.hpp" #include @@ -137,6 +138,11 @@ class CPoint { */ inline su2double *GetCoord(unsigned long iPoint) { return Coord[iPoint]; } + /*! + * \brief Get the entire matrix of coordinates of the control volumes. + */ + inline const su2activematrix& GetCoord() const { return Coord; } + /*! * \brief Set the coordinates for the control volume. * \param[in] iPoint - Index of the point. @@ -189,6 +195,11 @@ class CPoint { */ inline unsigned long GetElem(unsigned long iPoint, unsigned long nelem) const { return Elem.getInnerIdx(iPoint,nelem); } + /*! + * \brief Get inner iterator to loop over neighbor elements. + */ + inline CCompressedSparsePatternL::CInnerIter GetElems(unsigned long iPoint) const { return Elem.getInnerIter(iPoint); } + /*! * \brief Set the points that compose the control volume. * \param[in] pointsMatrix - List of lists with the neighbor points connected to each point. @@ -220,6 +231,11 @@ class CPoint { */ inline unsigned long GetPoint(unsigned long iPoint, unsigned long npoint) const { return Point.getInnerIdx(iPoint,npoint); } + /*! + * \brief Get inner iterator to loop over neighbor points. + */ + inline CCompressedSparsePatternUL::CInnerIter GetPoints(unsigned long iPoint) const { return Point.getInnerIter(iPoint); } + /*! * \brief Set the edges that compose the control volume. * \param[in] iPoint - Index of the point. @@ -236,6 +252,11 @@ class CPoint { */ inline long GetEdge(unsigned long iPoint, unsigned long nedge) const { return Edge.getInnerIdx(iPoint,nedge); } + /*! + * \brief Get inner iterator to loop over neighbor edges. + */ + inline CCompressedSparsePatternL::CInnerIter GetEdges(unsigned long iPoint) const { return Edge.getInnerIter(iPoint); } + /*! * \brief Set the boundary vertex that compose the control volume. * \param[in] iPoint - Index of the point. diff --git a/Common/include/geometry/elements/CGaussVariable.hpp b/Common/include/geometry/elements/CGaussVariable.hpp index 7a2eba01f1a..8b93224de57 100644 --- a/Common/include/geometry/elements/CGaussVariable.hpp +++ b/Common/include/geometry/elements/CGaussVariable.hpp @@ -6,7 +6,7 @@ * * SU2 Project Website: https://su2code.github.io * - * The SU2 Project is maintained by the SU2 Foundation + * The SU2 Project is maintained by the SU2 Foundation * (http://su2foundation.org) * * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md) @@ -27,7 +27,7 @@ #pragma once -#include "../../toolboxes/C2DContainer.hpp" +#include "../../containers/C2DContainer.hpp" /*! * \class CGaussVariable diff --git a/Common/include/interface_interpolation/CInterpolator.hpp b/Common/include/interface_interpolation/CInterpolator.hpp index c0c6a71cc5d..6cd7c402aff 100644 --- a/Common/include/interface_interpolation/CInterpolator.hpp +++ b/Common/include/interface_interpolation/CInterpolator.hpp @@ -27,7 +27,7 @@ #pragma once #include "../../include/basic_types/datatype_structure.hpp" -#include "../../include/toolboxes/C2DContainer.hpp" +#include "../../include/containers/C2DContainer.hpp" #include class CConfig; diff --git a/Common/include/interface_interpolation/CRadialBasisFunction.hpp b/Common/include/interface_interpolation/CRadialBasisFunction.hpp index ced01abf60c..6f39475fd36 100644 --- a/Common/include/interface_interpolation/CRadialBasisFunction.hpp +++ b/Common/include/interface_interpolation/CRadialBasisFunction.hpp @@ -28,14 +28,13 @@ #include "CInterpolator.hpp" #include "../option_structure.hpp" -#include "../toolboxes/C2DContainer.hpp" +#include "../containers/C2DContainer.hpp" /*! * \brief Radial basis function interpolation. */ class CRadialBasisFunction final : public CInterpolator { - static_assert(su2passivematrix::Storage == StorageType::RowMajor, - "This class relies on row major storage throughout."); + static_assert(su2passivematrix::IsRowMajor, "This class relies on row major storage throughout."); private: unsigned long MinDonors = 0, AvgDonors = 0, MaxDonors = 0; passivedouble Density = 0.0, AvgCorrection = 0.0, MaxCorrection = 0.0; diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp index 14e9e94542f..977c580ea58 100644 --- a/Common/include/linear_algebra/CSysMatrix.hpp +++ b/Common/include/linear_algebra/CSysMatrix.hpp @@ -2,7 +2,7 @@ * \file CSysMatrix.hpp * \brief Declaration of the block-sparse matrix class. * The implemtation is in CSysMatrix.cpp. - * \author F. Palacios, A. Bueno, T. Economon + * \author F. Palacios, A. Bueno, T. Economon, P. Gomes * \version 7.0.6 "Blackbird" * * SU2 Project Website: https://su2code.github.io @@ -30,11 +30,13 @@ #include "../../include/mpi_structure.hpp" #include "../../include/omp_structure.hpp" +#include "../../include/parallelization/vectorization.hpp" #include "CSysVector.hpp" #include "CPastixWrapper.hpp" #include #include +#include using namespace std; @@ -54,6 +56,20 @@ using namespace std; #if defined(__INTEL_COMPILER) && defined(MKL_DIRECT_CALL_SEQ) && !defined(CODI_REVERSE_TYPE) #define USE_MKL_LAPACK #endif +template +struct mkl_jit_wrapper { + using gemm_t = dgemm_jit_kernel_t; + template + static void create_gemm(Ts&&... args) { mkl_jit_create_dgemm(args...); } + static gemm_t get_gemm(void* jitter) { return mkl_jit_get_dgemm_ptr(jitter); } +}; +template<> +struct mkl_jit_wrapper { + using gemm_t = sgemm_jit_kernel_t; + template + static void create_gemm(Ts&&... args) { mkl_jit_create_sgemm(args...); } + static gemm_t get_gemm(void* jitter) { return mkl_jit_get_sgemm_ptr(jitter); } +}; #else #warning The current version of MKL does not support JIT gemm kernels #endif @@ -65,7 +81,6 @@ class CGeometry; /*! * \class CSysMatrix * \brief Main class for defining block-compressed-row-storage sparse matrices. - * \author A. Bueno, F. Palacios, P. Gomes */ template class CSysMatrix { @@ -115,13 +130,7 @@ class CSysMatrix { mutable vector > LineletVector; /*!< \brief Solution and RHS of the tri-diag system (working memory). */ #ifdef USE_MKL -#ifndef USE_MIXED_PRECISION - /*--- Double precision kernels. ---*/ - using gemm_t = dgemm_jit_kernel_t; -#else - /*--- Single precision kernels. ---*/ - using gemm_t = sgemm_jit_kernel_t; -#endif + using gemm_t = typename mkl_jit_wrapper::gemm_t; void * MatrixMatrixProductJitter; /*!< \brief Jitter handle for MKL JIT based GEMM. */ gemm_t MatrixMatrixProductKernel; /*!< \brief MKL JIT based GEMM kernel. */ void * MatrixVectorProductJitterBetaZero; /*!< \brief Jitter handle for MKL JIT based GEMV. */ @@ -444,7 +453,7 @@ class CSysMatrix { * \param[in] alpha - Scale factor. */ template::value,bool>::type = 0> + su2enable_if::value> = 0> inline void SetBlock(unsigned long block_i, unsigned long block_j, const OtherType *val_block, OtherType alpha = 1.0) { @@ -463,8 +472,7 @@ class CSysMatrix { * \param[in] val_block - Block to set to A(i, j). * \param[in] alpha - Scale factor. */ - template::value,bool>::type = 0> + template::value> = 0> inline void AddBlock(unsigned long block_i, unsigned long block_j, const OtherType *val_block, OtherType alpha = 1.0) { SetBlock(block_i, block_j, val_block, alpha); @@ -518,18 +526,17 @@ class CSysMatrix { /*! * \brief Update 4 blocks ii, ij, ji, jj (add to i* sub from j*). - * \note The template parameter Sign, can be used create a "subtractive" - * update i.e. subtract from row i and add to row j instead. - * This method assumes an FVM-type sparse pattern. + * \note This method assumes an FVM-type sparse pattern. * \param[in] edge - Index of edge that connects iPoint and jPoint. * \param[in] iPoint - Row to which we add the blocks. * \param[in] jPoint - Row from which we subtract the blocks. * \param[in] block_i - Adds to ii, subs from ji. * \param[in] block_j - Adds to ij, subs from jj. + * \param[in] scale - Scale blocks during update (axpy type op). */ - template + template inline void UpdateBlocks(unsigned long iEdge, unsigned long iPoint, unsigned long jPoint, - const OtherType* const* block_i, const OtherType* const* block_j) { + const MatrixType& block_i, const MatrixType& block_j, OtherType scale = 1) { ScalarType *bii = &matrix[dia_ptr[iPoint]*nVar*nEqn]; ScalarType *bjj = &matrix[dia_ptr[jPoint]*nVar*nEqn]; @@ -540,10 +547,10 @@ class CSysMatrix { for (iVar = 0; iVar < nVar; iVar++) { for (jVar = 0; jVar < nEqn; jVar++) { - bii[offset] += PassiveAssign(block_i[iVar][jVar]) * Sign; - bij[offset] += PassiveAssign(block_j[iVar][jVar]) * Sign; - bji[offset] -= PassiveAssign(block_i[iVar][jVar]) * Sign; - bjj[offset] -= PassiveAssign(block_j[iVar][jVar]) * Sign; + bii[offset] += PassiveAssign(block_i[iVar][jVar] * scale); + bij[offset] += PassiveAssign(block_j[iVar][jVar] * scale); + bji[offset] -= PassiveAssign(block_i[iVar][jVar] * scale); + bjj[offset] -= PassiveAssign(block_j[iVar][jVar] * scale); ++offset; } } @@ -552,25 +559,72 @@ class CSysMatrix { /*! * \brief Short-hand for the "subtractive" version (sub from i* add to j*) of UpdateBlocks. */ - template + template inline void UpdateBlocksSub(unsigned long iEdge, unsigned long iPoint, unsigned long jPoint, - const OtherType* const* block_i, const OtherType* const* block_j) { - UpdateBlocks(iEdge, iPoint, jPoint, block_i, block_j); + const MatrixType& block_i, const MatrixType& block_j) { + UpdateBlocks(iEdge, iPoint, jPoint, block_i, block_j, -1); + } + + /*! + * \brief SIMD version, does the update for multiple edges and points. + * \note Nothing is updated if the mask is 0. + */ + template + FORCEINLINE void UpdateBlocks(simd::Array iEdge, simd::Array iPoint, simd::Array jPoint, + const MatTypeSIMD& block_i, const MatTypeSIMD& block_j, simd::Array mask = 1) { + + static_assert(MatTypeSIMD::StaticSize, "This method requires static size blocks."); + static_assert(MatTypeSIMD::IsRowMajor, "Block storage is not compatible with matrix."); + constexpr size_t blkSz = MatTypeSIMD::StaticSize; + assert(blkSz == nVar*nEqn); + + /*--- "Transpose" the blocks, scale, and possibly convert types, + * giving the compiler the chance to vectorize all of these. ---*/ + ScalarType blk_i[N][blkSz], blk_j[N][blkSz]; + + for (size_t i=0; i - inline void SetBlocks(unsigned long iEdge, const OtherType* const* block_i, const OtherType* const* block_j) { + template + inline void SetBlocks(unsigned long iEdge, const MatrixType& block_i, + const MatrixType& block_j, OtherType scale = 1) { ScalarType *bij = &matrix[edge_ptr(iEdge,0)*nVar*nEqn]; ScalarType *bji = &matrix[edge_ptr(iEdge,1)*nVar*nEqn]; @@ -579,8 +633,8 @@ class CSysMatrix { for (iVar = 0; iVar < nVar; iVar++) { for (jVar = 0; jVar < nEqn; jVar++) { - bij[offset] = (Overwrite? ScalarType(0) : bij[offset]) + PassiveAssign(block_j[iVar][jVar]) * Sign; - bji[offset] = (Overwrite? ScalarType(0) : bji[offset]) - PassiveAssign(block_i[iVar][jVar]) * Sign; + bij[offset] = (Overwrite? ScalarType(0) : bij[offset]) + PassiveAssign(block_j[iVar][jVar] * scale); + bji[offset] = (Overwrite? ScalarType(0) : bji[offset]) - PassiveAssign(block_i[iVar][jVar] * scale); ++offset; } } @@ -589,17 +643,61 @@ class CSysMatrix { /*! * \brief Short-hand for the "additive overwrite" version of SetBlocks. */ - template - inline void UpdateBlocks(unsigned long iEdge, const OtherType* const* block_i, const OtherType* const* block_j) { - SetBlocks(iEdge, block_i, block_j); + template + inline void UpdateBlocks(unsigned long iEdge, const MatrixType& block_i, + const MatrixType& block_j, OtherType scale = 1) { + SetBlocks(iEdge, block_i, block_j, scale); } /*! * \brief Short-hand for the "subtractive" version (sub from i* add to j*) of SetBlocks. */ - template - inline void UpdateBlocksSub(unsigned long iEdge, const OtherType* const* block_i, const OtherType* const* block_j) { - SetBlocks(iEdge, block_i, block_j); + template + inline void UpdateBlocksSub(unsigned long iEdge, const MatrixType& block_i, const MatrixType& block_j) { + SetBlocks(iEdge, block_i, block_j, -1); + } + + /*! + * \brief SIMD version, does the update for multiple edges. + * \note Nothing is updated if the mask is 0. + */ + template + FORCEINLINE void SetBlocks(simd::Array iEdge, const MatTypeSIMD& block_i, + const MatTypeSIMD& block_j, simd::Array mask = 1) { + + static_assert(MatTypeSIMD::StaticSize, "This method requires static size blocks."); + static_assert(MatTypeSIMD::IsRowMajor, "Block storage is not compatible with matrix."); + constexpr size_t blkSz = MatTypeSIMD::StaticSize; + assert(blkSz == nVar*nEqn); + + /*--- "Transpose" the blocks, scale, and possibly convert types, + * giving the compiler the chance to vectorize all of these. ---*/ + ScalarType blk_i[N][blkSz], blk_j[N][blkSz]; + + for (size_t i=0; i