diff --git a/Common/include/CConfig.hpp b/Common/include/CConfig.hpp
index 16ed7cadc19..f76b9225491 100644
--- a/Common/include/CConfig.hpp
+++ b/Common/include/CConfig.hpp
@@ -43,8 +43,8 @@
 #include <map>
 #include <assert.h>
 
-#include "./option_structure.hpp"
-#include "./toolboxes/C2DContainer.hpp"
+#include "option_structure.hpp"
+#include "containers/container_decorators.hpp"
 
 #ifdef HAVE_CGNS
 #include "cgnslib.h"
@@ -419,6 +419,7 @@ class CConfig {
   su2double *RK_Alpha_Step;                 /*!< \brief Runge-Kutta beta coefficients. */
 
   unsigned short nQuasiNewtonSamples;  /*!< \brief Number of samples used in quasi-Newton solution methods. */
+  bool UseVectorization;       /*!< \brief Whether to use vectorized numerics schemes. */
 
   unsigned short nMGLevels;    /*!< \brief Number of multigrid levels (coarse levels). */
   unsigned short nCFL;         /*!< \brief Number of CFL, one for each multigrid level. */
@@ -591,10 +592,10 @@ class CConfig {
   *Kappa_AdjFlow,                  /*!< \brief Numerical dissipation coefficients for the adjoint flow equations. */
   *Kappa_Heat;                     /*!< \brief Numerical dissipation coefficients for the (fvm) heat equation. */
   su2double* FFD_Axis;          /*!< \brief Numerical dissipation coefficients for the adjoint equations. */
-  su2double Kappa_1st_AdjFlow,  /*!< \brief JST 1st order dissipation coefficient for adjoint flow equations (coarse multigrid levels). */
+  su2double Kappa_1st_AdjFlow,  /*!< \brief Lax 1st order dissipation coefficient for adjoint flow equations (coarse multigrid levels). */
   Kappa_2nd_AdjFlow,            /*!< \brief JST 2nd order dissipation coefficient for adjoint flow equations. */
   Kappa_4th_AdjFlow,            /*!< \brief JST 4th order dissipation coefficient for adjoint flow equations. */
-  Kappa_1st_Flow,           /*!< \brief JST 1st order dissipation coefficient for flow equations (coarse multigrid levels). */
+  Kappa_1st_Flow,           /*!< \brief Lax 1st order dissipation coefficient for flow equations (coarse multigrid levels). */
   Kappa_2nd_Flow,           /*!< \brief JST 2nd order dissipation coefficient for flow equations. */
   Kappa_4th_Flow,           /*!< \brief JST 4th order dissipation coefficient for flow equations. */
   Kappa_2nd_Heat,           /*!< \brief 2nd order dissipation coefficient for heat equation. */
@@ -1164,7 +1165,7 @@ class CConfig {
   ionization;                               /*!< \brief Flag for determining if free electron gas is in the mixture. */
   string GasModel,                          /*!< \brief Gas Model. */
   *Wall_Catalytic;                          /*!< \brief Pointer to catalytic walls. */
-  
+
   /*!
    * \brief Set the default values of config options not set in the config file using another config object.
    * \param config - Config object to use the default values from.
@@ -4114,6 +4115,11 @@ class CConfig {
    */
   unsigned short GetnQuasiNewtonSamples(void) const { return nQuasiNewtonSamples; }
 
+  /*!
+   * \brief Get whether to use vectorized numerics (if available).
+   */
+  bool GetUseVectorization(void) const { return UseVectorization; }
+
   /*!
    * \brief Get the relaxation coefficient of the linear solver for the implicit formulation.
    * \return relaxation coefficient of the linear solver for the implicit formulation.
@@ -4509,7 +4515,7 @@ class CConfig {
    *       during the computation.
    * \return Kind of center convective numerical scheme for the flow equations.
    */
-  unsigned short GetKind_Centered_Flow(void) const { return Kind_Centered_Flow; }
+  ENUM_CENTERED GetKind_Centered_Flow(void) const { return static_cast<ENUM_CENTERED>(Kind_Centered_Flow); }
 
   /*!
    * \brief Get the kind of center convective numerical scheme for the plasma equations.
diff --git a/Common/include/CMultiGridQueue.hpp b/Common/include/CMultiGridQueue.hpp
index 42f14a598c1..369685d86c5 100644
--- a/Common/include/CMultiGridQueue.hpp
+++ b/Common/include/CMultiGridQueue.hpp
@@ -29,7 +29,7 @@
 #pragma once
 
 #include <vector>
-#include "toolboxes/CFastFindAndEraseQueue.hpp"
+#include "containers/CFastFindAndEraseQueue.hpp"
 #include "geometry/CGeometry.hpp"
 
 using namespace std;
diff --git a/Common/include/basic_types/ad_structure.hpp b/Common/include/basic_types/ad_structure.hpp
index acf46861300..f38d54876da 100644
--- a/Common/include/basic_types/ad_structure.hpp
+++ b/Common/include/basic_types/ad_structure.hpp
@@ -353,8 +353,7 @@ namespace AD{
   /*--- Base case for parameter pack expansion. ---*/
   FORCEINLINE void SetPreaccIn() {}
 
-  template<class T, class... Ts,
-           typename std::enable_if<std::is_same<T,su2double>::value,bool>::type = 0>
+  template<class T, class... Ts, su2enable_if<std::is_same<T,su2double>::value> = 0>
   FORCEINLINE void SetPreaccIn(const T& data, Ts&&... moreData) {
     if (!PreaccActive) return;
     if (data.isActive())
@@ -385,6 +384,18 @@ namespace AD{
     }
   }
 
+  template<class T>
+  FORCEINLINE void SetPreaccIn(const T& data, const int size_x, const int size_y, const int size_z) {
+    if (!PreaccActive) return;
+    for (int i = 0; i < size_x; i++) {
+      for (int j = 0; j < size_y; j++) {
+        for (int k = 0; k < size_z; k++) {
+          if (data[i][j][k].isActive()) PreaccHelper.addInput(data[i][j][k]);
+        }
+      }
+    }
+  }
+
   FORCEINLINE void StartPreacc() {
     if (globalTape.isActive() && PreaccEnabled) {
       PreaccHelper.start();
@@ -395,8 +406,7 @@ namespace AD{
   /*--- Base case for parameter pack expansion. ---*/
   FORCEINLINE void SetPreaccOut() {}
 
-  template<class T, class... Ts,
-           typename std::enable_if<std::is_same<T,su2double>::value,bool>::type = 0>
+  template<class T, class... Ts, su2enable_if<std::is_same<T,su2double>::value> = 0>
   FORCEINLINE void SetPreaccOut(T& data, Ts&&... moreData) {
     if (!PreaccActive) return;
     if (data.isActive())
diff --git a/Common/include/basic_types/datatype_structure.hpp b/Common/include/basic_types/datatype_structure.hpp
index a6b94544319..00f5a7a31bb 100644
--- a/Common/include/basic_types/datatype_structure.hpp
+++ b/Common/include/basic_types/datatype_structure.hpp
@@ -40,6 +40,17 @@
 #define FORCEINLINE inline
 #endif
 
+#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
+#define NEVERINLINE inline __attribute__((noinline))
+#else
+#define NEVERINLINE inline
+#endif
+
+/*--- Convenience SFINAE typedef to conditionally
+ * enable/disable function template overloads. ---*/
+template<bool condition>
+using su2enable_if = typename std::enable_if<condition,bool>::type;
+
 /*--- Depending on the datatype defined during the configuration,
  * include the correct definition, and create the main typedef. ---*/
 
@@ -194,10 +205,10 @@ namespace SU2_TYPE {
 
   /*--- Special handling of the sprintf routine for non-primitive types. ---*/
   /*--- Pass-through for built-in types. ---*/
-  template<class T, typename std::enable_if<std::is_trivial<T>::value,bool>::type = 0>
+  template<class T, su2enable_if<std::is_trivial<T>::value> = 0>
   FORCEINLINE const T& _printGetValue(const T& val) {return val;}
   /*--- Overload for expressions of active types. ---*/
-  template<class T, typename std::enable_if<!std::is_trivial<T>::value,bool>::type = 0>
+  template<class T, su2enable_if<!std::is_trivial<T>::value> = 0>
   FORCEINLINE passivedouble _printGetValue(const T& val) { return val.getValue(); }
 
   /*!
diff --git a/Common/include/toolboxes/C2DContainer.hpp b/Common/include/containers/C2DContainer.hpp
similarity index 78%
rename from Common/include/toolboxes/C2DContainer.hpp
rename to Common/include/containers/C2DContainer.hpp
index e14bef89365..fb3d71fc7ff 100644
--- a/Common/include/toolboxes/C2DContainer.hpp
+++ b/Common/include/containers/C2DContainer.hpp
@@ -27,8 +27,9 @@
 
 #pragma once
 
-#include "allocation_toolbox.hpp"
+#include "../toolboxes/allocation_toolbox.hpp"
 #include "../basic_types/datatype_structure.hpp"
+#include "../parallelization/vectorization.hpp"
 
 #include <utility>
 #include <type_traits>
@@ -134,7 +135,9 @@ class AccessorImpl
 #define UNIV_ACCESSORS                                                  \
   bool empty() const noexcept {return size()==0;}                       \
   Scalar_t* data() noexcept {return m_data;}                            \
-  const Scalar_t* data() const noexcept {return m_data;}
+  const Scalar_t* data() const noexcept {return m_data;}                \
+  const Scalar_t* begin() const noexcept {return data();}               \
+  const Scalar_t* end() const noexcept {return data()+size();}
 
   /*!
    * Operator (,) gives pointwise access, operator [] returns a pointer to the
@@ -177,8 +180,7 @@ class AccessorImpl
   }
 
   /*!
-   * Vectors do not provide operator [] as it is redundant
-   * since operator () already returns by reference.
+   * Vectors provide both [] and () with the same behavior.
    */
 #define VECTOR_ACCESSORS(M,ROWMAJOR)                                    \
   UNIV_ACCESSORS                                                        \
@@ -193,6 +195,18 @@ class AccessorImpl
   }                                                                     \
                                                                         \
   const Scalar_t& operator() (const Index_t i) const noexcept           \
+  {                                                                     \
+    assert(i>=0 && i<M);                                                \
+    return m_data[i];                                                   \
+  }                                                                     \
+                                                                        \
+  Scalar_t& operator[] (const Index_t i) noexcept                       \
+  {                                                                     \
+    assert(i>=0 && i<M);                                                \
+    return m_data[i];                                                   \
+  }                                                                     \
+                                                                        \
+  const Scalar_t& operator[] (const Index_t i) const noexcept           \
   {                                                                     \
     assert(i>=0 && i<M);                                                \
     return m_data[i];                                                   \
@@ -368,9 +382,67 @@ class C2DContainer :
   using Base::m_allocate;
 public:
   using Base::size;
+  using Base::rows;
+  using Base::cols;
   using Index = Index_t;
   using Scalar = Scalar_t;
   static constexpr StorageType Storage = Store;
+  static constexpr bool IsVector = (StaticRows==1) || (StaticCols==1);
+  static constexpr bool IsRowMajor = (Store==StorageType::RowMajor);
+  static constexpr bool IsColumnMajor = (Store==StorageType::ColumnMajor);
+  static constexpr size_t StaticSize = StaticRows*StaticCols;
+
+  /*!
+   * \brief Scalar iterator to the inner dimension of the container, read-only.
+   */
+  class CInnerIter {
+   private:
+    const Index m_increment;
+    const Scalar* m_ptr;
+   public:
+    CInnerIter() = delete;
+
+    FORCEINLINE CInnerIter(const Scalar* ptr, Index increment) noexcept :
+      m_increment(increment),
+      m_ptr(ptr) {
+    }
+
+    FORCEINLINE Scalar operator* () const noexcept { return *m_ptr; }
+
+    FORCEINLINE CInnerIter operator++(int) noexcept {
+      auto ret = *this; m_ptr += m_increment; return ret;
+    }
+  };
+
+  /*!
+   * \brief SIMD iterator to the inner dimension of the container,
+   * read-only, generic non-contiguous access.
+   */
+  template<class IndexSIMD_t>
+  class CInnerIterGather {
+   private:
+    static_assert(std::is_integral<typename IndexSIMD_t::Scalar>::value,"");
+    enum {Size = IndexSIMD_t::Size};
+    IndexSIMD_t m_offsets;
+    const Index m_increment;
+    const Scalar* const m_data;
+   public:
+    CInnerIterGather() = delete;
+
+    FORCEINLINE CInnerIterGather(const Scalar* data, Index increment, IndexSIMD_t offsets) noexcept :
+      m_offsets(offsets),
+      m_increment(increment),
+      m_data(data) {
+    }
+
+    FORCEINLINE simd::Array<Scalar,Size> operator* () const noexcept {
+      return simd::Array<Scalar,Size>(m_data, m_offsets);
+    }
+
+    FORCEINLINE CInnerIterGather operator++(int) noexcept {
+      auto ret = *this; m_offsets += m_increment; return ret;
+    }
+  };
 
 private:
   /*!
@@ -379,11 +451,10 @@ class C2DContainer :
   size_t m_resize(Index_t rows, Index_t cols) noexcept
   {
     /*--- fully static, no allocation needed ---*/
-    if(StaticRows!=DynamicSize && StaticCols!=DynamicSize)
-      return StaticRows*StaticCols;
+    if(StaticSize!=DynamicSize) return StaticSize;
 
     /*--- dynamic row vector, swap size specification ---*/
-    if(StaticRows==1 && StaticCols==DynamicSize) {cols = rows; rows = 1;}
+    if(StaticRows==1 && IsVector) {cols = rows; rows = 1;}
 
     /*--- assert a static size is not being asked to change ---*/
     if(StaticRows!=DynamicSize) assert(rows==StaticRows && "A static size was asked to change.");
@@ -483,148 +554,69 @@ class C2DContainer :
   {
     for(size_t i=0; i<size(); ++i) m_data[i] = value;
   }
-};
-
-/*!
- * \brief Useful typedefs with default template parameters
- */
-template<class T> using su2vector = C2DContainer<unsigned long, T, StorageType::ColumnMajor, 64, DynamicSize, 1>;
-template<class T> using su2matrix = C2DContainer<unsigned long, T, StorageType::RowMajor,    64, DynamicSize, DynamicSize>;
-
-using su2activevector = su2vector<su2double>;
-using su2activematrix = su2matrix<su2double>;
-
-using su2passivevector = su2vector<passivedouble>;
-using su2passivematrix = su2matrix<passivedouble>;
-
-/*!
- * \class CVectorOfMatrix
- * \brief This contrived container is used to store small matrices in a contiguous manner
- *        but still present the "su2double**" interface to the outside world.
- *        The "interface" part should be replaced by something more efficient, e.g. a "matrix view".
- */
-struct CVectorOfMatrix {
-  su2activevector storage;
-  su2matrix<su2double*> interface;
-  unsigned long M, N;
-
-  CVectorOfMatrix() = default;
-
-  CVectorOfMatrix(unsigned long length, unsigned long rows, unsigned long cols, su2double value = 0.0) {
-    resize(length, rows, cols, value);
-  }
-
-  void resize(unsigned long length, unsigned long rows, unsigned long cols, su2double value = 0.0) {
-    M = rows;
-    N = cols;
-    storage.resize(length*rows*cols) = value;
-    interface.resize(length,rows);
-
-    for(unsigned long i=0; i<length; ++i)
-      for(unsigned long j=0; j<rows; ++j)
-        interface(i,j) = &(*this)(i,j,0);
-  }
 
-  su2double& operator() (unsigned long i, unsigned long j, unsigned long k) { return storage(i*M*N + j*N + k); }
-  const su2double& operator() (unsigned long i, unsigned long j, unsigned long k) const { return storage(i*M*N + j*N + k); }
-
-  su2double** operator[] (unsigned long i) { return interface[i]; }
-  const su2double* const* operator[] (unsigned long i) const { return interface[i]; }
-};
-
-/*!
- * \class C2DDummyLastView
- * \brief Helper class, adds dummy trailing dimension to a reference of a
- *        vector object making it a dummy matrix.
- * \note The constness of the object is derived from the template type, but
- *       we allways keep a reference, never a copy of the associated vector.
- */
-template<class T>
-struct C2DDummyLastView
-{
-  using Index = typename T::Index;
-  using Scalar = typename T::Scalar;
-
-  T& data;
-
-  C2DDummyLastView() = delete;
-
-  C2DDummyLastView(T& ref) : data(ref) {}
-
-  template<class U = T,
-           typename std::enable_if<!std::is_const<U>::value, bool>::type = 0>
-  Scalar& operator() (Index i, Index) noexcept
+  /*!
+   * \brief Get a scalar iterator to the inner dimension of the container.
+   */
+  FORCEINLINE CInnerIter innerIter(Index_t row) const noexcept
   {
-    return data(i);
+    return CInnerIter(&m_data[IsRowMajor? row*cols() : row], IsRowMajor? 1 : rows());
   }
 
-  const Scalar& operator() (Index i, Index) const noexcept
+  /*!
+   * \brief Get a SIMD gather iterator to the inner dimension of the container.
+   */
+  template<class T, size_t N>
+  FORCEINLINE CInnerIterGather<simd::Array<T,N> > innerIter(simd::Array<T,N> row) const noexcept
   {
-    return data(i);
+    return CInnerIterGather<simd::Array<T,N> >(m_data, IsRowMajor? 1 : rows(), IsRowMajor? row*cols() : row);
   }
-};
-
-/*!
- * \class C3DDummyMiddleView
- * \brief Helper class, adds dummy middle dimension to a reference of a
- *        matrix object making it a dummy 3D array.
- * \note The constness of the object is derived from the template type, but
- *       we allways keep a reference, never a copy of the associated matrix.
- */
-template<class T>
-struct C3DDummyMiddleView
-{
-  using Index = typename T::Index;
-  using Scalar = typename T::Scalar;
-
-  T& data;
 
-  C3DDummyMiddleView() = delete;
-
-  C3DDummyMiddleView(T& ref) : data(ref) {}
-
-  template<class U = T,
-           typename std::enable_if<!std::is_const<U>::value, bool>::type = 0>
-  Scalar& operator() (Index i, Index, Index k) noexcept
+  /*!
+   * \brief Return copy of data in a static size container.
+   * \param[in] row - Row of the matrix.
+   * \param[in] start - Starting column to copy the data (amount determined by container size).
+   */
+  template<class StaticContainer>
+  FORCEINLINE StaticContainer get(Index_t row, Index_t start = 0) const noexcept
   {
-    return data(i,k);
+    constexpr size_t Size = StaticContainer::StaticSize;
+    static_assert(Size, "This method requires a static output type.");
+    assert(Size <= cols()-start);
+    StaticContainer ret;
+    SU2_OMP_SIMD
+    for (size_t i=0; i<Size; ++i)
+      ret.data()[i] = m_data[IsRowMajor? row*cols()+i+start : row+(i+start)*rows()];
+    return ret;
   }
 
-  const Scalar& operator() (Index i, Index, Index k) const noexcept
+  /*!
+   * \brief Return copy of data in a static size container, SIMD version.
+   */
+  template<class StaticContainer, class T, size_t N>
+  FORCEINLINE StaticContainer get(simd::Array<T,N> row, Index_t start = 0) const noexcept
   {
-    return data(i,k);
+    constexpr size_t Size = StaticContainer::StaticSize;
+    static_assert(Size, "This method requires a static output type.");
+    assert(Size <= cols()-start);
+    StaticContainer ret;
+    for (size_t k=0; k<N; ++k) {
+      SU2_OMP_SIMD
+      for (size_t i=0; i<Size; ++i)
+        ret.data()[i][k] = m_data[IsRowMajor? row[k]*cols()+i+start : row[k]+(i+start)*rows()];
+    }
+    return ret;
   }
 };
 
 /*!
- * \class C3DContainerDecorator
- * \brief Decorate a vector type (Storage) with 3 dimensions. *
+ * \brief Useful typedefs with default template parameters
  */
-template<class Storage>
-struct C3DContainerDecorator {
-  using Scalar = typename Storage::Scalar;
-  using Index = typename Storage::Index;
-
-  Storage storage;
-  Index M, N;
-
-  C3DContainerDecorator() = default;
-
-  C3DContainerDecorator(Index length, Index rows, Index cols, Scalar value = 0) {
-    resize(length, rows, cols, value);
-  }
-
-  void resize(Index length, Index rows, Index cols, Scalar value = 0) {
-    M = rows;
-    N = cols;
-    storage.resize(length*rows*cols) = value;
-  }
+template<class T> using su2vector = C2DContainer<unsigned long, T, StorageType::ColumnMajor, 64, DynamicSize, 1>;
+template<class T> using su2matrix = C2DContainer<unsigned long, T, StorageType::RowMajor, 64, DynamicSize, DynamicSize>;
 
-  Scalar& operator() (Index i, Index j, Index k) { return storage(i*M*N + j*N + k); }
-  const Scalar& operator() (Index i, Index j, Index k) const { return storage(i*M*N + j*N + k); }
-};
+using su2activevector = su2vector<su2double>;
+using su2activematrix = su2matrix<su2double>;
 
-/* Define an alias for a 3D int matrix, we use su2vector to store the integers contiguously
- * and the container decorator to create the access semantics we want. */
-using C3DIntMatrix = C3DContainerDecorator<su2vector<int> >;
-using C3DDoubleMatrix = C3DContainerDecorator<su2vector<double> >;
+using su2passivevector = su2vector<passivedouble>;
+using su2passivematrix = su2matrix<passivedouble>;
diff --git a/Common/include/toolboxes/CFastFindAndEraseQueue.hpp b/Common/include/containers/CFastFindAndEraseQueue.hpp
similarity index 100%
rename from Common/include/toolboxes/CFastFindAndEraseQueue.hpp
rename to Common/include/containers/CFastFindAndEraseQueue.hpp
diff --git a/Common/include/toolboxes/CVertexMap.hpp b/Common/include/containers/CVertexMap.hpp
similarity index 100%
rename from Common/include/toolboxes/CVertexMap.hpp
rename to Common/include/containers/CVertexMap.hpp
diff --git a/Common/include/containers/container_decorators.hpp b/Common/include/containers/container_decorators.hpp
new file mode 100644
index 00000000000..32834c5c7d8
--- /dev/null
+++ b/Common/include/containers/container_decorators.hpp
@@ -0,0 +1,207 @@
+/*!
+ * \file container_decorators.hpp
+ * \brief Collection of small classes that decorate C2DContainer to
+ * augment its functionality, e.g. give it extra dimensions.
+ * \author P. Gomes
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "C2DContainer.hpp"
+
+/*!
+ * \class C3DContainerDecorator
+ * \brief Decorate a matrix type (Storage) with 3 dimensions.
+ */
+template<class Storage>
+class C3DContainerDecorator {
+  static_assert(!Storage::IsVector, "Storage type must be a matrix.");
+  static_assert(Storage::IsRowMajor, "Storage type must be row major.");
+public:
+  using Scalar = typename Storage::Scalar;
+  using Index = typename Storage::Index;
+  static constexpr bool IsRowMajor = true;
+  static constexpr bool IsColumnMajor = false;
+
+  using CInnerIter = typename Storage::CInnerIter;
+  template<class T, size_t N>
+  using CInnerIterGather = typename Storage::template CInnerIterGather<simd::Array<T,N> >;
+
+private:
+  Storage m_storage;
+  Index m_innerSz;
+
+public:
+  C3DContainerDecorator() = default;
+
+  C3DContainerDecorator(Index length, Index rows, Index cols, Scalar value = 0) noexcept {
+    resize(length, rows, cols, value);
+  }
+
+  void resize(Index length, Index rows, Index cols, Scalar value = 0) noexcept {
+    m_innerSz = cols;
+    m_storage.resize(length, rows*cols) = value;
+  }
+
+  /*!
+   * \brief Container sizes.
+   */
+  Index size() const noexcept { return m_storage.size(); }
+  Index length() const noexcept { return m_storage.rows(); }
+  Index rows() const noexcept { return m_storage.cols() / m_innerSz; }
+  Index cols() const noexcept { return m_innerSz; }
+
+  /*!
+   * \brief Element-wise access.
+   */
+  Scalar& operator() (Index i, Index j, Index k) noexcept { return m_storage(i, j*m_innerSz + k); }
+  const Scalar& operator() (Index i, Index j, Index k) const noexcept { return m_storage(i, j*m_innerSz + k); }
+
+  /*!
+   * \brief Get a scalar iterator to the inner-most dimension of the container.
+   */
+  FORCEINLINE CInnerIter innerIter(Index i, Index j) const noexcept {
+    return CInnerIter(&m_storage(i, j*m_innerSz), 1);
+  }
+
+  /*!
+   * \brief Get a SIMD gather iterator to the inner-most dimension of the container.
+   */
+  template<class T, size_t N>
+  FORCEINLINE CInnerIterGather<T,N> innerIter(simd::Array<T,N> i, Index j) const noexcept {
+    return CInnerIterGather<T,N>(m_storage.data(), 1, i*m_storage.cols() + j*m_innerSz);
+  }
+
+  /*!
+   * \brief Return copy of data in a static size container (see C2DContainer::get).
+   * \param[in] i - Outer index.
+   * \param[in] j - Starting middle index for the copy (amount determined by container size).
+   */
+  template<class StaticContainer, class Int>
+  FORCEINLINE StaticContainer get(Int i, Index j = 0) const noexcept {
+    return m_storage.template get<StaticContainer>(i, j*m_innerSz);
+  }
+};
+
+/*!
+ * \brief Some typedefs for the
+ */
+using C3DIntMatrix = C3DContainerDecorator<su2matrix<unsigned long> >;
+using C3DDoubleMatrix = C3DContainerDecorator<su2activematrix>;
+
+/*!
+ * \class CVectorOfMatrix
+ * \brief This contrived container is used to store small matrices in a contiguous manner
+ *        but still present the "su2double**" interface to the outside world.
+ *        The "interface" part should be replaced by something more efficient, e.g. a "matrix view".
+ */
+class CVectorOfMatrix: public C3DDoubleMatrix {
+private:
+  su2matrix<Scalar*> interface;
+
+public:
+  CVectorOfMatrix() = default;
+
+  CVectorOfMatrix(Index length, Index rows, Index cols, Scalar value = 0) noexcept {
+    resize(length, rows, cols, value);
+  }
+
+  void resize(Index length, Index rows, Index cols, Scalar value = 0) noexcept {
+    C3DDoubleMatrix::resize(length, rows, cols, value);
+    interface.resize(length,rows);
+    for(Index i=0; i<length; ++i)
+      for(Index j=0; j<rows; ++j)
+        interface(i,j) = &(*this)(i,j,0);
+  }
+
+  /*!
+   * \brief Matrix-wise access.
+   */
+  Scalar** operator[] (Index i) noexcept { return interface[i]; }
+  const Scalar* const* operator[] (Index i) const noexcept { return interface[i]; }
+};
+
+/*!
+ * \class C2DDummyLastView
+ * \brief Helper class, adds dummy trailing dimension to a reference of a
+ *        vector object making it a dummy matrix.
+ * \note The constness of the object is derived from the template type, but
+ *       we allways keep a reference, never a copy of the associated vector.
+ */
+template<class T>
+struct C2DDummyLastView
+{
+  static_assert(T::IsVector, "This class decorates vectors.");
+  using Index = typename T::Index;
+  using Scalar = typename T::Scalar;
+
+  T& data;
+
+  C2DDummyLastView() = delete;
+
+  C2DDummyLastView(T& ref) : data(ref) {}
+
+  template<class U = T, su2enable_if<!std::is_const<U>::value> = 0>
+  Scalar& operator() (Index i, Index) noexcept
+  {
+    return data(i);
+  }
+
+  const Scalar& operator() (Index i, Index) const noexcept
+  {
+    return data(i);
+  }
+};
+
+/*!
+ * \class C3DDummyMiddleView
+ * \brief Helper class, adds dummy middle dimension to a reference of a
+ *        matrix object making it a dummy 3D array.
+ * \note The constness of the object is derived from the template type, but
+ *       we allways keep a reference, never a copy of the associated matrix.
+ */
+template<class T>
+struct C3DDummyMiddleView
+{
+  static_assert(!T::IsVector, "This class decorates matrices.");
+  using Index = typename T::Index;
+  using Scalar = typename T::Scalar;
+
+  T& data;
+
+  C3DDummyMiddleView() = delete;
+
+  C3DDummyMiddleView(T& ref) : data(ref) {}
+
+  template<class U = T, su2enable_if<!std::is_const<U>::value> = 0>
+  Scalar& operator() (Index i, Index, Index k) noexcept
+  {
+    return data(i,k);
+  }
+
+  const Scalar& operator() (Index i, Index, Index k) const noexcept
+  {
+    return data(i,k);
+  }
+};
diff --git a/Common/include/geometry/CPhysicalGeometry.hpp b/Common/include/geometry/CPhysicalGeometry.hpp
index bd0934e9329..8cfbd7ae381 100644
--- a/Common/include/geometry/CPhysicalGeometry.hpp
+++ b/Common/include/geometry/CPhysicalGeometry.hpp
@@ -29,7 +29,7 @@
 
 #include "CGeometry.hpp"
 #include "meshreader/CMeshReaderFVM.hpp"
-#include "../toolboxes/C2DContainer.hpp"
+#include "../containers/C2DContainer.hpp"
 
 
 /*!
diff --git a/Common/include/geometry/dual_grid/CEdge.hpp b/Common/include/geometry/dual_grid/CEdge.hpp
index 394c428709b..bdddbfff357 100644
--- a/Common/include/geometry/dual_grid/CEdge.hpp
+++ b/Common/include/geometry/dual_grid/CEdge.hpp
@@ -27,7 +27,7 @@
 
 #pragma once
 
-#include "../../toolboxes/C2DContainer.hpp"
+#include "../../containers/C2DContainer.hpp"
 
 /*!
  * \class CEdge
@@ -35,12 +35,13 @@
  * \author F. Palacios
  */
 class CEdge {
-  static_assert(su2activematrix::Storage == StorageType::RowMajor, "Needed to return normal as pointer.");
-
+  static_assert(su2activematrix::IsRowMajor, "Needed to return normal as pointer.");
 private:
-  su2matrix<unsigned long> Nodes; /*!< \brief Vector to store the node indices of the edge. */
-  su2activematrix Normal;         /*!< \brief Normal (area) of the edge. */
-  su2activematrix Coord_CG;       /*!< \brief Center-of-gravity (mid point) of the edge. */
+  using Index = unsigned long;
+  using NodeArray = C2DContainer<Index, Index, StorageType::ColumnMajor, 64, DynamicSize, 2>;
+  NodeArray Nodes;           /*!< \brief Vector to store the node indices of the edge. */
+  su2activematrix Normal;    /*!< \brief Normal (area) of the edge. */
+  su2activematrix Coord_CG;  /*!< \brief Center-of-gravity (mid point) of the edge. */
 
 public:
   enum NodePosition : unsigned long {LEFT = 0, RIGHT = 1};
@@ -84,6 +85,14 @@ class CEdge {
    */
   inline unsigned long GetNode(unsigned long iEdge, unsigned long iNode) const { return Nodes(iEdge,iNode); }
 
+  /*!
+   * \brief SIMD version of GetNode, iNode returned for multiple contiguous iEdges
+   */
+  template<class T, size_t N>
+  FORCEINLINE simd::Array<T,N> GetNode(simd::Array<T,N> iEdge, unsigned long iNode) const {
+    return simd::Array<T,N>(&Nodes(iEdge[0],iNode));
+  }
+
   /*!
    * \brief Set the node indices of an edge.
    * \param[in] iEdge - Edge index.
@@ -168,6 +177,11 @@ class CEdge {
    */
   inline const su2double* GetNormal(unsigned long iEdge) const { return Normal[iEdge]; }
 
+  /*!
+   * \brief Get the entire matrix of edge normals.
+   */
+  inline const su2activematrix& GetNormal() const { return Normal; }
+
   /*!
    * \brief Initialize normal vector to 0.
    */
diff --git a/Common/include/geometry/dual_grid/CPoint.hpp b/Common/include/geometry/dual_grid/CPoint.hpp
index b211baaa401..82654c34b65 100644
--- a/Common/include/geometry/dual_grid/CPoint.hpp
+++ b/Common/include/geometry/dual_grid/CPoint.hpp
@@ -28,7 +28,8 @@
 
 #pragma once
 
-#include "../../toolboxes/C2DContainer.hpp"
+#include "../../containers/C2DContainer.hpp"
+#include "../../containers/container_decorators.hpp"
 #include "../../toolboxes/graph_toolbox.hpp"
 #include <vector>
 
@@ -137,6 +138,11 @@ class CPoint {
    */
   inline su2double *GetCoord(unsigned long iPoint) { return Coord[iPoint]; }
 
+  /*!
+   * \brief Get the entire matrix of coordinates of the control volumes.
+   */
+  inline const su2activematrix& GetCoord() const { return Coord; }
+
   /*!
    * \brief Set the coordinates for the control volume.
    * \param[in] iPoint - Index of the point.
@@ -189,6 +195,11 @@ class CPoint {
    */
   inline unsigned long GetElem(unsigned long iPoint, unsigned long nelem) const { return Elem.getInnerIdx(iPoint,nelem); }
 
+  /*!
+   * \brief Get inner iterator to loop over neighbor elements.
+   */
+  inline CCompressedSparsePatternL::CInnerIter GetElems(unsigned long iPoint) const { return Elem.getInnerIter(iPoint); }
+
   /*!
    * \brief Set the points that compose the control volume.
    * \param[in] pointsMatrix - List of lists with the neighbor points connected to each point.
@@ -220,6 +231,11 @@ class CPoint {
    */
   inline unsigned long GetPoint(unsigned long iPoint, unsigned long npoint) const { return Point.getInnerIdx(iPoint,npoint); }
 
+  /*!
+   * \brief Get inner iterator to loop over neighbor points.
+   */
+  inline CCompressedSparsePatternUL::CInnerIter GetPoints(unsigned long iPoint) const { return Point.getInnerIter(iPoint); }
+
   /*!
    * \brief Set the edges that compose the control volume.
    * \param[in] iPoint - Index of the point.
@@ -236,6 +252,11 @@ class CPoint {
    */
   inline long GetEdge(unsigned long iPoint, unsigned long nedge) const { return Edge.getInnerIdx(iPoint,nedge); }
 
+  /*!
+   * \brief Get inner iterator to loop over neighbor edges.
+   */
+  inline CCompressedSparsePatternL::CInnerIter GetEdges(unsigned long iPoint) const { return Edge.getInnerIter(iPoint); }
+
   /*!
    * \brief Set the boundary vertex that compose the control volume.
    * \param[in] iPoint - Index of the point.
diff --git a/Common/include/geometry/elements/CGaussVariable.hpp b/Common/include/geometry/elements/CGaussVariable.hpp
index 7a2eba01f1a..8b93224de57 100644
--- a/Common/include/geometry/elements/CGaussVariable.hpp
+++ b/Common/include/geometry/elements/CGaussVariable.hpp
@@ -6,7 +6,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
@@ -27,7 +27,7 @@
 
 #pragma once
 
-#include "../../toolboxes/C2DContainer.hpp"
+#include "../../containers/C2DContainer.hpp"
 
 /*!
  * \class CGaussVariable
diff --git a/Common/include/interface_interpolation/CInterpolator.hpp b/Common/include/interface_interpolation/CInterpolator.hpp
index c0c6a71cc5d..6cd7c402aff 100644
--- a/Common/include/interface_interpolation/CInterpolator.hpp
+++ b/Common/include/interface_interpolation/CInterpolator.hpp
@@ -27,7 +27,7 @@
 #pragma once
 
 #include "../../include/basic_types/datatype_structure.hpp"
-#include "../../include/toolboxes/C2DContainer.hpp"
+#include "../../include/containers/C2DContainer.hpp"
 #include <vector>
 
 class CConfig;
diff --git a/Common/include/interface_interpolation/CRadialBasisFunction.hpp b/Common/include/interface_interpolation/CRadialBasisFunction.hpp
index ced01abf60c..6f39475fd36 100644
--- a/Common/include/interface_interpolation/CRadialBasisFunction.hpp
+++ b/Common/include/interface_interpolation/CRadialBasisFunction.hpp
@@ -28,14 +28,13 @@
 
 #include "CInterpolator.hpp"
 #include "../option_structure.hpp"
-#include "../toolboxes/C2DContainer.hpp"
+#include "../containers/C2DContainer.hpp"
 
 /*!
  * \brief Radial basis function interpolation.
  */
 class CRadialBasisFunction final : public CInterpolator {
-  static_assert(su2passivematrix::Storage == StorageType::RowMajor,
-                "This class relies on row major storage throughout.");
+  static_assert(su2passivematrix::IsRowMajor, "This class relies on row major storage throughout.");
 private:
   unsigned long MinDonors = 0, AvgDonors = 0, MaxDonors = 0;
   passivedouble Density = 0.0, AvgCorrection = 0.0, MaxCorrection = 0.0;
diff --git a/Common/include/linear_algebra/CSysMatrix.hpp b/Common/include/linear_algebra/CSysMatrix.hpp
index 14e9e94542f..977c580ea58 100644
--- a/Common/include/linear_algebra/CSysMatrix.hpp
+++ b/Common/include/linear_algebra/CSysMatrix.hpp
@@ -2,7 +2,7 @@
  * \file CSysMatrix.hpp
  * \brief Declaration of the block-sparse matrix class.
  *        The implemtation is in <i>CSysMatrix.cpp</i>.
- * \author F. Palacios, A. Bueno, T. Economon
+ * \author F. Palacios, A. Bueno, T. Economon, P. Gomes
  * \version 7.0.6 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
@@ -30,11 +30,13 @@
 
 #include "../../include/mpi_structure.hpp"
 #include "../../include/omp_structure.hpp"
+#include "../../include/parallelization/vectorization.hpp"
 #include "CSysVector.hpp"
 #include "CPastixWrapper.hpp"
 
 #include <cstdlib>
 #include <vector>
+#include <cassert>
 
 using namespace std;
 
@@ -54,6 +56,20 @@ using namespace std;
 #if defined(__INTEL_COMPILER) && defined(MKL_DIRECT_CALL_SEQ) && !defined(CODI_REVERSE_TYPE)
   #define USE_MKL_LAPACK
 #endif
+template<class T>
+struct mkl_jit_wrapper {
+  using gemm_t = dgemm_jit_kernel_t;
+  template<class... Ts>
+  static void create_gemm(Ts&&... args) { mkl_jit_create_dgemm(args...); }
+  static gemm_t get_gemm(void* jitter) { return mkl_jit_get_dgemm_ptr(jitter); }
+};
+template<>
+struct mkl_jit_wrapper<float> {
+  using gemm_t = sgemm_jit_kernel_t;
+  template<class... Ts>
+  static void create_gemm(Ts&&... args) { mkl_jit_create_sgemm(args...); }
+  static gemm_t get_gemm(void* jitter) { return mkl_jit_get_sgemm_ptr(jitter); }
+};
 #else
   #warning The current version of MKL does not support JIT gemm kernels
 #endif
@@ -65,7 +81,6 @@ class CGeometry;
 /*!
  * \class CSysMatrix
  * \brief Main class for defining block-compressed-row-storage sparse matrices.
- * \author A. Bueno, F. Palacios, P. Gomes
  */
 template<class ScalarType>
 class CSysMatrix {
@@ -115,13 +130,7 @@ class CSysMatrix {
   mutable vector<vector<ScalarType> > LineletVector;       /*!< \brief Solution and RHS of the tri-diag system (working memory). */
 
 #ifdef USE_MKL
-#ifndef USE_MIXED_PRECISION
-  /*--- Double precision kernels. ---*/
-  using gemm_t = dgemm_jit_kernel_t;
-#else
-  /*--- Single precision kernels. ---*/
-  using gemm_t = sgemm_jit_kernel_t;
-#endif
+  using gemm_t = typename mkl_jit_wrapper<ScalarType>::gemm_t;
   void * MatrixMatrixProductJitter;              /*!< \brief Jitter handle for MKL JIT based GEMM. */
   gemm_t MatrixMatrixProductKernel;              /*!< \brief MKL JIT based GEMM kernel. */
   void * MatrixVectorProductJitterBetaZero;      /*!< \brief Jitter handle for MKL JIT based GEMV. */
@@ -444,7 +453,7 @@ class CSysMatrix {
    * \param[in] alpha - Scale factor.
    */
   template<class OtherType, bool Overwrite = true,
-           typename enable_if<!is_pointer<OtherType>::value,bool>::type = 0>
+           su2enable_if<!is_pointer<OtherType>::value> = 0>
   inline void SetBlock(unsigned long block_i, unsigned long block_j,
                        const OtherType *val_block, OtherType alpha = 1.0) {
 
@@ -463,8 +472,7 @@ class CSysMatrix {
    * \param[in] val_block - Block to set to A(i, j).
    * \param[in] alpha - Scale factor.
    */
-  template<class OtherType,
-           typename enable_if<!is_pointer<OtherType>::value,bool>::type = 0>
+  template<class OtherType, su2enable_if<!is_pointer<OtherType>::value> = 0>
   inline void AddBlock(unsigned long block_i, unsigned long block_j,
                        const OtherType *val_block, OtherType alpha = 1.0) {
     SetBlock<OtherType,false>(block_i, block_j, val_block, alpha);
@@ -518,18 +526,17 @@ class CSysMatrix {
 
   /*!
    * \brief Update 4 blocks ii, ij, ji, jj (add to i* sub from j*).
-   * \note The template parameter Sign, can be used create a "subtractive"
-   *       update i.e. subtract from row i and add to row j instead.
-   *       This method assumes an FVM-type sparse pattern.
+   * \note This method assumes an FVM-type sparse pattern.
    * \param[in] edge - Index of edge that connects iPoint and jPoint.
    * \param[in] iPoint - Row to which we add the blocks.
    * \param[in] jPoint - Row from which we subtract the blocks.
    * \param[in] block_i - Adds to ii, subs from ji.
    * \param[in] block_j - Adds to ij, subs from jj.
+   * \param[in] scale - Scale blocks during update (axpy type op).
    */
-  template<class OtherType, int Sign = 1>
+  template<class MatrixType, class OtherType = ScalarType>
   inline void UpdateBlocks(unsigned long iEdge, unsigned long iPoint, unsigned long jPoint,
-                           const OtherType* const* block_i, const OtherType* const* block_j) {
+                           const MatrixType& block_i, const MatrixType& block_j, OtherType scale = 1) {
 
     ScalarType *bii = &matrix[dia_ptr[iPoint]*nVar*nEqn];
     ScalarType *bjj = &matrix[dia_ptr[jPoint]*nVar*nEqn];
@@ -540,10 +547,10 @@ class CSysMatrix {
 
     for (iVar = 0; iVar < nVar; iVar++) {
       for (jVar = 0; jVar < nEqn; jVar++) {
-        bii[offset] += PassiveAssign(block_i[iVar][jVar]) * Sign;
-        bij[offset] += PassiveAssign(block_j[iVar][jVar]) * Sign;
-        bji[offset] -= PassiveAssign(block_i[iVar][jVar]) * Sign;
-        bjj[offset] -= PassiveAssign(block_j[iVar][jVar]) * Sign;
+        bii[offset] += PassiveAssign(block_i[iVar][jVar] * scale);
+        bij[offset] += PassiveAssign(block_j[iVar][jVar] * scale);
+        bji[offset] -= PassiveAssign(block_i[iVar][jVar] * scale);
+        bjj[offset] -= PassiveAssign(block_j[iVar][jVar] * scale);
         ++offset;
       }
     }
@@ -552,25 +559,72 @@ class CSysMatrix {
   /*!
    * \brief Short-hand for the "subtractive" version (sub from i* add to j*) of UpdateBlocks.
    */
-  template<class OtherType>
+  template<class MatrixType>
   inline void UpdateBlocksSub(unsigned long iEdge, unsigned long iPoint, unsigned long jPoint,
-                              const OtherType* const* block_i, const OtherType* const* block_j) {
-    UpdateBlocks<OtherType,-1>(iEdge, iPoint, jPoint, block_i, block_j);
+                              const MatrixType& block_i, const MatrixType& block_j) {
+    UpdateBlocks<MatrixType,ScalarType>(iEdge, iPoint, jPoint, block_i, block_j, -1);
+  }
+
+  /*!
+   * \brief SIMD version, does the update for multiple edges and points.
+   * \note Nothing is updated if the mask is 0.
+   */
+  template<class MatTypeSIMD, size_t N, class I, class F = ScalarType>
+  FORCEINLINE void UpdateBlocks(simd::Array<I,N> iEdge, simd::Array<I,N> iPoint, simd::Array<I,N> jPoint,
+                                const MatTypeSIMD& block_i, const MatTypeSIMD& block_j, simd::Array<F,N> mask = 1) {
+
+    static_assert(MatTypeSIMD::StaticSize, "This method requires static size blocks.");
+    static_assert(MatTypeSIMD::IsRowMajor, "Block storage is not compatible with matrix.");
+    constexpr size_t blkSz = MatTypeSIMD::StaticSize;
+    assert(blkSz == nVar*nEqn);
+
+    /*--- "Transpose" the blocks, scale, and possibly convert types,
+     * giving the compiler the chance to vectorize all of these. ---*/
+    ScalarType blk_i[N][blkSz], blk_j[N][blkSz];
+
+    for (size_t i=0; i<blkSz; ++i) {
+      SU2_OMP_SIMD
+      for (size_t k=0; k<N; ++k) {
+        blk_i[k][i] = PassiveAssign(-mask[k] * block_i.data()[i][k]);
+        blk_j[k][i] = PassiveAssign(mask[k] * block_j.data()[i][k]);
+      }
+    }
+
+    /*--- Update one by one skipping if mask is 0. ---*/
+    for (size_t k=0; k<N; ++k) {
+      if (mask[k]==0) continue;
+
+      /*--- Fetch the blocks. ---*/
+      auto bii = &matrix[dia_ptr[iPoint[k]]*blkSz];
+      auto bjj = &matrix[dia_ptr[jPoint[k]]*blkSz];
+      auto bij = &matrix[edge_ptr(iEdge[k],0)*blkSz];
+      auto bji = &matrix[edge_ptr(iEdge[k],1)*blkSz];
+
+      /*--- Update, block i was negated during transpose in the
+       * hope the assignments below become non-temporal stores. ---*/
+      SU2_OMP_SIMD
+      for (size_t i=0; i<blkSz; ++i) {
+        bii[i] -= blk_i[k][i];
+        bjj[i] -= blk_j[k][i];
+        bij[i] = blk_j[k][i];
+        bji[i] = blk_i[k][i];
+      }
+    }
   }
 
   /*!
    * \brief Sets 2 blocks ij and ji (add to i* sub from j*) associated with
    *        one edge of an FVM-type sparse pattern.
-   * \note The template parameter Sign, can be used create a "subtractive"
-   *       update i.e. subtract from row i and add to row j instead.
-   *       The parameter Overwrite allows completely writing over the
+   * \note The parameter Overwrite allows completely writing over the
    *       current values held by the matrix (true), or updating them (false).
    * \param[in] edge - Index of edge that connects iPoint and jPoint.
    * \param[in] block_i - Subs from ji.
    * \param[in] block_j - Adds to ij.
+   * \param[in] scale - Scale blocks during update (axpy type op).
    */
-  template<class OtherType, int Sign = 1, bool Overwrite = true>
-  inline void SetBlocks(unsigned long iEdge, const OtherType* const* block_i, const OtherType* const* block_j) {
+  template<class MatrixType, class OtherType = ScalarType, bool Overwrite = true>
+  inline void SetBlocks(unsigned long iEdge, const MatrixType& block_i,
+                        const MatrixType& block_j, OtherType scale = 1) {
 
     ScalarType *bij = &matrix[edge_ptr(iEdge,0)*nVar*nEqn];
     ScalarType *bji = &matrix[edge_ptr(iEdge,1)*nVar*nEqn];
@@ -579,8 +633,8 @@ class CSysMatrix {
 
     for (iVar = 0; iVar < nVar; iVar++) {
       for (jVar = 0; jVar < nEqn; jVar++) {
-        bij[offset] = (Overwrite? ScalarType(0) : bij[offset]) + PassiveAssign(block_j[iVar][jVar]) * Sign;
-        bji[offset] = (Overwrite? ScalarType(0) : bji[offset]) - PassiveAssign(block_i[iVar][jVar]) * Sign;
+        bij[offset] = (Overwrite? ScalarType(0) : bij[offset]) + PassiveAssign(block_j[iVar][jVar] * scale);
+        bji[offset] = (Overwrite? ScalarType(0) : bji[offset]) - PassiveAssign(block_i[iVar][jVar] * scale);
         ++offset;
       }
     }
@@ -589,17 +643,61 @@ class CSysMatrix {
   /*!
    * \brief Short-hand for the "additive overwrite" version of SetBlocks.
    */
-  template<class OtherType>
-  inline void UpdateBlocks(unsigned long iEdge, const OtherType* const* block_i, const OtherType* const* block_j) {
-    SetBlocks<OtherType,1,false>(iEdge, block_i, block_j);
+  template<class MatrixType, class OtherType = ScalarType>
+  inline void UpdateBlocks(unsigned long iEdge, const MatrixType& block_i,
+                           const MatrixType& block_j, OtherType scale = 1) {
+    SetBlocks<MatrixType,OtherType,false>(iEdge, block_i, block_j, scale);
   }
 
   /*!
    * \brief Short-hand for the "subtractive" version (sub from i* add to j*) of SetBlocks.
    */
-  template<class OtherType>
-  inline void UpdateBlocksSub(unsigned long iEdge, const OtherType* const* block_i, const OtherType* const* block_j) {
-    SetBlocks<OtherType,-1,false>(iEdge, block_i, block_j);
+  template<class MatrixType>
+  inline void UpdateBlocksSub(unsigned long iEdge, const MatrixType& block_i, const MatrixType& block_j) {
+    SetBlocks<MatrixType,ScalarType,false>(iEdge, block_i, block_j, -1);
+  }
+
+  /*!
+   * \brief SIMD version, does the update for multiple edges.
+   * \note Nothing is updated if the mask is 0.
+   */
+  template<class MatTypeSIMD, size_t N, class I, class F = ScalarType>
+  FORCEINLINE void SetBlocks(simd::Array<I,N> iEdge, const MatTypeSIMD& block_i,
+                             const MatTypeSIMD& block_j, simd::Array<F,N> mask = 1) {
+
+    static_assert(MatTypeSIMD::StaticSize, "This method requires static size blocks.");
+    static_assert(MatTypeSIMD::IsRowMajor, "Block storage is not compatible with matrix.");
+    constexpr size_t blkSz = MatTypeSIMD::StaticSize;
+    assert(blkSz == nVar*nEqn);
+
+    /*--- "Transpose" the blocks, scale, and possibly convert types,
+     * giving the compiler the chance to vectorize all of these. ---*/
+    ScalarType blk_i[N][blkSz], blk_j[N][blkSz];
+
+    for (size_t i=0; i<blkSz; ++i) {
+      SU2_OMP_SIMD
+      for (size_t k=0; k<N; ++k) {
+        blk_i[k][i] = PassiveAssign(-mask[k] * block_i.data()[i][k]);
+        blk_j[k][i] = PassiveAssign(mask[k] * block_j.data()[i][k]);
+      }
+    }
+
+    /*--- Update one by one skipping if mask is 0. ---*/
+    for (size_t k=0; k<N; ++k) {
+      if (mask[k]==0) continue;
+
+      /*--- Fetch the blocks. ---*/
+      auto bij = &matrix[edge_ptr(iEdge[k],0)*blkSz];
+      auto bji = &matrix[edge_ptr(iEdge[k],1)*blkSz];
+
+      /*--- Update, block i was negated during transpose in the
+       * hope the assignments below become non-temporal stores. ---*/
+      SU2_OMP_SIMD
+      for (size_t i=0; i<blkSz; ++i) {
+        bij[i] = blk_j[k][i];
+        bji[i] = blk_i[k][i];
+      }
+    }
   }
 
   /*!
diff --git a/Common/include/linear_algebra/CSysMatrix.inl b/Common/include/linear_algebra/CSysMatrix.inl
index a2f2093a13e..6b61a49562a 100644
--- a/Common/include/linear_algebra/CSysMatrix.inl
+++ b/Common/include/linear_algebra/CSysMatrix.inl
@@ -4,7 +4,7 @@
  * \note These are the "private" inlines, they are not needed outside
  *       of the .cpp file and so they are hidden to avoid triggering
  *       recompilation of other units when changes are made here.
- * \author F. Palacios, A. Bueno, T. Economon
+ * \author F. Palacios, A. Bueno, T. Economon, P. Gomes
  * \version 7.0.6 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
@@ -186,31 +186,41 @@ FORCEINLINE void CSysMatrix<ScalarType>::InverseDiagonalBlock_ILUMatrix(unsigned
   MatrixInverse(block, invBlock);
 }
 
+template<class ScalarType>
+FORCEINLINE void CSysMatrix<ScalarType>::RowProduct(const CSysVector<ScalarType> & vec,
+                                                    unsigned long row_i, ScalarType *prod) const {
+  for (auto iVar = 0ul; iVar < nVar; iVar++)
+    prod[iVar] = 0.0;
+
+  for (auto index = row_ptr[row_i]; index < row_ptr[row_i+1]; index++) {
+    auto col_j = col_ind[index];
+    MatrixVectorProductAdd(&matrix[index*nVar*nEqn], &vec[col_j*nEqn], prod);
+  }
+}
+
 template<class ScalarType>
 FORCEINLINE void CSysMatrix<ScalarType>::UpperProduct(const CSysVector<ScalarType> & vec, unsigned long row_i,
                                                       unsigned long col_ub, ScalarType *prod) const {
-  unsigned long iVar, index, col_j;
-
-  for (iVar = 0; iVar < nVar; iVar++) prod[iVar] = 0.0;
+  for (auto iVar = 0ul; iVar < nVar; iVar++)
+    prod[iVar] = 0.0;
 
-  for (index = dia_ptr[row_i]+1; index < row_ptr[row_i+1]; index++) {
-    col_j = col_ind[index];
+  for (auto index = dia_ptr[row_i]+1; index < row_ptr[row_i+1]; index++) {
+    auto col_j = col_ind[index];
     if (col_j < col_ub)
-      MatrixVectorProductAdd(&matrix[index*nVar*nVar], &vec[col_j*nVar], prod);
+      MatrixVectorProductAdd(&matrix[index*nVar*nEqn], &vec[col_j*nEqn], prod);
   }
 }
 
 template<class ScalarType>
 FORCEINLINE void CSysMatrix<ScalarType>::LowerProduct(const CSysVector<ScalarType> & vec, unsigned long row_i,
                                                       unsigned long col_lb, ScalarType *prod) const {
-  unsigned long iVar, index, col_j;
-
-  for (iVar = 0; iVar < nVar; iVar++) prod[iVar] = 0.0;
+  for (auto iVar = 0ul; iVar < nVar; iVar++)
+    prod[iVar] = 0.0;
 
-  for (index = row_ptr[row_i]; index < dia_ptr[row_i]; index++) {
-    col_j = col_ind[index];
+  for (auto index = row_ptr[row_i]; index < dia_ptr[row_i]; index++) {
+    auto col_j = col_ind[index];
     if (col_j >= col_lb)
-      MatrixVectorProductAdd(&matrix[index*nVar*nVar], &vec[col_j*nVar], prod);
+      MatrixVectorProductAdd(&matrix[index*nVar*nEqn], &vec[col_j*nEqn], prod);
   }
 }
 
@@ -218,5 +228,5 @@ template<class ScalarType>
 FORCEINLINE void CSysMatrix<ScalarType>::DiagonalProduct(const CSysVector<ScalarType> & vec,
                                                          unsigned long row_i, ScalarType *prod) const {
 
-  MatrixVectorProduct(&matrix[dia_ptr[row_i]*nVar*nVar], &vec[row_i*nVar], prod);
+  MatrixVectorProduct(&matrix[dia_ptr[row_i]*nVar*nEqn], &vec[row_i*nEqn], prod);
 }
diff --git a/Common/include/linear_algebra/CSysSolve.hpp b/Common/include/linear_algebra/CSysSolve.hpp
index daf7df8350e..c44da41d5d1 100644
--- a/Common/include/linear_algebra/CSysSolve.hpp
+++ b/Common/include/linear_algebra/CSysSolve.hpp
@@ -2,7 +2,7 @@
  * \file CSysSolve.hpp
  * \brief Headers for the classes related to linear solvers (CG, FGMRES, etc)
  *        The subroutines and functions are in the <i>CSysSolve.cpp</i> file.
- * \author J. Hicken, F. Palacios, T. Economon
+ * \author J. Hicken, F. Palacios, T. Economon, P. Gomes
  * \version 7.0.6 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
@@ -26,10 +26,9 @@
  * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
  */
 
-
 #pragma once
 
-#include "../mpi_structure.hpp"
+#include "../containers/C2DContainer.hpp"
 
 #include <cmath>
 #include <vector>
@@ -46,8 +45,6 @@ template<class T> class CSysMatrix;
 template<class T> class CMatrixVectorProduct;
 template<class T> class CPreconditioner;
 
-using namespace std;
-
 /*--- Relative tolerance, target residual is tol*||b-Ax||,
  *    Absolute tolerance, target residual is tol*||b||. ---*/
 enum class LinearToleranceType {RELATIVE, ABSOLUTE};
@@ -55,7 +52,6 @@ enum class LinearToleranceType {RELATIVE, ABSOLUTE};
 /*!
  * \class CSysSolve
  * \brief Class for solving linear systems using classical and Krylov-subspace iterative methods
- * \author J. Hicken.
  *
  * The individual solvers could be stand-alone subroutines, but by
  * creating CSysSolve objects we can more easily assign different
@@ -70,14 +66,15 @@ template<class ScalarType>
 class CSysSolve {
 
 public:
-  /*--- Some typedefs for simplicity ---*/
-  typedef CSysVector<ScalarType> VectorType;
-  typedef CSysMatrix<ScalarType> MatrixType;
-  typedef CMatrixVectorProduct<ScalarType> ProductType;
-  typedef CPreconditioner<ScalarType> PrecondType;
+  /*--- Some aliases for simplicity. ---*/
+  using Scalar = ScalarType;
+  using VectorType = CSysVector<ScalarType>;
+  using MatrixType = CSysMatrix<ScalarType>;
+  using ProductType = CMatrixVectorProduct<ScalarType>;
+  using PrecondType = CPreconditioner<ScalarType>;
 
 private:
-
+  const ScalarType eps;      /*!< \brief Machine epsilon used in this class. */
   bool mesh_deform;          /*!< \brief Operate in mesh deformation mode, changes the source of solver options. */
   ScalarType Residual=1e-20; /*!< \brief Residual at the end of a call to Solve or Solve_b. */
   unsigned long Iterations=0;/*!< \brief Iterations done in Solve or Solve_b. */
@@ -95,8 +92,8 @@ class CSysSolve {
   mutable VectorType r_0;    /*!< \brief The "arbitrary" vector in BCGSTAB. */
   mutable VectorType v;      /*!< \brief BCGSTAB "v" vector (v = A * M^-1 * p). */
 
-  mutable vector<VectorType> W;  /*!< \brief Large matrix used by FGMRES, w^i+1 = A * z^i. */
-  mutable vector<VectorType> Z;  /*!< \brief Large matrix used by FGMRES, preconditioned W. */
+  mutable std::vector<VectorType> W;  /*!< \brief Large matrix used by FGMRES, w^i+1 = A * z^i. */
+  mutable std::vector<VectorType> Z;  /*!< \brief Large matrix used by FGMRES, preconditioned W. */
 
   VectorType  LinSysSol_tmp;        /*!< \brief Temporary used when it is necessary to interface between active and passive types. */
   VectorType  LinSysRes_tmp;        /*!< \brief Temporary used when it is necessary to interface between active and passive types. */
@@ -151,8 +148,8 @@ class CSysSolve {
    * \pre the upper Hessenberg matrix has been transformed into a
    * triangular matrix.
    */
-  void SolveReduced(int n, const vector<vector<ScalarType> > & Hsbg,
-                    const vector<ScalarType> & rhs, vector<ScalarType> & x) const;
+  void SolveReduced(int n, const su2matrix<ScalarType>& Hsbg,
+                    const su2vector<ScalarType>& rhs, su2vector<ScalarType>& x) const;
 
   /*!
    * \brief Modified Gram-Schmidt orthogonalization
@@ -171,7 +168,7 @@ class CSysSolve {
    * vector is kept in nrm0 and updated after operating with each vector
    *
    */
-  void ModGramSchmidt(int i, vector<vector<ScalarType> > & Hsbg, vector<VectorType> & w) const;
+  void ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg, std::vector<VectorType> & w) const;
 
   /*!
    * \brief writes header information for a CSysSolve residual history
@@ -181,7 +178,7 @@ class CSysSolve {
    *
    * \pre the ostream object os should be open
    */
-  void WriteHeader(string solver, ScalarType restol, ScalarType resinit) const;
+  void WriteHeader(std::string solver, ScalarType restol, ScalarType resinit) const;
 
   /*!
    * \brief writes residual convergence data for one iteration to a stream
@@ -198,7 +195,7 @@ class CSysSolve {
    * \param[in] iter - current iteration
    * \param[in] res - the residual norm
    */
-  void WriteFinalResidual(string solver, unsigned long iter, ScalarType res) const;
+  void WriteFinalResidual(std::string solver, unsigned long iter, ScalarType res) const;
 
   /*!
    * \brief writes the convergence warning
@@ -209,17 +206,77 @@ class CSysSolve {
   void WriteWarning(ScalarType res_calc, ScalarType res_true, ScalarType tol) const;
 
   /*!
-   * \brief Used by Solve for compatibility between passive and active CSysVector, see specializations.
+   * \brief Used by Solve for compatibility between passive and active CSysVector.
+   * \note Same type specialization, temporary variables are not required.
+   * \param[in] LinSysRes - Linear system residual
+   * \param[in,out] LinSysSol - Linear system solution
+   */
+  template<class OtherType, su2enable_if<std::is_same<ScalarType,OtherType>::value> = 0>
+  void HandleTemporariesIn(const CSysVector<OtherType>& LinSysRes, CSysVector<OtherType>& LinSysSol) {
+
+    /*--- Set the pointers. ---*/
+    SU2_OMP_MASTER {
+      LinSysRes_ptr = &LinSysRes;
+      LinSysSol_ptr = &LinSysSol;
+    }
+    SU2_OMP_BARRIER
+  }
+
+  /*!
+   * \brief Used by Solve for compatibility between passive and active CSysVector.
+   * \note Different type specialization, copy data into temporary solution and residual vectors.
    * \param[in] LinSysRes - Linear system residual
    * \param[in,out] LinSysSol - Linear system solution
    */
-  void HandleTemporariesIn(const CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol);
+  template<class OtherType, su2enable_if<!std::is_same<ScalarType,OtherType>::value> = 0>
+  void HandleTemporariesIn(const CSysVector<OtherType>& LinSysRes, CSysVector<OtherType>& LinSysSol) {
+
+    /*--- Copy data, the solution is also copied as it serves as initial condition. ---*/
+    LinSysRes_tmp.PassiveCopy(LinSysRes);
+    LinSysSol_tmp.PassiveCopy(LinSysSol);
+
+    /*--- Set the pointers. ---*/
+    SU2_OMP_MASTER {
+      LinSysRes_ptr = &LinSysRes_tmp;
+      LinSysSol_ptr = &LinSysSol_tmp;
+    }
+    SU2_OMP_BARRIER
+  }
+
+  /*!
+   * \brief Used by Solve for compatibility between passive and active CSysVector.
+   * \note Same type specialization, temporary variables are not required.
+   * \param[out] LinSysSol - Linear system solution
+   */
+  template<class OtherType, su2enable_if<std::is_same<ScalarType,OtherType>::value> = 0>
+  void HandleTemporariesOut(CSysVector<OtherType>& LinSysSol) {
+
+    /*--- Reset the pointers. ---*/
+    SU2_OMP_MASTER {
+      LinSysRes_ptr = nullptr;
+      LinSysSol_ptr = nullptr;
+    }
+    SU2_OMP_BARRIER
+  }
 
   /*!
-   * \brief Used by Solve for compatibility between passive and active CSysVector, see specializations.
+   * \brief Used by Solve for compatibility between passive and active CSysVector.
+   * \note Different type specialization, copy data from the temporary solution vector.
    * \param[out] LinSysSol - Linear system solution
    */
-  void HandleTemporariesOut(CSysVector<su2double> & LinSysSol);
+  template<class OtherType, su2enable_if<!std::is_same<ScalarType,OtherType>::value> = 0>
+  void HandleTemporariesOut(CSysVector<OtherType>& LinSysSol) {
+
+    /*--- Copy data, only the temporary solution needs to be copied. ---*/
+    LinSysSol.PassiveCopy(LinSysSol_tmp);
+
+    /*--- Reset the pointers. ---*/
+    SU2_OMP_MASTER {
+      LinSysRes_ptr = nullptr;
+      LinSysSol_ptr = nullptr;
+    }
+    SU2_OMP_BARRIER
+  }
 
 public:
 
diff --git a/Common/include/linear_algebra/CSysVector.hpp b/Common/include/linear_algebra/CSysVector.hpp
index 08c3fdaf0b6..d36559fae3e 100644
--- a/Common/include/linear_algebra/CSysVector.hpp
+++ b/Common/include/linear_algebra/CSysVector.hpp
@@ -1,8 +1,8 @@
 /*!
  * \file CSysVector.hpp
- * \brief Declararion of the vector class used in the solution of
- *        large, distributed, sparse linear systems.
- * \author F. Palacios, J. Hicken, T. Economon
+ * \brief Declararion and inlines of the vector class used in the
+ * solution of large, distributed, sparse linear systems.
+ * \author P. Gomes, F. Palacios, J. Hicken, T. Economon
  * \version 7.0.6 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
@@ -28,115 +28,129 @@
 
 #pragma once
 
-#include <cmath>
-#include <cstdlib>
+#include "../mpi_structure.hpp"
+#include "../omp_structure.hpp"
+#include "../parallelization/vectorization.hpp"
+#include "vector_expressions.hpp"
 
 /*!
- * \class CSysVector
- * \brief Class for holding and manipulating vectors needed by linear solvers
- * \author J. Hicken.
- *
- * We could use the STL vector as a base class here, but this gives us
- * more flexibility with the underlying data (e.g. we may decide to
- * use a block storage scheme rather than a continuous storage scheme).
+ * \brief OpenMP worksharing construct used in CSysVector for loops.
+ * \note The loop will only run in parallel if methods are called from a
+ * parallel region (if not the results will still be correct).
+ * Static schedule to reduce overhead, chunk size determined at initialization.
+ * "nowait" clause is safe when calling CSysVector methods after each other
+ * as the loop size is the same. Methods of other classes that operate on a
+ * CSysVector and do not have the same work scheduling must use a
+ * SU2_OMP_BARRIER before using the vector.
  */
-template<class ScalarType>
-class CSysVector {
+#ifdef HAVE_OMP
+#ifdef HAVE_OMP_SIMD
+#define CSYSVEC_PARFOR SU2_OMP(for simd schedule(static,omp_chunk_size) nowait)
+#else
+#define CSYSVEC_PARFOR SU2_OMP(for schedule(static,omp_chunk_size) nowait)
+#endif
+#else
+#define CSYSVEC_PARFOR SU2_OMP_SIMD
+#endif
 
-private:
-  enum { OMP_MAX_SIZE = 4096 };   /*!< \brief Maximum chunk size used in parallel for loops. */
+/*!
+ * \class CSysVector
+ * \brief Class for holding and manipulating vectors needed by linear solvers.
+ */
+template <class ScalarType>
+class CSysVector : public VecExpr::CVecExpr<CSysVector<ScalarType>, ScalarType> {
+ private:
+  enum { OMP_MAX_SIZE = 4096 }; /*!< \brief Maximum chunk size used in parallel for loops. */
 
-  unsigned long omp_chunk_size;   /*!< \brief Static chunk size used in loop, determined at initialization. */
-  ScalarType* vec_val;            /*!< \brief storage for the element values, 64 byte aligned (do not use normal new/delete) */
-  unsigned long nElm;             /*!< \brief total number of elements (or number elements on this processor) */
-  unsigned long nElmDomain;       /*!< \brief total number of elements (or number elements on this processor without Ghost cells) */
-  unsigned long nVar;             /*!< \brief number of elements in a block */
-  mutable ScalarType dotRes;      /*!< \brief result of dot product. to perform a reduction with OpenMP the
-                                              variable needs to be declared outside the parallel region */
+  unsigned long omp_chunk_size = OMP_MAX_SIZE; /*!< \brief Static chunk size used in loops. */
+  ScalarType* vec_val = nullptr;               /*!< \brief Storage, 64 byte aligned (do not use normal new/delete). */
+  unsigned long nElm = 0;          /*!< \brief Total number of elements (or number elements on this processor). */
+  unsigned long nElmDomain = 0;    /*!< \brief Total number of elements without Ghost cells. */
+  unsigned long nVar = 0;          /*!< \brief Number of elements in a block. */
+  mutable ScalarType dotRes = 0.0; /*!< \brief Result of dot product. to perform a reduction with OpenMP the
+                                               variable needs to be declared outside the parallel region. */
 
   /*!
    * \brief Generic initialization from a scalar or array.
    * \note If val==nullptr vec_val is not initialized, only allocated.
-   * \param[in] numBlk - number of blocks locally
-   * \param[in] numBlkDomain - number of blocks locally (without ghost cells)
-   * \param[in] numVar - number of variables in each block
-   * \param[in] val - default value for elements
-   * \param[in] valIsArray - if true val is treated as array
-   */
-  void Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar,
-                  const ScalarType* val, bool valIsArray);
+   * \param[in] numBlk - Number of blocks locally.
+   * \param[in] numBlkDomain - Number of blocks locally (without ghost cells).
+   * \param[in] numVar - Number of variables in each block.
+   * \param[in] val - Default value for elements.
+   * \param[in] valIsArray - If true val is treated as array.
+   * \param[in] errorIfParallel - Throw error if within parallel region (all ctors except the default one do this).
+   */
+  void Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar, const ScalarType* val,
+                  bool valIsArray, bool errorIfParallel = true);
+
+  /*!
+   * \brief Helper to unpack (transpose) a SIMD input block.
+   */
+  template <size_t N, size_t nVar, class VecTypeSIMD, class F>
+  FORCEINLINE static void UnpackBlock(const VecTypeSIMD& in, simd::Array<F, N> mask, ScalarType out[][nVar]) {
+    static_assert(VecTypeSIMD::StaticSize, "This method requires static size vectors.");
+    for (size_t i = 0; i < nVar; ++i) {
+      SU2_OMP_SIMD
+      for (size_t k = 0; k < N; ++k) out[k][i] = mask[k] * in[i][k];
+    }
+  }
 
-public:
+ public:
+  static constexpr bool StoreAsRef = true; /*! \brief Required by CVecExpr. */
 
   /*!
-   * \brief default constructor of the class.
+   * \brief Default constructor of the class.
    */
-  CSysVector(void);
+  CSysVector() = default;
 
   /*!
-   * \brief constructor of the class.
-   * \param[in] size - number of elements locally
-   * \param[in] val - default value for elements
+   * \brief Destructor
    */
-  CSysVector(unsigned long size, ScalarType val = 0.0) {
-    nElm = 0; vec_val = nullptr;
-    Initialize(size, size, 1, &val, false);
-  }
+  ~CSysVector();
 
   /*!
-   * \brief constructor of the class.
-   * \param[in] numBlk - number of blocks locally
-   * \param[in] numBlkDomain - number of blocks locally (without g cells)
-   * \param[in] numVar - number of variables in each block
-   * \param[in] val - default value for elements
+   * \brief Construct from size and value.
+   * \param[in] size - Number of elements locally.
+   * \param[in] val - Default value for elements.
+   */
+  CSysVector(unsigned long size, ScalarType val = 0.0) { Initialize(size, size, 1, &val, false); }
+
+  /*!
+   * \brief Construct from size and value (block version).
+   * \param[in] numBlk - Number of blocks locally.
+   * \param[in] numBlkDomain - Number of blocks locally (without ghost cells).
+   * \param[in] numVar - Number of variables in each block.
+   * \param[in] val - Default value for elements.
    */
   CSysVector(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar, ScalarType val = 0.0) {
-    nElm = 0; vec_val = nullptr;
     Initialize(numBlk, numBlkDomain, numVar, &val, false);
   }
 
   /*!
-   * \brief constructor from array
-   * \param[in] size - number of elements locally
-   * \param[in] u_array - vector stored as array being copied
+   * \brief Construct from array.
+   * \param[in] size - Number of elements locally.
+   * \param[in] u_array - Vector stored as array being copied.
    */
-  explicit CSysVector(unsigned long size, const ScalarType* u_array) {
-    nElm = 0; vec_val = nullptr;
-    Initialize(size, size, 1, u_array, true);
-  }
+  explicit CSysVector(unsigned long size, const ScalarType* u_array) { Initialize(size, size, 1, u_array, true); }
 
   /*!
-   * \brief constructor from array
+   * \brief Constructor from array (block version).
    * \param[in] numBlk - number of blocks locally
    * \param[in] numBlkDomain - number of blocks locally (without g cells)
    * \param[in] numVar - number of variables in each block
    * \param[in] u_array - vector stored as array being copied
    */
-  explicit CSysVector(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar, const ScalarType* u_array) {
-    nElm = 0; vec_val = nullptr;
+  explicit CSysVector(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar,
+                      const ScalarType* u_array) {
     Initialize(numBlk, numBlkDomain, numVar, u_array, true);
   }
 
   /*!
-   * \brief copy constructor of the class.
-   * \param[in] u - CSysVector that is being copied
-   */
-  CSysVector(const CSysVector & u) {
-    nElm = 0; vec_val = nullptr;
-    Initialize(u.GetNBlk(), u.GetNBlkDomain(), u.nVar, u.vec_val, true);
-  }
-
-  /*!
-   * \brief Set our values (resizing if required) by copying from other, the derivative information is lost.
-   * \param[in] other - source CSysVector
+   * \brief Copy constructor of the class.
+   * \note Not defined for expressions because we do not know their sizes.
+   * \param[in] u - Vector being copied.
    */
-  template<class T>
-  void PassiveCopy(const CSysVector<T>& other);
-
-  /*!
-   * \brief class destructor
-   */
-  ~CSysVector();
+  CSysVector(const CSysVector& u) { Initialize(u.GetNBlk(), u.GetNBlkDomain(), u.nVar, u.vec_val, true); }
 
   /*!
    * \brief Initialize the class with a scalar.
@@ -146,7 +160,7 @@ class CSysVector {
    * \param[in] val - default value for elements
    */
   void Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar, ScalarType val = 0.0) {
-    Initialize(numBlk, numBlkDomain, numVar, &val, false);
+    Initialize(numBlk, numBlkDomain, numVar, &val, false, false);
   }
 
   /*!
@@ -158,7 +172,27 @@ class CSysVector {
    * \param[in] ptr - pointer to data with which to initialize the vector
    */
   void Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar, const ScalarType* ptr) {
-    Initialize(numBlk, numBlkDomain, numVar, ptr, true);
+    Initialize(numBlk, numBlkDomain, numVar, ptr, true, false);
+  }
+
+  /*!
+   * \brief Set our values (resizing if required) by copying from other, the derivative information is lost.
+   * \param[in] other - source CSysVector
+   */
+  template <class T>
+  void PassiveCopy(const CSysVector<T>& other) {
+    /*--- This is a method and not the overload of an operator to make sure who
+     * calls it knows the consequence to the derivative information (lost) ---*/
+
+    /*--- check if self-assignment, otherwise perform deep copy ---*/
+    if ((const void*)this == (const void*)&other) return;
+
+    SU2_OMP_MASTER
+    Initialize(other.GetNBlk(), other.GetNBlkDomain(), other.GetNVar(), nullptr, true, false);
+    SU2_OMP_BARRIER
+
+    CSYSVEC_PARFOR
+    for (auto i = 0ul; i < nElm; i++) vec_val[i] = SU2_TYPE::GetValue(other[i]);
   }
 
   /*!
@@ -179,191 +213,234 @@ class CSysVector {
   /*!
    * \brief return the number of blocks (typically number of nodes locally)
    */
-  inline unsigned long GetNBlk() const { return nElm/nVar; }
+  inline unsigned long GetNBlk() const { return nElm / nVar; }
 
   /*!
    * \brief return the number of blocks (typically number of nodes locally)
    */
-  inline unsigned long GetNBlkDomain() const { return nElmDomain/nVar; }
+  inline unsigned long GetNBlkDomain() const { return nElmDomain / nVar; }
 
   /*!
-   * \brief set calling CSysVector to scaling of another CSysVector
-   * \param[in] a - scalar factor for x
-   * \param[in] x - CSysVector that is being scaled
+   * \brief Access operator with assignment permitted.
+   * \param[in] i - Local index to access.
+   * \return Value at position i.
    */
-  void Equals_AX(ScalarType a, const CSysVector & x);
+  inline ScalarType& operator[](unsigned long i) { return vec_val[i]; }
+  inline const ScalarType& operator[](unsigned long i) const { return vec_val[i]; }
 
   /*!
-   * \brief adds a scaled CSysVector to calling CSysVector
-   * \param[in] a - scalar factor for x
-   * \param[in] x - CSysVector that is being scaled
+   * \brief Iterators for range for loops.
    */
-  void Plus_AX(ScalarType a, const CSysVector & x);
+  inline const ScalarType* begin() const { return vec_val; }
+  inline const ScalarType* end() const { return vec_val + nElm; }
 
   /*!
-   * \brief general linear combination of two CSysVectors
-   * \param[in] a - scalar factor for x
-   * \param[in] x - first CSysVector in linear combination
-   * \param[in] b - scalar factor for y
-   * \param[in] y - second CSysVector in linear combination
+   * \brief Access operator with assignment permitted block version.
+   * \param[in] iPoint - Index of block.
+   * \param[in] iVar - Index of variable.
+   * \return Value at position (i,j).
    */
-  void Equals_AX_Plus_BY(ScalarType a, const CSysVector & x, ScalarType b, const CSysVector & y);
+  inline ScalarType& operator()(unsigned long iPoint, unsigned long iVar) { return vec_val[iPoint * nVar + iVar]; }
+  inline const ScalarType& operator()(unsigned long iPoint, unsigned long iVar) const {
+    return vec_val[iPoint * nVar + iVar];
+  }
 
   /*!
-   * \brief assignment operator with deep copy
-   * \param[in] u - CSysVector whose values are being assigned
+   * \brief Assignment operator from another vector.
+   * \note Does not resize as it is meant for use in parallel.
+   * \param[in] other - Another vector.
    */
-  CSysVector & operator=(const CSysVector & u);
+  CSysVector& operator=(const CSysVector& other) {
+    CSYSVEC_PARFOR
+    for (auto i = 0ul; i < nElm; ++i) vec_val[i] = other.vec_val[i];
+    return *this;
+  }
 
   /*!
-   * \brief CSysVector=su2double assignment operator
-   * \param[in] val - value assigned to each element of CSysVector
-   */
-  CSysVector & operator=(ScalarType val);
+   * \brief Compound assignement operations with scalars and expressions.
+   * \param[in] val/expr - Scalar value or expression.
+   */
+#define MAKE_COMPOUND(OP)                                                 \
+  CSysVector& operator OP(ScalarType val) {                               \
+    CSYSVEC_PARFOR                                                        \
+    for (auto i = 0ul; i < nElm; ++i) vec_val[i] OP val;                  \
+    return *this;                                                         \
+  }                                                                       \
+  template <class T>                                                      \
+  CSysVector& operator OP(const VecExpr::CVecExpr<T, ScalarType>& expr) { \
+    CSYSVEC_PARFOR                                                        \
+    for (auto i = 0ul; i < nElm; ++i) vec_val[i] OP expr.derived()[i];    \
+    return *this;                                                         \
+  }
+  MAKE_COMPOUND(=)
+  MAKE_COMPOUND(+=)
+  MAKE_COMPOUND(-=)
+  MAKE_COMPOUND(*=)
+  MAKE_COMPOUND(/=)
+#undef MAKE_COMPOUND
 
   /*!
    * \brief Sets to zero all the entries of the vector.
    */
-  inline void SetValZero(void) { *this = ScalarType(0.0); }
-
-  /*!
-   * \brief compound addition-assignment operator
-   * \param[in] u - CSysVector being added to calling object
-   */
-  CSysVector & operator+=(const CSysVector & u);
-
-  /*!
-   * \brief compound subtraction-assignment operator
-   * \param[in] u - CSysVector being subtracted from calling object
-   */
-  CSysVector & operator-=(const CSysVector & u);
-
-  /*!
-   * \brief compound scalar multiplication-assignment operator
-   * \param[in] val - value to multiply calling object by
-   */
-  CSysVector & operator*=(ScalarType val);
-
-  /*!
-   * \brief compound scalar division-assignment operator
-   * \param[in] val - value to divide elements of calling object by
-   */
-  CSysVector & operator/=(ScalarType val);
-
-  /*!
-   * \brief Dot product between "this" and another vector
-   * \param[in] u - Another vector.
-   * \return result of dot product
-   */
-  ScalarType dot(const CSysVector & u) const;
+  inline void SetValZero(void) { *this = ScalarType(0); }
+
+  /*!
+   * \brief Dot product between "this" and an expression.
+   * \param[in] expr - Expression.
+   * \return Result of dot product
+   */
+  template <class T>
+  ScalarType dot(const VecExpr::CVecExpr<T, ScalarType>& expr) const {
+    /*--- All threads get the same "view" of the vectors and shared variable. ---*/
+    SU2_OMP_BARRIER
+    dotRes = 0.0;
+    SU2_OMP_BARRIER
+
+    /*--- Local dot product for each thread. ---*/
+    ScalarType sum = 0.0;
+
+    CSYSVEC_PARFOR
+    for (auto i = 0ul; i < nElmDomain; ++i) {
+      sum += vec_val[i] * expr.derived()[i];
+    }
+
+    /*--- Update shared variable with "our" partial sum. ---*/
+    atomicAdd(sum, dotRes);
+
+#ifdef HAVE_MPI
+    /*--- Reduce across all mpi ranks, only master thread communicates.
+     * The nElm condition is to allow vectors to also be used locally. ---*/
+    if (nElm != nElmDomain) {
+      SU2_OMP_BARRIER
+      SU2_OMP_MASTER {
+        sum = dotRes;
+        const auto mpi_type = (sizeof(ScalarType) < sizeof(double)) ? MPI_FLOAT : MPI_DOUBLE;
+        SelectMPIWrapper<ScalarType>::W::Allreduce(&sum, &dotRes, 1, mpi_type, MPI_SUM, MPI_COMM_WORLD);
+      }
+    }
+#endif
+    /*--- Make view of result consistent across threads. ---*/
+    SU2_OMP_BARRIER
+
+    return dotRes;
+  }
 
   /*!
-   * \brief squared L2 norm of the vector (via dot with self)
-   * \return squared L2 norm
+   * \brief Squared L2 norm of the vector (via dot with self).
+   * \return Squared L2 norm.
    */
   inline ScalarType squaredNorm() const { return dot(*this); }
 
   /*!
-   * \brief L2 norm of the vector
-   * \return L2 norm
+   * \brief L2 norm of the vector.
+   * \return L2 norm.
    */
   inline ScalarType norm() const { return sqrt(squaredNorm()); }
 
   /*!
-   * \brief indexing operator with assignment permitted
-   * \param[in] i = local index to access
+   * \brief Get pointer to a block.
+   * \param[in] iPoint - Index of block.
+   * \return Pointer to start of block.
    */
-  inline ScalarType & operator[](unsigned long i) { return vec_val[i]; }
+  inline ScalarType* GetBlock(unsigned long iPoint) { return &vec_val[iPoint * nVar]; }
+  inline const ScalarType* GetBlock(unsigned long iPoint) const { return &vec_val[iPoint * nVar]; }
 
   /*!
-   * \brief indexing operator with assignment not permitted
-   * \param[in] i = local index to access
+   * \brief Set the values to zero for one block.
+   * \param[in] iPoint - Index of the block being set to zero.
    */
-  inline const ScalarType & operator[](unsigned long i) const { return vec_val[i]; }
-
-  /*!
-   * \brief copies the contents of the calling CSysVector into an array
-   * \param[out] u_array - array into which information is being copied
-   * \pre u_array must be allocated and have the same size as CSysVector
-   */
-  void CopyToArray(ScalarType* u_array) const;
-
-  /*!
-   * \brief Subtract val_residual to the residual.
-   * \param[in] val_ipoint - index of the point where subtract the residual.
-   * \param[in] val_residual - Value to subtract to the residual.
-   */
-  inline void SubtractBlock(unsigned long val_ipoint, const ScalarType *val_residual) {
-    for (auto iVar = 0ul; iVar < nVar; iVar++)
-      vec_val[val_ipoint*nVar+iVar] -= val_residual[iVar];
+  inline void SetBlock_Zero(unsigned long iPoint) {
+    for (auto iVar = 0ul; iVar < nVar; iVar++) vec_val[iPoint * nVar + iVar] = 0.0;
   }
 
   /*!
-   * \brief Add val_residual to the residual.
-   * \param[in] val_ipoint - index of the point where add the residual.
-   * \param[in] val_residual - Value to add to the residual.
+   * \brief Set "block" to the vector.
+   * \note Template param Overwrite can be set to false to update existing values.
+   * \param[in] iPoint - index of the point where set the residual.
+   * \param[in] block - Value to set to the residual.
+   * \param[in] alpha - Scale factor (axpy-type operation).
    */
-  inline void AddBlock(unsigned long val_ipoint, const ScalarType *val_residual) {
-    for (auto iVar = 0ul; iVar < nVar; iVar++)
-      vec_val[val_ipoint*nVar+iVar] += val_residual[iVar];
+  template <class VectorType, bool Overwrite = true>
+  FORCEINLINE void SetBlock(unsigned long iPoint, const VectorType& block, ScalarType alpha = 1) {
+    if (Overwrite) {
+      for (auto i = 0ul; i < nVar; ++i) vec_val[iPoint * nVar + i] = alpha * block[i];
+    } else {
+      for (auto i = 0ul; i < nVar; ++i) vec_val[iPoint * nVar + i] += alpha * block[i];
+    }
   }
 
   /*!
-   * \brief Set val_residual to the residual.
-   * \param[in] val_ipoint - index of the point where set the residual.
-   * \param[in] val_var - inde of the residual to be set.
-   * \param[in] val_residual - Value to set to the residual.
+   * \brief Add "block" to the vector, see SetBlock.
    */
-  inline void SetBlock(unsigned long val_ipoint, unsigned long val_var, ScalarType val_residual) {
-    vec_val[val_ipoint*nVar+val_var] = val_residual;
+  template <class VectorType>
+  FORCEINLINE void AddBlock(unsigned long iPoint, const VectorType& block, ScalarType alpha = 1) {
+    SetBlock<VectorType, false>(iPoint, block, alpha);
   }
 
   /*!
-   * \brief Set val_residual to the residual.
-   * \param[in] val_ipoint - index of the point where set the residual.
-   * \param[in] val_residual - Value to set to the residual.
+   * \brief Subtract "block" from the vector, see AddBlock.
    */
-  inline void SetBlock(unsigned long val_ipoint, const ScalarType *val_residual) {
-    for (auto iVar = 0ul; iVar < nVar; iVar++)
-      vec_val[val_ipoint*nVar+iVar] = val_residual[iVar];
+  template <class VectorType>
+  FORCEINLINE void SubtractBlock(unsigned long iPoint, const VectorType& block) {
+    AddBlock(iPoint, block, -1);
   }
 
   /*!
-   * \brief Set the residual to zero.
-   * \param[in] val_ipoint - index of the point where set the residual.
+   * \brief Add to iPoint, subtract from jPoint.
    */
-  inline void SetBlock_Zero(unsigned long val_ipoint) {
-    for (auto iVar = 0ul; iVar < nVar; iVar++)
-      vec_val[val_ipoint*nVar+iVar] = 0.0;
+  template <class VectorType>
+  FORCEINLINE void UpdateBlocks(unsigned long iPoint, unsigned long jPoint, const VectorType& block,
+                                ScalarType alpha = 1) {
+    AddBlock(iPoint, block, alpha);
+    AddBlock(jPoint, block, -alpha);
   }
 
   /*!
-   * \brief Set the velocity residual to zero.
-   * \param[in] val_ipoint - index of the point where set the residual.
-   * \param[in] val_var - inde of the residual to be set.
-   */
-  inline void SetBlock_Zero(unsigned long val_ipoint, unsigned long val_var) {
-    vec_val[val_ipoint*nVar+val_var] = 0.0;
+   * \brief Vectorized version of SetBlock, sets multiple iPoint's.
+   * \param[in] iPoint - SIMD integer, the positions to update.
+   * \param[in] vector - Vector of SIMD scalars.
+   * \param[in] mask - Optional scale factor (axpy type operation).
+   * \note Nothing is updated if the mask is 0.
+   */
+  template <size_t N, class T, class VecTypeSIMD, class F = ScalarType>
+  FORCEINLINE void SetBlock(simd::Array<T, N> iPoint, const VecTypeSIMD& vector, simd::Array<F, N> mask = 1) {
+    /*--- "Transpose" and scale input vector. ---*/
+    constexpr size_t nVar = VecTypeSIMD::StaticSize;
+    assert(nVar == this->nVar);
+    ScalarType vec[N][nVar];
+    UnpackBlock(vector, mask, vec);
+
+    /*--- Update one by one skipping if mask is 0. ---*/
+    for (size_t k = 0; k < N; ++k) {
+      if (mask[k] == 0) continue;
+      SU2_OMP_SIMD
+      for (size_t i = 0; i < nVar; ++i) vec_val[iPoint[k] * nVar + i] = vec[k][i];
+    }
   }
 
   /*!
-   * \brief Get the value of the residual.
-   * \param[in] val_ipoint - index of the point where set the residual.
-   * \return Pointer to the residual.
-   */
-  inline ScalarType *GetBlock(unsigned long val_ipoint) { return &vec_val[val_ipoint*nVar]; }
-
-  /*!
-   * \brief Get the value of the residual.
-   * \param[in] val_ipoint - index of the point where set the residual.
-   * \param[in] val_var - inde of the residual to be set.
-   * \return Value of the residual.
-   */
-  inline const ScalarType& operator() (unsigned long val_ipoint, unsigned long val_var) const {
-    return vec_val[val_ipoint*nVar+val_var];
-  }
-  inline ScalarType& operator() (unsigned long val_ipoint, unsigned long val_var) {
-    return vec_val[val_ipoint*nVar+val_var];
+   * \brief Vectorized version of UpdateBlocks, updates multiple i/jPoint's.
+   * \note See SIMD overload of SetBlock.
+   */
+  template <size_t N, class T, class VecTypeSIMD, class F = ScalarType>
+  FORCEINLINE void UpdateBlocks(simd::Array<T, N> iPoint, simd::Array<T, N> jPoint, const VecTypeSIMD& vector,
+                                simd::Array<F, N> mask = 1) {
+    /*--- "Transpose" and scale input vector. ---*/
+    constexpr size_t nVar = VecTypeSIMD::StaticSize;
+    assert(nVar == this->nVar);
+    ScalarType vec[N][nVar];
+    UnpackBlock(vector, mask, vec);
+
+    /*--- Update one by one skipping if mask is 0. ---*/
+    for (size_t k = 0; k < N; ++k) {
+      if (mask[k] == 0) continue;
+      SU2_OMP_SIMD
+      for (size_t i = 0; i < nVar; ++i) {
+        vec_val[iPoint[k] * nVar + i] += vec[k][i];
+        vec_val[jPoint[k] * nVar + i] -= vec[k][i];
+      }
+    }
   }
-
 };
+
+#undef CSYSVEC_PARFOR
diff --git a/Common/include/linear_algebra/vector_expressions.hpp b/Common/include/linear_algebra/vector_expressions.hpp
new file mode 100644
index 00000000000..e82d1f383a7
--- /dev/null
+++ b/Common/include/linear_algebra/vector_expressions.hpp
@@ -0,0 +1,210 @@
+/*!
+ * \file vector_expressions.hpp
+ * \brief Expression templates for vector types with coefficient-wise operations.
+ * \author P. Gomes
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../basic_types/datatype_structure.hpp"
+#include <type_traits>
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <cmath>
+
+namespace VecExpr {
+
+/*!
+ * \brief Base vector expression class.
+ * \param[in] Derived - The class that inherits from this one to use the expressions.
+ * \param[in] Scalar - Associated scalar type, prevents implicit conversions between exprs.
+ * \note Derived classes must implement operator[], and at least operator= with
+ * expressions (that is when they are evaluated). They must also contain a constexpr
+ * boolean "StoreAsRef", indicating whether to store them by value (false) or by
+ * reference (true), when composing expressions.
+ * Expression classes need to be stored by value to allow nested expressions to
+ * propagate correctly (i.e. if "Scalar" has its own expression templates).
+ * Vector classes should be stored by reference to avoid copies, especially if they
+ * allocate memory dynamically.
+ */
+template<class Derived, class Scalar>
+class CVecExpr {
+public:
+  /*!
+   * \brief Cast the expression to Derived, usually to allow evaluation via operator[].
+   */
+  FORCEINLINE const Derived& derived() const { return static_cast<const Derived&>(*this); }
+
+  // Allowed from C++14, allows nested expression propagation without
+  // manually calling derived() on the expression being evaluated.
+  //FORCEINLINE auto operator[] (size_t i) const { return derived()[i]; }
+};
+
+/*!
+ * \brief Expression class to broadcast a scalar value. Allows implementing
+ * "vector-scalar" operations re-using "vector-vector" expressions.
+ */
+template<class Scalar>
+class Bcast : public CVecExpr<Bcast<Scalar>, Scalar> {
+  Scalar x;
+public:
+  static constexpr bool StoreAsRef = false;
+  FORCEINLINE Bcast(const Scalar& x_) : x(x_) {}
+  FORCEINLINE const Scalar& operator[] (size_t) const { return x; }
+};
+
+/*!
+ * \brief std::decay_t from C++14, used to allow implicit conversions
+ * between scalar types, e.g. "CVecExpr<U,double>" + "int/double/etc.".
+ */
+template<class T> using decay_t = typename std::decay<T>::type;
+
+/*! \brief std::remove_reference_t from C++14, removes references from some type. */
+template<class T> using remove_reference_t = typename std::remove_reference<T>::type;
+
+/*! \brief Mechanism to conditionally (based on "StoreAsRef") add lvalue reference to a type. */
+template<class T, bool> struct add_lref_if { using type = remove_reference_t<T>; };
+template<class T> struct add_lref_if<T,true> { using type = remove_reference_t<T> &; };
+template<class T> using store_t = typename add_lref_if<T,T::StoreAsRef>::type;
+
+/*--- Namespace from which the math function implementations come. ---*/
+
+#if defined(CODI_REVERSE_TYPE) || defined(CODI_FORWARD_TYPE)
+namespace math = ::codi;
+#else
+namespace math = ::std;
+#endif
+
+/*--- Macro to simplify auto return type deduction in C++11, operator[] needs
+ * it to allow inner expressions to propagate as the outer is evaluated.  ---*/
+
+#define RETURNS(...) ->decltype(__VA_ARGS__) { return __VA_ARGS__; }
+
+/*--- Macro to create expression classes (EXPR) and overloads (FUN) for unary
+ * functions, based on their coefficient-wise implementation (IMPL). ---*/
+
+#define MAKE_UNARY_FUN(FUN, EXPR, IMPL)                                       \
+/*!--- Expression class. ---*/                                                \
+template<class U, class Scalar>                                               \
+class EXPR : public CVecExpr<EXPR<U,Scalar>, Scalar> {                        \
+  store_t<const U> u;                                                         \
+public:                                                                       \
+  static constexpr bool StoreAsRef = false;                                   \
+  FORCEINLINE EXPR(const U& u_) : u(u_) {}                                    \
+  FORCEINLINE auto operator[] (size_t i) const RETURNS( IMPL(u[i]) )          \
+};                                                                            \
+/*!--- Function overload, returns an expression object. ---*/                 \
+template<class U, class S>                                                    \
+FORCEINLINE auto FUN(const CVecExpr<U,S>& u) RETURNS( EXPR<U,S>(u.derived()) )
+
+#define sign_impl(x) Scalar(1-2*(x<0))
+MAKE_UNARY_FUN(operator-, minus_, -)
+MAKE_UNARY_FUN(abs, abs_, math::abs)
+MAKE_UNARY_FUN(sqrt, sqrt_, math::sqrt)
+MAKE_UNARY_FUN(sign, sign_, sign_impl)
+#undef sign_impl
+
+#undef MAKE_UNARY_FUN
+
+/*--- Macro to create expressions and overloads for binary functions. ---*/
+
+#define MAKE_BINARY_FUN(FUN, EXPR, IMPL)                                      \
+/*!--- Expression class. ---*/                                                \
+template<class U, class V, class Scalar>                                      \
+class EXPR : public CVecExpr<EXPR<U,V,Scalar>, Scalar> {                      \
+  store_t<const U> u;                                                         \
+  store_t<const V> v;                                                         \
+public:                                                                       \
+  static constexpr bool StoreAsRef = false;                                   \
+  FORCEINLINE EXPR(const U& u_, const V& v_) : u(u_), v(v_) {}                \
+  FORCEINLINE auto operator[] (size_t i) const RETURNS( IMPL(u[i], v[i]) )    \
+};                                                                            \
+/*!--- Vector with vector function overload. ---*/                            \
+template<class U, class V, class S>                                           \
+FORCEINLINE auto FUN(const CVecExpr<U,S>& u, const CVecExpr<V,S>& v)          \
+  RETURNS( EXPR<U,V,S>(u.derived(), v.derived())                              \
+)                                                                             \
+/*!--- Vector with scalar function overload. ---*/                            \
+template<class U, class S>                                                    \
+FORCEINLINE auto FUN(const CVecExpr<U,S>& u, decay_t<S> v)                    \
+  RETURNS( EXPR<U,Bcast<S>,S>(u.derived(), Bcast<S>(v))                       \
+)                                                                             \
+/*!--- Scalar with vector function overload. ---*/                            \
+template<class S, class V>                                                    \
+FORCEINLINE auto FUN(decay_t<S> u, const CVecExpr<V,S>& v)                    \
+  RETURNS( EXPR<Bcast<S>,V,S>(Bcast<S>(u), v.derived())                       \
+)                                                                             \
+
+/*--- std::max/min have issues (maybe because they return by reference). ---*/
+
+#define max_impl(a,b) a<b? Scalar(b) : Scalar(a)
+#define min_impl(a,b) b<a? Scalar(b) : Scalar(a)
+MAKE_BINARY_FUN(max, max_, max_impl)
+MAKE_BINARY_FUN(min, min_, min_impl)
+MAKE_BINARY_FUN(pow, pow_, math::pow)
+#undef max_impl
+#undef min_impl
+
+/*--- sts::plus and co. were tried, the code was horrendous (due to the forced
+ * conversion between different types) and creating functions for these ops
+ * requires a lot of boilerplate (template args, auto return, etc.). ---*/
+
+#define add_impl(a,b) a+b
+#define sub_impl(a,b) a-b
+#define mul_impl(a,b) a*b
+#define div_impl(a,b) a/b
+MAKE_BINARY_FUN(operator+, add_, add_impl)
+MAKE_BINARY_FUN(operator-, sub_, sub_impl)
+MAKE_BINARY_FUN(operator*, mul_, mul_impl)
+MAKE_BINARY_FUN(operator/, div_, div_impl)
+#undef add_impl
+#undef sub_impl
+#undef mul_impl
+#undef div_impl
+
+/*--- Relational operators need to be cast to the scalar type to allow vectorization. ---*/
+
+#define le_impl(a,b) Scalar(a<=b)
+#define ge_impl(a,b) Scalar(a>=b)
+#define eq_impl(a,b) Scalar(a==b)
+#define ne_impl(a,b) Scalar(a!=b)
+#define lt_impl(a,b) Scalar(a<b)
+#define gt_impl(a,b) Scalar(a>b)
+MAKE_BINARY_FUN(operator<=, le_, le_impl)
+MAKE_BINARY_FUN(operator>=, ge_, ge_impl)
+MAKE_BINARY_FUN(operator==, eq_, eq_impl)
+MAKE_BINARY_FUN(operator!=, ne_, ne_impl)
+MAKE_BINARY_FUN(operator<, lt_, lt_impl)
+MAKE_BINARY_FUN(operator>, gt_, gt_impl)
+#undef le_impl
+#undef ge_impl
+#undef eq_impl
+#undef ne_impl
+#undef lt_impl
+#undef gt_impl
+
+#undef MAKE_BINARY_FUN
+
+} // end namespace
diff --git a/Common/include/mpi_structure.inl b/Common/include/mpi_structure.inl
index 6ecc766e0bd..5673d73148e 100644
--- a/Common/include/mpi_structure.inl
+++ b/Common/include/mpi_structure.inl
@@ -24,12 +24,15 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
  */
-#include "mpi_structure.hpp"
+
 #pragma once
 
+#include "mpi_structure.hpp"
+#include "omp_structure.hpp"
+
 #ifdef HAVE_MPI
 
-inline void CBaseMPIWrapper::Error(std::string ErrorMsg, std::string FunctionName){
+NEVERINLINE void CBaseMPIWrapper::Error(std::string ErrorMsg, std::string FunctionName){
 
   /* Set MinRankError to Rank, as the error message is called on this rank. */
   MinRankError = Rank;
@@ -514,13 +517,8 @@ inline void CMediMPIWrapper::Waitany(int nrequests, Request *request,
 }
 #endif
 #else // HAVE_MPI
-#ifdef _OPENMP
-#include <omp.h>
-#else
-#include <ctime>
-#endif
 
-inline void CBaseMPIWrapper::Error(std::string ErrorMsg, std::string FunctionName){
+NEVERINLINE void CBaseMPIWrapper::Error(std::string ErrorMsg, std::string FunctionName){
   if (Rank == 0){
     std::cout << std::endl << std::endl;
     std::cout << "Error in \"" << FunctionName << "\": " << std::endl;
@@ -693,11 +691,6 @@ inline void CBaseMPIWrapper::CopyData(void *sendbuf, void *recvbuf, int size, Da
   }
 }
 
-inline passivedouble CBaseMPIWrapper::Wtime(void) {
-#ifdef _OPENMP
-  return omp_get_wtime();
-#else
-  return passivedouble(clock()) / CLOCKS_PER_SEC;
-#endif
-}
+inline passivedouble CBaseMPIWrapper::Wtime(void) { return omp_get_wtime(); }
+
 #endif
diff --git a/Common/include/omp_structure.hpp b/Common/include/omp_structure.hpp
index 5ed0fbbceb2..fb5343c0d41 100644
--- a/Common/include/omp_structure.hpp
+++ b/Common/include/omp_structure.hpp
@@ -38,8 +38,7 @@
 
 #pragma once
 
-#include <type_traits>
-#include <cstdlib>
+#include "basic_types/datatype_structure.hpp"
 
 #if defined(_MSC_VER)
 #define PRAGMIZE(X) __pragma(X)
@@ -57,6 +56,7 @@
 #define SU2_OMP(ARGS) PRAGMIZE(omp ARGS)
 
 #else // Compile without OpenMP
+#include <ctime>
 
 /*--- Disable pragmas to quiet compilation warnings. ---*/
 #define SU2_OMP(ARGS)
@@ -64,12 +64,12 @@
 /*!
  * \brief Maximum number of threads available.
  */
-inline constexpr int omp_get_max_threads(void) {return 1;}
+inline constexpr int omp_get_max_threads() {return 1;}
 
 /*!
  * \brief Number of threads in current team.
  */
-inline constexpr int omp_get_num_threads(void) {return 1;}
+inline constexpr int omp_get_num_threads() {return 1;}
 
 /*!
  * \brief Set the maximum number of threads.
@@ -79,7 +79,17 @@ inline void omp_set_num_threads(int) { }
 /*!
  * \brief Index of current thread, akin to MPI rank.
  */
-inline constexpr int omp_get_thread_num(void) {return 0;}
+inline constexpr int omp_get_thread_num() {return 0;}
+
+/*!
+ * \brief Returns true if inside a parallel section.
+ */
+inline constexpr bool omp_in_parallel() {return false;}
+
+/*!
+ * \brief Return the wall time.
+ */
+inline passivedouble omp_get_wtime() {return passivedouble(clock()) / CLOCKS_PER_SEC;}
 
 /*!
  * \brief Dummy lock type and associated functions.
@@ -189,15 +199,13 @@ void parallelSet(size_t size, T val, U* dst)
  * \param[in] rhs - Local variable being added to the shared one.
  * \param[in,out] lhs - Shared variable being updated.
  */
-template<class T,
-         typename std::enable_if<!std::is_arithmetic<T>::value,bool>::type = 0>
+template<class T, su2enable_if<!std::is_arithmetic<T>::value> = 0>
 inline void atomicAdd(T rhs, T& lhs)
 {
   SU2_OMP_CRITICAL
   lhs += rhs;
 }
-template<class T,
-         typename std::enable_if<std::is_arithmetic<T>::value,bool>::type = 0>
+template<class T, su2enable_if<std::is_arithmetic<T>::value> = 0>
 inline void atomicAdd(T rhs, T& lhs)
 {
   SU2_OMP_ATOMIC
diff --git a/Common/include/option_structure.hpp b/Common/include/option_structure.hpp
index 4d988c21d96..1c9bb1d0e62 100644
--- a/Common/include/option_structure.hpp
+++ b/Common/include/option_structure.hpp
@@ -359,7 +359,7 @@ enum ENUM_INLET_SPANWISEINTERPOLATION {
   LINEAR_1D = 1,
   AKIMA_1D = 2,
 };
-static const map<string, ENUM_INLET_SPANWISEINTERPOLATION> Inlet_SpanwiseInterpolation_Map = {
+static const MapType<string, ENUM_INLET_SPANWISEINTERPOLATION> Inlet_SpanwiseInterpolation_Map = {
   MakePair("NONE", NO_INTERPOLATION)
   MakePair("LINEAR_1D",LINEAR_1D)
   MakePair("AKIMA_1D",AKIMA_1D)
@@ -372,7 +372,7 @@ enum ENUM_INLET_INTERPOLATIONTYPE {
   VR_VTHETA = 0,
   ALPHA_PHI = 1,
 };
-static const map<string, ENUM_INLET_INTERPOLATIONTYPE> Inlet_SpanwiseInterpolationType_Map = {
+static const MapType<string, ENUM_INLET_INTERPOLATIONTYPE> Inlet_SpanwiseInterpolationType_Map = {
   MakePair("VR_VTHETA",VR_VTHETA)
   MakePair("ALPHA_PHI",ALPHA_PHI)
 };
@@ -754,12 +754,14 @@ enum ENUM_CENTERED {
   NO_CENTERED = 0,    /*!< \brief No centered scheme is used. */
   JST = 1,            /*!< \brief Jameson-Smith-Turkel centered numerical method. */
   LAX = 2,            /*!< \brief Lax-Friedrich centered numerical method. */
+  JST_MAT = 3,        /*!< \brief JST with matrix dissipation. */
   JST_KE = 4          /*!< \brief Kinetic Energy preserving Jameson-Smith-Turkel centered numerical method. */
 };
 static const MapType<string, ENUM_CENTERED> Centered_Map = {
   MakePair("NONE", NO_CENTERED)
   MakePair("JST", JST)
   MakePair("JST_KE", JST_KE)
+  MakePair("JST_MAT", JST_MAT)
   MakePair("LAX-FRIEDRICH", LAX)
 };
 
@@ -964,11 +966,11 @@ static const MapType<string, ENUM_HYBRIDRANSLES> HybridRANSLES_Map = {
  * \brief Types of Roe Low Dissipation Schemes
  */
 enum ENUM_ROELOWDISS {
-    NO_ROELOWDISS = 0, /*!< \brief No Roe Low Dissipation model. */
-    FD            = 1, /*!< \brief Numerical Blending based on DDES's F_d function */
-    NTS           = 2, /*!< \brief Numerical Blending of Travin and Shur. */
-    NTS_DUCROS    = 3, /*!< \brief Numerical Blending of Travin and Shur + Ducros' Shock Sensor. */
-    FD_DUCROS     = 4  /*!< \brief Numerical Blending based on DDES's F_d function + Ducros' Shock Sensor */
+  NO_ROELOWDISS = 0, /*!< \brief No Roe Low Dissipation model. */
+  FD            = 1, /*!< \brief Numerical Blending based on DDES's F_d function */
+  NTS           = 2, /*!< \brief Numerical Blending of Travin and Shur. */
+  NTS_DUCROS    = 3, /*!< \brief Numerical Blending of Travin and Shur + Ducros' Shock Sensor. */
+  FD_DUCROS     = 4  /*!< \brief Numerical Blending based on DDES's F_d function + Ducros' Shock Sensor */
 };
 static const MapType<string, ENUM_ROELOWDISS> RoeLowDiss_Map = {
   MakePair("NONE", NO_ROELOWDISS)
diff --git a/Common/include/parallelization/special_vectorization.hpp b/Common/include/parallelization/special_vectorization.hpp
new file mode 100644
index 00000000000..8b6f7f50872
--- /dev/null
+++ b/Common/include/parallelization/special_vectorization.hpp
@@ -0,0 +1,170 @@
+/*!
+ * \file special_vectorization.hpp
+ * \brief Code generator header to create specializations of simd::Array.
+ * \author P. Gomes
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+// no #pragma once, header needs to be included once per specialization.
+
+/*!
+ * \brief Symbols that need to be defined before including this header:
+ * \param[in] ARRAY_T - The desired specialization of simd::Array.
+ * \param[in] SCALAR_T - Scalar type.
+ * \param[in] REGISTER_T - Intrinsic type.
+ * \param[in] SIZE_TAG - Dummy object associated with the simd size.
+ * \param[in] ARRAY_BOILERPLATE - Generates the general ctors, access, etc.
+ * \note On top of that, the intrinsic functions must be wrapped to
+ * strip their type / size characteristics, e.g. _mm256_add_pd -> add_p,
+ * overload resolution will do the rest. The first four symbols are
+ * undefined once we are done using them.
+ */
+template<>
+class ARRAY_T {
+#define FOREACH SU2_OMP_SIMD for(size_t k=0; k<Size; ++k)
+  template<class F, class S>
+  FORCEINLINE static S second(F, S s) { return s; }
+public:
+  using Scalar = SCALAR_T;
+  using Register = REGISTER_T;
+  enum : size_t {Align = alignof(Register)};
+  enum : size_t {Size = sizeof(Register) / sizeof(Scalar)};
+
+  /*--- The infamous union "hack", sue me. ---*/
+  union {
+    Register reg;
+    Scalar x_[Size];
+  };
+
+  /*--- Same basic construction operations as the general case. ---*/
+
+  ARRAY_BOILERPLATE
+
+  /*--- Special construction using the "register type" directly. ---*/
+
+  FORCEINLINE Array(Register y) { reg = y; }
+  FORCEINLINE Array(const Array& other) { reg = other.reg; }
+
+  /*--- Specialized construction primitives. ---*/
+
+  FORCEINLINE void bcast(Scalar x) { reg = set1_p(SIZE_TAG, x); }
+  FORCEINLINE void load(const Scalar* ptr) { reg = loadu_p(SIZE_TAG, ptr); }
+  FORCEINLINE void loada(const Scalar* ptr) { reg = load_p(SIZE_TAG, ptr); }
+  FORCEINLINE void store(Scalar* ptr) const { storeu_p(ptr, reg); }
+  FORCEINLINE void storea(Scalar* ptr) const { store_p(ptr, reg); }
+  FORCEINLINE void stream(Scalar* ptr) const { stream_p(ptr, reg); }
+  template<class T>
+  FORCEINLINE void gather(const Scalar* begin, const T& offsets) { FOREACH x_[k] = begin[offsets[k]]; }
+
+  /*--- Compound assignement operators. ---*/
+
+#define MAKE_COMPOUND(OP,IMPL)\
+  FORCEINLINE Array& operator OP (Scalar x) { reg = IMPL(reg, set1_p(SIZE_TAG, x)); return *this; }\
+  FORCEINLINE Array& operator OP (const Array& other) { reg = IMPL(reg, other.reg); return *this; }
+  MAKE_COMPOUND(=, second)
+  MAKE_COMPOUND(+=, add_p)
+  MAKE_COMPOUND(-=, sub_p)
+  MAKE_COMPOUND(*=, mul_p)
+  MAKE_COMPOUND(/=, div_p)
+#undef MAKE_COMPOUND
+
+#undef FOREACH
+};
+
+/*!
+ * SIMD overloads, NAME is the operator or function,
+ * IMPL the intrinsic function that implements it.
+ */
+#define MAKE_UNARY_FUN(NAME,IMPL)\
+FORCEINLINE ARRAY_T NAME(const ARRAY_T& x) {return IMPL(x.reg);}
+
+MAKE_UNARY_FUN(operator-, neg_p)
+MAKE_UNARY_FUN(sqrt, sqrt_p)
+MAKE_UNARY_FUN(abs, abs_p)
+MAKE_UNARY_FUN(sign, sign_p)
+
+#undef MAKE_UNARY_FUN
+
+#define MAKE_BINARY_FUN(NAME,IMPL)                              \
+FORCEINLINE ARRAY_T NAME (const ARRAY_T& a, const ARRAY_T& b) { \
+  return IMPL(a.reg, b.reg);                                    \
+}                                                               \
+FORCEINLINE ARRAY_T NAME (const ARRAY_T& a, SCALAR_T b) {       \
+  return IMPL(a.reg, set1_p(SIZE_TAG, b));                      \
+}                                                               \
+FORCEINLINE ARRAY_T NAME (SCALAR_T b, const ARRAY_T& a) {       \
+  return IMPL(set1_p(SIZE_TAG, b), a.reg);                      \
+}
+
+MAKE_BINARY_FUN(operator+, add_p)
+MAKE_BINARY_FUN(operator-, sub_p)
+MAKE_BINARY_FUN(operator*, mul_p)
+MAKE_BINARY_FUN(operator/, div_p)
+MAKE_BINARY_FUN(operator<, lt_p)
+MAKE_BINARY_FUN(operator>, gt_p)
+MAKE_BINARY_FUN(operator==, eq_p)
+MAKE_BINARY_FUN(operator!=, ne_p)
+MAKE_BINARY_FUN(operator<=, le_p)
+MAKE_BINARY_FUN(operator>=, ge_p)
+MAKE_BINARY_FUN(max, max_p)
+MAKE_BINARY_FUN(min, min_p)
+
+#undef MAKE_BINARY_FUN
+
+/*!
+ * Compatibility mode overloads, element-wise implementation.
+ */
+#define FOREACH SU2_OMP_SIMD for(size_t k=0; k<ARRAY_T::Size; ++k)
+
+/*--- Functions of one (array) argument. ---*/
+
+#define MAKE_UNARY_FUN(NAME,IMPL)                         \
+FORCEINLINE ARRAY_T NAME(const ARRAY_T& x) {              \
+  ARRAY_T res; FOREACH res[k] = IMPL(x[k]); return res;   \
+}
+
+#undef MAKE_UNARY_FUN
+
+/*--- Functions of two arguments, with arrays and scalars. ---*/
+
+#define MAKE_BINARY_FUN(NAME,IMPL)                                \
+FORCEINLINE ARRAY_T NAME(const ARRAY_T& a, const ARRAY_T& b) {    \
+  ARRAY_T res; FOREACH res[k] = IMPL(a[k], b[k]); return res;     \
+}                                                                 \
+FORCEINLINE ARRAY_T NAME(const ARRAY_T& a, SCALAR_T b) {          \
+  ARRAY_T res; FOREACH res[k] = IMPL(a[k], b); return res;        \
+}                                                                 \
+FORCEINLINE ARRAY_T NAME(SCALAR_T b, const ARRAY_T& a) {          \
+  ARRAY_T res; FOREACH res[k] = IMPL(b, a[k]); return res;        \
+}
+
+MAKE_BINARY_FUN(pow, ::pow)
+
+#undef MAKE_BINARY_FUN
+
+#undef FOREACH
+
+#undef ARRAY_T
+#undef SCALAR_T
+#undef REGISTER_T
+#undef SIZE_TAG
diff --git a/Common/include/parallelization/vectorization.hpp b/Common/include/parallelization/vectorization.hpp
new file mode 100644
index 00000000000..bf7a3eaeb17
--- /dev/null
+++ b/Common/include/parallelization/vectorization.hpp
@@ -0,0 +1,304 @@
+/*!
+ * \file vectorization.hpp
+ * \brief Implementation of a portable SIMD type.
+ * \author P. Gomes
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../linear_algebra/vector_expressions.hpp"
+#include "../omp_structure.hpp"
+#include <initializer_list>
+#include <algorithm>
+#include <cmath>
+
+namespace simd {
+
+using namespace VecExpr;
+
+/*--- Detect preferred SIMD size (bytes). This only covers x86 architectures. ---*/
+#if defined(__AVX512F__)
+constexpr size_t PREFERRED_SIZE = 64;
+#elif defined(__AVX__)
+constexpr size_t PREFERRED_SIZE = 32;
+#elif defined(__SSE2__)
+constexpr size_t PREFERRED_SIZE = 16;
+#else
+constexpr size_t PREFERRED_SIZE = 8;
+#endif
+
+/*!
+ * \brief Convert the SIMD size (bytes) to a lenght (num elems).
+ */
+template<class T>
+constexpr size_t preferredLen() { return PREFERRED_SIZE / sizeof(T); }
+template<>
+constexpr size_t preferredLen<su2double>() { return PREFERRED_SIZE / sizeof(passivedouble); }
+
+/*!
+ * \class Array
+ * \brief A simple SIMD type relying on implicit vectorization, i.e. done by
+ * the compiler, explicitly vectorized specializations are defined after.
+ * \note This class gets its math operator overloads from CVecExpr, the
+ * specializations do not use expression templates, IF YOU NEED A NEW FUNCTION,
+ * define it both in vector_expressions.hpp and in special_vectorization.hpp.
+ */
+template<class Scalar_t, size_t N = preferredLen<Scalar_t>()>
+class Array : public CVecExpr<Array<Scalar_t,N>, Scalar_t> {
+#define FOREACH for(size_t k=0; k<N; ++k)
+  static_assert(N > 0, "Invalid SIMD size");
+public:
+  using Scalar = Scalar_t;
+  enum : size_t {Size = N};
+  enum : size_t {Align = Size*sizeof(Scalar)};
+  static constexpr bool StoreAsRef = true;
+
+private:
+  alignas(Align) Scalar x_[N];
+
+public:
+#define ARRAY_BOILERPLATE                                                     \
+  /*!--- Access elements ---*/                                                \
+  FORCEINLINE Scalar& operator[] (size_t k) { return x_[k]; }                 \
+  FORCEINLINE const Scalar& operator[] (size_t k) const { return x_[k]; }     \
+  /*!--- Constructors ---*/                                                   \
+  FORCEINLINE Array() = default;                                              \
+  FORCEINLINE Array(Scalar x) { bcast(x); }                                   \
+  FORCEINLINE Array(std::initializer_list<Scalar> vals) {                     \
+    auto it = vals.begin(); FOREACH { x_[k] = *it; ++it; }                    \
+  }                                                                           \
+  FORCEINLINE Array(Scalar x0, Scalar dx) { FOREACH x_[k] = x0 + k*dx; }      \
+  FORCEINLINE Array(const Scalar* ptr) { load(ptr); }                         \
+  template<class T>                                                           \
+  FORCEINLINE Array(const Scalar* beg, const T& off) { gather(beg,off); }     \
+  /*!--- Reduction operations ---*/                                           \
+  FORCEINLINE Scalar sum() const { Scalar s(0); FOREACH s+=x_[k]; return s; } \
+  FORCEINLINE Scalar dot(const Array& other) const {                          \
+    Scalar s(0); FOREACH s += x_[k] * other[k]; return s;                     \
+  }
+
+#if defined(CODI_REVERSE_TYPE) || defined(CODI_FORWARD_TYPE)
+  /*--- These are not very nice but without them it would not be
+   * possible to assign literals to Arrays of active types. ---*/
+  template<class U = Scalar, su2enable_if<std::is_same<U,su2double>::value> = 0>
+  FORCEINLINE Array(passivedouble x) { bcast(x); }
+  template<class U = Scalar, su2enable_if<std::is_same<U,su2double>::value> = 0>
+  FORCEINLINE Array& operator= (passivedouble x) { bcast(x); return *this; }
+#endif
+
+  ARRAY_BOILERPLATE
+
+  /*! \brief Copy construct from expression. */
+  template<class U>
+  FORCEINLINE Array(const CVecExpr<U,Scalar>& expr) {
+    FOREACH x_[k] = expr.derived()[k];
+  }
+
+  /*--- Implementation of the construction primitives. ---*/
+
+  FORCEINLINE void bcast(Scalar x) { FOREACH x_[k] = x; }
+  FORCEINLINE void load(const Scalar* ptr) { FOREACH x_[k] = ptr[k]; }
+  FORCEINLINE void loada(const Scalar* ptr) { load(ptr); }
+  FORCEINLINE void store(Scalar* ptr) const { FOREACH ptr[k] = x_[k]; }
+  FORCEINLINE void storea(Scalar* ptr) const { store(ptr); }
+  FORCEINLINE void stream(Scalar* ptr) const { store(ptr); }
+  template<class T>
+  FORCEINLINE void gather(const Scalar* begin, const T& offsets) { FOREACH x_[k] = begin[offsets[k]]; }
+
+  /*--- Compound assignment operators. ---*/
+
+#define MAKE_COMPOUND(OP)                                                         \
+  FORCEINLINE Array& operator OP (Scalar x) { FOREACH x_[k] OP x; return *this; } \
+  template<class U>                                                               \
+  FORCEINLINE Array& operator OP (const CVecExpr<U,Scalar>& expr) {               \
+    FOREACH x_[k] OP expr.derived()[k]; return *this;                             \
+  }
+  MAKE_COMPOUND(=)
+  MAKE_COMPOUND(+=)
+  MAKE_COMPOUND(-=)
+  MAKE_COMPOUND(*=)
+  MAKE_COMPOUND(/=)
+#undef MAKE_COMPOUND
+
+#undef FOREACH
+};
+
+/*--- Explicit vectorization specializations, see e.g.
+ * https://software.intel.com/sites/landingpage/IntrinsicsGuide/
+ * for documentation on the "_mm*" functions. ---*/
+
+/*--- Size tags for overload resolution of some wrapper functions. ---*/
+namespace SizeTag {
+  struct TWO {};
+  struct FOUR {};
+  struct EIGHT {};
+  struct SIXTEEN {};
+}
+
+/*--- Constants for bitwise implementations. ---*/
+/*--- abs forces the sign bit to 0 ("x" & 0b0111...). ---*/
+constexpr auto abs_mask_d = 0x7FFFFFFFFFFFFFFFL;
+/*--- negation flips the sign bit ("x" ^ 0b1000...). ---*/
+constexpr auto sign_mask_d = 0x8000000000000000L;
+
+#ifdef __SSE2__
+#include "x86intrin.h"
+/*!
+ * Create specialization for array of 2 doubles (this should be always available).
+ */
+#define ARRAY_T Array<double,2>
+#define SCALAR_T double
+#define REGISTER_T __m128d
+#define SIZE_TAG SizeTag::TWO()
+
+static const __m128d abs_mask_2d = _mm_castsi128_pd(_mm_set1_epi64x(abs_mask_d));
+static const __m128d sign_mask_2d = _mm_castsi128_pd(_mm_set1_epi64x(sign_mask_d));
+static const __m128d ones_2d = _mm_set1_pd(1);
+
+FORCEINLINE __m128d set1_p(SizeTag::TWO, double p) { return _mm_set1_pd(p); }
+FORCEINLINE __m128d load_p(SizeTag::TWO, const double* p) { return _mm_load_pd(p); }
+FORCEINLINE __m128d loadu_p(SizeTag::TWO, const double* p) { return _mm_loadu_pd(p); }
+FORCEINLINE void store_p(double* p, __m128d x) { _mm_store_pd(p,x); }
+FORCEINLINE void storeu_p(double* p, __m128d x) { _mm_storeu_pd(p,x); }
+FORCEINLINE void stream_p(double* p, __m128d x) { _mm_stream_pd(p,x); }
+
+FORCEINLINE __m128d add_p(__m128d a, __m128d b) { return _mm_add_pd(a,b); }
+FORCEINLINE __m128d sub_p(__m128d a, __m128d b) { return _mm_sub_pd(a,b); }
+FORCEINLINE __m128d mul_p(__m128d a, __m128d b) { return _mm_mul_pd(a,b); }
+FORCEINLINE __m128d div_p(__m128d a, __m128d b) { return _mm_div_pd(a,b); }
+FORCEINLINE __m128d max_p(__m128d a, __m128d b) { return _mm_max_pd(a,b); }
+FORCEINLINE __m128d min_p(__m128d a, __m128d b) { return _mm_min_pd(a,b); }
+
+FORCEINLINE __m128d eq_p(__m128d a, __m128d b) { return _mm_and_pd(ones_2d, _mm_cmpeq_pd(a,b)); }
+FORCEINLINE __m128d lt_p(__m128d a, __m128d b) { return _mm_and_pd(ones_2d, _mm_cmplt_pd(a,b)); }
+FORCEINLINE __m128d le_p(__m128d a, __m128d b) { return _mm_and_pd(ones_2d, _mm_cmple_pd(a,b)); }
+FORCEINLINE __m128d ne_p(__m128d a, __m128d b) { return _mm_and_pd(ones_2d, _mm_cmpneq_pd(a,b)); }
+FORCEINLINE __m128d ge_p(__m128d a, __m128d b) { return _mm_and_pd(ones_2d, _mm_cmpge_pd(a,b)); }
+FORCEINLINE __m128d gt_p(__m128d a, __m128d b) { return _mm_and_pd(ones_2d, _mm_cmpgt_pd(a,b)); }
+
+FORCEINLINE __m128d sqrt_p(__m128d x) { return _mm_sqrt_pd(x); }
+FORCEINLINE __m128d abs_p(__m128d x) { return _mm_and_pd(x, abs_mask_2d); }
+FORCEINLINE __m128d neg_p(__m128d x) { return _mm_xor_pd(x, sign_mask_2d); }
+FORCEINLINE __m128d sign_p(__m128d x) { return _mm_or_pd(ones_2d, _mm_and_pd(x, sign_mask_2d)); }
+
+/*--- Generate specialization based on the defines
+ * and functions above by including the header. ---*/
+
+#include "special_vectorization.hpp"
+
+#endif // __SSE2__
+
+#ifdef __AVX__
+/*!
+ * Create specialization for array of 4 doubles.
+ */
+#define ARRAY_T Array<double,4>
+#define SCALAR_T double
+#define REGISTER_T __m256d
+#define SIZE_TAG SizeTag::FOUR()
+
+static const __m256d abs_mask_4d = _mm256_castsi256_pd(_mm256_set1_epi64x(abs_mask_d));
+static const __m256d sign_mask_4d = _mm256_castsi256_pd(_mm256_set1_epi64x(sign_mask_d));
+static const __m256d ones_4d = _mm256_set1_pd(1);
+
+FORCEINLINE __m256d set1_p(SizeTag::FOUR, double p) { return _mm256_set1_pd(p); }
+FORCEINLINE __m256d load_p(SizeTag::FOUR, const double* p) { return _mm256_load_pd(p); }
+FORCEINLINE __m256d loadu_p(SizeTag::FOUR, const double* p) { return _mm256_loadu_pd(p); }
+FORCEINLINE void store_p(double* p, __m256d x) { _mm256_store_pd(p,x); }
+FORCEINLINE void storeu_p(double* p, __m256d x) { _mm256_storeu_pd(p,x); }
+FORCEINLINE void stream_p(double* p, __m256d x) { _mm256_stream_pd(p,x); }
+
+FORCEINLINE __m256d add_p(__m256d a, __m256d b) { return _mm256_add_pd(a,b); }
+FORCEINLINE __m256d sub_p(__m256d a, __m256d b) { return _mm256_sub_pd(a,b); }
+FORCEINLINE __m256d mul_p(__m256d a, __m256d b) { return _mm256_mul_pd(a,b); }
+FORCEINLINE __m256d div_p(__m256d a, __m256d b) { return _mm256_div_pd(a,b); }
+FORCEINLINE __m256d max_p(__m256d a, __m256d b) { return _mm256_max_pd(a,b); }
+FORCEINLINE __m256d min_p(__m256d a, __m256d b) { return _mm256_min_pd(a,b); }
+
+FORCEINLINE __m256d eq_p(__m256d a, __m256d b) { return _mm256_and_pd(ones_4d, _mm256_cmp_pd(a,b,0)); }
+FORCEINLINE __m256d lt_p(__m256d a, __m256d b) { return _mm256_and_pd(ones_4d, _mm256_cmp_pd(a,b,1)); }
+FORCEINLINE __m256d le_p(__m256d a, __m256d b) { return _mm256_and_pd(ones_4d, _mm256_cmp_pd(a,b,2)); }
+FORCEINLINE __m256d ne_p(__m256d a, __m256d b) { return _mm256_and_pd(ones_4d, _mm256_cmp_pd(a,b,4)); }
+FORCEINLINE __m256d ge_p(__m256d a, __m256d b) { return _mm256_and_pd(ones_4d, _mm256_cmp_pd(a,b,13)); }
+FORCEINLINE __m256d gt_p(__m256d a, __m256d b) { return _mm256_and_pd(ones_4d, _mm256_cmp_pd(a,b,14)); }
+
+FORCEINLINE __m256d sqrt_p(__m256d x) { return _mm256_sqrt_pd(x); }
+FORCEINLINE __m256d abs_p(__m256d x) { return _mm256_and_pd(x, abs_mask_4d); }
+FORCEINLINE __m256d neg_p(__m256d x) { return _mm256_xor_pd(x, sign_mask_4d); }
+FORCEINLINE __m256d sign_p(__m256d x) { return _mm256_or_pd(ones_4d, _mm256_and_pd(x, sign_mask_4d)); }
+
+#include "special_vectorization.hpp"
+
+#endif // __AVX__
+
+#ifdef __AVX512F__
+/*!
+ * Create specialization for array of 8 doubles.
+ */
+#define ARRAY_T Array<double,8>
+#define SCALAR_T double
+#define REGISTER_T __m512d
+#define SIZE_TAG SizeTag::EIGHT()
+
+static const __m512d abs_mask_8d = _mm512_castsi512_pd(_mm512_set1_epi64(abs_mask_d));
+static const __m512d sign_mask_8d = _mm512_castsi512_pd(_mm512_set1_epi64(sign_mask_d));
+static const __m512d ones_8d = _mm512_set1_pd(1);
+
+FORCEINLINE __m512d set1_p(SizeTag::EIGHT, double p) { return _mm512_set1_pd(p); }
+FORCEINLINE __m512d load_p(SizeTag::EIGHT, const double* p) { return _mm512_load_pd(p); }
+FORCEINLINE __m512d loadu_p(SizeTag::EIGHT, const double* p) { return _mm512_loadu_pd(p); }
+FORCEINLINE void store_p(double* p, __m512d x) { _mm512_store_pd(p,x); }
+FORCEINLINE void storeu_p(double* p, __m512d x) { _mm512_storeu_pd(p,x); }
+FORCEINLINE void stream_p(double* p, __m512d x) { _mm512_stream_pd(p,x); }
+
+FORCEINLINE __m512d add_p(__m512d a, __m512d b) { return _mm512_add_pd(a,b); }
+FORCEINLINE __m512d sub_p(__m512d a, __m512d b) { return _mm512_sub_pd(a,b); }
+FORCEINLINE __m512d mul_p(__m512d a, __m512d b) { return _mm512_mul_pd(a,b); }
+FORCEINLINE __m512d div_p(__m512d a, __m512d b) { return _mm512_div_pd(a,b); }
+FORCEINLINE __m512d max_p(__m512d a, __m512d b) { return _mm512_max_pd(a,b); }
+FORCEINLINE __m512d min_p(__m512d a, __m512d b) { return _mm512_min_pd(a,b); }
+
+template<int opCode>
+FORCEINLINE __m512d cmp_p(__m512d a, __m512d b) {
+  return _mm512_mask_blend_pd(_mm512_cmp_pd_mask(a,b,opCode), _mm512_setzero_pd(), ones_8d);
+}
+FORCEINLINE __m512d eq_p(__m512d a, __m512d b) { return cmp_p<0>(a,b); }
+FORCEINLINE __m512d lt_p(__m512d a, __m512d b) { return cmp_p<1>(a,b); }
+FORCEINLINE __m512d le_p(__m512d a, __m512d b) { return cmp_p<2>(a,b); }
+FORCEINLINE __m512d ne_p(__m512d a, __m512d b) { return cmp_p<4>(a,b); }
+FORCEINLINE __m512d ge_p(__m512d a, __m512d b) { return cmp_p<13>(a,b); }
+FORCEINLINE __m512d gt_p(__m512d a, __m512d b) { return cmp_p<14>(a,b); }
+
+FORCEINLINE __m512d sqrt_p(__m512d x) { return _mm512_sqrt_pd(x); }
+FORCEINLINE __m512d abs_p(__m512d x) { return _mm512_and_pd(x, abs_mask_8d); }
+FORCEINLINE __m512d neg_p(__m512d x) { return _mm512_xor_pd(x, sign_mask_8d); }
+FORCEINLINE __m512d sign_p(__m512d x) { return _mm512_or_pd(ones_8d, _mm512_and_pd(x, sign_mask_8d)); }
+
+#include "special_vectorization.hpp"
+
+#endif // __AVX512F__
+
+#undef ARRAY_BOILERPLATE
+
+} // namespace
diff --git a/Common/include/toolboxes/CSymmetricMatrix.hpp b/Common/include/toolboxes/CSymmetricMatrix.hpp
index c9fe3842f2f..d599ae9cbb7 100644
--- a/Common/include/toolboxes/CSymmetricMatrix.hpp
+++ b/Common/include/toolboxes/CSymmetricMatrix.hpp
@@ -27,7 +27,7 @@
 #pragma once
 
 #include <vector>
-#include "C2DContainer.hpp"
+#include "../containers/C2DContainer.hpp"
 
 /*!
  * \brief The matrix is symmetric but full storage is used as that gives much better
@@ -35,8 +35,7 @@
  * with LAPACK to use optimized matrix inversion and multiplication routines.
  */
 class CSymmetricMatrix {
-  static_assert(su2passivematrix::Storage == StorageType::RowMajor,
-                "Row major storage is assumed for LAPACK.");
+  static_assert(su2passivematrix::IsRowMajor, "Row major storage is assumed for LAPACK.");
 private:
   su2passivematrix mat;
 
diff --git a/Common/include/toolboxes/graph_toolbox.hpp b/Common/include/toolboxes/graph_toolbox.hpp
index 5e7003b620b..95bd4291b72 100644
--- a/Common/include/toolboxes/graph_toolbox.hpp
+++ b/Common/include/toolboxes/graph_toolbox.hpp
@@ -27,7 +27,7 @@
 
 #pragma once
 
-#include "C2DContainer.hpp"
+#include "../containers/C2DContainer.hpp"
 #include "../omp_structure.hpp"
 
 #include <set>
@@ -65,6 +65,22 @@ class CCompressedSparsePattern {
 public:
   using IndexType = Index_t;
 
+  /*!
+   * \brief Type to allow range for loops over inner indices.
+   */
+  struct CInnerIter {
+    const IndexType* const m_first = nullptr;
+    const IndexType* const m_last = nullptr;
+    CInnerIter(const IndexType* first, const IndexType* last) :
+      m_first(first), m_last(last) {
+    }
+    const IndexType* begin() const { return m_first; }
+    const IndexType* end() const { return m_last; }
+  };
+
+  /*!
+   * \brief Default construction.
+   */
   CCompressedSparsePattern() = default;
 
   /*!
@@ -219,6 +235,15 @@ class CCompressedSparsePattern {
     return m_innerIdx(m_outerPtr(iOuterIdx) + iNonZero);
   }
 
+  /*!
+   * \param[in] iOuterIdx - Outer index.
+   * \return Iterator to inner dimension to use in range for loops.
+   */
+  inline CInnerIter getInnerIter(Index_t iOuterIdx) const {
+    return CInnerIter(m_innerIdx.data()+m_outerPtr(iOuterIdx),
+                      m_innerIdx.data()+m_outerPtr(iOuterIdx+1));
+  }
+
   /*!
    * \param[in] iOuterIdx - Outer index (row/col).
    * \param[in] iInnerIdx - Inner index (col/row).
@@ -386,20 +411,16 @@ CCompressedSparsePattern<Index_t> buildCSRPattern(Geometry_t& geometry,
         if(type == ConnectivityType::FiniteVolume)
         {
           /*--- For FVM we know the neighbors of point j directly. ---*/
-          for(unsigned short iNeigh = 0; iNeigh < geometry.nodes->GetnPoint(jPoint); ++iNeigh)
-          {
-            Index_t kPoint = geometry.nodes->GetPoint(jPoint, iNeigh);
-
+          for(Index_t kPoint : geometry.nodes->GetPoints(jPoint))
             if(neighbors.count(kPoint) == 0) // no duplication
               newNeighbors.insert(kPoint);
-          }
         }
         else // FiniteElement
         {
           /*--- For FEM we need the nodes of all elements that contain point j. ---*/
-          for(unsigned short iNeigh = 0; iNeigh < geometry.nodes->GetnElem(jPoint); ++iNeigh)
+          for(auto iElem : geometry.nodes->GetElems(jPoint))
           {
-            auto elem = geometry.elem[geometry.nodes->GetElem(jPoint, iNeigh)];
+            auto elem = geometry.elem[iElem];
 
             for(unsigned short iNode = 0; iNode < elem->GetnNodes(); ++iNode)
             {
diff --git a/Common/src/CConfig.cpp b/Common/src/CConfig.cpp
index be6cf1f2fd5..1c0aa7e6b55 100644
--- a/Common/src/CConfig.cpp
+++ b/Common/src/CConfig.cpp
@@ -1599,6 +1599,8 @@ void CConfig::SetConfig_Options() {
 
   /* DESCRIPTION: Number of samples for quasi-Newton methods. */
   addUnsignedShortOption("QUASI_NEWTON_NUM_SAMPLES", nQuasiNewtonSamples, 0);
+  /* DESCRIPTION: Whether to use vectorized numerical schemes, less robust against transients. */
+  addBoolOption("USE_VECTORIZATION", UseVectorization, false);
 
   /*!\par CONFIG_CATEGORY: Time-marching \ingroup Config*/
   /*--- Options related to time-marching ---*/
@@ -2832,14 +2834,14 @@ void CConfig::SetConfig_Parsing(char case_filename[MAX_STRING_SIZE]) {
 
 }
 
-  void CConfig::SetConfig_Parsing(istream& config_buffer){
+void CConfig::SetConfig_Parsing(istream& config_buffer){
 
   string text_line, option_name;
   vector<string> option_value;
 
   string errorString;
 
-  int  err_count = 0;  // How many errors have we found in the config file
+  int err_count = 0;  // How many errors have we found in the config file
   int max_err_count = 30; // Maximum number of errors to print before stopping
   int line_count = 1;
 
@@ -6080,21 +6082,16 @@ void CConfig::SetOutput(unsigned short val_software, unsigned short val_izone) {
         (Kind_Solver == DISC_ADJ_EULER) || (Kind_Solver == DISC_ADJ_NAVIER_STOKES) || (Kind_Solver == DISC_ADJ_RANS) ) {
 
       if (Kind_ConvNumScheme_Flow == SPACE_CENTERED) {
-        if (Kind_Centered_Flow == JST) {
-          cout << "Jameson-Schmidt-Turkel scheme (2nd order in space) for the flow inviscid terms."<< endl;
-          cout << "JST viscous coefficients (2nd & 4th): " << Kappa_2nd_Flow << ", " << Kappa_4th_Flow <<"." << endl;
-          cout << "The method includes a grid stretching correction (p = 0.3)."<< endl;
-        }
-        if (Kind_Centered_Flow == JST_KE) {
-          cout << "Jameson-Schmidt-Turkel scheme (2nd order in space) for the flow inviscid terms."<< endl;
-          cout << "JST viscous coefficients (2nd & 4th): " << Kappa_2nd_Flow << ", " << Kappa_4th_Flow << "." << endl;
-          cout << "The method includes a grid stretching correction (p = 0.3)."<< endl;
-        }
         if (Kind_Centered_Flow == LAX) {
           cout << "Lax-Friedrich scheme (1st order in space) for the flow inviscid terms."<< endl;
           cout << "Lax viscous coefficients (1st): " << Kappa_1st_Flow << "." << endl;
           cout << "First order integration." << endl;
         }
+        else {
+          cout << "Jameson-Schmidt-Turkel scheme (2nd order in space) for the flow inviscid terms."<< endl;
+          cout << "JST viscous coefficients (2nd & 4th): " << Kappa_2nd_Flow << ", " << Kappa_4th_Flow << "." << endl;
+          cout << "The method includes a grid stretching correction (p = 0.3)."<< endl;
+        }
       }
 
       if (Kind_ConvNumScheme_Flow == SPACE_UPWIND) {
diff --git a/Common/src/CMultiGridQueue.cpp b/Common/src/CMultiGridQueue.cpp
index 168478d11aa..0a51e5d26d9 100644
--- a/Common/src/CMultiGridQueue.cpp
+++ b/Common/src/CMultiGridQueue.cpp
@@ -186,9 +186,7 @@ void CMultiGridQueue::Update(unsigned long updatePoint, CGeometry *fineGrid) {
 
   RemoveCV(updatePoint);
 
-  for (auto iNode = 0u; iNode < fineGrid->nodes->GetnPoint(updatePoint); ++iNode) {
-    const auto jPoint = fineGrid->nodes->GetPoint(updatePoint,iNode);
+  for (auto jPoint : fineGrid->nodes->GetPoints(updatePoint))
     if (!fineGrid->nodes->GetAgglomerate(jPoint))
       IncrPriorityCV(jPoint);
-  }
 }
diff --git a/Common/src/geometry/CGeometry.cpp b/Common/src/geometry/CGeometry.cpp
index a6b3ae8c41e..42270aca716 100644
--- a/Common/src/geometry/CGeometry.cpp
+++ b/Common/src/geometry/CGeometry.cpp
@@ -1404,10 +1404,8 @@ long CGeometry::FindEdge(unsigned long first_point, unsigned long second_point)
 
 bool CGeometry::CheckEdge(unsigned long first_point, unsigned long second_point) const {
 
-  for (unsigned short iNode = 0; iNode < nodes->GetnPoint(first_point); iNode++) {
-    auto iPoint = nodes->GetPoint(first_point, iNode);
+  for (auto iPoint : nodes->GetPoints(first_point))
     if (iPoint == second_point) return true;
-  }
   return false;
 }
 
@@ -1434,8 +1432,7 @@ void CGeometry::SetEdges(void) {
   edges = new CEdge(nEdge,nDim);
 
   for (auto iPoint = 0ul; iPoint < nPoint; iPoint++) {
-    for (auto iNode = 0u; iNode < nodes->GetnPoint(iPoint); iNode++) {
-      auto jPoint = nodes->GetPoint(iPoint, iNode);
+    for (auto jPoint : nodes->GetPoints(iPoint)) {
       if (iPoint < jPoint) {
         auto iEdge = FindEdge(iPoint, jPoint);
         edges->SetNodes(iEdge, iPoint, jPoint);
diff --git a/Common/src/geometry/CMultiGridGeometry.cpp b/Common/src/geometry/CMultiGridGeometry.cpp
index a08dbe2dd16..f8d80a25968 100644
--- a/Common/src/geometry/CMultiGridGeometry.cpp
+++ b/Common/src/geometry/CMultiGridGeometry.cpp
@@ -40,7 +40,7 @@ CMultiGridGeometry::CMultiGridGeometry(CGeometry **geometry, CConfig *config_con
 
   /*--- Local variables ---*/
 
-  unsigned long iPoint, Index_CoarseCV, CVPoint, iElem, iVertex, jPoint, iteration, nVertexS, nVertexR,
+  unsigned long iPoint, Index_CoarseCV, iElem, iVertex, iteration, nVertexS, nVertexR,
                 nBufferS_Vector, nBufferR_Vector, iParent, jVertex,Local_nPointCoarse, Local_nPointFine, Global_nPointCoarse, Global_nPointFine,
                 *Buffer_Receive_Parent = nullptr, *Buffer_Send_Parent = nullptr, *Buffer_Receive_Children = nullptr, *Buffer_Send_Children = nullptr,
                 *Parent_Remote = nullptr,         *Children_Remote = nullptr,    *Parent_Local = nullptr,            *Children_Local = nullptr;
@@ -155,9 +155,7 @@ CMultiGridGeometry::CMultiGridGeometry(CGeometry **geometry, CConfig *config_con
 
           /*--- Now we do a sweep over all the nodes that surround the seed point ---*/
 
-          for (iNode = 0; iNode < fine_grid->nodes->GetnPoint(iPoint); iNode ++) {
-
-            CVPoint = fine_grid->nodes->GetPoint(iPoint, iNode);
+          for (auto CVPoint : fine_grid->nodes->GetPoints(iPoint)) {
 
             /*--- The new point can be agglomerated ---*/
 
@@ -182,9 +180,7 @@ CMultiGridGeometry::CMultiGridGeometry(CGeometry **geometry, CConfig *config_con
 
           /*--- Now we do a sweep over all the indirect nodes that can be added ---*/
 
-          for (iNode = 0; iNode < Suitable_Indirect_Neighbors.size(); iNode ++) {
-
-            CVPoint = Suitable_Indirect_Neighbors[iNode];
+          for (auto CVPoint : Suitable_Indirect_Neighbors) {
 
             /*--- The new point can be agglomerated ---*/
 
@@ -250,8 +246,7 @@ CMultiGridGeometry::CMultiGridGeometry(CGeometry **geometry, CConfig *config_con
       /*--- Count the number of agglomerated neighbors, and modify the queue ---*/
 
       priority = 0;
-      for (iNode = 0; iNode < fine_grid->nodes->GetnPoint(iPoint); iNode ++) {
-        jPoint = fine_grid->nodes->GetPoint(iPoint, iNode);
+      for (auto jPoint : fine_grid->nodes->GetPoints(iPoint)) {
         if (fine_grid->nodes->GetAgglomerate(jPoint) == true) priority++;
       }
       MGQueue_InnerCV.MoveCV(iPoint, priority);
@@ -290,9 +285,7 @@ CMultiGridGeometry::CMultiGridGeometry(CGeometry **geometry, CConfig *config_con
 
       /*--- Now we do a sweep over all the nodes that surround the seed point ---*/
 
-      for (iNode = 0; iNode < fine_grid->nodes->GetnPoint(iPoint); iNode ++) {
-
-        CVPoint = fine_grid->nodes->GetPoint(iPoint, iNode);
+      for (auto CVPoint : fine_grid->nodes->GetPoints(iPoint)) {
 
         /*--- Determine if the CVPoint can be agglomerated ---*/
 
@@ -326,9 +319,7 @@ CMultiGridGeometry::CMultiGridGeometry(CGeometry **geometry, CConfig *config_con
 
       /*--- Now we do a sweep over all the indirect nodes that can be added ---*/
 
-      for (iNode = 0; iNode < Suitable_Indirect_Neighbors.size(); iNode ++) {
-
-        CVPoint = Suitable_Indirect_Neighbors[iNode];
+      for (auto CVPoint : Suitable_Indirect_Neighbors) {
 
         /*--- The new point can be agglomerated ---*/
 
@@ -392,7 +383,7 @@ CMultiGridGeometry::CMultiGridGeometry(CGeometry **geometry, CConfig *config_con
 
   /*--- Check that there are no hanging nodes ---*/
 
-  unsigned long iFinePoint, iFinePoint_Neighbor, iCoarsePoint, iCoarsePoint_Complete;
+  unsigned long iFinePoint, iCoarsePoint, iCoarsePoint_Complete;
   unsigned short iChildren;
 
   /*--- Find the point surrounding a point ---*/
@@ -403,8 +394,7 @@ CMultiGridGeometry::CMultiGridGeometry(CGeometry **geometry, CConfig *config_con
     for (iCoarsePoint = 0; iCoarsePoint < nPointDomain; iCoarsePoint ++) {
       for (iChildren = 0; iChildren <  nodes->GetnChildren_CV(iCoarsePoint); iChildren ++) {
         iFinePoint = nodes->GetChildren_CV(iCoarsePoint, iChildren);
-        for (iNode = 0; iNode < fine_grid->nodes->GetnPoint(iFinePoint); iNode ++) {
-          iFinePoint_Neighbor = fine_grid->nodes->GetPoint(iFinePoint, iNode);
+        for (auto iFinePoint_Neighbor : fine_grid->nodes->GetPoints(iFinePoint)) {
           iParent = fine_grid->nodes->GetParent_CV(iFinePoint_Neighbor);
           if (iParent != iCoarsePoint) {
             auto End = points[iCoarsePoint].end();
@@ -787,8 +777,7 @@ bool CMultiGridGeometry::GeometricalCheck(unsigned long iPoint, CGeometry *fine_
 void CMultiGridGeometry::SetSuitableNeighbors(vector<unsigned long> *Suitable_Indirect_Neighbors, unsigned long iPoint,
                                               unsigned long Index_CoarseCV, CGeometry *fine_grid) {
 
-  unsigned long jPoint, kPoint, lPoint;
-  unsigned short iNode, jNode, iNeighbor, jNeighbor, kNode;
+  unsigned short iNeighbor, jNeighbor;
   bool SecondNeighborSeed, ThirdNeighborSeed;
   vector<unsigned long>::iterator it;
 
@@ -796,20 +785,16 @@ void CMultiGridGeometry::SetSuitableNeighbors(vector<unsigned long> *Suitable_In
 
   vector<unsigned long> First_Neighbor_Points;
   First_Neighbor_Points.push_back(iPoint);
-  for (iNode = 0; iNode < fine_grid->nodes->GetnPoint(iPoint); iNode ++) {
-    jPoint = fine_grid->nodes->GetPoint(iPoint, iNode);
+  for (auto jPoint : fine_grid->nodes->GetPoints(iPoint))
     First_Neighbor_Points.push_back(jPoint);
-  }
 
   /*--- Create a list with the second neighbors, without first, and seed neighbors ---*/
 
   vector<unsigned long> Second_Neighbor_Points, Second_Origin_Points, Suitable_Second_Neighbors;
 
-  for (iNode = 0; iNode < fine_grid->nodes->GetnPoint(iPoint); iNode ++) {
-    jPoint = fine_grid->nodes->GetPoint(iPoint, iNode);
+  for (auto jPoint : fine_grid->nodes->GetPoints(iPoint)) {
 
-    for (jNode = 0; jNode < fine_grid->nodes->GetnPoint(jPoint); jNode ++) {
-      kPoint = fine_grid->nodes->GetPoint(jPoint, jNode);
+    for (auto kPoint : fine_grid->nodes->GetPoints(jPoint)) {
 
       /*--- Check that the second neighbor do not belong to the first neighbor or the seed ---*/
 
@@ -865,11 +850,8 @@ void CMultiGridGeometry::SetSuitableNeighbors(vector<unsigned long> *Suitable_In
 
   vector<unsigned long> Third_Neighbor_Points, Third_Origin_Points;
 
-  for (jNode = 0; jNode < Suitable_Second_Neighbors.size(); jNode ++) {
-    kPoint = Suitable_Second_Neighbors[jNode];
-
-    for (kNode = 0; kNode < fine_grid->nodes->GetnPoint(kPoint); kNode ++) {
-      lPoint = fine_grid->nodes->GetPoint(kPoint, kNode);
+  for (auto kPoint : Suitable_Second_Neighbors) {
+    for (auto lPoint : fine_grid->nodes->GetPoints(kPoint)) {
 
       /*--- Check that the third neighbor do not belong to the first neighbors or the seed ---*/
 
@@ -922,8 +904,8 @@ void CMultiGridGeometry::SetSuitableNeighbors(vector<unsigned long> *Suitable_In
 
 void CMultiGridGeometry::SetPoint_Connectivity(CGeometry *fine_grid) {
 
-  unsigned long iFinePoint, iFinePoint_Neighbor, iParent, iCoarsePoint;
-  unsigned short iChildren, iNode;
+  unsigned long iFinePoint, iParent, iCoarsePoint;
+  unsigned short iChildren;
 
   /*--- Set the point surrounding a point ---*/
 
@@ -932,8 +914,7 @@ void CMultiGridGeometry::SetPoint_Connectivity(CGeometry *fine_grid) {
   for (iCoarsePoint = 0; iCoarsePoint < nPoint; iCoarsePoint ++) {
     for (iChildren = 0; iChildren <  nodes->GetnChildren_CV(iCoarsePoint); iChildren ++) {
       iFinePoint = nodes->GetChildren_CV(iCoarsePoint, iChildren);
-      for (iNode = 0; iNode < fine_grid->nodes->GetnPoint(iFinePoint); iNode ++) {
-        iFinePoint_Neighbor = fine_grid->nodes->GetPoint(iFinePoint, iNode);
+      for (auto iFinePoint_Neighbor : fine_grid->nodes->GetPoints(iFinePoint)) {
         iParent = fine_grid->nodes->GetParent_CV(iFinePoint_Neighbor);
         if (iParent != iCoarsePoint) {
           auto End = points[iCoarsePoint].end();
@@ -1101,9 +1082,9 @@ void CMultiGridGeometry::SetControlVolume(CConfig *config, CGeometry *fine_grid,
 
   SU2_OMP_MASTER {
 
-  unsigned long iFinePoint, iFinePoint_Neighbor, iCoarsePoint, iEdge, iParent;
+  unsigned long iFinePoint, iCoarsePoint, iEdge, iParent;
   long FineEdge, CoarseEdge;
-  unsigned short iChildren, iNode, iDim;
+  unsigned short iChildren, iDim;
   bool change_face_orientation;
   su2double Coarse_Volume, Area;
 
@@ -1127,8 +1108,7 @@ void CMultiGridGeometry::SetControlVolume(CConfig *config, CGeometry *fine_grid,
     for (iChildren = 0; iChildren < nodes->GetnChildren_CV(iCoarsePoint); iChildren ++) {
       iFinePoint = nodes->GetChildren_CV(iCoarsePoint, iChildren);
 
-      for (iNode = 0; iNode < fine_grid->nodes->GetnPoint(iFinePoint); iNode ++) {
-        iFinePoint_Neighbor = fine_grid->nodes->GetPoint(iFinePoint, iNode);
+      for (auto iFinePoint_Neighbor : fine_grid->nodes->GetPoints(iFinePoint)) {
         iParent = fine_grid->nodes->GetParent_CV(iFinePoint_Neighbor);
         if ((iParent != iCoarsePoint) && (iParent < iCoarsePoint)) {
 
@@ -1365,12 +1345,10 @@ void CMultiGridGeometry::FindNormal_Neighbor(CConfig *config) {
 
           /*--- Compute closest normal neighbor ---*/
           su2double cos_max, scalar_prod, norm_vect, norm_Normal, cos_alpha, diff_coord;
-          unsigned long Point_Normal = 0, jPoint;
-          unsigned short iNeigh;
+          unsigned long Point_Normal = 0;
           su2double *Normal = vertex[iMarker][iVertex]->GetNormal();
           cos_max = -1.0;
-          for (iNeigh = 0; iNeigh < nodes->GetnPoint(iPoint); iNeigh++) {
-            jPoint = nodes->GetPoint(iPoint, iNeigh);
+          for (auto jPoint : nodes->GetPoints(iPoint)) {
             scalar_prod = 0.0; norm_vect = 0.0; norm_Normal = 0.0;
             for (iDim = 0; iDim < nDim; iDim++) {
               diff_coord = nodes->GetCoord(jPoint, iDim)-nodes->GetCoord(iPoint, iDim);
diff --git a/Common/src/geometry/dual_grid/CEdge.cpp b/Common/src/geometry/dual_grid/CEdge.cpp
index 0e960c68445..060932dec3b 100644
--- a/Common/src/geometry/dual_grid/CEdge.cpp
+++ b/Common/src/geometry/dual_grid/CEdge.cpp
@@ -27,14 +27,16 @@
 
 #include "../../../include/geometry/dual_grid/CEdge.hpp"
 #include "../../../include/toolboxes/geometry_toolbox.hpp"
+#include "../../../include/omp_structure.hpp"
 
 using namespace GeometryToolbox;
 
-
-CEdge::CEdge(unsigned long nEdge, unsigned long nDim) :
-  Nodes(nEdge,2), Normal(nEdge,nDim), Coord_CG(nEdge,nDim) {
-  Normal = su2double(0.0);
-  Coord_CG = su2double(0.0);
+CEdge::CEdge(unsigned long nEdge, unsigned long nDim) {
+  /*--- Allocate with padding. ---*/
+  const auto nEdgeSIMD = nextMultiple(nEdge, simd::preferredLen<su2double>());
+  Nodes.resize(nEdgeSIMD,2) = 0;
+  Normal.resize(nEdgeSIMD,nDim) = su2double(0.0);
+  Coord_CG.resize(nEdgeSIMD,nDim) = su2double(0.0);
 }
 
 void CEdge::SetZeroValues(void) {
diff --git a/Common/src/linear_algebra/CSysMatrix.cpp b/Common/src/linear_algebra/CSysMatrix.cpp
index 54fc016e5c6..aaff51d514a 100644
--- a/Common/src/linear_algebra/CSysMatrix.cpp
+++ b/Common/src/linear_algebra/CSysMatrix.cpp
@@ -1,7 +1,7 @@
 /*!
  * \file CSysMatrix.cpp
  * \brief Implementation of the sparse matrix class.
- * \author F. Palacios, A. Bueno, T. Economon
+ * \author F. Palacios, A. Bueno, T. Economon, P. Gomes
  * \version 7.0.6 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
@@ -112,7 +112,8 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
   /*--- Type of preconditioner the matrix will be asked to build. ---*/
   auto prec = config->GetKind_Linear_Solver_Prec();
 
-  if (!EdgeConnect && !config->GetStructuralProblem()) {
+  if ((!EdgeConnect && !config->GetStructuralProblem()) ||
+      (config->GetKind_SU2() == SU2_DEF) || (config->GetKind_SU2() == SU2_DOT)) {
     /*--- FEM-type connectivity in non-structural context implies mesh deformation. ---*/
     prec = config->GetKind_Deform_Linear_Solver_Prec();
   }
@@ -201,34 +202,26 @@ void CSysMatrix<ScalarType>::Initialize(unsigned long npoint, unsigned long npoi
   /*--- Generate MKL Kernels ---*/
 
 #ifdef USE_MKL
-#ifndef USE_MIXED_PRECISION
-  /*--- Double precision kernels. ---*/
-  #define CREATE_GEMM mkl_jit_create_dgemm
-  #define GET_GEMM_PTR mkl_jit_get_dgemm_ptr
-#else
-  /*--- Single precision kernels. ---*/
-  #define CREATE_GEMM mkl_jit_create_sgemm
-  #define GET_GEMM_PTR mkl_jit_get_sgemm_ptr
-#endif
-  CREATE_GEMM(&MatrixMatrixProductJitter, MKL_ROW_MAJOR,
-              MKL_NOTRANS, MKL_NOTRANS, nVar, nVar, nVar, 1.0, nVar, nVar, 0.0, nVar);
-  MatrixMatrixProductKernel = GET_GEMM_PTR(MatrixMatrixProductJitter);
-
-  CREATE_GEMM(&MatrixVectorProductJitterBetaZero, MKL_COL_MAJOR,
-              MKL_NOTRANS, MKL_NOTRANS, 1, nVar, nEqn, 1.0, 1, nEqn, 0.0, 1);
-  MatrixVectorProductKernelBetaZero = GET_GEMM_PTR(MatrixVectorProductJitterBetaZero);
-
-  CREATE_GEMM(&MatrixVectorProductJitterBetaOne, MKL_COL_MAJOR,
-              MKL_NOTRANS, MKL_NOTRANS, 1, nVar, nEqn, 1.0, 1, nEqn, 1.0, 1);
-  MatrixVectorProductKernelBetaOne = GET_GEMM_PTR(MatrixVectorProductJitterBetaOne);
-
-  CREATE_GEMM(&MatrixVectorProductJitterAlphaMinusOne, MKL_COL_MAJOR,
-              MKL_NOTRANS, MKL_NOTRANS, 1, nVar, nEqn, -1.0, 1, nEqn, 1.0, 1);
-  MatrixVectorProductKernelAlphaMinusOne = GET_GEMM_PTR(MatrixVectorProductJitterAlphaMinusOne);
-
-  CREATE_GEMM(&MatrixVectorProductTranspJitterBetaOne, MKL_COL_MAJOR,
-              MKL_NOTRANS, MKL_NOTRANS, nEqn, 1, nVar, 1.0, nEqn, nVar, 1.0, nEqn);
-  MatrixVectorProductTranspKernelBetaOne = GET_GEMM_PTR(MatrixVectorProductTranspJitterBetaOne);
+  using mkl = mkl_jit_wrapper<ScalarType>;
+  mkl::create_gemm(&MatrixMatrixProductJitter, MKL_ROW_MAJOR, MKL_NOTRANS,
+                   MKL_NOTRANS, nVar, nVar, nVar, 1.0, nVar, nVar, 0.0, nVar);
+  MatrixMatrixProductKernel = mkl::get_gemm(MatrixMatrixProductJitter);
+
+  mkl::create_gemm(&MatrixVectorProductJitterBetaZero, MKL_COL_MAJOR,
+                   MKL_NOTRANS, MKL_NOTRANS, 1, nVar, nEqn, 1.0, 1, nEqn, 0.0, 1);
+  MatrixVectorProductKernelBetaZero = mkl::get_gemm(MatrixVectorProductJitterBetaZero);
+
+  mkl::create_gemm(&MatrixVectorProductJitterBetaOne, MKL_COL_MAJOR,
+                   MKL_NOTRANS, MKL_NOTRANS, 1, nVar, nEqn, 1.0, 1, nEqn, 1.0, 1);
+  MatrixVectorProductKernelBetaOne = mkl::get_gemm(MatrixVectorProductJitterBetaOne);
+
+  mkl::create_gemm(&MatrixVectorProductJitterAlphaMinusOne, MKL_COL_MAJOR,
+                   MKL_NOTRANS, MKL_NOTRANS, 1, nVar, nEqn, -1.0, 1, nEqn, 1.0, 1);
+  MatrixVectorProductKernelAlphaMinusOne = mkl::get_gemm(MatrixVectorProductJitterAlphaMinusOne);
+
+  mkl::create_gemm(&MatrixVectorProductTranspJitterBetaOne, MKL_COL_MAJOR,
+                   MKL_NOTRANS, MKL_NOTRANS, nEqn, 1, nVar, 1.0, nEqn, nVar, 1.0, nEqn);
+  MatrixVectorProductTranspKernelBetaOne = mkl::get_gemm(MatrixVectorProductTranspJitterBetaOne);
 #endif
 
 }
@@ -514,9 +507,12 @@ void CSysMatrix<ScalarType>::CompleteComms(CSysVector<OtherType> & x,
 
 template<class ScalarType>
 void CSysMatrix<ScalarType>::SetValZero() {
-  SU2_OMP_FOR_STAT(omp_light_size)
-  for (auto index = 0ul; index < nnz*nVar*nEqn; index++)
-    matrix[index] = 0.0;
+  const auto size = nnz*nVar*nEqn;
+  const auto chunk = roundUpDiv(size,omp_get_max_threads());
+  const auto begin = chunk * omp_get_thread_num();
+  const auto mySize = min(chunk, size-begin) * sizeof(ScalarType);
+  memset(&matrix[begin], 0, mySize);
+  SU2_OMP_BARRIER
 }
 
 template<class ScalarType>
@@ -618,31 +614,15 @@ void CSysMatrix<ScalarType>::MatrixInverse(ScalarType *matrix, ScalarType *inver
 template<class ScalarType>
 void CSysMatrix<ScalarType>::DeleteValsRowi(unsigned long i) {
 
-  unsigned long block_i = i/nVar;
-  unsigned long row = i - block_i*nVar;
-  unsigned long index, iVar;
+  const auto block_i = i/nVar;
+  const auto row = i%nVar;
 
-  for (index = row_ptr[block_i]; index < row_ptr[block_i+1]; index++) {
-    for (iVar = 0; iVar < nVar; iVar++)
+  for (auto index = row_ptr[block_i]; index < row_ptr[block_i+1]; index++) {
+    for (auto iVar = 0u; iVar < nVar; iVar++)
       matrix[index*nVar*nVar+row*nVar+iVar] = 0.0; // Delete row values in the block
     if (col_ind[index] == block_i)
       matrix[index*nVar*nVar+row*nVar+row] = 1.0; // Set 1 to the diagonal element
   }
-
-}
-
-template<class ScalarType>
-void CSysMatrix<ScalarType>::RowProduct(const CSysVector<ScalarType> & vec,
-                                        unsigned long row_i, ScalarType *prod) const {
-  unsigned long iVar, index, col_j;
-
-  for (iVar = 0; iVar < nVar; iVar++) prod[iVar] = 0.0;
-
-  for (index = row_ptr[row_i]; index < row_ptr[row_i+1]; index++) {
-    col_j = col_ind[index];
-    MatrixVectorProductAdd(&matrix[index*nVar*nVar], &vec[col_j*nVar], prod);
-  }
-
 }
 
 template<class ScalarType>
@@ -669,14 +649,7 @@ void CSysMatrix<ScalarType>::MatrixVectorProduct(const CSysVector<ScalarType> &
 
   SU2_OMP_FOR_DYN(omp_heavy_size)
   for (auto row_i = 0ul; row_i < nPointDomain; row_i++) {
-    auto prod_begin = row_i*nVar; // offset to beginning of block row_i
-    for(auto iVar = 0ul; iVar < nVar; iVar++)
-      prod[prod_begin+iVar] = 0.0;
-    for (auto index = row_ptr[row_i]; index < row_ptr[row_i+1]; index++) {
-      auto vec_begin = col_ind[index]*nEqn; // offset to beginning of block col_ind[index]
-      auto mat_begin = index*nVar*nEqn; // offset to beginning of matrix block[row_i][col_ind[indx]]
-      MatrixVectorProductAdd(&matrix[mat_begin], &vec[vec_begin], &prod[prod_begin]);
-    }
+    RowProduct(vec, row_i, &prod[row_i*nVar]);
   }
 
   /*--- MPI Parallelization. ---*/
@@ -1008,9 +981,7 @@ unsigned long CSysMatrix<ScalarType>::BuildLineletPreconditioner(CGeometry *geom
 
   nLinelet = 0;
   for (iMarker = 0; iMarker < config->GetnMarker_All(); iMarker++) {
-    if ((config->GetMarker_All_KindBC(iMarker) == HEAT_FLUX              ) ||
-        (config->GetMarker_All_KindBC(iMarker) == ISOTHERMAL             ) ||
-        (config->GetMarker_All_KindBC(iMarker) == EULER_WALL             ) ||
+    if (config->GetSolid_Wall(iMarker) ||
         (config->GetMarker_All_KindBC(iMarker) == DISPLACEMENT_BOUNDARY)) {
       nLinelet += geometry->nVertex[iMarker];
     }
@@ -1029,9 +1000,7 @@ unsigned long CSysMatrix<ScalarType>::BuildLineletPreconditioner(CGeometry *geom
     iLinelet = 0;
 
     for (iMarker = 0; iMarker < config->GetnMarker_All(); iMarker++) {
-      if ((config->GetMarker_All_KindBC(iMarker) == HEAT_FLUX              ) ||
-          (config->GetMarker_All_KindBC(iMarker) == ISOTHERMAL             ) ||
-          (config->GetMarker_All_KindBC(iMarker) == EULER_WALL             ) ||
+      if (config->GetSolid_Wall(iMarker) ||
           (config->GetMarker_All_KindBC(iMarker) == DISPLACEMENT_BOUNDARY))
       {
         for (iVertex = 0; iVertex < geometry->nVertex[iMarker]; iVertex++) {
diff --git a/Common/src/linear_algebra/CSysSolve.cpp b/Common/src/linear_algebra/CSysSolve.cpp
index 7f5a9db1524..1070ad4989f 100644
--- a/Common/src/linear_algebra/CSysSolve.cpp
+++ b/Common/src/linear_algebra/CSysSolve.cpp
@@ -1,7 +1,7 @@
 /*!
  * \file CSysSolve.cpp
  * \brief Main classes required for solving linear systems of equations
- * \author J. Hicken, F. Palacios, T. Economon
+ * \author J. Hicken, F. Palacios, T. Economon, P. Gomes
  * \version 7.0.6 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
@@ -37,20 +37,27 @@
 
 #include <limits>
 
-/*!< \brief machine epsilon */
-#ifndef USE_MIXED_PRECISION
-const passivedouble eps = numeric_limits<passivedouble>::epsilon();
-#else
-const passivedouble eps = 1e-12;
-#endif
+/*!
+ * \brief Epsilon used in CSysSolve depending on datatype to
+ * decide if the linear system is already solved.
+ */
+namespace {
+  template<class T>
+  constexpr T linSolEpsilon() { return numeric_limits<passivedouble>::epsilon(); }
+  template<>
+  constexpr float linSolEpsilon<float>() { return 1e-12; }
+}
 
 template<class ScalarType>
-CSysSolve<ScalarType>::CSysSolve(const bool mesh_deform_mode) : cg_ready(false), bcg_ready(false),
-                                                                gmres_ready(false), smooth_ready(false) {
-  mesh_deform = mesh_deform_mode;
-  LinSysRes_ptr = nullptr;
-  LinSysSol_ptr = nullptr;
-  Residual = 0.0;
+CSysSolve<ScalarType>::CSysSolve(const bool mesh_deform_mode) :
+  eps(linSolEpsilon<ScalarType>()),
+  mesh_deform(mesh_deform_mode),
+  cg_ready(false),
+  bcg_ready(false),
+  gmres_ready(false),
+  smooth_ready(false),
+  LinSysSol_ptr(nullptr),
+  LinSysRes_ptr(nullptr) {
 }
 
 template<class ScalarType>
@@ -92,23 +99,23 @@ void CSysSolve<ScalarType>::GenerateGivens(ScalarType & dx, ScalarType & dy, Sca
 }
 
 template<class ScalarType>
-void CSysSolve<ScalarType>::SolveReduced(int n, const vector<vector<ScalarType> > & Hsbg,
-                                         const vector<ScalarType> & rhs, vector<ScalarType> & x) const {
+void CSysSolve<ScalarType>::SolveReduced(int n, const su2matrix<ScalarType>& Hsbg,
+                                         const su2vector<ScalarType>& rhs, su2vector<ScalarType>& x) const {
   // initialize...
   for (int i = 0; i < n; i++)
     x[i] = rhs[i];
   // ... and backsolve
   for (int i = n-1; i >= 0; i--) {
-    x[i] /= Hsbg[i][i];
+    x[i] /= Hsbg(i,i);
     for (int j = i-1; j >= 0; j--) {
-      x[j] -= Hsbg[j][i]*x[i];
+      x[j] -= Hsbg(j,i)*x[i];
     }
   }
 }
 
 template<class ScalarType>
-void CSysSolve<ScalarType>::ModGramSchmidt(int i, vector<vector<ScalarType> > & Hsbg,
-                                           vector<CSysVector<ScalarType> > & w) const {
+void CSysSolve<ScalarType>::ModGramSchmidt(int i, su2matrix<ScalarType>& Hsbg,
+                                           vector<CSysVector<ScalarType> >& w) const {
 
   /*--- Parameter for reorthonormalization ---*/
 
@@ -132,28 +139,28 @@ void CSysSolve<ScalarType>::ModGramSchmidt(int i, vector<vector<ScalarType> > &
 
   for (int k = 0; k < i+1; k++) {
     ScalarType prod = w[i+1].dot(w[k]);
-    Hsbg[k][i] = prod;
-    w[i+1].Plus_AX(-prod, w[k]);
+    Hsbg(k,i) = prod;
+    w[i+1] -= prod * w[k];
 
     /*--- Check if reorthogonalization is necessary ---*/
 
     if (prod*prod > thr) {
       prod = w[i+1].dot(w[k]);
-      Hsbg[k][i] += prod;
-      w[i+1].Plus_AX(-prod, w[k]);
+      Hsbg(k,i) += prod;
+      w[i+1] -= prod * w[k];
     }
 
     /*--- Update the norm and check its size ---*/
 
-    nrm -= Hsbg[k][i]*Hsbg[k][i];
-    if (nrm < 0.0) nrm = 0.0;
+    nrm -= pow(Hsbg(k,i),2);
+    nrm = max<ScalarType>(nrm, 0.0);
     thr = nrm*reorth;
   }
 
   /*--- Test the resulting vector ---*/
 
   nrm = w[i+1].norm();
-  Hsbg[i+1][i] = nrm;
+  Hsbg(i+1,i) = nrm;
 
   /*--- Scale the resulting vector ---*/
 
@@ -212,8 +219,7 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
 
   if (!cg_ready) {
     SU2_OMP_BARRIER
-    SU2_OMP_MASTER
-    {
+    SU2_OMP_MASTER {
       auto nVar = b.GetNVar();
       auto nBlk = b.GetNBlk();
       auto nBlkDomain = b.GetNBlkDomain();
@@ -231,7 +237,7 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
   /*--- Calculate the initial residual, compute norm, and check if system is already solved ---*/
 
   mat_vec(x, A_x);
-  r = b; r -= A_x;
+  r = b - A_x;
 
   /*--- Only compute the residuals in full communication mode. ---*/
 
@@ -251,17 +257,16 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
 
     /*--- Output header information including initial residual ---*/
 
-    if ((monitoring) && (master)) {
+    if (monitoring && master) {
       WriteHeader("CG", tol, norm_r);
       WriteHistory(i, norm_r/norm0);
     }
 
   }
 
-  ScalarType alpha, beta, r_dot_z, r_dot_z_old;
   precond(r, z);
   p = z;
-  r_dot_z = r.dot(z);
+  ScalarType r_dot_z = r.dot(z);
 
   /*---  Loop over all search directions ---*/
 
@@ -273,12 +278,12 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
 
     /*--- Calculate step-length alpha ---*/
 
-    alpha = r_dot_z / A_x.dot(p);
+    ScalarType alpha = r_dot_z / A_x.dot(p);
 
     /*--- Update solution and residual: ---*/
 
-    x.Plus_AX(alpha, p);
-    r.Plus_AX(-alpha, A_x);
+    x += alpha * p;
+    r -= alpha * A_x;
 
     /*--- Only compute the residuals in full communication mode. ---*/
 
@@ -295,16 +300,15 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
 
     precond(r, z);
 
-    /*--- Calculate Gram-Schmidt coefficient beta,
-     beta = dotProd(r_{i+1}, z_{i+1}) / dotProd(r_{i}, z_{i}) ---*/
+    /*--- Calculate Gram-Schmidt coefficient, beta = (r_{i+1}, z_{i+1}) / (r_{i}, z_{i}) ---*/
 
-    r_dot_z_old = r_dot_z;
+    ScalarType beta = r_dot_z;
     r_dot_z = r.dot(z);
-    beta = r_dot_z / r_dot_z_old;
+    beta = r_dot_z / beta;
 
-    /*--- Gram-Schmidt orthogonalization; p = beta *p + z ---*/
+    /*--- Gram-Schmidt orthogonalization. ---*/
 
-    p.Equals_AX_Plus_BY(beta, p, 1.0, z);
+    p = beta*p + z;
 
   }
 
@@ -315,7 +319,7 @@ unsigned long CSysSolve<ScalarType>::CG_LinSolver(const CSysVector<ScalarType> &
     if (master) WriteFinalResidual("CG", i, norm_r/norm0);
 
     mat_vec(x, A_x);
-    r = b; r -= A_x;
+    r = b - A_x;
     ScalarType true_res = r.norm();
 
     if (fabs(true_res - norm_r) > tol*10.0) {
@@ -356,10 +360,11 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
 
   if (!gmres_ready) {
     SU2_OMP_BARRIER
-    SU2_OMP_MASTER
-    {
-      W.resize(m+1, x);
-      Z.resize(m+1, x);
+    SU2_OMP_MASTER {
+      W.resize(m+1);
+      Z.resize(m+1);
+      for (auto& w : W) w.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
+      for (auto& z : Z) z.Initialize(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
       gmres_ready = true;
     }
     SU2_OMP_BARRIER
@@ -369,11 +374,13 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
    on its own thread, since calculations on these arrays are based on dot products
    (reduced across all threads and ranks) all threads do the same computations. ---*/
 
-  vector<ScalarType> g(m+1, 0.0);
-  vector<ScalarType> sn(m+1, 0.0);
-  vector<ScalarType> cs(m+1, 0.0);
-  vector<ScalarType> y(m, 0.0);
-  vector<vector<ScalarType> > H(m+1, vector<ScalarType>(m, 0.0));
+  su2vector<ScalarType> g(m+1), sn(m+1), cs(m+1), y(m);
+  g = ScalarType(0);
+  sn = ScalarType(0);
+  cs = ScalarType(0);
+  y = ScalarType(0);
+  su2matrix<ScalarType> H(m+1, m);
+  H = ScalarType(0);
 
   /*--- Calculate the norm of the rhs vector. ---*/
 
@@ -459,7 +466,7 @@ unsigned long CSysSolve<ScalarType>::FGMRES_LinSolver(const CSysVector<ScalarTyp
 
   SolveReduced(i, H, g, y);
   for (unsigned long k = 0; k < i; k++) {
-    x.Plus_AX(y[k], Z[k]);
+    x += y[k] * Z[k];
   }
 
   /*---  Recalculate final (neg.) residual (this should be optional) ---*/
@@ -505,8 +512,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
   if (!bcg_ready) {
     SU2_OMP_BARRIER
-    SU2_OMP_MASTER
-    {
+    SU2_OMP_MASTER {
       auto nVar = b.GetNVar();
       auto nBlk = b.GetNBlk();
       auto nBlkDomain = b.GetNBlkDomain();
@@ -526,7 +532,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
   /*--- Calculate the initial residual, compute norm, and check if system is already solved ---*/
 
   mat_vec(x, A_x);
-  r = b; r -= A_x;
+  r = b - A_x;
 
   /*--- Only compute the residuals in full communication mode. ---*/
 
@@ -555,7 +561,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
   /*--- Initialization ---*/
 
-  ScalarType alpha = 1.0, beta = 1.0, omega = 1.0, rho = 1.0, rho_prime = 1.0;
+  ScalarType alpha = 1.0, omega = 1.0, rho = 1.0, rho_prime = 1.0;
   p = ScalarType(0.0); v = ScalarType(0.0); r_0 = r;
 
   /*--- Loop over all search directions ---*/
@@ -572,13 +578,11 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
 
     /*--- Compute beta ---*/
 
-    beta = (rho / rho_prime) * (alpha /omega);
+    ScalarType beta = (rho / rho_prime) * (alpha /omega);
 
-    /*--- p_{i} = r_{i-1} + beta * p_{i-1} - beta * omega * v_{i-1} ---*/
+    /*--- Update p ---*/
 
-    ScalarType beta_omega = -beta*omega;
-    p.Equals_AX_Plus_BY(beta, p, beta_omega, v);
-    p += r;
+    p = beta * (p - omega*v) + r;
 
     /*--- Preconditioning step ---*/
 
@@ -590,12 +594,10 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
     ScalarType r_0_v = r_0.dot(v);
     alpha = rho / r_0_v;
 
-    /*--- Update solution and residual: ---*/
+    /*--- Update solution and residual ---*/
 
-    /*--- x_{i-1/2} = x_{i-1} + alpha * z ---*/
-    x.Plus_AX(alpha, z);
-    /*--- r_{i-1/2} = r_{i-1} - alpha * v_{i} ---*/
-    r.Plus_AX(-alpha, v);
+    x += alpha * z;
+    r -= alpha * v;
 
     /*--- Preconditioning step ---*/
 
@@ -608,12 +610,10 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
     if (omega == ScalarType(0)) break;
     omega = A_x.dot(r) / omega;
 
-    /*--- Update solution and residual: ---*/
+    /*--- Update solution and residual ---*/
 
-    /*--- x_{i} = x_{i-1/2} + omega * z ---*/
-    x.Plus_AX(omega, z);
-    /*--- r_{i} = r_{i-1/2} - omega * A * z ---*/
-    r.Plus_AX(-omega, A_x);
+    x += omega * z;
+    r -= omega * A_x;
 
     /*--- Only compute the residuals in full communication mode. ---*/
 
@@ -637,7 +637,7 @@ unsigned long CSysSolve<ScalarType>::BCGSTAB_LinSolver(const CSysVector<ScalarTy
     if (master) WriteFinalResidual("BCGSTAB", i, norm_r/norm0);
 
     mat_vec(x, A_x);
-    r = b; r -= A_x;
+    r = b - A_x;
     ScalarType true_res = r.norm();
 
     if ((fabs(true_res - norm_r) > tol*10.0) && (master)) {
@@ -660,7 +660,7 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
   unsigned long i = 0;
 
   /*--- Relaxation factor, see comments inside the loop over the smoothing iterations. ---*/
-  ScalarType omega = SU2_TYPE::GetValue(config->GetLinear_Solver_Smoother_Relaxation());
+  const ScalarType omega = SU2_TYPE::GetValue(config->GetLinear_Solver_Smoother_Relaxation());
 
   if (m < 1) {
     SU2_OMP_MASTER
@@ -672,8 +672,7 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
 
   if (!smooth_ready) {
     SU2_OMP_BARRIER
-    SU2_OMP_MASTER
-    {
+    SU2_OMP_MASTER {
       auto nVar = b.GetNVar();
       auto nBlk = b.GetNBlk();
       auto nBlkDomain = b.GetNBlkDomain();
@@ -690,7 +689,7 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
   /*--- Compute the initial residual and check if the system is already solved (if in COMM_FULL mode). ---*/
 
   mat_vec(x, A_x);
-  r = b; r -= A_x;
+  r = b - A_x;
 
   /*--- Only compute the residuals in full communication mode. ---*/
 
@@ -739,8 +738,8 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
      M^{-1}(b-A*x) which converges if ||I-w*M^{-1}*A|| < 1. Combining this method
      with a Gauss-Seidel preconditioner and w>1 is NOT equivalent to SOR. ---*/
 
-    x.Plus_AX(omega, z);
-    r.Plus_AX(-omega, A_x);
+    x += omega * z;
+    r -= omega * A_x;
 
     /*--- Only compute the residuals in full communication mode. ---*/
     /*--- Check if solution has converged, else output the relative residual if necessary. ---*/
@@ -761,67 +760,6 @@ unsigned long CSysSolve<ScalarType>::Smoother_LinSolver(const CSysVector<ScalarT
   return i;
 }
 
-template<>
-void CSysSolve<su2double>::HandleTemporariesIn(const CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol) {
-
-  /*--- When the type is the same the temporaties are not required ---*/
-  /*--- Set the pointers ---*/
-  SU2_OMP_MASTER
-  {
-    LinSysRes_ptr = &LinSysRes;
-    LinSysSol_ptr = &LinSysSol;
-  }
-  SU2_OMP_BARRIER
-}
-
-template<>
-void CSysSolve<su2double>::HandleTemporariesOut(CSysVector<su2double> & LinSysSol) {
-
-  /*--- When the type is the same the temporaties are not required ---*/
-  /*--- Reset the pointers ---*/
-  SU2_OMP_MASTER
-  {
-    LinSysRes_ptr = nullptr;
-    LinSysSol_ptr = nullptr;
-  }
-  SU2_OMP_BARRIER
-}
-
-#if defined(CODI_REVERSE_TYPE) || defined(USE_MIXED_PRECISION)
-template<>
-void CSysSolve<su2mixedfloat>::HandleTemporariesIn(const CSysVector<su2double> & LinSysRes, CSysVector<su2double> & LinSysSol) {
-
-  /*--- When the type is different we need to copy data to the temporaries ---*/
-  /*--- Copy data, the solution is also copied because it serves as initial conditions ---*/
-  LinSysRes_tmp.PassiveCopy(LinSysRes);
-  LinSysSol_tmp.PassiveCopy(LinSysSol);
-
-  /*--- Set the pointers ---*/
-  SU2_OMP_MASTER
-  {
-    LinSysRes_ptr = &LinSysRes_tmp;
-    LinSysSol_ptr = &LinSysSol_tmp;
-  }
-  SU2_OMP_BARRIER
-}
-
-template<>
-void CSysSolve<su2mixedfloat>::HandleTemporariesOut(CSysVector<su2double> & LinSysSol) {
-
-  /*--- When the type is different we need to copy data from the temporaries ---*/
-  /*--- Copy data, only the solution needs to be copied ---*/
-  LinSysSol.PassiveCopy(LinSysSol_tmp);
-
-  /*--- Reset the pointers ---*/
-  SU2_OMP_MASTER
-  {
-    LinSysRes_ptr = nullptr;
-    LinSysSol_ptr = nullptr;
-  }
-  SU2_OMP_BARRIER
-}
-#endif
-
 template<class ScalarType>
 unsigned long CSysSolve<ScalarType>::Solve(CSysMatrix<ScalarType> & Jacobian, const CSysVector<su2double> & LinSysRes,
                                            CSysVector<su2double> & LinSysSol, CGeometry *geometry, const CConfig *config) {
diff --git a/Common/src/linear_algebra/CSysVector.cpp b/Common/src/linear_algebra/CSysVector.cpp
index 412d5cee066..ad60be59c5d 100644
--- a/Common/src/linear_algebra/CSysVector.cpp
+++ b/Common/src/linear_algebra/CSysVector.cpp
@@ -1,7 +1,7 @@
 /*!
  * \file CSysVector.cpp
- * \brief Main classes required for solving linear systems of equations
- * \author F. Palacios, J. Hicken
+ * \brief Implementation and explicit instantiations of CSysVector.
+ * \author P. Gomes, F. Palacios, J. Hicken, T. Economon
  * \version 7.0.6 "Blackbird"
  *
  * SU2 Project Website: https://su2code.github.io
@@ -26,233 +26,50 @@
  */
 
 #include "../../include/linear_algebra/CSysVector.hpp"
-#include "../../include/mpi_structure.hpp"
-#include "../../include/omp_structure.hpp"
 #include "../../include/toolboxes/allocation_toolbox.hpp"
 
-/*!
- * \brief OpenMP worksharing construct used in CSysVector for loops.
- * \note The loop will only run in parallel if methods are called from a
- * parallel region (if not the results will still be correct).
- * Static schedule to reduce overhead, chunk size determined at initialization.
- * "nowait" clause is safe when calling CSysVector methods after each other
- * as the loop size is the same. Methods of other classes that operate on a
- * CSysVector and do not have the same work scheduling must use a
- * SU2_OMP_BARRIER before using the vector.
- */
-#ifdef HAVE_OMP
-#ifdef HAVE_OMP_SIMD
-#define PARALLEL_FOR SU2_OMP(for simd schedule(static,omp_chunk_size) nowait)
-#else
-#define PARALLEL_FOR SU2_OMP(for schedule(static,omp_chunk_size) nowait)
-#endif
-#else
-#define PARALLEL_FOR SU2_OMP_SIMD
-#endif
-
-template<class ScalarType>
-CSysVector<ScalarType>::CSysVector(void) {
-
-  vec_val = nullptr;
-  nElm = 0;
-  nElmDomain = 0;
-  nVar = 0;
-  omp_chunk_size = OMP_MAX_SIZE;
-  dotRes = 0.0;
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::Initialize(unsigned long numBlk, unsigned long numBlkDomain,
-                                        unsigned long numVar, const ScalarType* val, bool valIsArray) {
+template <class ScalarType>
+void CSysVector<ScalarType>::Initialize(unsigned long numBlk, unsigned long numBlkDomain, unsigned long numVar,
+                                        const ScalarType* val, bool valIsArray, bool errorIfParallel) {
+  if (errorIfParallel && omp_in_parallel()) {
+    assert(false);
+    SU2_MPI::Error("If this class were constructed in parallel its operations would be incorrect.", CURRENT_FUNCTION);
+  }
 
-  /*--- Assert that this method is only called by one thread. ---*/
-  assert(omp_get_thread_num()==0 && "Only the master thread is allowed to initialize the vector.");
+  if (omp_get_thread_num())
+    SU2_MPI::Error("Only the master thread is allowed to initialize the vector.", CURRENT_FUNCTION);
 
-  if ((nElm != numBlk*numVar) && (vec_val != nullptr)) {
+  if (nElm != numBlk * numVar) {
     MemoryAllocation::aligned_free(vec_val);
     vec_val = nullptr;
   }
 
-  nElm = numBlk*numVar;
-  nElmDomain = numBlkDomain*numVar;
+  nElm = numBlk * numVar;
+  nElmDomain = numBlkDomain * numVar;
   nVar = numVar;
 
   omp_chunk_size = computeStaticChunkSize(nElm, omp_get_max_threads(), OMP_MAX_SIZE);
 
-  if (vec_val == nullptr)
-    vec_val = MemoryAllocation::aligned_alloc<ScalarType>(64, nElm*sizeof(ScalarType));
+  if (vec_val == nullptr) vec_val = MemoryAllocation::aligned_alloc<ScalarType>(64, nElm * sizeof(ScalarType));
 
-  if(val != nullptr) {
-    if(!valIsArray) {
-      for(auto i=0ul; i<nElm; i++) vec_val[i] = *val;
-    }
-    else {
-      for(auto i=0ul; i<nElm; i++) vec_val[i] = val[i];
+  if (val != nullptr) {
+    if (!valIsArray) {
+      for (auto i = 0ul; i < nElm; i++) vec_val[i] = *val;
+    } else {
+      for (auto i = 0ul; i < nElm; i++) vec_val[i] = val[i];
     }
   }
 }
 
-template<class ScalarType>
-template<class T>
-void CSysVector<ScalarType>::PassiveCopy(const CSysVector<T>& other) {
-
-  /*--- This is a method and not the overload of an operator to make sure who
-   calls it knows the consequence to the derivative information (lost) ---*/
-
-  /*--- check if self-assignment, otherwise perform deep copy ---*/
-  if ((const void*)this == (const void*)&other) return;
-
-  SU2_OMP_MASTER
-  Initialize(other.GetNBlk(), other.GetNBlkDomain(), other.GetNVar(), nullptr, true);
-  SU2_OMP_BARRIER
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++)
-    vec_val[i] = SU2_TYPE::GetValue(other[i]);
-}
-
-template<class ScalarType>
+template <class ScalarType>
 CSysVector<ScalarType>::~CSysVector() {
-
   MemoryAllocation::aligned_free(vec_val);
 }
 
-template<class ScalarType>
-void CSysVector<ScalarType>::Equals_AX(ScalarType a, const CSysVector<ScalarType> & x) {
-
-  assert(nElm == x.nElm && "Sizes do not match");
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++) vec_val[i] = a * x.vec_val[i];
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::Plus_AX(ScalarType a, const CSysVector<ScalarType> & x) {
-
-  assert(nElm == x.nElm && "Sizes do not match");
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++) vec_val[i] += a * x.vec_val[i];
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::Equals_AX_Plus_BY(ScalarType a, const CSysVector<ScalarType> & x,
-                                               ScalarType b, const CSysVector<ScalarType> & y) {
-  assert(nElm == x.nElm && nElm == y.nElm && "Sizes do not match");
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++)
-    vec_val[i] = a * x.vec_val[i] + b * y.vec_val[i];
-}
-
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator=(const CSysVector<ScalarType> & u) {
-
-  assert(nElm == u.nElm && "Sizes do not match");
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++) vec_val[i] = u.vec_val[i];
-
-  return *this;
-}
-
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator=(ScalarType val) {
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++) vec_val[i] = val;
-
-  return *this;
-}
-
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator+=(const CSysVector<ScalarType> & u) {
-
-  assert(nElm == u.nElm && "Sizes do not match");
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++) vec_val[i] += u.vec_val[i];
-
-  return *this;
-}
-
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator-=(const CSysVector<ScalarType> & u) {
-
-  assert(nElm == u.nElm && "Sizes do not match");
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++) vec_val[i] -= u.vec_val[i];
-
-  return *this;
-}
-
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator*=(ScalarType val) {
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++) vec_val[i] *= val;
-
-  return *this;
-}
-
-template<class ScalarType>
-CSysVector<ScalarType> & CSysVector<ScalarType>::operator/=(ScalarType val) {
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++) vec_val[i] /= val;
-
-  return *this;
-}
-
-template<class ScalarType>
-void CSysVector<ScalarType>::CopyToArray(ScalarType* u_array) const {
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElm; i++) u_array[i] = vec_val[i];
-}
-
-template<class ScalarType>
-ScalarType CSysVector<ScalarType>::dot(const CSysVector<ScalarType> & u) const {
-
-  /*--- All threads get the same "view" of the vectors and shared variable. ---*/
-  SU2_OMP_BARRIER
-  dotRes = 0.0;
-  SU2_OMP_BARRIER
-
-  /*--- Local dot product for each thread. ---*/
-  ScalarType sum = 0.0;
-
-  PARALLEL_FOR
-  for(auto i=0ul; i<nElmDomain; ++i)
-    sum += vec_val[i]*u.vec_val[i];
-
-  /*--- Update shared variable with "our" partial sum. ---*/
-  atomicAdd(sum, dotRes);
-
-#ifdef HAVE_MPI
-  /*--- Reduce across all mpi ranks, only master thread communicates. ---*/
-  SU2_OMP_BARRIER
-  SU2_OMP_MASTER
-  {
-    sum = dotRes;
-    const auto mpi_type = (sizeof(ScalarType) < sizeof(double))? MPI_FLOAT : MPI_DOUBLE;
-    SelectMPIWrapper<ScalarType>::W::Allreduce(&sum, &dotRes, 1, mpi_type, MPI_SUM, MPI_COMM_WORLD);
-  }
-#endif
-  /*--- Make view of result consistent across threads. ---*/
-  SU2_OMP_BARRIER
-
-  return dotRes;
-}
-
 /*--- Explicit instantiations ---*/
 /*--- We allways need su2double (regardless if it is passive or active). ---*/
 template class CSysVector<su2double>;
 #if defined(CODI_REVERSE_TYPE) || defined(USE_MIXED_PRECISION)
-/*--- In reverse AD (or with mixed precision) we will also have passive (or float) vectors,
- *    and copy operations between them and active (or double) vectors, respectively. ---*/
+/*--- In reverse AD (or with mixed precision) we will also have passive (or float) vectors. ---*/
 template class CSysVector<su2mixedfloat>;
-template void CSysVector<su2mixedfloat>::PassiveCopy(const CSysVector<su2double>&);
-template void CSysVector<su2double>::PassiveCopy(const CSysVector<su2mixedfloat>&);
 #endif
diff --git a/SU2_CFD/include/gradients/computeGradientsGreenGauss.hpp b/SU2_CFD/include/gradients/computeGradientsGreenGauss.hpp
index cf406774c9f..9c016c8242a 100644
--- a/SU2_CFD/include/gradients/computeGradientsGreenGauss.hpp
+++ b/SU2_CFD/include/gradients/computeGradientsGreenGauss.hpp
@@ -29,9 +29,11 @@
 
 #include "../../../Common/include/omp_structure.hpp"
 
+namespace detail {
 
 /*!
  * \brief Compute the gradient of a field using the Green-Gauss theorem.
+ * \note Template nDim to allow efficient unrolling of inner loops.
  * \note Gradients can be computed only for a contiguous range of variables, defined
  *       by [varBegin, varEnd[ (e.g. 0,1 computes the gradient of the 1st variable).
  *       This can be used, for example, to compute only velocity gradients.
@@ -48,7 +50,7 @@
  * \param[in] varEnd - Index of last variable for which to compute the gradient.
  * \param[out] gradient - Generic object implementing operator (iPoint, iVar, iDim).
  */
-template<class FieldType, class GradientType>
+template<size_t nDim, class FieldType, class GradientType>
 void computeGradientsGreenGauss(CSolver* solver,
                                 MPI_QUANTITIES kindMpiComm,
                                 PERIODIC_QUANTITIES kindPeriodicComm,
@@ -59,14 +61,12 @@ void computeGradientsGreenGauss(CSolver* solver,
                                 size_t varEnd,
                                 GradientType& gradient)
 {
-  size_t nPointDomain = geometry.GetnPointDomain();
-  size_t nDim = geometry.GetnDim();
+  const size_t nPointDomain = geometry.GetnPointDomain();
 
 #ifdef HAVE_OMP
   constexpr size_t OMP_MAX_CHUNK = 512;
 
-  size_t chunkSize = computeStaticChunkSize(nPointDomain,
-                     omp_get_max_threads(), OMP_MAX_CHUNK);
+  const auto chunkSize = computeStaticChunkSize(nPointDomain, omp_get_max_threads(), OMP_MAX_CHUNK);
 #endif
 
   /*--- For each (non-halo) volume integrate over its faces (edges). ---*/
@@ -106,7 +106,7 @@ void computeGradientsGreenGauss(CSolver* solver,
       su2double dir = (iPoint == geometry.edges->GetNode(iEdge,0))? 1.0 : -1.0;
       su2double weight = dir * halfOnVol;
 
-      const su2double* area = geometry.edges->GetNormal(iEdge);
+      const auto area = geometry.edges->GetNormal(iEdge);
       AD::SetPreaccIn(area, nDim);
 
       for (size_t iVar = varBegin; iVar < varEnd; ++iVar)
@@ -150,7 +150,7 @@ void computeGradientsGreenGauss(CSolver* solver,
 
         su2double volume = nodes->GetVolume(iPoint) + nodes->GetPeriodicVolume(iPoint);
 
-        const su2double* area = geometry.vertex[iMarker][iVertex]->GetNormal();
+        const auto area = geometry.vertex[iMarker][iVertex]->GetNormal();
 
         for (size_t iVar = varBegin; iVar < varEnd; iVar++)
         {
@@ -181,3 +181,32 @@ void computeGradientsGreenGauss(CSolver* solver,
   solver->CompleteComms(&geometry, &config, kindMpiComm);
 
 }
+} // end namespace
+
+/*!
+ * \brief Instantiations for 2D and 3D.
+ */
+template<class FieldType, class GradientType>
+void computeGradientsGreenGauss(CSolver* solver,
+                                MPI_QUANTITIES kindMpiComm,
+                                PERIODIC_QUANTITIES kindPeriodicComm,
+                                CGeometry& geometry,
+                                const CConfig& config,
+                                const FieldType& field,
+                                size_t varBegin,
+                                size_t varEnd,
+                                GradientType& gradient) {
+  switch (geometry.GetnDim()) {
+  case 2:
+    detail::computeGradientsGreenGauss<2>(solver, kindMpiComm, kindPeriodicComm, geometry,
+                                          config, field, varBegin, varEnd, gradient);
+    break;
+  case 3:
+    detail::computeGradientsGreenGauss<3>(solver, kindMpiComm, kindPeriodicComm, geometry,
+                                          config, field, varBegin, varEnd, gradient);
+    break;
+  default:
+    SU2_MPI::Error("Too many dimensions to compute gradients.", CURRENT_FUNCTION);
+    break;
+  }
+}
diff --git a/SU2_CFD/include/gradients/computeGradientsLeastSquares.hpp b/SU2_CFD/include/gradients/computeGradientsLeastSquares.hpp
index 6c118f9293c..1aeb01604d1 100644
--- a/SU2_CFD/include/gradients/computeGradientsLeastSquares.hpp
+++ b/SU2_CFD/include/gradients/computeGradientsLeastSquares.hpp
@@ -27,7 +27,123 @@
  */
 
 #include "../../../Common/include/omp_structure.hpp"
+#include "../../../Common/include/toolboxes/geometry_toolbox.hpp"
 
+namespace detail {
+
+/*!
+ * \brief Solve the least-squares problem for one point.
+ * \note See detail::computeGradientsLeastSquares for the
+ *       purpose of template "nDim" and "periodic".
+ */
+template<size_t nDim, bool periodic, class GradientType, class RMatrixType>
+FORCEINLINE void solveLeastSquares(size_t iPoint,
+                                   size_t varBegin,
+                                   size_t varEnd,
+                                   const RMatrixType& Rmatrix,
+                                   GradientType& gradient)
+{
+  const auto eps = pow(std::numeric_limits<passivedouble>::epsilon(),2);
+
+  /*--- Entries of upper triangular matrix R. ---*/
+
+  su2double r11 = Rmatrix(iPoint,0,0);
+  su2double r12 = Rmatrix(iPoint,0,1);
+  su2double r22 = Rmatrix(iPoint,1,1);
+  su2double r13 = 0.0, r23 = 0.0, r33 = 1.0;
+
+  if (periodic) {
+    AD::StartPreacc();
+    AD::SetPreaccIn(r11);
+    AD::SetPreaccIn(r12);
+    AD::SetPreaccIn(r22);
+  }
+
+  r11 = sqrt(max(r11, eps));
+  r12 /= r11;
+  r22 = sqrt(max(r22 - r12*r12, eps));
+
+  if (nDim == 3) {
+    r13 = Rmatrix(iPoint,0,2);
+    r33 = Rmatrix(iPoint,2,2);
+    const auto r23_a = Rmatrix(iPoint,1,2);
+    const auto r23_b = Rmatrix(iPoint,2,1);
+
+    if (periodic) {
+      AD::SetPreaccIn(r13);
+      AD::SetPreaccIn(r23_a);
+      AD::SetPreaccIn(r23_b);
+      AD::SetPreaccIn(r33);
+    }
+
+    r13 /= r11;
+    r23 = r23_a/r22 - r23_b*r12/(r11*r22);
+    r33 = sqrt(max(r33 - r23*r23 - r13*r13, eps));
+  }
+
+  /*--- Compute determinant ---*/
+
+  const su2double detR2 = pow(r11*r22*r33, 2);
+
+  /*--- S matrix := inv(R)*traspose(inv(R)) ---*/
+
+  su2double Smatrix[nDim][nDim] = {{0.0}};
+
+  /*--- Detect singular matrix ---*/
+
+  if (detR2 > eps) {
+    if (nDim == 2) {
+      Smatrix[0][0] = (r12*r12+r22*r22)/detR2;
+      Smatrix[0][1] = -r11*r12/detR2;
+      Smatrix[1][1] = r11*r11/detR2;
+    }
+    else {
+      su2double z11 = r22*r33;
+      su2double z12 =-r12*r33;
+      su2double z13 = r12*r23-r13*r22;
+      su2double z22 = r11*r33;
+      su2double z23 =-r11*r23;
+      su2double z33 = r11*r22;
+
+      Smatrix[0][0] = (z11*z11+z12*z12+z13*z13)/detR2;
+      Smatrix[0][1] = (z12*z22+z13*z23)/detR2;
+      Smatrix[0][2] = (z13*z33)/detR2;
+      Smatrix[1][1] = (z22*z22+z23*z23)/detR2;
+      Smatrix[1][2] = (z23*z33)/detR2;
+      Smatrix[2][2] = (z33*z33)/detR2;
+    }
+  }
+
+  if (periodic) {
+    /*--- Stop preacc here as gradient is in/out. ---*/
+    for (size_t iDim = 0; iDim < nDim; ++iDim)
+      for (size_t jDim = iDim; jDim < nDim; ++jDim)
+        AD::SetPreaccOut(Smatrix[iDim][jDim]);
+    AD::EndPreacc();
+  }
+
+  /*--- Computation of the gradient: S*c ---*/
+
+  for (size_t iVar = varBegin; iVar < varEnd; ++iVar)
+  {
+    su2double Cvector[nDim] = {0.0};
+
+    for (size_t iDim = 0; iDim < nDim; ++iDim)
+      for (size_t jDim = 0; jDim < nDim; ++jDim)
+        Cvector[iDim] += Smatrix[min(iDim,jDim)][max(iDim,jDim)] * gradient(iPoint, iVar, jDim);
+
+    for (size_t iDim = 0; iDim < nDim; ++iDim)
+      gradient(iPoint, iVar, iDim) = Cvector[iDim];
+  }
+
+  if (!periodic) {
+    /*--- Stop preacc here instead as gradient is only out. ---*/
+    for (size_t iVar = varBegin; iVar < varEnd; ++iVar)
+      for (size_t iDim = 0; iDim < nDim; ++iDim)
+        AD::SetPreaccOut(gradient(iPoint, iVar, iDim));
+    AD::EndPreacc();
+  }
+}
 
 /*!
  * \brief Compute the gradient of a field using inverse-distance-weighted or
@@ -45,7 +161,7 @@
  * \param[out] gradient - Generic object implementing operator (iPoint, iVar, iDim).
  * \param[out] Rmatrix - Generic object implementing operator (iPoint, iDim, iDim).
  */
-template<class FieldType, class GradientType, class RMatrixType>
+template<size_t nDim, class FieldType, class GradientType, class RMatrixType>
 void computeGradientsLeastSquares(CSolver* solver,
                                   MPI_QUANTITIES kindMpiComm,
                                   PERIODIC_QUANTITIES kindPeriodicComm,
@@ -58,10 +174,9 @@ void computeGradientsLeastSquares(CSolver* solver,
                                   GradientType& gradient,
                                   RMatrixType& Rmatrix)
 {
-  constexpr size_t MAXNDIM = 3;
+  const bool periodic = (solver != nullptr) && (config.GetnMarker_Periodic() > 0);
 
-  size_t nPointDomain = geometry.GetnPointDomain();
-  size_t nDim = geometry.GetnDim();
+  const size_t nPointDomain = geometry.GetnPointDomain();
 
 #ifdef HAVE_OMP
   constexpr size_t OMP_MAX_CHUNK = 512;
@@ -76,7 +191,7 @@ void computeGradientsLeastSquares(CSolver* solver,
   for (size_t iPoint = 0; iPoint < nPointDomain; ++iPoint)
   {
     auto nodes = geometry.nodes;
-    const su2double* coord_i = nodes->GetCoord(iPoint);
+    const auto coord_i = nodes->GetCoord(iPoint);
 
     AD::StartPreacc();
     AD::SetPreaccIn(coord_i, nDim);
@@ -93,34 +208,24 @@ void computeGradientsLeastSquares(CSolver* solver,
     for (size_t iDim = 0; iDim < nDim; ++iDim)
       for (size_t jDim = 0; jDim < nDim; ++jDim)
         Rmatrix(iPoint, iDim, jDim) = 0.0;
-  
 
-    for (size_t iNeigh = 0; iNeigh < nodes->GetnPoint(iPoint); ++iNeigh)
-    {
-      size_t jPoint = nodes->GetPoint(iPoint,iNeigh);
 
-      const su2double* coord_j = geometry.nodes->GetCoord(jPoint);
+    for (auto jPoint : nodes->GetPoints(iPoint))
+    {
+      const auto coord_j = geometry.nodes->GetCoord(jPoint);
       AD::SetPreaccIn(coord_j, nDim);
 
 
       /*--- Distance vector from iPoint to jPoint ---*/
 
-      su2double dist_ij[MAXNDIM] = {0.0};
-
-      for (size_t iDim = 0; iDim < nDim; ++iDim)
-        dist_ij[iDim] = coord_j[iDim] - coord_i[iDim];
+      su2double dist_ij[nDim] = {0.0};
+      GeometryToolbox::Distance(nDim, coord_j, coord_i, dist_ij);
 
 
       /*--- Compute inverse weight, default 1 (unweighted). ---*/
 
       su2double weight = 1.0;
-
-      if (weighted)
-      {
-        weight = 0.0;
-        for (size_t iDim = 0; iDim < nDim; ++iDim)
-          weight += dist_ij[iDim] * dist_ij[iDim];
-      }
+      if(weighted) weight = GeometryToolbox::SquaredNorm(nDim, dist_ij);
 
       /*--- Sumations for entries of upper triangular matrix R. ---*/
 
@@ -128,17 +233,12 @@ void computeGradientsLeastSquares(CSolver* solver,
       {
         weight = 1.0 / weight;
 
-        Rmatrix(iPoint,0,0) += dist_ij[0]*dist_ij[0]*weight;
-        Rmatrix(iPoint,0,1) += dist_ij[0]*dist_ij[1]*weight;
-        Rmatrix(iPoint,1,1) += dist_ij[1]*dist_ij[1]*weight;
+        for (size_t iDim = 0; iDim < nDim; ++iDim)
+          for (size_t jDim = iDim; jDim < nDim; ++jDim)
+            Rmatrix(iPoint,iDim,jDim) += dist_ij[iDim]*dist_ij[jDim]*weight;
 
         if (nDim == 3)
-        {
-          Rmatrix(iPoint,0,2) += dist_ij[0]*dist_ij[2]*weight;
-          Rmatrix(iPoint,1,2) += dist_ij[1]*dist_ij[2]*weight;
           Rmatrix(iPoint,2,1) += dist_ij[0]*dist_ij[2]*weight;
-          Rmatrix(iPoint,2,2) += dist_ij[2]*dist_ij[2]*weight;
-        }
 
         /*--- Entries of c:= transpose(A)*b ---*/
 
@@ -154,146 +254,42 @@ void computeGradientsLeastSquares(CSolver* solver,
       }
     }
 
-    for (size_t iDim = 0; iDim < nDim; ++iDim)
-      for (size_t jDim = 0; jDim < nDim; ++jDim)
-        AD::SetPreaccOut(Rmatrix(iPoint, iDim, jDim));
+    if (periodic)
+    {
+      /*--- A second loop is required after periodic comms, checkpoint the preacc. ---*/
 
-    for (size_t iVar = varBegin; iVar < varEnd; ++iVar)
       for (size_t iDim = 0; iDim < nDim; ++iDim)
-        AD::SetPreaccOut(gradient(iPoint, iVar, iDim));
+        for (size_t jDim = 0; jDim < nDim; ++jDim)
+          AD::SetPreaccOut(Rmatrix(iPoint, iDim, jDim));
 
-    AD::EndPreacc();
+      for (size_t iVar = varBegin; iVar < varEnd; ++iVar)
+        for (size_t iDim = 0; iDim < nDim; ++iDim)
+          AD::SetPreaccOut(gradient(iPoint, iVar, iDim));
+
+      AD::EndPreacc();
+    }
+    else {
+      /*--- Periodic comms are not needed, solve the LS problem for iPoint. ---*/
+
+      solveLeastSquares<nDim, false>(iPoint, varBegin, varEnd, Rmatrix, gradient);
+    }
   }
 
   /*--- Correct the gradient values across any periodic boundaries. ---*/
 
-  if (solver != nullptr)
+  if (periodic)
   {
     for (size_t iPeriodic = 1; iPeriodic <= config.GetnMarker_Periodic()/2; ++iPeriodic)
     {
       solver->InitiatePeriodicComms(&geometry, &config, iPeriodic, kindPeriodicComm);
       solver->CompletePeriodicComms(&geometry, &config, iPeriodic, kindPeriodicComm);
     }
-  }
-
-
-  /*--- Second loop over points of the grid to compute final gradient. ---*/
-
-  SU2_OMP_FOR_DYN(chunkSize)
-  for (size_t iPoint = 0; iPoint < nPointDomain; ++iPoint)
-  {
-    /*--- Entries of upper triangular matrix R. ---*/
-
-    su2double r11 = Rmatrix(iPoint,0,0);
-    su2double r12 = Rmatrix(iPoint,0,1);
-    su2double r22 = Rmatrix(iPoint,1,1);
-    su2double r13 = 0.0, r23 = 0.0, r23_a = 0.0, r23_b = 0.0, r33 = 0.0;
-
-    AD::StartPreacc();
-    AD::SetPreaccIn(r11);
-    AD::SetPreaccIn(r12);
-    AD::SetPreaccIn(r22);
-
-    if (r11 >= 0.0) r11 = sqrt(r11);
-    if (r11 >= 0.0) r12 /= r11; else r12 = 0.0;
-    su2double tmp = r22-r12*r12;
-    if (tmp >= 0.0) r22 = sqrt(tmp); else r22 = 0.0;
-
-    if (nDim == 3) {
-      r13   = Rmatrix(iPoint,0,2);
-      r23_a = Rmatrix(iPoint,1,2);
-      r23_b = Rmatrix(iPoint,2,1);
-      r33   = Rmatrix(iPoint,2,2);
-
-      AD::SetPreaccIn(r13);
-      AD::SetPreaccIn(r23_a);
-      AD::SetPreaccIn(r23_b);
-      AD::SetPreaccIn(r33);
-
-      if (r11 >= 0.0) r13 /= r11; else r13 = 0.0;
-
-      if ((r22 >= 0.0) && (r11*r22 >= 0.0)) {
-        r23 = r23_a/r22 - r23_b*r12/(r11*r22);
-      } else {
-        r23 = 0.0;
-      }
-
-      tmp = r33 - r23*r23 - r13*r13;
-      if (tmp >= 0.0) r33 = sqrt(tmp); else r33 = 0.0;
-    }
-
-    /*--- Compute determinant ---*/
-
-    su2double detR2 = (r11*r22)*(r11*r22);
-    if (nDim == 3) detR2 *= r33*r33;
-
-    /*--- Detect singular matrices ---*/
-
-    bool singular = false;
-
-    if (detR2 <= EPS) {
-      detR2 = 1.0;
-      singular = true;
-    }
-
-    /*--- S matrix := inv(R)*traspose(inv(R)) ---*/
-
-    su2double Smatrix[MAXNDIM][MAXNDIM];
-
-    if (singular) {
-      for (size_t iDim = 0; iDim < nDim; ++iDim)
-        for (size_t jDim = 0; jDim < nDim; ++jDim)
-          Smatrix[iDim][jDim] = 0.0;
-    }
-    else {
-      if (nDim == 2) {
-        Smatrix[0][0] = (r12*r12+r22*r22)/detR2;
-        Smatrix[0][1] = -r11*r12/detR2;
-        Smatrix[1][0] = Smatrix[0][1];
-        Smatrix[1][1] = r11*r11/detR2;
-      }
-      else {
-        su2double z11 = r22*r33;
-        su2double z12 =-r12*r33;
-        su2double z13 = r12*r23-r13*r22;
-        su2double z22 = r11*r33;
-        su2double z23 =-r11*r23;
-        su2double z33 = r11*r22;
-
-        Smatrix[0][0] = (z11*z11+z12*z12+z13*z13)/detR2;
-        Smatrix[0][1] = (z12*z22+z13*z23)/detR2;
-        Smatrix[0][2] = (z13*z33)/detR2;
-        Smatrix[1][0] = Smatrix[0][1];
-        Smatrix[1][1] = (z22*z22+z23*z23)/detR2;
-        Smatrix[1][2] = (z23*z33)/detR2;
-        Smatrix[2][0] = Smatrix[0][2];
-        Smatrix[2][1] = Smatrix[1][2];
-        Smatrix[2][2] = (z33*z33)/detR2;
-      }
-    }
-
-    for (size_t iDim = 0; iDim < nDim; ++iDim)
-      for (size_t jDim = 0; jDim < nDim; ++jDim)
-        AD::SetPreaccOut(Smatrix[iDim][jDim]);
 
-    AD::EndPreacc();
-
-    /*--- Computation of the gradient: S*c ---*/
-
-    for (size_t iVar = varBegin; iVar < varEnd; ++iVar)
-    {
-      su2double Cvector[MAXNDIM];
-
-      for (size_t iDim = 0; iDim < nDim; ++iDim)
-      {
-        Cvector[iDim] = 0.0;
-        for (size_t jDim = 0; jDim < nDim; ++jDim)
-          Cvector[iDim] += Smatrix[iDim][jDim] * gradient(iPoint, iVar, jDim);
-      }
+    /*--- Second loop over points of the grid to compute final gradient. ---*/
 
-      for (size_t iDim = 0; iDim < nDim; ++iDim)
-        gradient(iPoint, iVar, iDim) = Cvector[iDim];
-    }
+    SU2_OMP_FOR_DYN(chunkSize)
+    for (size_t iPoint = 0; iPoint < nPointDomain; ++iPoint)
+      solveLeastSquares<nDim, true>(iPoint, varBegin, varEnd, Rmatrix, gradient);
   }
 
   /*--- If no solver was provided we do not communicate ---*/
@@ -307,3 +303,34 @@ void computeGradientsLeastSquares(CSolver* solver,
   }
 
 }
+} // end namespace
+
+/*!
+ * \brief Instantiations for 2D and 3D.
+ */
+template<class FieldType, class GradientType, class RMatrixType>
+void computeGradientsLeastSquares(CSolver* solver,
+                                  MPI_QUANTITIES kindMpiComm,
+                                  PERIODIC_QUANTITIES kindPeriodicComm,
+                                  CGeometry& geometry,
+                                  const CConfig& config,
+                                  bool weighted,
+                                  const FieldType& field,
+                                  size_t varBegin,
+                                  size_t varEnd,
+                                  GradientType& gradient,
+                                  RMatrixType& Rmatrix) {
+  switch (geometry.GetnDim()) {
+  case 2:
+    detail::computeGradientsLeastSquares<2>(solver, kindMpiComm, kindPeriodicComm, geometry, config,
+                                            weighted, field, varBegin, varEnd, gradient, Rmatrix);
+    break;
+  case 3:
+    detail::computeGradientsLeastSquares<3>(solver, kindMpiComm, kindPeriodicComm, geometry, config,
+                                            weighted, field, varBegin, varEnd, gradient, Rmatrix);
+    break;
+  default:
+    SU2_MPI::Error("Too many dimensions to compute gradients.", CURRENT_FUNCTION);
+    break;
+  }
+}
diff --git a/SU2_CFD/include/limiters/computeLimiters.hpp b/SU2_CFD/include/limiters/computeLimiters.hpp
index 6d4a2df7ad2..9ecf6ebfbc0 100644
--- a/SU2_CFD/include/limiters/computeLimiters.hpp
+++ b/SU2_CFD/include/limiters/computeLimiters.hpp
@@ -50,11 +50,17 @@ void computeLimiters(ENUM_LIMITER LimiterKind,
                      FieldType& fieldMax,
                      FieldType& limiter)
 {
-#define INSTANTIATE(KIND) \
-computeLimiters_impl<FieldType, GradientType, KIND>(solver, kindMpiComm, \
-  kindPeriodicComm1, kindPeriodicComm2, geometry, config, varBegin, \
-  varEnd, field, gradient, fieldMin, fieldMax, limiter)
+  if (geometry.GetnDim() != 2 && geometry.GetnDim() != 3)
+    SU2_MPI::Error("Too many dimensions to compute limiters.", CURRENT_FUNCTION);
 
+#define INSTANTIATE(KIND)\
+if (geometry.GetnDim() == 2) {\
+  computeLimiters_impl<2,KIND>(solver, kindMpiComm, kindPeriodicComm1, kindPeriodicComm2, geometry,\
+                               config, varBegin, varEnd, field, gradient, fieldMin, fieldMax, limiter);\
+} else {\
+  computeLimiters_impl<3,KIND>(solver, kindMpiComm, kindPeriodicComm1, kindPeriodicComm2, geometry,\
+                               config, varBegin, varEnd, field, gradient, fieldMin, fieldMax, limiter);\
+}
   switch (LimiterKind) {
     case NO_LIMITER:
     {
diff --git a/SU2_CFD/include/limiters/computeLimiters_impl.hpp b/SU2_CFD/include/limiters/computeLimiters_impl.hpp
index 0ebf054bf29..84ae551478e 100644
--- a/SU2_CFD/include/limiters/computeLimiters_impl.hpp
+++ b/SU2_CFD/include/limiters/computeLimiters_impl.hpp
@@ -55,11 +55,12 @@
  * \param[out] limiter - Reconstruction limiter for the field.
  *
  * Template parameters:
- * \param FieldType - Generic object with operator (iPoint,iVar)
- * \param GradientType - Generic object with operator (iPoint,iVar,iDim)
+ * \param nDim - Number of dimensions.
  * \param LimiterKind - Used to instantiate the right details class.
+ * \param FieldType - Generic object with operator (iPoint,iVar).
+ * \param GradientType - Generic object with operator (iPoint,iVar,iDim).
  */
-template<class FieldType, class GradientType, ENUM_LIMITER LimiterKind>
+template<size_t nDim, ENUM_LIMITER LimiterKind, class FieldType, class GradientType>
 void computeLimiters_impl(CSolver* solver,
                           MPI_QUANTITIES kindMpiComm,
                           PERIODIC_QUANTITIES kindPeriodicComm1,
@@ -74,28 +75,25 @@ void computeLimiters_impl(CSolver* solver,
                           FieldType& fieldMax,
                           FieldType& limiter)
 {
-  constexpr size_t MAXNDIM = 3;
-  constexpr size_t MAXNVAR = 30;
+  constexpr size_t MAXNVAR = 32;
 
   if (varEnd > MAXNVAR)
     SU2_MPI::Error("Number of variables is too large, increase MAXNVAR.", CURRENT_FUNCTION);
 
-  size_t nPointDomain = geometry.GetnPointDomain();
-  size_t nPoint = geometry.GetnPoint();
-  size_t nDim = geometry.GetnDim();
+  const size_t nPointDomain = geometry.GetnPointDomain();
+  const size_t nPoint = geometry.GetnPoint();
 
   /*--- If we do not have periodicity we can use a
    *    more efficient access pattern to memory. ---*/
 
-  bool periodic = (solver != nullptr) &&
-                  (kindPeriodicComm1 != PERIODIC_NONE) &&
-                  (config.GetnMarker_Periodic() > 0);
+  const bool periodic = (solver != nullptr) &&
+                        (kindPeriodicComm1 != PERIODIC_NONE) &&
+                        (config.GetnMarker_Periodic() > 0);
 
 #ifdef HAVE_OMP
   constexpr size_t OMP_MAX_CHUNK = 512;
 
-  size_t chunkSize = computeStaticChunkSize(nPointDomain,
-                     omp_get_max_threads(), OMP_MAX_CHUNK);
+  const auto chunkSize = computeStaticChunkSize(nPointDomain, omp_get_max_threads(), OMP_MAX_CHUNK);
 #endif
 
   /*--- If limiters are frozen do not record the computation ---*/
@@ -131,7 +129,7 @@ void computeLimiters_impl(CSolver* solver,
   for (size_t iPoint = 0; iPoint < nPointDomain; ++iPoint)
   {
     auto nodes = geometry.nodes;
-    const su2double* coord_i = nodes->GetCoord(iPoint);
+    const auto coord_i = nodes->GetCoord(iPoint);
 
     AD::StartPreacc();
     AD::SetPreaccIn(coord_i, nDim);
@@ -164,16 +162,14 @@ void computeLimiters_impl(CSolver* solver,
 
     /*--- Compute max/min projection and values over direct neighbors. ---*/
 
-    for(size_t iNeigh = 0; iNeigh < nodes->GetnPoint(iPoint); ++iNeigh)
-    {
-      size_t jPoint = nodes->GetPoint(iPoint,iNeigh);
+    for (auto jPoint : geometry.nodes->GetPoints(iPoint)) {
 
-      const su2double* coord_j = geometry.nodes->GetCoord(jPoint);
+      const auto coord_j = geometry.nodes->GetCoord(jPoint);
       AD::SetPreaccIn(coord_j, nDim);
 
       /*--- Distance vector from iPoint to face (middle of the edge). ---*/
 
-      su2double dist_ij[MAXNDIM] = {0.0};
+      su2double dist_ij[nDim] = {0.0};
 
       for(size_t iDim = 0; iDim < nDim; ++iDim)
         dist_ij[iDim] = 0.5 * (coord_j[iDim] - coord_i[iDim]);
diff --git a/SU2_CFD/include/numerics/CNumerics.hpp b/SU2_CFD/include/numerics/CNumerics.hpp
index ceb48087a7d..0b7eb53a311 100644
--- a/SU2_CFD/include/numerics/CNumerics.hpp
+++ b/SU2_CFD/include/numerics/CNumerics.hpp
@@ -253,6 +253,8 @@ class CNumerics {
      * allows discarding the Jacobians when they are not needed.
      */
     operator Vector_t() { return residual; }
+
+    su2double operator[] (unsigned long idx) const { return residual[idx]; }
   };
 
   /*!
diff --git a/SU2_CFD/include/numerics_simd/CNumericsSIMD.cpp b/SU2_CFD/include/numerics_simd/CNumericsSIMD.cpp
new file mode 100644
index 00000000000..4fc1cdb47e5
--- /dev/null
+++ b/SU2_CFD/include/numerics_simd/CNumericsSIMD.cpp
@@ -0,0 +1,90 @@
+﻿/*!
+ * \file CNumericsSIMD.hpp
+ * \brief Vectorized (SIMD) numerics classes.
+ * \note This should be the only cpp for this family of classes
+ * (which are all templates). All compilation takes place here.
+ * \author P. Gomes
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "CNumericsSIMD.hpp"
+#include "flow/convection/roe.hpp"
+#include "flow/convection/centered.hpp"
+#include "flow/diffusion/viscous_fluxes.hpp"
+
+/*!
+ * \brief Generic factory implementation.
+ */
+template<class ViscousDecorator>
+CNumericsSIMD* createNumerics(const CConfig& config, int iMesh) {
+  CNumericsSIMD* obj = nullptr;
+  switch (config.GetKind_ConvNumScheme_Flow()) {
+    case SPACE_UPWIND:
+      switch (config.GetKind_Upwind_Flow()) {
+        case ROE:
+          obj = new CRoeScheme<ViscousDecorator>(config, iMesh);
+          break;
+      }
+      break;
+
+    case SPACE_CENTERED:
+      switch ((iMesh==MESH_0)? config.GetKind_Centered_Flow() : LAX) {
+        case NO_CENTERED:
+          break;
+        case LAX:
+          obj = new CLaxScheme<ViscousDecorator>(config, iMesh);
+          break;
+        case JST:
+          obj = new CJSTScheme<ViscousDecorator>(config, iMesh);
+          break;
+        case JST_KE:
+          obj = new CJSTkeScheme<ViscousDecorator>(config, iMesh);
+          break;
+        case JST_MAT:
+          obj = new CJSTmatScheme<ViscousDecorator>(config, iMesh);
+          break;
+      }
+      break;
+  }
+  return obj;
+}
+
+/*!
+ * \brief This function instantiates both 2D and 3D versions of the implementation in
+ * createNumerics, which in turn instantiates the class templates of the different
+ * numerical methods.
+ */
+CNumericsSIMD* CNumericsSIMD::CreateNumerics(const CConfig& config, int nDim, int iMesh) {
+  if ((Double::Size < 4) && (SU2_MPI::GetRank() == MASTER_NODE)) {
+    cout << "WARNING: SU2 was not compiled for an AVX-capable architecture." << endl;
+  }
+  CNumericsSIMD* obj = nullptr;
+  if (config.GetViscous()) {
+    if (nDim == 2) obj = createNumerics<CCompressibleViscousFlux<2> >(config, iMesh);
+    if (nDim == 3) obj = createNumerics<CCompressibleViscousFlux<3> >(config, iMesh);
+  } else {
+    if (nDim == 2) obj = createNumerics<CNoViscousFlux<2> >(config, iMesh);
+    if (nDim == 3) obj = createNumerics<CNoViscousFlux<3> >(config, iMesh);
+  }
+  return obj;
+}
diff --git a/SU2_CFD/include/numerics_simd/CNumericsSIMD.hpp b/SU2_CFD/include/numerics_simd/CNumericsSIMD.hpp
new file mode 100644
index 00000000000..90343b6f540
--- /dev/null
+++ b/SU2_CFD/include/numerics_simd/CNumericsSIMD.hpp
@@ -0,0 +1,99 @@
+﻿/*!
+ * \file CNumericsSIMD.hpp
+ * \brief Vectorized (SIMD) numerics classes.
+ * \author P. Gomes
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../../Common/include/CConfig.hpp"
+#include "../../../Common/include/parallelization/vectorization.hpp"
+
+/*!
+ * \enum UpdateType
+ * \brief Ways to update vectors and system matrices.
+ * COLORING is the typical i/j update, whereas for REDUCTION
+ * the fluxes are stored and the matrix diagonal is not modified.
+ */
+enum class UpdateType {COLORING, REDUCTION};
+
+/*!
+ * \brief Define Double and Int SIMD types.
+ */
+using Double = simd::Array<su2double>;
+using Int = simd::Array<unsigned long, Double::Size>;
+
+/*--- Forward declare a few classes used in name only by the interface. ---*/
+template<class T> class CSysVector;
+template<class T> class CSysMatrix;
+class CConfig;
+class CGeometry;
+class CVariable;
+
+#ifdef CODI_FORWARD_TYPE
+using SparseMatrixType = CSysMatrix<su2double>;
+#else
+using SparseMatrixType = CSysMatrix<su2mixedfloat>;
+#endif
+
+/*!
+ * \class CNumericsSIMD
+ * \brief Base class to define the interface.
+ * \note See CNumericsEmptyDecorator.
+ */
+class CNumericsSIMD {
+public:
+  /*!
+   * \brief Interface for edge flux computation.
+   * \param[in] iEdge - The edges for flux computation.
+   * \param[in] config - Problem definitions.
+   * \param[in] geometry - Problem geometry.
+   * \param[in] solution - Solution variables.
+   * \param[in] updateType - Type of update done on vector and matrix.
+   * \param[in] updateMask - SIMD array of 1's and 0's, the latter prevent the update.
+   * \param[in,out] vector - Target for the fluxes.
+   * \param[in,out] matrix - Target for the flux Jacobians.
+   * \note The update mask is used to handle "remainder" edges (nEdge mod simdSize).
+   */
+  virtual void ComputeFlux(Int iEdge,
+                           const CConfig& config,
+                           const CGeometry& geometry,
+                           const CVariable& solution,
+                           UpdateType updateType,
+                           Double updateMask,
+                           CSysVector<su2double>& vector,
+                           SparseMatrixType& matrix) const = 0;
+
+  /*! \brief Destructor of the class. */
+  virtual ~CNumericsSIMD(void) = default;
+
+  /*!
+   * \brief Factory method.
+   * \param[in] config - Problem definitions.
+   * \param[in] nDim - 2D or 3D.
+   * \param[in] iMesh - Grid index.
+   */
+  static CNumericsSIMD* CreateNumerics(const CConfig& config, int nDim, int iMesh);
+
+};
diff --git a/SU2_CFD/include/numerics_simd/flow/convection/centered.hpp b/SU2_CFD/include/numerics_simd/flow/convection/centered.hpp
new file mode 100644
index 00000000000..32743685c3e
--- /dev/null
+++ b/SU2_CFD/include/numerics_simd/flow/convection/centered.hpp
@@ -0,0 +1,527 @@
+﻿/*!
+ * \file centered.hpp
+ * \brief Centered convective schemes.
+ * \author P. Gomes, F. Palacios, T. Economon
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../CNumericsSIMD.hpp"
+#include "../../util.hpp"
+#include "../variables.hpp"
+#include "common.hpp"
+#include "../../../variables/CEulerVariable.hpp"
+#include "../../../../../Common/include/geometry/CGeometry.hpp"
+
+/*!
+ * \class CCenteredBase
+ * \brief Base class for Centered schemes, derived classes implement
+ * the dissipation term in a const "finalizeFlux" method.
+ * \note See CRoeBase for the role of Base.
+ */
+template<class Derived, class Base>
+class CCenteredBase : public Base {
+protected:
+  using Base::nDim;
+  static constexpr size_t nVar = CCompressibleConservatives<nDim>::nVar;
+  static constexpr size_t nPrimVar = Max(Base::nPrimVar, nDim+5);
+
+  const su2double gamma;
+  const su2double fixFactor;
+  const bool dynamicGrid;
+  const su2double stretchParam = 0.3;
+
+  /*!
+   * \brief Constructor, store some constants and forward args to base.
+   */
+  template<class... Ts>
+  CCenteredBase(const CConfig& config, Ts&... args) : Base(config, args...),
+    gamma(config.GetGamma()),
+    fixFactor(config.GetCent_Jac_Fix_Factor()),
+    dynamicGrid(config.GetDynamic_Grid()) {
+  }
+
+  /*!
+   * \brief Special treatment needed to fetch integer data.
+   */
+  template<class T, size_t N>
+  FORCEINLINE static Double numNeighbor(simd::Array<T,N> idx, const CGeometry& geometry) {
+    Double n;
+    for (size_t k=0; k<N; ++k) n[k] = geometry.nodes->GetnNeighbor(idx[k]);
+    return n;
+  }
+  FORCEINLINE static Double numNeighbor(unsigned long idx, const CGeometry& geometry) {
+    return geometry.nodes->GetnNeighbor(idx);
+  }
+
+public:
+  /*!
+   * \brief Implementation of the base Roe flux.
+   */
+  void ComputeFlux(Int iEdge,
+                   const CConfig& config,
+                   const CGeometry& geometry,
+                   const CVariable& solution_,
+                   UpdateType updateType,
+                   Double updateMask,
+                   CSysVector<su2double>& vector,
+                   SparseMatrixType& matrix) const final {
+
+    /*--- Start preaccumulation, inputs are registered
+     *    automatically in "gatherVariables". ---*/
+    AD::StartPreacc();
+
+    const bool implicit = (config.GetKind_TimeIntScheme() == EULER_IMPLICIT);
+    const auto& solution = static_cast<const CEulerVariable&>(solution_);
+
+    const auto iPoint = geometry.edges->GetNode(iEdge,0);
+    const auto jPoint = geometry.edges->GetNode(iEdge,1);
+
+    /*--- Geometric properties. ---*/
+
+    const auto normal = gatherVariables<nDim>(iEdge, geometry.edges->GetNormal());
+    const auto area = norm(normal);
+    VectorDbl<nDim> unitNormal;
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      unitNormal(iDim) = normal(iDim) / area;
+    }
+
+    /*--- Primitive variables. ---*/
+
+    CPair<CCompressiblePrimitives<nDim,nPrimVar> > V;
+    V.i.all = gatherVariables<nPrimVar>(iPoint, solution.GetPrimitive());
+    V.j.all = gatherVariables<nPrimVar>(jPoint, solution.GetPrimitive());
+
+    CCompressiblePrimitives<nDim,nPrimVar> avgV;
+    for (size_t iVar = 0; iVar < nPrimVar; ++iVar) {
+      avgV.all(iVar) = 0.5 * (V.i.all(iVar) + V.j.all(iVar));
+    }
+
+    /*--- Compute conservative variables. ---*/
+
+    CPair<CCompressibleConservatives<nDim> > U;
+    U.i = compressibleConservatives(V.i);
+    U.j = compressibleConservatives(V.j);
+
+    auto avgU = compressibleConservatives(avgV);
+
+    VectorDbl<nVar> diffU;
+    for (size_t iVar = 0; iVar < nVar-1; ++iVar) {
+      diffU(iVar) = U.i.all(iVar) - U.j.all(iVar);
+    }
+    diffU(nVar-1) = V.i.density()*V.i.enthalpy() - V.j.density()*V.j.enthalpy();
+
+    /*--- Inviscid fluxes and Jacobians. ---*/
+
+    auto flux = inviscidProjFlux(avgV, avgU, normal);
+
+    MatrixDbl<nVar> jac_i, jac_j;
+    if (implicit) {
+      jac_i = inviscidProjJac(gamma, V.i.velocity(), U.i.energy(), normal, 0.5);
+      jac_j = inviscidProjJac(gamma, V.j.velocity(), U.j.energy(), normal, 0.5);
+    }
+
+    /*--- Grid motion. ---*/
+
+    Double projGridVel = 0.0;
+    if (dynamicGrid) {
+      const auto& gridVel = geometry.nodes->GetGridVel();
+      projGridVel = 0.5*(dot(gatherVariables<nDim>(iPoint,gridVel), normal)+
+                         dot(gatherVariables<nDim>(jPoint,gridVel), normal));
+
+      for (size_t iVar = 0; iVar < nVar; ++iVar) {
+        flux(iVar) -= projGridVel * avgU.all(iVar);
+        if (implicit) {
+          jac_i(iVar,iVar) -= 0.5 * projGridVel;
+          jac_j(iVar,iVar) -= 0.5 * projGridVel;
+        }
+      }
+    }
+
+    const Double projVel = dot(avgV.velocity(), normal) - projGridVel;
+
+    /*--- Finalize in derived class (static polymorphism). ---*/
+
+    const auto derived = static_cast<const Derived*>(this);
+
+    derived->finalizeFlux(flux, jac_i, jac_j, implicit, area, projVel, avgV, V,
+                          diffU, iPoint, jPoint, geometry, solution, unitNormal);
+
+    /*--- Add the contributions from the base class (static decorator). ---*/
+
+    Base::viscousTerms(iEdge, iPoint, jPoint, avgV, V, solution_, geometry,
+                       config, area, unitNormal, implicit, flux, jac_i, jac_j);
+
+    /*--- Stop preaccumulation. ---*/
+
+    AD::SetPreaccOut(flux, nVar, Double::Size);
+    AD::EndPreacc();
+
+    /*--- Update the vector and system matrix. ---*/
+
+    updateLinearSystem(iEdge, iPoint, jPoint, implicit, updateType,
+                       updateMask, flux, jac_i, jac_j, vector, matrix);
+  }
+};
+
+/*!
+ * \class CJSTScheme
+ * \brief Classical JST scheme with scalar dissipation.
+ */
+template<class Decorator>
+class CJSTScheme : public CCenteredBase<CJSTScheme<Decorator>,Decorator> {
+private:
+  using Base = CCenteredBase<CJSTScheme<Decorator>,Decorator>;
+  using Base::nDim;
+  using Base::nVar;
+  using Base::gamma;
+  using Base::fixFactor;
+  using Base::stretchParam;
+  const su2double kappa2;
+  const su2double kappa4;
+
+public:
+  /*!
+   * \brief Constructor, forward everything to base.
+   */
+  template<class... Ts>
+  CJSTScheme(const CConfig& config, Ts&... args) : Base(config, args...),
+    kappa2(config.GetKappa_2nd_Flow()),
+    kappa4(config.GetKappa_4th_Flow()) {
+  }
+
+  /*!
+   * \brief Updates flux and Jacobians with JST dissipation.
+   * \note "Ts" is here just in case other schemes in the family need extra args.
+   */
+  template<class PrimVarType, class... Ts>
+  FORCEINLINE void finalizeFlux(VectorDbl<nVar>& flux,
+                                MatrixDbl<nVar>& jac_i,
+                                MatrixDbl<nVar>& jac_j,
+                                bool implicit,
+                                Double area,
+                                Double projVel,
+                                const PrimVarType& avgV,
+                                const CPair<PrimVarType>& V,
+                                const VectorDbl<nVar>& diffU,
+                                Int iPoint,
+                                Int jPoint,
+                                const CGeometry& geometry,
+                                const CEulerVariable& solution,
+                                Ts&...) const {
+
+    Double lambda = abs(projVel) + avgV.speedSound()*area;
+    lambda = correctedSpectralRadius(iPoint, jPoint, lambda, stretchParam, solution);
+
+    /*--- Compute dissipation coefficients. ---*/
+
+    const auto ni = Base::numNeighbor(iPoint, geometry);
+    const auto nj = Base::numNeighbor(jPoint, geometry);
+    const Double sc2 = 3 * (ni+nj) / (ni*nj);
+    const Double sc4 = 0.25*pow(sc2, 2);
+
+    const auto si = gatherVariables(iPoint, solution.GetSensor());
+    const auto sj = gatherVariables(jPoint, solution.GetSensor());
+    const Double eps2 = kappa2 * 0.5*(si+sj) * sc2;
+    const Double eps4 = max(0.0, kappa4-eps2) * sc4;
+
+    /*--- Update flux and Jacobians with dissipation terms. ---*/
+
+    const auto lapl_i = gatherVariables<nVar>(iPoint, solution.GetUndivided_Laplacian());
+    const auto lapl_j = gatherVariables<nVar>(jPoint, solution.GetUndivided_Laplacian());
+
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      flux(iVar) += (eps2*diffU(iVar) - eps4*(lapl_i(iVar)-lapl_j(iVar))) * lambda;
+    }
+
+    if (implicit) {
+      const Double dissip_i = fixFactor * (eps2 + eps4*(ni+1)) * lambda;
+      const Double dissip_j = -fixFactor * (eps2 + eps4*(nj+1)) * lambda;
+      scalarDissipationJacobian(V.i, gamma, dissip_i, jac_i);
+      scalarDissipationJacobian(V.j, gamma, dissip_j, jac_j);
+    }
+  }
+};
+
+/*!
+ * \class CJSTmatScheme
+ * \brief JST scheme with matrix dissipation.
+ */
+template<class Decorator>
+class CJSTmatScheme : public CCenteredBase<CJSTmatScheme<Decorator>,Decorator> {
+private:
+  using Base = CCenteredBase<CJSTmatScheme<Decorator>,Decorator>;
+  using Base::nDim;
+  using Base::nVar;
+  using Base::gamma;
+  using Base::fixFactor;
+  const su2double kappa2;
+  const su2double kappa4;
+  const su2double entropyFix;
+
+public:
+  /*!
+   * \brief Constructor, forward everything to base.
+   */
+  template<class... Ts>
+  CJSTmatScheme(const CConfig& config, Ts&... args) : Base(config, args...),
+    kappa2(config.GetKappa_2nd_Flow()),
+    kappa4(config.GetKappa_4th_Flow()),
+    entropyFix(config.GetEntropyFix_Coeff()) {
+  }
+
+  /*!
+   * \brief Updates flux and Jacobians.
+   * \note "Ts" is here just in case other schemes in the family need extra args.
+   */
+  template<class PrimVarType, class... Ts>
+  FORCEINLINE void finalizeFlux(VectorDbl<nVar>& flux,
+                                MatrixDbl<nVar>& jac_i,
+                                MatrixDbl<nVar>& jac_j,
+                                bool implicit,
+                                Double area,
+                                Double projVel,
+                                const PrimVarType& avgV,
+                                const CPair<PrimVarType>& V,
+                                const VectorDbl<nVar>& diffU,
+                                Int iPoint,
+                                Int jPoint,
+                                const CGeometry& geometry,
+                                const CEulerVariable& solution,
+                                const VectorDbl<nDim>& unitNormal,
+                                Ts&...) const {
+
+    /*--- Compute scalar dissipation. ---*/
+
+    const auto ni = Base::numNeighbor(iPoint, geometry);
+    const auto nj = Base::numNeighbor(jPoint, geometry);
+    const Double sc2 = 3 * (ni+nj) / (ni*nj);
+    const Double sc4 = 0.25*pow(sc2, 2);
+
+    const auto si = gatherVariables(iPoint, solution.GetSensor());
+    const auto sj = gatherVariables(jPoint, solution.GetSensor());
+    const Double eps2 = kappa2 * 0.5*(si+sj) * sc2;
+    const Double eps4 = max(0.0, kappa4-eps2) * sc4;
+
+    const auto lapl_i = gatherVariables<nVar>(iPoint, solution.GetUndivided_Laplacian());
+    const auto lapl_j = gatherVariables<nVar>(jPoint, solution.GetUndivided_Laplacian());
+
+    VectorDbl<nVar> scalarDissip;
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      scalarDissip(iVar) = eps2*diffU(iVar) - eps4*(lapl_i(iVar)-lapl_j(iVar));
+    }
+
+    MatrixDbl<nVar> scalarJac;
+    if (implicit) {
+      scalarJac = Double(0.0);
+      Double factor = fixFactor * (eps2 + 0.5*eps4*(ni+nj+2));
+      scalarDissipationJacobian(avgV, gamma, factor, scalarJac);
+    }
+
+    /*--- Compute matrix dissipation terms. ---*/
+
+    const auto unitProjVel = dot(avgV.velocity(), unitNormal);
+
+    auto pMat = pMatrix(gamma, avgV.density(), avgV.velocity(),
+                        unitProjVel, avgV.speedSound(), unitNormal);
+
+    auto pMatInv = pMatrixInv(gamma, avgV.density(), avgV.velocity(),
+                              unitProjVel, avgV.speedSound(), unitNormal);
+
+    /*--- Compute limited absolute eigenvalues (times area). ---*/
+
+    VectorDbl<nVar> lambda;
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      lambda(iDim) = projVel;
+    }
+    lambda(nDim) = projVel + avgV.speedSound()*area;
+    lambda(nDim+1) = projVel - avgV.speedSound()*area;
+
+    const Double maxLambda = max(lambda(nDim), -lambda(nDim+1));
+
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      lambda(iVar) = max(abs(lambda(iVar)), entropyFix*maxLambda);
+    }
+
+    /*--- Update flux and Jacobians with scaled dissipation terms. ---*/
+
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      for (size_t jVar = 0; jVar < nVar; ++jVar) {
+        /*--- Matrix scaling, P x |S * Lambda| x P^-1. ---*/
+
+        Double scale = 0.0;
+        for (size_t kVar = 0; kVar < nVar; ++kVar) {
+          scale += pMat(iVar,kVar) * lambda(kVar) * pMatInv(kVar,jVar);
+        }
+
+        /*--- Update flux and Jacobians. ---*/
+
+        flux(iVar) += scale * scalarDissip(jVar);
+
+        if (implicit) {
+          for (size_t kVar = 0; kVar < nVar; ++kVar) {
+            jac_i(iVar,kVar) += scale * scalarJac(jVar,kVar);
+            jac_j(iVar,kVar) -= scale * scalarJac(jVar,kVar);
+          }
+        }
+      }
+    }
+  }
+};
+
+/*!
+ * \class CJSTkeScheme
+ * \brief JST scheme without 4th order dissipation.
+ */
+template<class Decorator>
+class CJSTkeScheme : public CCenteredBase<CJSTkeScheme<Decorator>,Decorator> {
+private:
+  using Base = CCenteredBase<CJSTkeScheme<Decorator>,Decorator>;
+  using Base::nDim;
+  using Base::nVar;
+  using Base::gamma;
+  using Base::fixFactor;
+  using Base::stretchParam;
+  const su2double kappa2;
+
+public:
+  /*!
+   * \brief Constructor, forward everything to base.
+   */
+  template<class... Ts>
+  CJSTkeScheme(const CConfig& config, Ts&... args) : Base(config, args...),
+    kappa2(config.GetKappa_2nd_Flow()) {
+  }
+
+  /*!
+   * \brief Updates flux and Jacobians with 2nd order dissipation.
+   * \note "Ts" is here just in case other schemes in the family need extra args.
+   */
+  template<class PrimVarType, class... Ts>
+  FORCEINLINE void finalizeFlux(VectorDbl<nVar>& flux,
+                                MatrixDbl<nVar>& jac_i,
+                                MatrixDbl<nVar>& jac_j,
+                                bool implicit,
+                                Double area,
+                                Double projVel,
+                                const PrimVarType& avgV,
+                                const CPair<PrimVarType>& V,
+                                const VectorDbl<nVar>& diffU,
+                                Int iPoint,
+                                Int jPoint,
+                                const CGeometry& geometry,
+                                const CEulerVariable& solution,
+                                Ts&...) const {
+
+    Double lambda = abs(projVel) + avgV.speedSound()*area;
+    lambda = correctedSpectralRadius(iPoint, jPoint, lambda, stretchParam, solution);
+
+    /*--- Compute dissipation coefficient. ---*/
+
+    const auto ni = Base::numNeighbor(iPoint, geometry);
+    const auto nj = Base::numNeighbor(jPoint, geometry);
+    const Double sc2 = 3 * (ni+nj) / (ni*nj);
+
+    const auto si = gatherVariables(iPoint, solution.GetSensor());
+    const auto sj = gatherVariables(jPoint, solution.GetSensor());
+    const Double dissip = kappa2 * 0.5*(si+sj) * sc2 * lambda;
+
+    /*--- Update flux and Jacobians with dissipation term. ---*/
+
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      flux(iVar) += dissip * diffU(iVar);
+    }
+
+    if (implicit) {
+      scalarDissipationJacobian(V.i, gamma, fixFactor*dissip, jac_i);
+      scalarDissipationJacobian(V.j, gamma, -fixFactor*dissip, jac_j);
+    }
+  }
+};
+
+/*!
+ * \class CLaxScheme
+ * \brief Lax–Friedrichs 1st order scheme.
+ */
+template<class Decorator>
+class CLaxScheme : public CCenteredBase<CLaxScheme<Decorator>,Decorator> {
+private:
+  using Base = CCenteredBase<CLaxScheme<Decorator>,Decorator>;
+  using Base::nDim;
+  using Base::nVar;
+  using Base::gamma;
+  using Base::fixFactor;
+  using Base::stretchParam;
+  const su2double kappa0;
+
+public:
+  /*!
+   * \brief Constructor, forward everything to base.
+   */
+  template<class... Ts>
+  CLaxScheme(const CConfig& config, Ts&... args) : Base(config, args...),
+    kappa0(config.GetKappa_1st_Flow()) {
+  }
+
+  /*!
+   * \brief Updates flux and Jacobians with 1st order scalar dissipation.
+   * \note "Ts" is here just in case other schemes in the family need extra args.
+   */
+  template<class PrimVarType, class... Ts>
+  FORCEINLINE void finalizeFlux(VectorDbl<nVar>& flux,
+                                MatrixDbl<nVar>& jac_i,
+                                MatrixDbl<nVar>& jac_j,
+                                bool implicit,
+                                Double area,
+                                Double projVel,
+                                const PrimVarType& avgV,
+                                const CPair<PrimVarType>& V,
+                                const VectorDbl<nVar>& diffU,
+                                Int iPoint,
+                                Int jPoint,
+                                const CGeometry& geometry,
+                                const CEulerVariable& solution,
+                                Ts&...) const {
+
+    Double lambda = abs(projVel) + avgV.speedSound()*area;
+    lambda = correctedSpectralRadius(iPoint, jPoint, lambda, stretchParam, solution);
+
+    /*--- Compute dissipation coefficient. ---*/
+
+    const auto ni = Base::numNeighbor(iPoint, geometry);
+    const auto nj = Base::numNeighbor(jPoint, geometry);
+    const Double dissip = kappa0 * nDim * (ni+nj) / (ni*nj) * lambda;
+
+    /*--- Update flux and Jacobians with dissipation term. ---*/
+
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      flux(iVar) += dissip * diffU(iVar);
+    }
+
+    if (implicit) {
+      scalarDissipationJacobian(V.i, gamma, fixFactor*dissip, jac_i);
+      scalarDissipationJacobian(V.j, gamma, -fixFactor*dissip, jac_j);
+    }
+  }
+};
diff --git a/SU2_CFD/include/numerics_simd/flow/convection/common.hpp b/SU2_CFD/include/numerics_simd/flow/convection/common.hpp
new file mode 100644
index 00000000000..fe3b1fbf356
--- /dev/null
+++ b/SU2_CFD/include/numerics_simd/flow/convection/common.hpp
@@ -0,0 +1,411 @@
+﻿/*!
+ * \file common.hpp
+ * \brief Common convection-related methods.
+ * \author P. Gomes, F. Palacios, T. Economon
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../CNumericsSIMD.hpp"
+#include "../../util.hpp"
+#include "../variables.hpp"
+#include "../../../variables/CNSVariable.hpp"
+
+/*!
+ * \brief Unlimited reconstruction.
+ */
+template<size_t nVar, size_t nDim, class Gradient_t>
+FORCEINLINE void musclUnlimited(Int iPoint,
+                                const VectorDbl<nDim>& vector_ij,
+                                Double scale,
+                                const Gradient_t& gradient,
+                                VectorDbl<nVar>& vars) {
+  auto grad = gatherVariables<nVar,nDim>(iPoint, gradient);
+  for (size_t iVar = 0; iVar < nVar; ++iVar) {
+    vars(iVar) += scale * dot(grad[iVar], vector_ij);
+  }
+}
+
+/*!
+ * \brief Limited reconstruction with point-based limiter.
+ */
+template<size_t nVar, size_t nDim, class Limiter_t, class Gradient_t>
+FORCEINLINE void musclPointLimited(Int iPoint,
+                                   const VectorDbl<nDim>& vector_ij,
+                                   Double scale,
+                                   const Limiter_t& limiter,
+                                   const Gradient_t& gradient,
+                                   VectorDbl<nVar>& vars) {
+  auto lim = gatherVariables<nVar>(iPoint, limiter);
+  auto grad = gatherVariables<nVar,nDim>(iPoint, gradient);
+  for (size_t iVar = 0; iVar < nVar; ++iVar) {
+    vars(iVar) += lim(iVar) * scale * dot(grad[iVar], vector_ij);
+  }
+}
+
+/*!
+ * \brief Limited reconstruction with edge-based limiter.
+ */
+template<size_t nDim, class VarType, class Gradient_t>
+FORCEINLINE void musclEdgeLimited(Int iPoint,
+                                  Int jPoint,
+                                  const VectorDbl<nDim>& vector_ij,
+                                  const Gradient_t& gradient,
+                                  CPair<VarType>& V) {
+  constexpr size_t nVar = VarType::nVar;
+
+  auto grad_i = gatherVariables<nVar,nDim>(iPoint, gradient);
+  auto grad_j = gatherVariables<nVar,nDim>(jPoint, gradient);
+
+  for (size_t iVar = 0; iVar < nVar; ++iVar) {
+    auto proj_i = dot(grad_i[iVar], vector_ij);
+    auto proj_j = dot(grad_j[iVar], vector_ij);
+    auto delta_ij = V.j.all(iVar) - V.i.all(iVar);
+    auto delta_ij_2 = pow(delta_ij, 2) + 1e-6;
+    /// TODO: Customize the limiter function.
+    auto lim_i = (delta_ij_2 + proj_i*delta_ij) / (pow(proj_i,2) + delta_ij_2);
+    auto lim_j = (delta_ij_2 + proj_j*delta_ij) / (pow(proj_j,2) + delta_ij_2);
+    V.i.all(iVar) += lim_i * 0.5 * proj_i;
+    V.j.all(iVar) -= lim_j * 0.5 * proj_j;
+  }
+}
+
+/*!
+ * \brief Retrieve primitive variables for points i/j, reconstructing them if needed.
+ * \param[in] iPoint, jPoint - Nodes of the edge.
+ * \param[in] muscl - If true, reconstruct, else simply fetch.
+ * \param[in] vector_ij - Distance vector from i to j.
+ * \param[in] solution - Entire solution container (a derived CVariable).
+ * \return Pair of primitive variables.
+ */
+template<class ReconVarType, class PrimVarType, size_t nDim, class VariableType>
+FORCEINLINE CPair<ReconVarType> reconstructPrimitives(Int iPoint, Int jPoint, bool muscl,
+                                                      ENUM_LIMITER limiterType,
+                                                      const CPair<PrimVarType>& V1st,
+                                                      const VectorDbl<nDim>& vector_ij,
+                                                      const VariableType& solution) {
+  static_assert(ReconVarType::nVar <= PrimVarType::nVar,"");
+
+  const auto& gradients = solution.GetGradient_Reconstruction();
+  const auto& limiters = solution.GetLimiter_Primitive();
+
+  CPair<ReconVarType> V;
+
+  for (size_t iVar = 0; iVar < ReconVarType::nVar; ++iVar) {
+    V.i.all(iVar) = V1st.i.all(iVar);
+    V.j.all(iVar) = V1st.j.all(iVar);
+  }
+
+  if (muscl) {
+    switch (limiterType) {
+    case NO_LIMITER:
+      musclUnlimited(iPoint, vector_ij, 0.5, gradients, V.i.all);
+      musclUnlimited(jPoint, vector_ij,-0.5, gradients, V.j.all);
+      break;
+    case VAN_ALBADA_EDGE:
+      musclEdgeLimited(iPoint, jPoint, vector_ij, gradients, V);
+      break;
+    default:
+      musclPointLimited(iPoint, vector_ij, 0.5, limiters, gradients, V.i.all);
+      musclPointLimited(jPoint, vector_ij,-0.5, limiters, gradients, V.j.all);
+      break;
+    }
+    /// TODO: Extra reconstruction checks needed.
+  }
+  return V;
+}
+
+/*!
+ * \brief Compute and return the P tensor (compressible flow, ideal gas).
+ */
+template<size_t nDim, class RandomAccessIterator>
+FORCEINLINE MatrixDbl<nDim+2> pMatrix(Double gamma, Double density, const RandomAccessIterator& velocity,
+                                      Double projVel, Double speedSound, const VectorDbl<nDim>& normal) {
+  MatrixDbl<nDim+2> pMat;
+  const Double vel2 = 0.5*squaredNorm<nDim>(velocity);
+
+  if (nDim == 2) {
+    pMat(0,0) = 1.0;
+    pMat(0,1) = 0.0;
+
+    pMat(1,0) = velocity[0];
+    pMat(1,1) = density*normal(1);
+
+    pMat(2,0) = velocity[1];
+    pMat(2,1) = -density*normal(0);
+
+    pMat(3,0) = vel2;
+    pMat(3,1) = density*(velocity[0]*normal(1) - velocity[1]*normal(0));
+  }
+  else {
+    pMat(0,0) = normal(0);
+    pMat(0,1) = normal(1);
+    pMat(0,2) = normal(2);
+
+    pMat(1,0) = velocity[0]*normal(0);
+    pMat(1,1) = velocity[0]*normal(1) - density*normal(2);
+    pMat(1,2) = velocity[0]*normal(2) + density*normal(1);
+
+    pMat(2,0) = velocity[1]*normal(0) + density*normal(2);
+    pMat(2,1) = velocity[1]*normal(1);
+    pMat(2,2) = velocity[1]*normal(2) - density*normal(0);
+
+    pMat(3,0) = velocity[2]*normal(0) - density*normal(1);
+    pMat(3,1) = velocity[2]*normal(1) + density*normal(0);
+    pMat(3,2) = velocity[2]*normal(2);
+
+    pMat(4,0) = vel2*normal(0) + density*(velocity[1]*normal(2) - velocity[2]*normal(1));
+    pMat(4,1) = vel2*normal(1) - density*(velocity[0]*normal(2) - velocity[2]*normal(0));
+    pMat(4,2) = vel2*normal(2) + density*(velocity[0]*normal(1) - velocity[1]*normal(0));
+  }
+
+  /*--- Last two columns. ---*/
+
+  const Double rhoOn2 = 0.5*density;
+  const Double rhoOnTwoC = rhoOn2 / speedSound;
+  pMat(0,nDim) = rhoOnTwoC;
+  pMat(0,nDim+1) = rhoOnTwoC;
+
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    pMat(iDim+1,nDim) = rhoOnTwoC * velocity[iDim] + rhoOn2 * normal(iDim);
+    pMat(iDim+1,nDim+1) = rhoOnTwoC * velocity[iDim] - rhoOn2 * normal(iDim);
+  }
+
+  pMat(nDim+1,nDim) = rhoOnTwoC * vel2 + rhoOn2 * (projVel + speedSound/(gamma-1));
+  pMat(nDim+1,nDim+1) = rhoOnTwoC * vel2 - rhoOn2 * (projVel - speedSound/(gamma-1));
+
+  return pMat;
+}
+
+/*!
+ * \brief Compute and return the inverse P tensor (compressible flow, ideal gas).
+ */
+template<size_t nDim, class RandomAccessIterator>
+FORCEINLINE MatrixDbl<nDim+2> pMatrixInv(Double gamma, Double density, const RandomAccessIterator& velocity,
+                                         Double projVel, Double speedSound, const VectorDbl<nDim>& normal) {
+  MatrixDbl<nDim+2> pMatInv;
+
+  const Double c2 = pow(speedSound,2);
+  const Double vel2 = 0.5*squaredNorm<nDim>(velocity);
+  const Double oneOnRho = 1 / density;
+
+  if (nDim == 2) {
+    Double tmp = (gamma-1)/c2;
+    pMatInv(0,0) = 1.0 - tmp*vel2;
+    pMatInv(0,1) = tmp*velocity[0];
+    pMatInv(0,2) = tmp*velocity[1];
+    pMatInv(0,3) = -tmp;
+
+    pMatInv(1,0) = (normal(0)*velocity[1]-normal(1)*velocity[0])*oneOnRho;
+    pMatInv(1,1) = normal(1)*oneOnRho;
+    pMatInv(1,2) = -normal(0)*oneOnRho;
+    pMatInv(1,3) = 0.0;
+  }
+  else {
+    Double tmp = (gamma-1)/c2 * normal(0);
+    pMatInv(0,0) = normal(0) - tmp*vel2 - (normal(2)*velocity[1]-normal(1)*velocity[2])*oneOnRho;
+    pMatInv(0,1) = tmp*velocity[0];
+    pMatInv(0,2) = tmp*velocity[1] + normal(2)*oneOnRho;
+    pMatInv(0,3) = tmp*velocity[2] - normal(1)*oneOnRho;
+    pMatInv(0,4) = -tmp;
+
+    tmp = (gamma-1)/c2 * normal(1);
+    pMatInv(1,0) = normal(1) - tmp*vel2 + (normal(2)*velocity[0]-normal(0)*velocity[2])*oneOnRho;
+    pMatInv(1,1) = tmp*velocity[0] - normal(2)*oneOnRho;
+    pMatInv(1,2) = tmp*velocity[1];
+    pMatInv(1,3) = tmp*velocity[2] + normal(0)*oneOnRho;
+    pMatInv(1,4) = -tmp;
+
+    tmp = (gamma-1)/c2 * normal(2);
+    pMatInv(2,0) = normal(2) - tmp*vel2 - (normal(1)*velocity[0]-normal(0)*velocity[1])*oneOnRho;
+    pMatInv(2,1) = tmp*velocity[0] + normal(1)*oneOnRho;
+    pMatInv(2,2) = tmp*velocity[1] - normal(0)*oneOnRho;
+    pMatInv(2,3) = tmp*velocity[2];
+    pMatInv(2,4) = -tmp;
+  }
+
+  /*--- Last two rows. ---*/
+
+  const Double gamma_minus_1_on_rho_times_c = (gamma-1) / (density*speedSound);
+
+  for (size_t iVar = nDim; iVar < nDim+2; ++iVar) {
+    Double sign = (iVar==nDim)? 1 : -1;
+    pMatInv(iVar,0) = -sign*projVel*oneOnRho + gamma_minus_1_on_rho_times_c * vel2;
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      pMatInv(iVar,iDim+1) = sign*normal(iDim)*oneOnRho - gamma_minus_1_on_rho_times_c * velocity[iDim];
+    }
+    pMatInv(iVar,nDim+1) = gamma_minus_1_on_rho_times_c;
+  }
+
+  return pMatInv;
+}
+
+/*!
+ * \brief Convective projected (onto normal) flux (compressible flow).
+ */
+template<class PrimVarType, class ConsVarType, size_t nDim>
+FORCEINLINE VectorDbl<nDim+2> inviscidProjFlux(const PrimVarType& V,
+                                               const ConsVarType& U,
+                                               const VectorDbl<nDim>& normal) {
+  static_assert(ConsVarType::nVar == nDim+2,"");
+  Double mdot = dot(U.momentum(), normal);
+  VectorDbl<nDim+2> flux;
+  flux(0) = mdot;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    flux(iDim+1) = mdot*V.velocity(iDim) + normal(iDim)*V.pressure();
+  }
+  flux(nDim+1) = mdot*V.enthalpy();
+  return flux;
+}
+
+/*!
+ * \brief Jacobian of the convective flux (compressible flow, ideal gas).
+ */
+template<size_t nDim, class RandomAccessIterator>
+FORCEINLINE MatrixDbl<nDim+2> inviscidProjJac(Double gamma, RandomAccessIterator velocity,
+                                              Double energy, const VectorDbl<nDim>& normal,
+                                              Double scale) {
+  MatrixDbl<nDim+2> jac;
+
+  Double projVel = dot(velocity, normal);
+  Double gamma_m_1 = gamma-1;
+  Double phi = 0.5*gamma_m_1*squaredNorm<nDim>(velocity);
+  Double a1 = gamma*energy - phi;
+
+  jac(0,0) = 0.0;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    jac(0,iDim+1) = scale * normal(iDim);
+  }
+  jac(0,nDim+1) = 0.0;
+
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    jac(iDim+1,0) = scale * (normal(iDim)*phi - velocity[iDim]*projVel);
+    for (size_t jDim = 0; jDim < nDim; ++jDim) {
+      jac(iDim+1,jDim+1) = scale * (normal(jDim)*velocity[iDim] - gamma_m_1*normal(iDim)*velocity[jDim]);
+    }
+    jac(iDim+1,iDim+1) += scale * projVel;
+    jac(iDim+1,nDim+1) = scale * gamma_m_1 * normal(iDim);
+  }
+
+  jac(nDim+1,0) = scale * projVel * (phi-a1);
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    jac(nDim+1,iDim+1) = scale * (normal(iDim)*a1 - gamma_m_1*velocity[iDim]*projVel);
+  }
+  jac(nDim+1,nDim+1) = scale * gamma * projVel;
+
+  return jac;
+}
+
+/*!
+ * \brief (Low) Dissipation coefficient for Roe schemes.
+ */
+template<class VariableType>
+FORCEINLINE Double roeDissipation(Int iPoint,
+                                  Int jPoint,
+                                  ENUM_ROELOWDISS type,
+                                  const VariableType& solution) {
+  if (type == NO_ROELOWDISS) {
+    return 1.0;
+  }
+
+  const auto& sol = static_cast<const CNSVariable&>(solution);
+  const auto& sensor = sol.GetSensor();
+  const auto& dissip = sol.GetRoe_Dissipation();
+
+  const Double si = gatherVariables(iPoint, sensor);
+  const Double sj = gatherVariables(jPoint, sensor);
+  const Double avgSensor = 0.5 * (si + sj);
+
+  const Double di = gatherVariables(iPoint, dissip);
+  const Double dj = gatherVariables(jPoint, dissip);
+  const Double avgDissip = 0.5 * (di + dj);
+
+  /*--- A minimum level of upwinding is used to enhance stability. ---*/
+  constexpr passivedouble minDissip = 0.05;
+
+  switch (type) {
+    case FD:
+    case FD_DUCROS: {
+      Double d = max(minDissip, 1.0 - avgDissip);
+
+      if (type == FD_DUCROS) {
+        /*--- See Jonhsen et al. JCP 229 (2010) pag. 1234 ---*/
+        d = max(d, 0.05 + 0.95*(avgSensor > 0.65));
+      }
+      return d;
+    }
+    case NTS:
+      return max(minDissip, avgDissip);
+
+    case NTS_DUCROS:
+      /*--- See Xiao et al. INT J HEAT FLUID FL 51 (2015) pag. 141
+       * https://doi.org/10.1016/j.ijheatfluidflow.2014.10.007 ---*/
+      return max(minDissip, avgSensor+avgDissip - avgSensor*avgDissip);
+
+    default:
+      assert(false);
+      return 1.0;
+  }
+}
+
+/*!
+ * \brief Correct spectral radius (avgLambda) for stretching.
+ */
+template<class VariableType, class T>
+FORCEINLINE Double correctedSpectralRadius(Int iPoint,
+                                           Int jPoint,
+                                           Double avgLambda,
+                                           T stretchParam,
+                                           const VariableType& solution) {
+
+  const auto lambda_i = gatherVariables(iPoint, solution.GetLambda());
+  const Double phi_i = pow(0.25*lambda_i/avgLambda, stretchParam);
+
+  const auto lambda_j = gatherVariables(jPoint, solution.GetLambda());
+  const Double phi_j = pow(0.25*lambda_j/avgLambda, stretchParam);
+
+  return 4*phi_i*phi_j / (phi_i + phi_j) * avgLambda;
+}
+
+/*!
+ * \brief Update of a flux Jacobian due to a scalar dissipation term.
+ */
+template<class VariableType, size_t nVar>
+FORCEINLINE void scalarDissipationJacobian(const VariableType& V,
+                                           Double gamma,
+                                           Double dissipConst,
+                                           MatrixDbl<nVar>& jac) {
+  /*--- Diagonal entries. ---*/
+  for (size_t iVar = 0; iVar < nVar-1; ++iVar) {
+    jac(iVar,iVar) += dissipConst;
+  }
+  jac(nVar-1,nVar-1) += dissipConst * gamma;
+
+  /*--- N-1 columns of last row. ---*/
+  dissipConst *= gamma-1.0;
+  for (size_t iDim = 0; iDim < VariableType::nDim; ++iDim) {
+    jac(nVar-1,iDim+1) -= dissipConst * V.velocity(iDim);
+    jac(nVar-1,0) += dissipConst * pow(V.velocity(iDim), 2);
+  }
+}
diff --git a/SU2_CFD/include/numerics_simd/flow/convection/roe.hpp b/SU2_CFD/include/numerics_simd/flow/convection/roe.hpp
new file mode 100644
index 00000000000..cc42b0caa04
--- /dev/null
+++ b/SU2_CFD/include/numerics_simd/flow/convection/roe.hpp
@@ -0,0 +1,298 @@
+﻿/*!
+ * \file roe.hpp
+ * \brief Roe-family of convective schemes.
+ * \author P. Gomes, A. Bueno, F. Palacios
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../CNumericsSIMD.hpp"
+#include "../../util.hpp"
+#include "../variables.hpp"
+#include "common.hpp"
+#include "../../../variables/CEulerVariable.hpp"
+#include "../../../../../Common/include/geometry/CGeometry.hpp"
+
+/*!
+ * \class CRoeBase
+ * \brief Base class for Roe schemes, derived classes implement
+ * the dissipation term in a const "finalizeFlux" method.
+ * A base class implementing "viscousTerms" is accepted as template parameter.
+ * Similarly to derived, that method should update the flux and Jacobians, but
+ * whereas "finalizeFlux" is passed data prepared by CRoeBase, "viscousTerms"
+ * takes the same input arguments as "ComputeFlux", i.e. it can fetch more
+ * data from CVariable. Derived is meant to implement small details,
+ * Base is meant to do heavy lifting.
+ */
+template<class Derived, class Base>
+class CRoeBase : public Base {
+protected:
+  using Base::nDim;
+  static constexpr size_t nVar = CCompressibleConservatives<nDim>::nVar;
+  static constexpr size_t nPrimVarGrad = nDim+4;
+  static constexpr size_t nPrimVar = Max(Base::nPrimVar, nPrimVarGrad);
+
+  const su2double kappa;
+  const su2double gamma;
+  const su2double entropyFix;
+  const bool finestGrid;
+  const bool dynamicGrid;
+  const bool muscl;
+  const ENUM_LIMITER typeLimiter;
+
+  /*!
+   * \brief Constructor, store some constants and forward args to base.
+   */
+  template<class... Ts>
+  CRoeBase(const CConfig& config, unsigned iMesh, Ts&... args) : Base(config, iMesh, args...),
+    kappa(config.GetRoe_Kappa()),
+    gamma(config.GetGamma()),
+    entropyFix(config.GetEntropyFix_Coeff()),
+    finestGrid(iMesh == MESH_0),
+    dynamicGrid(config.GetDynamic_Grid()),
+    muscl(finestGrid && config.GetMUSCL_Flow()),
+    typeLimiter(static_cast<ENUM_LIMITER>(config.GetKind_SlopeLimit_Flow())) {
+  }
+
+public:
+  /*!
+   * \brief Implementation of the base Roe flux.
+   */
+  void ComputeFlux(Int iEdge,
+                   const CConfig& config,
+                   const CGeometry& geometry,
+                   const CVariable& solution_,
+                   UpdateType updateType,
+                   Double updateMask,
+                   CSysVector<su2double>& vector,
+                   SparseMatrixType& matrix) const final {
+
+    /*--- Start preaccumulation, inputs are registered
+     *    automatically in "gatherVariables". ---*/
+    AD::StartPreacc();
+
+    const bool implicit = (config.GetKind_TimeIntScheme() == EULER_IMPLICIT);
+    const auto& solution = static_cast<const CEulerVariable&>(solution_);
+
+    const auto iPoint = geometry.edges->GetNode(iEdge,0);
+    const auto jPoint = geometry.edges->GetNode(iEdge,1);
+
+    /*--- Geometric properties. ---*/
+
+    const auto vector_ij = distanceVector<nDim>(iPoint, jPoint, geometry.nodes->GetCoord());
+
+    const auto normal = gatherVariables<nDim>(iEdge, geometry.edges->GetNormal());
+    const auto area = norm(normal);
+    VectorDbl<nDim> unitNormal;
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      unitNormal(iDim) = normal(iDim) / area;
+    }
+
+    /*--- Reconstructed primitives. ---*/
+
+    CPair<CCompressiblePrimitives<nDim,nPrimVar> > V1st;
+    V1st.i.all = gatherVariables<nPrimVar>(iPoint, solution.GetPrimitive());
+    V1st.j.all = gatherVariables<nPrimVar>(jPoint, solution.GetPrimitive());
+
+    auto V = reconstructPrimitives<CCompressiblePrimitives<nDim,nPrimVarGrad> >(
+                  iPoint, jPoint, muscl, typeLimiter, V1st, vector_ij, solution);
+
+    /*--- Compute conservative variables. ---*/
+
+    CPair<CCompressibleConservatives<nDim> > U;
+    U.i = compressibleConservatives(V.i);
+    U.j = compressibleConservatives(V.j);
+
+    /*--- Roe averaged variables. ---*/
+
+    auto roeAvg = roeAveragedVariables(gamma, V, unitNormal);
+
+    /*--- P tensor. ---*/
+
+    auto pMat = pMatrix(gamma, roeAvg.density, roeAvg.velocity,
+                        roeAvg.projVel, roeAvg.speedSound, unitNormal);
+
+    /*--- Grid motion. ---*/
+
+    Double projGridVel = 0.0, projVel = roeAvg.projVel;
+    if (dynamicGrid) {
+      const auto& gridVel = geometry.nodes->GetGridVel();
+      projGridVel = 0.5*(dot(gatherVariables<nDim>(iPoint,gridVel), unitNormal)+
+                         dot(gatherVariables<nDim>(jPoint,gridVel), unitNormal));
+      projVel -= projGridVel;
+    }
+
+    /*--- Convective eigenvalues. ---*/
+
+    VectorDbl<nVar> lambda;
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      lambda(iDim) = projVel;
+    }
+    lambda(nDim) = projVel + roeAvg.speedSound;
+    lambda(nDim+1) = projVel - roeAvg.speedSound;
+
+    /*--- Apply Mavriplis' entropy correction to eigenvalues. ---*/
+
+    Double maxLambda = abs(projVel) + roeAvg.speedSound;
+
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      lambda(iVar) = max(abs(lambda(iVar)), entropyFix*maxLambda);
+    }
+
+    /*--- Inviscid fluxes and Jacobians. ---*/
+
+    auto flux_i = inviscidProjFlux(V.i, U.i, normal);
+    auto flux_j = inviscidProjFlux(V.j, U.j, normal);
+
+    VectorDbl<nVar> flux;
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      flux(iVar) = kappa * (flux_i(iVar) + flux_j(iVar));
+    }
+
+    MatrixDbl<nVar> jac_i, jac_j;
+    if (implicit) {
+      jac_i = inviscidProjJac(gamma, V.i.velocity(), U.i.energy(), normal, kappa);
+      jac_j = inviscidProjJac(gamma, V.j.velocity(), U.j.energy(), normal, kappa);
+    }
+
+    /*--- Correct for grid motion. ---*/
+
+    if (dynamicGrid) {
+      for (size_t iVar = 0; iVar < nVar; ++iVar) {
+        Double dFdU = projGridVel * area * 0.5;
+        flux(iVar) -= dFdU * (U.i.all(iVar) + U.j.all(iVar));
+
+        if (implicit) {
+          jac_i(iVar,iVar) -= dFdU;
+          jac_j(iVar,iVar) -= dFdU;
+        }
+      }
+    }
+
+    /*--- Finalize in derived class (static polymorphism). ---*/
+
+    const auto derived = static_cast<const Derived*>(this);
+
+    derived->finalizeFlux(flux, jac_i, jac_j, implicit, area, unitNormal, V,
+                          U, roeAvg, lambda, pMat, iPoint, jPoint, solution);
+
+    /*--- Add the contributions from the base class (static decorator). ---*/
+
+    Base::viscousTerms(iEdge, iPoint, jPoint, V1st, solution_, vector_ij, geometry,
+                       config, area, unitNormal, implicit, flux, jac_i, jac_j);
+
+    /*--- Stop preaccumulation. ---*/
+
+    AD::SetPreaccOut(flux, nVar, Double::Size);
+    AD::EndPreacc();
+
+    /*--- Update the vector and system matrix. ---*/
+
+    updateLinearSystem(iEdge, iPoint, jPoint, implicit, updateType,
+                       updateMask, flux, jac_i, jac_j, vector, matrix);
+  }
+};
+
+/*!
+ * \class CRoeScheme
+ * \brief Classical Roe scheme.
+ */
+template<class Decorator>
+class CRoeScheme : public CRoeBase<CRoeScheme<Decorator>,Decorator> {
+private:
+  using Base = CRoeBase<CRoeScheme<Decorator>,Decorator>;
+  using Base::nDim;
+  using Base::nVar;
+  using Base::gamma;
+  using Base::kappa;
+  const ENUM_ROELOWDISS typeDissip;
+
+public:
+  /*!
+   * \brief Constructor, store some constants and forward to base.
+   */
+  template<class... Ts>
+  CRoeScheme(const CConfig& config, Ts&... args) : Base(config, args...),
+    typeDissip(static_cast<ENUM_ROELOWDISS>(config.GetKind_RoeLowDiss())) {
+  }
+
+  /*!
+   * \brief Updates flux and Jacobians with standard Roe dissipation.
+   * \note "Ts" is here just in case other schemes in the family need extra args.
+   */
+  template<class PrimVarType, class ConsVarType, class... Ts>
+  FORCEINLINE void finalizeFlux(VectorDbl<nVar>& flux,
+                                MatrixDbl<nVar>& jac_i,
+                                MatrixDbl<nVar>& jac_j,
+                                bool implicit,
+                                Double area,
+                                const VectorDbl<nDim>& unitNormal,
+                                const CPair<PrimVarType>& V,
+                                const CPair<ConsVarType>& U,
+                                const CRoeVariables<nDim>& roeAvg,
+                                const VectorDbl<nVar>& lambda,
+                                const MatrixDbl<nVar>& pMat,
+                                Int iPoint,
+                                Int jPoint,
+                                const CEulerVariable& solution,
+                                Ts&...) const {
+    /*--- Inverse P tensor. ---*/
+
+    auto pMatInv = pMatrixInv(gamma, roeAvg.density, roeAvg.velocity,
+                              roeAvg.projVel, roeAvg.speedSound, unitNormal);
+
+    /*--- Diference between conservative variables at jPoint and iPoint. ---*/
+
+    VectorDbl<nVar> deltaU;
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      deltaU(iVar) = U.j.all(iVar) - U.i.all(iVar);
+    }
+
+    /*--- Dissipation terms. ---*/
+
+    Double dissipation = roeDissipation(iPoint, jPoint, typeDissip, solution);
+
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      for (size_t jVar = 0; jVar < nVar; ++jVar) {
+        /*--- Compute |projModJacTensor| = P x |Lambda| x P^-1. ---*/
+
+        Double projModJacTensor = 0.0;
+        for (size_t kVar = 0; kVar < nVar; ++kVar) {
+          projModJacTensor += pMat(iVar,kVar) * lambda(kVar) * pMatInv(kVar,jVar);
+        }
+
+        Double dDdU = projModJacTensor * (1-kappa) * area * dissipation;
+
+        /*--- Update flux and Jacobians. ---*/
+
+        flux(iVar) -= dDdU * deltaU(jVar);
+
+        if(implicit) {
+          jac_i(iVar,jVar) += dDdU;
+          jac_j(iVar,jVar) -= dDdU;
+        }
+      }
+    }
+  }
+};
diff --git a/SU2_CFD/include/numerics_simd/flow/diffusion/common.hpp b/SU2_CFD/include/numerics_simd/flow/diffusion/common.hpp
new file mode 100644
index 00000000000..f2da6f63fdf
--- /dev/null
+++ b/SU2_CFD/include/numerics_simd/flow/diffusion/common.hpp
@@ -0,0 +1,170 @@
+﻿/*!
+ * \file common.hpp
+ * \brief Helper functions for viscous methods.
+ * \author P. Gomes, C. Pederson, A. Bueno, F. Palacios, T. Economon
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../CNumericsSIMD.hpp"
+#include "../../util.hpp"
+#include "../variables.hpp"
+
+/*!
+ * \brief Average gradients at i/j points.
+ */
+template<size_t nVar, size_t nDim, class GradientType>
+FORCEINLINE MatrixDbl<nVar,nDim> averageGradient(Int iPoint, Int jPoint,
+                                                 const GradientType& gradient) {
+  auto avgGrad = gatherVariables<nVar,nDim>(iPoint, gradient);
+  auto grad_j = gatherVariables<nVar,nDim>(jPoint, gradient);
+  for (size_t iVar = 0; iVar < nVar; ++iVar) {
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      avgGrad(iVar,iDim) *= 0.5;
+      avgGrad(iVar,iDim) += 0.5 * grad_j(iVar,iDim);
+    }
+  }
+  return avgGrad;
+}
+
+/*!
+ * \brief Correct average gradient with the directional derivative to avoid decoupling.
+ */
+template<size_t nVar, size_t nDim, class PrimitiveType>
+FORCEINLINE void correctGradient(const PrimitiveType& V,
+                                 const VectorDbl<nDim>& vector_ij,
+                                 Double dist2_ij,
+                                 MatrixDbl<nVar,nDim>& avgGrad) {
+  for (size_t iVar = 0; iVar < nVar; ++iVar) {
+    Double corr = (dot(avgGrad[iVar],vector_ij) - V.j.all(iVar) + V.i.all(iVar)) / dist2_ij;
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      avgGrad(iVar,iDim) -= corr * vector_ij(iDim);
+    }
+  }
+}
+
+/*!
+ * \brief Compute the stress tensor (using the total viscosity).
+ * \note Second viscosity term ignored.
+ */
+template<size_t nVar, size_t nDim, class PrimitiveType>
+FORCEINLINE MatrixDbl<nDim> stressTensor(const PrimitiveType& V,
+                                         const MatrixDbl<nVar,nDim> grad) {
+  Double viscosity = V.laminarVisc() + V.eddyVisc();
+
+  /*--- Hydrostatic term. ---*/
+  Double velDiv = 0.0;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    velDiv += grad(iDim+1,iDim);
+  }
+  Double pTerm = 2.0/3.0 * viscosity * velDiv;
+
+  MatrixDbl<nDim> tau;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    /*--- Deviatoric term. ---*/
+    for (size_t jDim = 0; jDim < nDim; ++jDim) {
+      tau(iDim,jDim) = viscosity * (grad(jDim+1,iDim) + grad(iDim+1,jDim));
+    }
+    tau(iDim,iDim) -= pTerm;
+  }
+  return tau;
+}
+
+/*!
+ * \brief SA-QCR2000 modification of the stress tensor.
+ */
+template<class MatrixType, size_t nDim>
+FORCEINLINE void addQCR(const MatrixType& grad, MatrixDbl<nDim>& tau) {
+  constexpr passivedouble c_cr1 = 0.3;
+
+  /*--- Denominator, antisymmetric normalized rotation tensor. ---*/
+  Double denom = 0.0;
+  for (size_t iDim = 0; iDim < nDim; ++iDim)
+    for (size_t jDim = 0; jDim < nDim; ++jDim)
+      denom += grad(iDim+1,jDim) * grad(iDim+1,jDim);
+
+  const Double factor = 1 / sqrt(max(denom,1e-10));
+
+  /*--- Compute the QCR term, and update the stress tensor. ---*/
+  MatrixDbl<nDim> qcr;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    for (size_t jDim = 0; jDim < nDim; ++jDim) {
+      qcr(iDim,jDim) = 0.0;
+      for (size_t kDim = 0; kDim < nDim; ++kDim) {
+        Double O_ik = (grad(iDim+1,kDim) - grad(kDim+1,iDim)) * factor;
+        Double O_jk = (grad(jDim+1,kDim) - grad(kDim+1,jDim)) * factor;
+        qcr(iDim,jDim) += O_ik*tau(jDim,kDim) + O_jk*tau(iDim,kDim);
+      }
+    }
+  }
+  for (size_t iDim = 0; iDim < nDim; ++iDim)
+    for (size_t jDim = 0; jDim < nDim; ++jDim)
+      tau(iDim,jDim) -= c_cr1 * qcr(iDim,jDim);
+}
+
+/*!
+ * \brief Jacobian of the stress tensor (compressible flow).
+ */
+template<size_t nVar, size_t nDim, class PrimitiveType>
+FORCEINLINE MatrixDbl<nDim,nVar> stressTensorJacobian(const PrimitiveType& V,
+                                                      const VectorDbl<nDim> normal,
+                                                      Double dist_ij) {
+  Double viscosity = V.laminarVisc() + V.eddyVisc();
+  Double xi = viscosity / (V.density() * dist_ij);
+  MatrixDbl<nDim,nVar> jac;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    /*--- Momentum. ---*/
+    for (size_t jDim = 0; jDim < nDim; ++jDim) {
+      jac(iDim,jDim+1) = (-1/3.0) * xi * normal(iDim) * normal(jDim);
+    }
+    jac(iDim,iDim+1) -= xi;
+    /*--- Density. ---*/
+    jac(iDim,0) = -dot<nDim>(&jac(iDim,1), V.velocity());
+    /*--- Energy. ---*/
+    jac(iDim,nDim+1) = 0.0;
+  }
+  return jac;
+}
+
+/*!
+ * \brief Viscous flux for compressible flows.
+ */
+template<size_t nVar, size_t nDim, class PrimitiveType>
+FORCEINLINE VectorDbl<nVar> viscousFlux(const PrimitiveType& V,
+                                        const MatrixDbl<nDim>& tau,
+                                        const VectorDbl<nDim>& heatFlux,
+                                        const VectorDbl<nDim>& normal) {
+  VectorDbl<nVar> flux;
+  flux(0) = 0.0;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    /*--- Using the symmetry of the tensor. ---*/
+    flux(iDim+1) = dot(tau[iDim], normal);
+  }
+  flux(nDim+1) = 0.0;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    auto viscWork = dot<nDim>(tau[iDim], V.velocity());
+    flux(nDim+1) += normal(iDim) * (heatFlux(iDim) + viscWork);
+  }
+  return flux;
+}
diff --git a/SU2_CFD/include/numerics_simd/flow/diffusion/viscous_fluxes.hpp b/SU2_CFD/include/numerics_simd/flow/diffusion/viscous_fluxes.hpp
new file mode 100644
index 00000000000..d2091c5969c
--- /dev/null
+++ b/SU2_CFD/include/numerics_simd/flow/diffusion/viscous_fluxes.hpp
@@ -0,0 +1,236 @@
+﻿/*!
+ * \file viscous_fluxes.hpp
+ * \brief Decorator classes for computation of viscous fluxes.
+ * \author P. Gomes, C. Pederson, A. Bueno, F. Palacios, T. Economon
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../../CNumericsSIMD.hpp"
+#include "../../util.hpp"
+#include "../variables.hpp"
+#include "common.hpp"
+
+/*!
+ * \class CNoViscousFlux
+ * \brief Numerics classes that accept a compile-time decorator should use this
+ * class template as a "do-nothing" decorator and as a link to the interface when
+ * they are not being decorated.
+ * Compile-time decoration works by specifying the base class as a template parameter.
+ * Then the class being decorated should call a method of its base class to add some
+ * contribution to the flux/source and Jacobians just before writting the results to
+ * CSysVector and CSysMatrix. The mechanism can be used to chain any number of classes
+ * at compile-time, but its main purpose is to combine convective and viscous fluxes
+ * in the most (nearly) efficient way.
+ */
+template<size_t NDIM>
+class CNoViscousFlux : public CNumericsSIMD {
+protected:
+  static constexpr size_t nDim = NDIM;
+  static constexpr size_t nPrimVar = 0;
+
+  template<class... Ts>
+  CNoViscousFlux(Ts&...) {}
+
+  /*!
+   * \brief Empty method, real decorators should take as arguments whatever
+   * the decorated class can pass them to avoid expensive data accesses.
+   */
+  template<class... Ts>
+  void viscousTerms(Ts&...) const {}
+};
+
+/*!
+ * \class CCompressibleViscousFlux
+ * \brief Decorator class to add viscous fluxes (compressible flow, ideal gas).
+ */
+template<size_t NDIM>
+class CCompressibleViscousFlux : public CNumericsSIMD {
+protected:
+  static constexpr size_t nDim = NDIM;
+  static constexpr size_t nPrimVar = nDim+7;
+  static constexpr size_t nPrimVarGrad = nDim+1;
+
+  const su2double gamma;
+  const su2double gasConst;
+  const su2double prandtlLam;
+  const su2double prandtlTurb;
+  const su2double cp;
+  const bool correct;
+  const bool useSA_QCR;
+
+  /*!
+   * \brief Constructor, initialize constants and booleans.
+   */
+  template<class... Ts>
+  CCompressibleViscousFlux(const CConfig& config, int iMesh, Ts&...) :
+    gamma(config.GetGamma()),
+    gasConst(config.GetGas_ConstantND()),
+    prandtlLam(config.GetPrandtl_Lam()),
+    prandtlTurb(config.GetPrandtl_Turb()),
+    cp(gamma * gasConst / (gamma - 1)),
+    correct(iMesh == MESH_0),
+    useSA_QCR(config.GetQCR()) {
+  }
+
+  /*!
+   * \brief Add viscous contributions to flux and jacobians.
+   */
+  template<class PrimVarType, size_t nVar>
+  FORCEINLINE void viscousTerms(Int iEdge,
+                                Int iPoint,
+                                Int jPoint,
+                                const PrimVarType& avgV,
+                                const CPair<PrimVarType>& V,
+                                const CVariable& solution_,
+                                const VectorDbl<nDim>& vector_ij,
+                                const CGeometry& geometry,
+                                const CConfig& config,
+                                Double area,
+                                const VectorDbl<nDim>& unitNormal,
+                                bool implicit,
+                                VectorDbl<nVar>& flux,
+                                MatrixDbl<nVar>& jac_i,
+                                MatrixDbl<nVar>& jac_j) const {
+
+    static_assert(PrimVarType::nVar <= nPrimVar,"");
+
+    const auto& solution = static_cast<const CNSVariable&>(solution_);
+    const auto& gradient = solution.GetGradient_Primitive();
+
+    /*--- Compute distance and handle zero without "ifs" by making it large. ---*/
+
+    auto dist2_ij = squaredNorm(vector_ij);
+    Double mask = dist2_ij < EPS*EPS;
+    dist2_ij += mask / (EPS*EPS);
+
+    /*--- Compute the corrected mean gradient. ---*/
+
+    auto avgGrad = averageGradient<nPrimVarGrad,nDim>(iPoint, jPoint, gradient);
+    if(correct) correctGradient(V, vector_ij, dist2_ij, avgGrad);
+
+    /// TODO: Uncertainty quantification (needs a way to access tke, maybe in ctor).
+
+    /*--- Stress and heat flux tensors. ---*/
+
+    auto tau = stressTensor(avgV, avgGrad);
+    if(useSA_QCR) addQCR(avgGrad, tau);
+
+    Double cond = cp * (avgV.laminarVisc()/prandtlLam + avgV.eddyVisc()/prandtlTurb);
+    VectorDbl<nDim> heatFlux;
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      heatFlux(iDim) = cond * avgGrad(0,iDim);
+    }
+
+    /*--- Projected flux. ---*/
+
+    auto viscFlux = viscousFlux<nVar>(avgV, tau, heatFlux, unitNormal);
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      viscFlux(iVar) *= area;
+      flux(iVar) -= viscFlux(iVar);
+    }
+
+    if (!implicit) return;
+
+    /*--- Flux Jacobians. ---*/
+
+    Double dist_ij = sqrt(dist2_ij);
+    auto dtau = stressTensorJacobian<nVar>(avgV, unitNormal, dist_ij);
+    Double contraction = 0.0;
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      contraction += dtau(iDim,0) * avgV.velocity(iDim);
+    }
+
+    /*--- Energy flux Jacobian. ---*/
+    VectorDbl<nVar> dEdU;
+    Double vel2 = 0.5 * squaredNorm<nDim>(avgV.velocity());
+    Double phi = (gamma-1) / avgV.density();
+    Double RdTdrho = phi*vel2 - avgV.pressure() / pow(avgV.density(),2);
+    Double condOnRd = cond / (gasConst * dist_ij);
+
+    dEdU(0) = area * (condOnRd * RdTdrho - contraction);
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      dEdU(iDim+1) = area * (condOnRd*phi*avgV.velocity(iDim) + dtau(iDim,0));
+    }
+    dEdU(nDim+1) = area * condOnRd * phi;
+
+    /*--- Update momentum and energy terms ("symmetric" part). ---*/
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      for (size_t iVar = 0; iVar < nVar; ++iVar) {
+        jac_i(iDim+1,iVar) -= area * dtau(iDim,iVar);
+        jac_j(iDim+1,iVar) += area * dtau(iDim,iVar);
+      }
+    }
+    for (size_t iVar = 0; iVar < nVar; ++iVar) {
+      jac_i(nDim+1,iVar) += dEdU(iVar);
+      jac_j(nDim+1,iVar) -= dEdU(iVar);
+    }
+    /*--- "Non-symmetric" energy terms. ---*/
+    Double proj = dot<nDim>(&viscFlux(1), avgV.velocity());
+    Double halfOnRho = 0.5/avgV.density();
+    jac_i(nDim+1,0) += halfOnRho * proj;
+    jac_j(nDim+1,0) += halfOnRho * proj;
+    for (size_t iDim = 0; iDim < nDim; ++iDim) {
+      jac_i(nDim+1,iDim+1) -= halfOnRho * viscFlux(iDim+1);
+      jac_j(nDim+1,iDim+1) -= halfOnRho * viscFlux(iDim+1);
+    }
+  }
+
+  /*!
+   * \overload Average primitives if not provided yet.
+   */
+  template<class PrimVarType, class... Ts>
+  FORCEINLINE void viscousTerms(Int iEdge,
+                                Int iPoint,
+                                Int jPoint,
+                                const CPair<PrimVarType>& V,
+                                Ts&... args) const {
+    PrimVarType avgV;
+    for (size_t iVar = 0; iVar < PrimVarType::nVar; ++iVar) {
+      avgV.all(iVar) = 0.5 * (V.i.all(iVar) + V.j.all(iVar));
+    }
+
+    /*--- Continue calculation. ---*/
+    viscousTerms(iEdge, iPoint, jPoint, avgV, V, args...);
+  }
+
+  /*!
+   * \overload Compute the i-j vector if not provided yet.
+   */
+  template<class PrimVarType, class... Ts>
+  FORCEINLINE void viscousTerms(Int iEdge,
+                                Int iPoint,
+                                Int jPoint,
+                                const PrimVarType& avgV,
+                                const CPair<PrimVarType>& V,
+                                const CVariable& solution_,
+                                const CGeometry& geometry,
+                                Ts&... args) const {
+
+    const auto vector_ij = distanceVector<nDim>(iPoint, jPoint, geometry.nodes->GetCoord());
+
+    /*--- Continue calculation. ---*/
+    viscousTerms(iEdge, iPoint, jPoint, avgV, V, solution_, vector_ij, geometry, args...);
+  }
+};
diff --git a/SU2_CFD/include/numerics_simd/flow/variables.hpp b/SU2_CFD/include/numerics_simd/flow/variables.hpp
new file mode 100644
index 00000000000..4b98adf40df
--- /dev/null
+++ b/SU2_CFD/include/numerics_simd/flow/variables.hpp
@@ -0,0 +1,126 @@
+﻿/*!
+ * \file variables.hpp
+ * \brief Collection of types to store physical variables.
+ * \author P. Gomes
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "../CNumericsSIMD.hpp"
+#include "../util.hpp"
+
+/*!
+ * \brief Type to store compressible primitive variables and access them by name.
+ */
+template<size_t nDim_, size_t nVar_>
+struct CCompressiblePrimitives {
+  static constexpr size_t nDim = nDim_;
+  static constexpr size_t nVar = nVar_;
+  VectorDbl<nVar> all;
+  FORCEINLINE Double& temperature() { return all(0); }
+  FORCEINLINE Double& pressure() { return all(nDim+1); }
+  FORCEINLINE Double& density() { return all(nDim+2); }
+  FORCEINLINE Double& enthalpy() { return all(nDim+3); }
+  FORCEINLINE Double& velocity(size_t iDim) { return all(iDim+1); }
+  FORCEINLINE const Double& temperature() const { return all(0); }
+  FORCEINLINE const Double& pressure() const { return all(nDim+1); }
+  FORCEINLINE const Double& density() const { return all(nDim+2); }
+  FORCEINLINE const Double& enthalpy() const { return all(nDim+3); }
+  FORCEINLINE const Double& velocity(size_t iDim) const { return all(iDim+1); }
+  FORCEINLINE const Double* velocity() const { return &velocity(0); }
+
+  /*--- Un-reconstructed variables (not allocated by default). ---*/
+  FORCEINLINE Double& speedSound() { return all(nDim+4); }
+  FORCEINLINE Double& laminarVisc() { return all(nDim+5); }
+  FORCEINLINE Double& eddyVisc() { return all(nDim+6); }
+  FORCEINLINE const Double& speedSound() const { return all(nDim+4); }
+  FORCEINLINE const Double& laminarVisc() const { return all(nDim+5); }
+  FORCEINLINE const Double& eddyVisc() const { return all(nDim+6); }
+};
+
+/*!
+ * \brief Type to store compressible conservative (i.e. solution) variables.
+ */
+template<size_t nDim_>
+struct CCompressibleConservatives {
+  static constexpr size_t nDim = nDim_;
+  static constexpr size_t nVar = nDim+2;
+  VectorDbl<nVar> all;
+
+  FORCEINLINE Double& density() { return all(0); }
+  FORCEINLINE Double& rhoEnergy() { return all(nDim+1); }
+  FORCEINLINE Double& momentum(size_t iDim) { return all(iDim+1); }
+  FORCEINLINE const Double& density() const { return all(0); }
+  FORCEINLINE const Double& rhoEnergy() const { return all(nDim+1); }
+  FORCEINLINE const Double& momentum(size_t iDim) const { return all(iDim+1); }
+
+  FORCEINLINE Double energy() const { return rhoEnergy() / density(); }
+  FORCEINLINE const Double* momentum() const { return &momentum(0); }
+};
+
+/*!
+ * \brief Primitive to conservative conversion.
+ */
+template<size_t nDim, size_t N>
+FORCEINLINE CCompressibleConservatives<nDim> compressibleConservatives(const CCompressiblePrimitives<nDim,N>& V) {
+  CCompressibleConservatives<nDim> U;
+  U.density() = V.density();
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    U.momentum(iDim) = V.density() * V.velocity(iDim);
+  }
+  U.rhoEnergy() = V.density() * V.enthalpy() - V.pressure();
+  return U;
+}
+
+/*!
+ * \brief Roe-averaged variables.
+ */
+template<size_t nDim>
+struct CRoeVariables {
+  Double density;
+  VectorDbl<nDim> velocity;
+  Double enthalpy;
+  Double speedSound;
+  Double projVel;
+};
+
+/*!
+ * \brief Compute Roe-averaged variables from pair of primitive variables.
+ */
+template<size_t nDim, class PrimVarType>
+FORCEINLINE CRoeVariables<nDim> roeAveragedVariables(Double gamma,
+                                                     const CPair<PrimVarType>& V,
+                                                     const VectorDbl<nDim>& normal) {
+  CRoeVariables<nDim> roeAvg;
+  Double R = sqrt(V.j.density() / V.i.density());
+  Double D = 1 / (R+1);
+  roeAvg.density = R * V.i.density();
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    roeAvg.velocity(iDim) = (R*V.j.velocity(iDim) + V.i.velocity(iDim)) * D;
+  }
+  roeAvg.enthalpy = (R*V.j.enthalpy() + V.i.enthalpy()) * D;
+  roeAvg.speedSound = sqrt((gamma-1) * (roeAvg.enthalpy - 0.5*squaredNorm(roeAvg.velocity)));
+  roeAvg.projVel = dot(roeAvg.velocity, normal);
+  return roeAvg;
+}
diff --git a/SU2_CFD/include/numerics_simd/util.hpp b/SU2_CFD/include/numerics_simd/util.hpp
new file mode 100644
index 00000000000..bd05fa11903
--- /dev/null
+++ b/SU2_CFD/include/numerics_simd/util.hpp
@@ -0,0 +1,194 @@
+﻿/*!
+ * \file util.hpp
+ * \brief Generic auxiliary functions.
+ * \author P. Gomes
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include "CNumericsSIMD.hpp"
+#include "../../../Common/include/containers/C2DContainer.hpp"
+#include "../../../Common/include/linear_algebra/CSysVector.hpp"
+#include "../../../Common/include/linear_algebra/CSysMatrix.hpp"
+
+/*!
+ * \brief Static vector and matrix types.
+ * \note These should be used instead of C-style arrays.
+ */
+template<class Type, size_t Size>
+using Vector = C2DContainer<unsigned long, Type, StorageType::ColumnMajor, Type::Align, Size, 1>;
+
+template<size_t Size> using VectorInt = Vector<Int, Size>;
+template<size_t Size> using VectorDbl = Vector<Double, Size>;
+
+template<class Type, size_t Rows, size_t Cols>
+using Matrix = C2DContainer<unsigned long, Type, StorageType::RowMajor, Type::Align, Rows, Cols>;
+
+template<size_t Rows, size_t Cols = Rows> using MatrixInt = Matrix<Int, Rows, Cols>;
+template<size_t Rows, size_t Cols = Rows> using MatrixDbl = Matrix<Double, Rows, Cols>;
+
+/*!
+ * \brief Constexpr version of max.
+ */
+inline constexpr size_t Max(size_t a, size_t b) { return a>b? a : b; }
+
+/*!
+ * \brief Simple pair type for i/j variables.
+ */
+template<class T>
+struct CPair {
+  T i, j;
+};
+
+/*!
+ * \brief Dot product.
+ */
+template<size_t nDim, class ForwardIterator, class T>
+FORCEINLINE Double dot(ForwardIterator iterator, const T* ptr) {
+  Double sum = 0.0;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    sum += *(iterator++) * ptr[iDim];
+  }
+  return sum;
+}
+
+/*!
+ * \overload Dot product.
+ */
+template<size_t nDim, class ForwardIterator>
+FORCEINLINE Double dot(ForwardIterator iterator, const VectorDbl<nDim>& vector) {
+  return dot<nDim>(iterator, vector.data());
+}
+
+/*!
+ * \overload Dot product.
+ */
+template<size_t nDim>
+FORCEINLINE Double dot(const VectorDbl<nDim>& a, const VectorDbl<nDim>& b) {
+  return dot<nDim>(a.data(), b.data());
+}
+
+/*!
+ * \brief Squared norm.
+ */
+template<size_t nDim, class ForwardIterator>
+FORCEINLINE Double squaredNorm(ForwardIterator iterator) {
+  Double sum = 0.0;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    sum += pow(*(iterator++),2);
+  }
+  return sum;
+}
+
+/*!
+ * \overload Squared norm.
+ */
+template<size_t nDim>
+FORCEINLINE Double squaredNorm(const VectorDbl<nDim>& vector) {
+  return squaredNorm<nDim>(vector.data());
+}
+
+/*!
+ * \brief Vector norm.
+ */
+template<size_t nDim>
+FORCEINLINE Double norm(const VectorDbl<nDim>& vector) { return sqrt(squaredNorm(vector)); }
+
+/*!
+ * \brief Gather a single variable from index iPoint of a 1D container.
+ */
+template<class Container>
+FORCEINLINE Double gatherVariables(Int iPoint, const Container& vars) {
+  auto x = *vars.innerIter(iPoint);
+  AD::SetPreaccIn(x, Double::Size);
+  return x;
+}
+
+/*!
+ * \brief Gather a vector of variables (size nVar) from row iPoint of a 2D container.
+ */
+template<size_t nVar, class Container>
+FORCEINLINE VectorDbl<nVar> gatherVariables(Int iPoint, const Container& vars) {
+  auto x = vars.template get<VectorDbl<nVar> >(iPoint);
+  AD::SetPreaccIn(x, nVar, Double::Size);
+  return x;
+}
+
+/*!
+ * \brief Gather a matrix of variables from outer index iPoint of a 3D container.
+ */
+template<size_t nRows, size_t nCols, class Container>
+FORCEINLINE MatrixDbl<nRows,nCols> gatherVariables(Int iPoint, const Container& vars) {
+  auto x = vars.template get<MatrixDbl<nRows,nCols> >(iPoint);
+  AD::SetPreaccIn(x, nRows, nCols, Double::Size);
+  return x;
+}
+
+/*!
+ * \brief Distance vector, from point i to point j.
+ */
+template<size_t nDim, class Container>
+FORCEINLINE VectorDbl<nDim> distanceVector(Int iPoint, Int jPoint,
+                                           const Container& coords) {
+  auto coord_i = gatherVariables<nDim>(iPoint, coords);
+  auto coord_j = gatherVariables<nDim>(jPoint, coords);
+  VectorDbl<nDim> vector_ij;
+  for (size_t iDim = 0; iDim < nDim; ++iDim) {
+    vector_ij(iDim) = coord_j(iDim) - coord_i(iDim);
+  }
+  return vector_ij;
+}
+
+/*!
+ * \brief Update the matrix and right-hand-side of a linear system.
+ */
+template<size_t nVar>
+FORCEINLINE void updateLinearSystem(Int iEdge,
+                                    Int iPoint,
+                                    Int jPoint,
+                                    bool implicit,
+                                    UpdateType updateType,
+                                    Double updateMask,
+                                    const VectorDbl<nVar>& flux,
+                                    const MatrixDbl<nVar>& jac_i,
+                                    const MatrixDbl<nVar>& jac_j,
+                                    CSysVector<su2double>& vector,
+                                    SparseMatrixType& matrix) {
+  if (updateType == UpdateType::COLORING) {
+    vector.UpdateBlocks(iPoint, jPoint, flux, updateMask);
+    if(implicit) {
+      auto wasActive = AD::BeginPassive();
+      matrix.UpdateBlocks(iEdge, iPoint, jPoint, jac_i, jac_j, updateMask);
+      AD::EndPassive(wasActive);
+    }
+  }
+  else {
+    vector.SetBlock(iEdge, flux, updateMask);
+    if(implicit) {
+      auto wasActive = AD::BeginPassive();
+      matrix.SetBlocks(iEdge, jac_i, jac_j, updateMask);
+      AD::EndPassive(wasActive);
+    }
+  }
+}
diff --git a/SU2_CFD/include/solvers/CEulerSolver.hpp b/SU2_CFD/include/solvers/CEulerSolver.hpp
index 04060acde2c..dd031085bab 100644
--- a/SU2_CFD/include/solvers/CEulerSolver.hpp
+++ b/SU2_CFD/include/solvers/CEulerSolver.hpp
@@ -31,9 +31,8 @@
 #include "../variables/CEulerVariable.hpp"
 
 /*!
- * \class CSolver
- * \brief Main class for defining the PDE solution, it requires
- * a child class for each particular solver (Euler, Navier-Stokes, etc.)
+ * \class CEulerSolver
+ * \brief Class for compressible inviscid flow problems, serves as base for Navier-Stokes/RANS.
  * \author F. Palacios
  */
 class CEulerSolver : public CFVMFlowSolverBase<CEulerVariable, COMPRESSIBLE> {
@@ -167,12 +166,6 @@ class CEulerSolver : public CFVMFlowSolverBase<CEulerVariable, COMPRESSIBLE> {
   template<ENUM_TIME_INT IntegrationType>
   void Explicit_Iteration(CGeometry *geometry, CSolver **solver_container, CConfig *config, unsigned short iRKStep);
 
-  /*!
-   * \brief Sum the edge fluxes for each cell to populate the residual vector, only used on coarse grids.
-   * \param[in] geometry - Geometrical definition of the problem.
-   */
-  void SumEdgeFluxes(CGeometry* geometry);
-
   /*!
    * \brief Preprocessing actions common to the Euler and NS solvers.
    * \param[in] geometry - Geometrical definition of the problem.
diff --git a/SU2_CFD/include/solvers/CFVMFlowSolverBase.hpp b/SU2_CFD/include/solvers/CFVMFlowSolverBase.hpp
index 032c674f2d2..f0960f00d91 100644
--- a/SU2_CFD/include/solvers/CFVMFlowSolverBase.hpp
+++ b/SU2_CFD/include/solvers/CFVMFlowSolverBase.hpp
@@ -30,6 +30,8 @@
 #include "../../../Common/include/toolboxes/geometry_toolbox.hpp"
 #include "CSolver.hpp"
 
+class CNumericsSIMD;
+
 template <class VariableType, ENUM_REGIME FlowRegime>
 class CFVMFlowSolverBase : public CSolver {
  protected:
@@ -178,7 +180,12 @@ class CFVMFlowSolverBase : public CSolver {
 
   CSysVector<su2double> EdgeFluxes; /*!< \brief Flux across each edge. */
 
-  VariableType* nodes = nullptr; /*!< \brief The highest level in the variable hierarchy this solver can safely use. */
+  CNumericsSIMD* edgeNumerics = nullptr; /*!< \brief Object for edge flux computation. */
+
+  /*!
+   * \brief The highest level in the variable hierarchy the DERIVED solver can safely use.
+   */
+  VariableType* nodes = nullptr;
 
   /*!
    * \brief Return nodes to allow CSolver::base_nodes to be set.
@@ -221,6 +228,16 @@ class CFVMFlowSolverBase : public CSolver {
    */
   su2double EvaluateCommonObjFunc(const CConfig& config) const;
 
+  /*!
+   * \brief Method to compute convective and viscous residual contribution using vectorized numerics.
+   */
+  void EdgeFluxResidual(const CGeometry *geometry, const CConfig *config);
+
+  /*!
+   * \brief Sum the edge fluxes for each cell to populate the residual vector, only used on coarse grids.
+   */
+  void SumEdgeFluxes(const CGeometry* geometry);
+
   /*!
    * \brief Destructor.
    */
diff --git a/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl b/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
index b215788bc66..0c6b6cc9160 100644
--- a/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
+++ b/SU2_CFD/include/solvers/CFVMFlowSolverBase.inl
@@ -29,6 +29,7 @@
 #include "../gradients/computeGradientsGreenGauss.hpp"
 #include "../gradients/computeGradientsLeastSquares.hpp"
 #include "../limiters/computeLimiters.hpp"
+#include "../numerics_simd/CNumericsSIMD.hpp"
 #include "CFVMFlowSolverBase.hpp"
 
 template <class V, ENUM_REGIME R>
@@ -101,9 +102,9 @@ void CFVMFlowSolverBase<V, R>::Allocate(const CConfig& config) {
 
   /*--- Define some auxiliar vector related with the undivided lapalacian computation ---*/
 
-  if (config.GetKind_ConvNumScheme_Flow() == SPACE_CENTERED) {
-    iPoint_UndLapl = new su2double[nPoint];
-    jPoint_UndLapl = new su2double[nPoint];
+  if ((config.GetKind_ConvNumScheme_Flow() == SPACE_CENTERED) && (MGLevel == MESH_0)) {
+    iPoint_UndLapl = new su2double[nPointDomain];
+    jPoint_UndLapl = new su2double[nPointDomain];
   }
 
   /*--- Initialize the solution and right hand side vectors for storing
@@ -223,9 +224,8 @@ void CFVMFlowSolverBase<V, R>::Allocate(const CConfig& config) {
   /*--- Only initialize when there is a Marker_Fluid_Load defined
    *--- (this avoids overhead in all other cases while a more permanent structure is being developed) ---*/
   if ((config.GetnMarker_Fluid_Load() > 0) && (MGLevel == MESH_0)) {
-    InitVertexTractionContainer();
-
-    if (config.GetDiscrete_Adjoint()) InitVertexTractionAdjointContainer();
+    Alloc3D(nMarker, nVertex, nDim, VertexTraction);
+    if (config.GetDiscrete_Adjoint()) Alloc3D(nMarker, nVertex, nDim, VertexTractionAdjoint);
   }
 
   /*--- Initialize the BGS residuals in FSI problems. ---*/
@@ -302,7 +302,7 @@ void CFVMFlowSolverBase<V, R>::CommunicateInitialState(CGeometry* geometry, cons
   InitiateComms(geometry, config, SOLUTION);
   CompleteComms(geometry, config, SOLUTION);
 
-  /* Store the initial CFL number for all grid points. */
+  /*--- Store the initial CFL number for all grid points. ---*/
 
   const auto CFL = config->GetCFL(MGLevel);
   for (auto iPoint = 0ul; iPoint < nPoint; iPoint++) {
@@ -354,6 +354,12 @@ void CFVMFlowSolverBase<V, R>::HybridParallelInitialization(const CConfig& confi
            << "         Those ranks will now use a fallback strategy, better performance may be possible\n"
            << "         with a different value of config option EDGE_COLORING_GROUP_SIZE (default 512)." << endl;
     }
+
+    if (config.GetUseVectorization() && (omp_get_max_threads() > 1) &&
+        (config.GetEdgeColoringGroupSize() % Double::Size != 0)) {
+      SU2_MPI::Error("When using vectorization, the EDGE_COLORING_GROUP_SIZE must be divisible "
+                     "by the SIMD length (2, 4, or 8).", CURRENT_FUNCTION);
+    }
   }
 
   if (ReducerStrategy) EdgeFluxes.Initialize(geometry.GetnEdge(), geometry.GetnEdge(), nVar, nullptr);
@@ -1134,7 +1140,7 @@ void CFVMFlowSolverBase<V, FlowRegime>::BC_Fluid_Interface(CGeometry* geometry,
             /*--- Accumulate the residuals to compute the average ---*/
 
             for (iVar = 0; iVar < nVar; iVar++) {
-              Residual[iVar] += weight * residual.residual[iVar];
+              Residual[iVar] += weight * residual[iVar];
               for (jVar = 0; jVar < nVar; jVar++) Jacobian_i[iVar][jVar] += weight * residual.jacobian_i[iVar][jVar];
             }
           }
@@ -1191,7 +1197,7 @@ void CFVMFlowSolverBase<V, FlowRegime>::BC_Fluid_Interface(CGeometry* geometry,
               /*--- Accumulate the residuals to compute the average ---*/
 
               for (iVar = 0; iVar < nVar; iVar++) {
-                Residual[iVar] += weight * residual.residual[iVar];
+                Residual[iVar] += weight * residual[iVar];
                 for (jVar = 0; jVar < nVar; jVar++) Jacobian_i[iVar][jVar] += weight * residual.jacobian_i[iVar][jVar];
               }
             }
@@ -1273,6 +1279,55 @@ void CFVMFlowSolverBase<V, R>::BC_Custom(CGeometry* geometry, CSolver** solver_c
   }
 }
 
+template <class V, ENUM_REGIME R>
+void CFVMFlowSolverBase<V, R>::EdgeFluxResidual(const CGeometry *geometry, const CConfig *config) {
+
+  /*--- Loop over edge colors. ---*/
+  for (auto color : EdgeColoring) {
+    /*--- Chunk size is at least OMP_MIN_SIZE and a multiple of the color group size. ---*/
+    SU2_OMP_FOR_DYN(nextMultiple(OMP_MIN_SIZE, color.groupSize))
+    for(auto k = 0ul; k < color.size; k += Double::Size) {
+      Int iEdge;
+      Double mask;
+      for (auto j = 0ul; j < Double::Size; ++j) {
+        bool in = (k+j < color.size);
+        mask[j] = in;
+        iEdge[j] = color.indices[k+j*in];
+      }
+
+      if (ReducerStrategy) {
+        edgeNumerics->ComputeFlux(iEdge, *config, *geometry, *nodes, UpdateType::REDUCTION, mask, EdgeFluxes, Jacobian);
+      } else {
+        edgeNumerics->ComputeFlux(iEdge, *config, *geometry, *nodes, UpdateType::COLORING, mask, LinSysRes, Jacobian);
+      }
+    }
+  }
+
+  if (ReducerStrategy) {
+    SumEdgeFluxes(geometry);
+    if (config->GetKind_TimeIntScheme() == EULER_IMPLICIT) {
+      Jacobian.SetDiagonalAsColumnSum();
+    }
+  }
+}
+
+template <class V, ENUM_REGIME R>
+void CFVMFlowSolverBase<V, R>::SumEdgeFluxes(const CGeometry* geometry) {
+
+  SU2_OMP_FOR_STAT(omp_chunk_size)
+  for (unsigned long iPoint = 0; iPoint < nPoint; ++iPoint) {
+
+    LinSysRes.SetBlock_Zero(iPoint);
+
+    for (auto iEdge : geometry->nodes->GetEdges(iPoint)) {
+      if (iPoint == geometry->edges->GetNode(iEdge,0))
+        LinSysRes.AddBlock(iPoint, EdgeFluxes.GetBlock(iEdge));
+      else
+        LinSysRes.SubtractBlock(iPoint, EdgeFluxes.GetBlock(iEdge));
+    }
+  }
+}
+
 template <class V, ENUM_REGIME FlowRegime>
 void CFVMFlowSolverBase<V, FlowRegime>::Pressure_Forces(const CGeometry* geometry, const CConfig* config) {
   unsigned long iVertex, iPoint;
@@ -1759,7 +1814,7 @@ void CFVMFlowSolverBase<V, FlowRegime>::Momentum_Forces(const CGeometry* geometr
 
           /*--- Moment with respect to the reference axis ---*/
 
-          if (iDim == 3) {
+          if (nDim == 3) {
             MomentMomentum[0] += (Force[2] * MomentDist[1] - Force[1] * MomentDist[2]) / RefLength;
             MomentX_Force[1] += (-Force[1] * Coord[2]);
             MomentX_Force[2] += (Force[2] * Coord[1]);
@@ -2255,7 +2310,7 @@ void CFVMFlowSolverBase<V, FlowRegime>::Friction_Forces(const CGeometry* geometr
 
         /*--- Moment with respect to the reference axis ---*/
 
-        if (iDim == 3) {
+        if (nDim == 3) {
           MomentViscous[0] += (Force[2] * MomentDist[1] - Force[1] * MomentDist[2]) / RefLength;
           MomentX_Force[1] += (-Force[1] * Coord[2]);
           MomentX_Force[2] += (Force[2] * Coord[1]);
diff --git a/SU2_CFD/include/solvers/CMeshSolver.hpp b/SU2_CFD/include/solvers/CMeshSolver.hpp
index 54f4ec8e7b4..0f57900efbc 100644
--- a/SU2_CFD/include/solvers/CMeshSolver.hpp
+++ b/SU2_CFD/include/solvers/CMeshSolver.hpp
@@ -29,6 +29,7 @@
 #pragma once
 
 #include "CFEASolver.hpp"
+#include "../variables/CMeshBoundVariable.hpp"
 #include "../variables/CMeshElement.hpp"
 
 class CMeshSolver final : public CFEASolver {
@@ -133,7 +134,7 @@ class CMeshSolver final : public CFEASolver {
   inline su2double Get_ValCoord(const CGeometry*,
                                 unsigned long indexNode,
                                 unsigned short iDim) const override {
-    return nodes->GetMesh_Coord(indexNode,iDim);
+    return static_cast<const CMeshBoundVariable*>(nodes)->GetMesh_Coord(indexNode,iDim);
   }
 
   /*!
diff --git a/SU2_CFD/include/solvers/CSolver.hpp b/SU2_CFD/include/solvers/CSolver.hpp
index de43ad20df4..c1c2eaaea02 100644
--- a/SU2_CFD/include/solvers/CSolver.hpp
+++ b/SU2_CFD/include/solvers/CSolver.hpp
@@ -4456,40 +4456,6 @@ class CSolver {
    */
   inline virtual void ComputeVerificationError(CGeometry *geometry, CConfig *config) { }
 
-  /*!
-   * \brief Initialize the vertex traction containers at the vertices.
-   */
-  inline void InitVertexTractionContainer() {
-
-    unsigned long iVertex;
-    unsigned short iMarker;
-
-    VertexTraction = new su2double** [nMarker];
-    for (iMarker = 0; iMarker < nMarker; iMarker++) {
-      VertexTraction[iMarker] = new su2double* [nVertex[iMarker]];
-      for (iVertex = 0; iVertex < nVertex[iMarker]; iVertex++) {
-        VertexTraction[iMarker][iVertex] = new su2double [nDim]();
-      }
-    }
-  }
-
-  /*!
-   * \brief Initialize the adjoint vertex traction containers at the vertices.
-   */
-  inline void InitVertexTractionAdjointContainer() {
-
-    unsigned long iVertex;
-    unsigned short iMarker;
-
-    VertexTractionAdjoint = new su2double** [nMarker];
-    for (iMarker = 0; iMarker < nMarker; iMarker++) {
-      VertexTractionAdjoint[iMarker] = new su2double* [nVertex[iMarker]];
-      for (iVertex = 0; iVertex < nVertex[iMarker]; iVertex++) {
-        VertexTractionAdjoint[iMarker][iVertex] = new su2double [nDim]();
-      }
-    }
-  }
-
   /*!
    * \brief Compute the tractions at the vertices.
    * \param[in] geometry - Geometrical definition.
diff --git a/SU2_CFD/include/variables/CDiscAdjFEABoundVariable.hpp b/SU2_CFD/include/variables/CDiscAdjFEABoundVariable.hpp
index 8f7d59bab04..443fdda06c9 100644
--- a/SU2_CFD/include/variables/CDiscAdjFEABoundVariable.hpp
+++ b/SU2_CFD/include/variables/CDiscAdjFEABoundVariable.hpp
@@ -6,7 +6,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
@@ -28,7 +28,7 @@
 #pragma once
 
 #include "CDiscAdjFEAVariable.hpp"
-#include "../../../Common/include/toolboxes/CVertexMap.hpp"
+#include "../../../Common/include/containers/CVertexMap.hpp"
 
 /*!
  * \class CDiscAdjFEABoundVariable
diff --git a/SU2_CFD/include/variables/CDiscAdjMeshBoundVariable.hpp b/SU2_CFD/include/variables/CDiscAdjMeshBoundVariable.hpp
index 59e9611f6bb..a82e726ee36 100644
--- a/SU2_CFD/include/variables/CDiscAdjMeshBoundVariable.hpp
+++ b/SU2_CFD/include/variables/CDiscAdjMeshBoundVariable.hpp
@@ -29,7 +29,7 @@
 #pragma once
 
 #include "CVariable.hpp"
-#include "../../../Common/include/toolboxes/CVertexMap.hpp"
+#include "../../../Common/include/containers/CVertexMap.hpp"
 
 class CDiscAdjMeshBoundVariable final : public CVariable {
 private:
diff --git a/SU2_CFD/include/variables/CEulerVariable.hpp b/SU2_CFD/include/variables/CEulerVariable.hpp
index 583f287dfbc..49b9b9b9e56 100644
--- a/SU2_CFD/include/variables/CEulerVariable.hpp
+++ b/SU2_CFD/include/variables/CEulerVariable.hpp
@@ -122,6 +122,7 @@ class CEulerVariable : public CVariable {
    * \return Primitive variables limiter for the entire domain.
    */
   inline MatrixType& GetLimiter_Primitive(void) {return Limiter_Primitive; }
+  inline const MatrixType& GetLimiter_Primitive(void) const {return Limiter_Primitive; }
 
   /*!
    * \brief Get the value of the primitive variables gradient.
@@ -154,12 +155,14 @@ class CEulerVariable : public CVariable {
    * \return Reference to primitive variable gradient.
    */
   inline CVectorOfMatrix& GetGradient_Primitive(void) { return Gradient_Primitive; }
+  inline const CVectorOfMatrix& GetGradient_Primitive(void) const { return Gradient_Primitive; }
 
   /*!
    * \brief Get the reconstruction gradient for primitive variable at all points.
    * \return Reference to variable reconstruction gradient.
    */
   inline CVectorOfMatrix& GetGradient_Reconstruction(void) final { return Gradient_Reconstruction; }
+  inline const CVectorOfMatrix& GetGradient_Reconstruction(void) const { return Gradient_Reconstruction; }
 
   /*!
    * \brief Get the value of the primitive variables gradient.
diff --git a/SU2_CFD/include/variables/CFEABoundVariable.hpp b/SU2_CFD/include/variables/CFEABoundVariable.hpp
index 8f35a186030..15b9ab4f069 100644
--- a/SU2_CFD/include/variables/CFEABoundVariable.hpp
+++ b/SU2_CFD/include/variables/CFEABoundVariable.hpp
@@ -6,7 +6,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
@@ -28,7 +28,7 @@
 #pragma once
 
 #include "CFEAVariable.hpp"
-#include "../../../Common/include/toolboxes/CVertexMap.hpp"
+#include "../../../Common/include/containers/CVertexMap.hpp"
 
 /*!
  * \class CFEABoundVariable
diff --git a/SU2_CFD/include/variables/CMeshBoundVariable.hpp b/SU2_CFD/include/variables/CMeshBoundVariable.hpp
index 21e2dd5481d..ab6c59a5e02 100644
--- a/SU2_CFD/include/variables/CMeshBoundVariable.hpp
+++ b/SU2_CFD/include/variables/CMeshBoundVariable.hpp
@@ -7,7 +7,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
@@ -29,7 +29,7 @@
 #pragma once
 
 #include "CMeshVariable.hpp"
-#include "../../../Common/include/toolboxes/CVertexMap.hpp"
+#include "../../../Common/include/containers/CVertexMap.hpp"
 
 class CMeshBoundVariable final : public CMeshVariable {
 private:
diff --git a/SU2_CFD/include/variables/CNSVariable.hpp b/SU2_CFD/include/variables/CNSVariable.hpp
index b752b97994e..69ebafdb577 100644
--- a/SU2_CFD/include/variables/CNSVariable.hpp
+++ b/SU2_CFD/include/variables/CNSVariable.hpp
@@ -6,7 +6,7 @@
  *
  * SU2 Project Website: https://su2code.github.io
  *
- * The SU2 Project is maintained by the SU2 Foundation 
+ * The SU2 Project is maintained by the SU2 Foundation
  * (http://su2foundation.org)
  *
  * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
@@ -220,6 +220,7 @@ class CNSVariable final : public CEulerVariable {
    * \return Value of the Roe Dissipation.
    */
   inline su2double GetRoe_Dissipation(unsigned long iPoint) const override { return Roe_Dissipation(iPoint); }
+  inline const VectorType& GetRoe_Dissipation() const { return Roe_Dissipation; }
 
   /*!
    * \brief Set the Roe Dissipation Coefficient.
diff --git a/SU2_CFD/include/variables/CVariable.hpp b/SU2_CFD/include/variables/CVariable.hpp
index 3eefb3dfd36..7f4845289b4 100644
--- a/SU2_CFD/include/variables/CVariable.hpp
+++ b/SU2_CFD/include/variables/CVariable.hpp
@@ -36,7 +36,7 @@
 #include <cstdlib>
 
 #include "../../../Common/include/CConfig.hpp"
-#include "../../../Common/include/toolboxes/C2DContainer.hpp"
+#include "../../../Common/include/containers/container_decorators.hpp"
 
 class CFluidModel;
 class CNEMOGas;
@@ -71,7 +71,7 @@ class CVariable {
   VectorType Delta_Time;         /*!< \brief Time step. */
 
   CVectorOfMatrix Gradient;  /*!< \brief Gradient of the solution of the problem. */
-  CVectorOfMatrix Rmatrix;   /*!< \brief Geometry-based matrix for weighted least squares gradient calculations. */
+  C3DDoubleMatrix Rmatrix;   /*!< \brief Geometry-based matrix for weighted least squares gradient calculations. */
 
   MatrixType Limiter;        /*!< \brief Limiter of the solution of the problem. */
   MatrixType Solution_Max;   /*!< \brief Max solution for limiter computation. */
@@ -758,18 +758,11 @@ class CVariable {
    */
   inline su2double GetRmatrix(unsigned long iPoint, unsigned long iDim, unsigned long jDim) const { return Rmatrix(iPoint,iDim,jDim); }
 
-  /*!
-   * \brief Get the value of the Rmatrix entry for least squares gradient calculations.
-   * \param[in] iPoint - Point index.
-   * \return Value of the Rmatrix entry.
-   */
-  inline su2double **GetRmatrix(unsigned long iPoint) { return Rmatrix[iPoint]; }
-
   /*!
    * \brief Get the value Rmatrix for the entire domain.
    * \return Reference to the Rmatrix.
    */
-  inline CVectorOfMatrix& GetRmatrix(void) { return Rmatrix; }
+  inline C3DDoubleMatrix& GetRmatrix(void) { return Rmatrix; }
 
   /*!
    * \brief Set the value of the limiter.
@@ -1014,6 +1007,7 @@ class CVariable {
    * \return Value of the spectral radius.
    */
   inline su2double GetLambda(unsigned long iPoint) const { return Lambda(iPoint); }
+  inline const VectorType& GetLambda() const { return Lambda; }
 
   /*!
    * \brief Get the value of the spectral radius.
@@ -1044,6 +1038,7 @@ class CVariable {
    * \return Value of the pressure sensor.
    */
   inline su2double GetSensor(unsigned long iPoint) const { return Sensor(iPoint); }
+  inline const VectorType& GetSensor() const { return Sensor; }
 
   /*!
    * \brief Get the pressure sensor.
@@ -1112,6 +1107,7 @@ class CVariable {
    * \return Value of the undivided laplacian vector.
    */
   inline su2double GetUndivided_Laplacian(unsigned long iPoint, unsigned long iVar) const { return Undivided_Laplacian(iPoint, iVar); }
+  inline const MatrixType& GetUndivided_Laplacian() const { return Undivided_Laplacian; }
 
   /*!
    * \brief A virtual member.
diff --git a/SU2_CFD/obj/Makefile.am b/SU2_CFD/obj/Makefile.am
index 49960de52a1..30acd7fa720 100644
--- a/SU2_CFD/obj/Makefile.am
+++ b/SU2_CFD/obj/Makefile.am
@@ -112,6 +112,7 @@ libSU2Core_sources = ../src/definition_structure.cpp \
   ../src/numerics/elasticity/CFEALinearElasticity.cpp \
   ../src/numerics/elasticity/CFEANonlinearElasticity.cpp \
   ../src/numerics/elasticity/nonlinear_models.cpp \
+  ../include/numerics_simd/CNumericsSIMD.cpp \
   ../src/numerics/NEMO/NEMO_diffusion.cpp \
   ../src/numerics/NEMO/NEMO_sources.cpp \
   ../src/numerics/NEMO/convection/ausm.cpp \
diff --git a/SU2_CFD/src/drivers/CDiscAdjSinglezoneDriver.cpp b/SU2_CFD/src/drivers/CDiscAdjSinglezoneDriver.cpp
index a755cd4e89c..abc08f0f067 100644
--- a/SU2_CFD/src/drivers/CDiscAdjSinglezoneDriver.cpp
+++ b/SU2_CFD/src/drivers/CDiscAdjSinglezoneDriver.cpp
@@ -286,6 +286,11 @@ void CDiscAdjSinglezoneDriver::SetRecording(unsigned short kind_recording){
 
   SetObjFunction();
 
+  if (rank == MASTER_NODE && kind_recording != NONE && config_container[ZONE_0]->GetWrt_AD_Statistics()) {
+    AD::PrintStatistics();
+    cout << "-------------------------------------------------------------------------\n" << endl;
+  }
+
   AD::StopRecording();
 
 }
diff --git a/SU2_CFD/src/drivers/CDriver.cpp b/SU2_CFD/src/drivers/CDriver.cpp
index 2ce7575a02e..7725bb7a18e 100644
--- a/SU2_CFD/src/drivers/CDriver.cpp
+++ b/SU2_CFD/src/drivers/CDriver.cpp
@@ -1635,6 +1635,11 @@ void CDriver::Numerics_Preprocessing(CConfig *config, CGeometry **geometry, CSol
             case LAX : numerics[MESH_0][FLOW_SOL][conv_term] = new CCentLax_Flow(nDim, nVar_Flow, config); break;
             case JST : numerics[MESH_0][FLOW_SOL][conv_term] = new CCentJST_Flow(nDim, nVar_Flow, config); break;
             case JST_KE : numerics[MESH_0][FLOW_SOL][conv_term] = new CCentJST_KE_Flow(nDim, nVar_Flow, config); break;
+            case JST_MAT :
+              if (!config->GetUseVectorization()) {
+                SU2_MPI::Error("JST with matrix dissipation requires USE_VECTORIZATION=YES.", CURRENT_FUNCTION);
+              }
+              break;
             default:
               SU2_OMP_MASTER
               SU2_MPI::Error("Invalid centered scheme or not implemented.", CURRENT_FUNCTION);
diff --git a/SU2_CFD/src/meson.build b/SU2_CFD/src/meson.build
index 6a166be4a5c..d6eeb45630e 100644
--- a/SU2_CFD/src/meson.build
+++ b/SU2_CFD/src/meson.build
@@ -139,6 +139,8 @@ su2_cfd_src += files(['numerics/CNumerics.cpp',
                       'numerics/elasticity/CFEANonlinearElasticity.cpp',
                       'numerics/elasticity/nonlinear_models.cpp'])
 
+su2_cfd_src += files(['../include/numerics_simd/CNumericsSIMD.cpp'])
+
 su2_cfd_src += files(['interfaces/CInterface.cpp',
                       'interfaces/cfd/CConservativeVarsInterface.cpp',
                       'interfaces/cfd/CMixingPlaneInterface.cpp',
diff --git a/SU2_CFD/src/output/CElasticityOutput.cpp b/SU2_CFD/src/output/CElasticityOutput.cpp
index c6a1f094673..0a059eedf47 100644
--- a/SU2_CFD/src/output/CElasticityOutput.cpp
+++ b/SU2_CFD/src/output/CElasticityOutput.cpp
@@ -72,6 +72,7 @@ CElasticityOutput::CElasticityOutput(CConfig *config, unsigned short nDim) : COu
     requestedVolumeFields.emplace_back("COORDINATES");
     requestedVolumeFields.emplace_back("SOLUTION");
     requestedVolumeFields.emplace_back("STRESS");
+    if (config->GetTopology_Optimization()) requestedVolumeFields.emplace_back("TOPOLOGY");
     nRequestedVolumeFields = requestedVolumeFields.size();
   }
 
@@ -198,6 +199,9 @@ void CElasticityOutput::LoadVolumeData(CConfig *config, CGeometry *geometry, CSo
   }
   SetVolumeOutputValue("VON_MISES_STRESS", iPoint, Node_Struc->GetVonMises_Stress(iPoint));
 
+  if (config->GetTopology_Optimization()) {
+    SetVolumeOutputValue("TOPOL_DENSITY", iPoint, Node_Struc->GetAuxVar(iPoint));
+  }
 }
 
 void CElasticityOutput::SetVolumeOutputFields(CConfig *config){
@@ -234,7 +238,11 @@ void CElasticityOutput::SetVolumeOutputFields(CConfig *config){
 
   AddVolumeOutput("VON_MISES_STRESS", "Von_Mises_Stress", "STRESS", "von-Mises stress");
 
+  if (config->GetTopology_Optimization()) {
+    AddVolumeOutput("TOPOL_DENSITY", "Topology_Density", "TOPOLOGY", "filtered topology density");
+  }
 }
+
 bool CElasticityOutput::SetInit_Residuals(CConfig *config){
 
   return (config->GetTime_Domain() == NO && (curInnerIter  == 0));
diff --git a/SU2_CFD/src/solvers/CAdjNSSolver.cpp b/SU2_CFD/src/solvers/CAdjNSSolver.cpp
index afd8306ec3c..67dc15be987 100644
--- a/SU2_CFD/src/solvers/CAdjNSSolver.cpp
+++ b/SU2_CFD/src/solvers/CAdjNSSolver.cpp
@@ -1278,7 +1278,7 @@ void CAdjNSSolver::BC_HeatFlux_Wall(CGeometry *geometry, CSolver **solver_contai
         nodes->SetSolution_Old(iPoint,iDim+1, phi[iDim]);
 
       for (iDim = 0; iDim < nDim; iDim++)
-        LinSysRes.SetBlock_Zero(iPoint, iDim+1);
+        LinSysRes(iPoint, iDim+1) = 0.0;
       nodes->SetVel_ResTruncError_Zero(iPoint);
 
       /*--- Compute additional contributions to the adjoint density and energy
@@ -1656,7 +1656,7 @@ void CAdjNSSolver::BC_Isothermal_Wall(CGeometry *geometry, CSolver **solver_cont
 
       /*--- Strong BC imposition for the adjoint velocity equations ---*/
       for (iDim = 0; iDim < nDim; iDim++)
-        LinSysRes.SetBlock_Zero(iPoint, iDim+1);
+        LinSysRes(iPoint, iDim+1) = 0.0;
       nodes->SetVel_ResTruncError_Zero(iPoint);
       for (iDim = 0; iDim < nDim; iDim++)
         nodes->SetSolution_Old(iPoint,iDim+1, phi[iDim]);
@@ -1700,7 +1700,7 @@ void CAdjNSSolver::BC_Isothermal_Wall(CGeometry *geometry, CSolver **solver_cont
       }
 
       /*--- Strong BC enforcement of the energy equation ---*/
-      LinSysRes.SetBlock_Zero(iPoint, nVar-1);
+      LinSysRes(iPoint, nVar-1) = 0.0;
       nodes->SetEnergy_ResTruncError_Zero(iPoint);
       nodes->SetSolution_Old(iPoint,nDim+1, q);
       if (implicit) {
diff --git a/SU2_CFD/src/solvers/CEulerSolver.cpp b/SU2_CFD/src/solvers/CEulerSolver.cpp
index 9b2a6255bca..d3ddb94943b 100644
--- a/SU2_CFD/src/solvers/CEulerSolver.cpp
+++ b/SU2_CFD/src/solvers/CEulerSolver.cpp
@@ -32,6 +32,7 @@
 #include "../../include/fluid/CIdealGas.hpp"
 #include "../../include/fluid/CVanDerWaalsGas.hpp"
 #include "../../include/fluid/CPengRobinson.hpp"
+#include "../../include/numerics_simd/CNumericsSIMD.hpp"
 
 
 CEulerSolver::CEulerSolver(CGeometry *geometry, CConfig *config,
@@ -51,24 +52,23 @@ CEulerSolver::CEulerSolver(CGeometry *geometry, CConfig *config,
     nSecVar = 2;
   }
 
+  const auto nZone = geometry->GetnZone();
+  const bool restart = (config->GetRestart() || config->GetRestart_Flow());
+  const bool rans = (config->GetKind_Turb_Model() != NONE);
+  const auto direct_diff = config->GetDirectDiff();
+  const bool dual_time = (config->GetTime_Marching() == DT_STEPPING_1ST) ||
+                         (config->GetTime_Marching() == DT_STEPPING_2ND);
+  const bool time_stepping = (config->GetTime_Marching() == TIME_STEPPING);
+  const bool adjoint = config->GetContinuous_Adjoint() || config->GetDiscrete_Adjoint();
+
+  int Unst_RestartIter = 0;
   unsigned long iPoint, counter_local = 0, counter_global = 0;
   unsigned short iDim, iMarker, nLineLets;
   su2double StaticEnergy, Density, Velocity2, Pressure, Temperature;
-  unsigned short nZone = geometry->GetnZone();
-  bool restart = (config->GetRestart() || config->GetRestart_Flow());
-  bool rans = (config->GetKind_Turb_Model() != NONE);
-  unsigned short direct_diff = config->GetDirectDiff();
-  int Unst_RestartIter = 0;
-  bool dual_time = (config->GetTime_Marching() == DT_STEPPING_1ST) ||
-                   (config->GetTime_Marching() == DT_STEPPING_2ND);
-  bool time_stepping = (config->GetTime_Marching() == TIME_STEPPING);
 
   /*--- A grid is defined as dynamic if there's rigid grid movement or grid deformation AND the problem is time domain ---*/
   dynamic_grid = config->GetDynamic_Grid();
 
-  bool adjoint = (config->GetContinuous_Adjoint()) || (config->GetDiscrete_Adjoint());
-  string filename_ = "flow";
-
   /*--- Store the multigrid level. ---*/
   MGLevel = iMesh;
 
@@ -94,6 +94,7 @@ CEulerSolver::CEulerSolver(CGeometry *geometry, CConfig *config,
       else Unst_RestartIter = SU2_TYPE::Int(config->GetRestart_Iter())-1;
     }
 
+    string filename_ = "flow";
     filename_ = config->GetFilename(filename_, ".meta", Unst_RestartIter);
 
     /*--- Read and store the restart metadata. ---*/
@@ -345,9 +346,29 @@ CEulerSolver::CEulerSolver(CGeometry *geometry, CConfig *config,
 
   CommunicateInitialState(geometry, config);
 
-  /*--- Add the solver name (max 8 characters) ---*/
+  /*--- Add the solver name (max 8 characters). ---*/
   SolverName = "C.FLOW";
 
+  /*--- Vectorized numerics. ---*/
+  if (config->GetUseVectorization()) {
+    const bool uncertain = config->GetUsing_UQ();
+    const bool ideal_gas = (config->GetKind_FluidModel() == STANDARD_AIR) ||
+                           (config->GetKind_FluidModel() == IDEAL_GAS);
+    const bool low_mach_corr = config->Low_Mach_Correction();
+
+    if (uncertain || !ideal_gas || low_mach_corr) {
+      SU2_MPI::Error("Some of the requested features are not yet "
+                     "supported with vectorization.", CURRENT_FUNCTION);
+    }
+
+    edgeNumerics = CNumericsSIMD::CreateNumerics(*config, nDim, iMesh);
+
+    if (!edgeNumerics) {
+      SU2_MPI::Error("The numerical scheme in use does not "
+                     "support vectorization.", CURRENT_FUNCTION);
+    }
+  }
+
   /*--- Finally, check that the static arrays will be large enough (keep this
    *    check at the bottom to make sure we consider the "final" values). ---*/
   if((nDim > MAXNDIM) || (nPrimVar > MAXNVAR) || (nSecondaryVar > MAXNVAR))
@@ -2205,6 +2226,7 @@ void CEulerSolver::CommonPreprocessing(CGeometry *geometry, CSolver **solver_con
                           (cont_adjoint && config->GetKind_ConvNumScheme_AdjFlow() == SPACE_CENTERED);
   bool center_jst       = (config->GetKind_Centered_Flow() == JST) && (iMesh == MESH_0);
   bool center_jst_ke    = (config->GetKind_Centered_Flow() == JST_KE) && (iMesh == MESH_0);
+  bool center_jst_mat   = (config->GetKind_Centered_Flow() == JST_MAT) && (iMesh == MESH_0);
   bool engine           = ((config->GetnMarker_EngineInflow() != 0) || (config->GetnMarker_EngineExhaust() != 0));
   bool actuator_disk    = ((config->GetnMarker_ActDiskInlet() != 0) || (config->GetnMarker_ActDiskOutlet() != 0));
   bool nearfield        = (config->GetnMarker_NearFieldBound() != 0);
@@ -2274,9 +2296,11 @@ void CEulerSolver::CommonPreprocessing(CGeometry *geometry, CSolver **solver_con
   /*--- Artificial dissipation ---*/
 
   if (center && !Output) {
-    SetMax_Eigenvalue(geometry, config);
-    if (center_jst) SetUndivided_Laplacian(geometry, config);
-    if (center_jst || center_jst_ke) SetCentered_Dissipation_Sensor(geometry, config);
+    if (!center_jst_mat) SetMax_Eigenvalue(geometry, config);
+    if (center_jst || center_jst_ke || center_jst_mat) {
+      SetCentered_Dissipation_Sensor(geometry, config);
+      if (!center_jst_ke) SetUndivided_Laplacian(geometry, config);
+    }
   }
 
   /*--- Roe Low Dissipation Sensor ---*/
@@ -2621,6 +2645,9 @@ void CEulerSolver::SetTime_Step(CGeometry *geometry, CSolver **solver_container,
 void CEulerSolver::Centered_Residual(CGeometry *geometry, CSolver **solver_container, CNumerics **numerics_container,
                                      CConfig *config, unsigned short iMesh, unsigned short iRKStep) {
 
+  /*--- If possible use the vectorized numerics instead. ---*/
+  if (edgeNumerics) { EdgeFluxResidual(geometry, config); return; }
+
   const bool implicit = (config->GetKind_TimeIntScheme() == EULER_IMPLICIT);
   const bool jst_scheme = (config->GetKind_Centered_Flow() == JST) && (iMesh == MESH_0);
   const bool jst_ke_scheme = (config->GetKind_Centered_Flow() == JST_KE) && (iMesh == MESH_0);
@@ -2705,6 +2732,9 @@ void CEulerSolver::Centered_Residual(CGeometry *geometry, CSolver **solver_conta
 void CEulerSolver::Upwind_Residual(CGeometry *geometry, CSolver **solver_container,
                                    CNumerics **numerics_container, CConfig *config, unsigned short iMesh) {
 
+  /*--- If possible use the vectorized numerics instead. ---*/
+  if (edgeNumerics) { EdgeFluxResidual(geometry, config); return; }
+
   const auto InnerIter        = config->GetInnerIter();
   const bool implicit         = (config->GetKind_TimeIntScheme() == EULER_IMPLICIT);
   const bool ideal_gas        = (config->GetKind_FluidModel() == STANDARD_AIR) ||
@@ -2731,7 +2761,7 @@ void CEulerSolver::Upwind_Residual(CGeometry *geometry, CSolver **solver_contain
   su2double Primitive_i[MAXNVAR] = {0.0}, Primitive_j[MAXNVAR] = {0.0};
   su2double Secondary_i[MAXNVAR] = {0.0}, Secondary_j[MAXNVAR] = {0.0};
 
-    /*--- Loop over edge colors. ---*/
+  /*--- Loop over edge colors. ---*/
   for (auto color : EdgeColoring)
   {
   /*--- Chunk size is at least OMP_MIN_SIZE and a multiple of the color group size. ---*/
@@ -2948,26 +2978,6 @@ void CEulerSolver::Upwind_Residual(CGeometry *geometry, CSolver **solver_contain
 
 }
 
-void CEulerSolver::SumEdgeFluxes(CGeometry* geometry) {
-
-  SU2_OMP_FOR_STAT(omp_chunk_size)
-  for (unsigned long iPoint = 0; iPoint < nPoint; ++iPoint) {
-
-    LinSysRes.SetBlock_Zero(iPoint);
-
-    for (unsigned short iNeigh = 0; iNeigh < geometry->nodes->GetnPoint(iPoint); ++iNeigh) {
-
-      auto iEdge = geometry->nodes->GetEdge(iPoint, iNeigh);
-
-      if (iPoint == geometry->edges->GetNode(iEdge,0))
-        LinSysRes.AddBlock(iPoint, EdgeFluxes.GetBlock(iEdge));
-      else
-        LinSysRes.SubtractBlock(iPoint, EdgeFluxes.GetBlock(iEdge));
-    }
-  }
-
-}
-
 void CEulerSolver::ComputeConsistentExtrapolation(CFluidModel *fluidModel, unsigned short nDim,
                                                   su2double *primitive, su2double *secondary) {
 
@@ -3351,9 +3361,8 @@ void CEulerSolver::SetUndivided_Laplacian(CGeometry *geometry, const CConfig *co
       nodes->SetUnd_Lapl(iPoint, iVar, 0.0);
 
     /*--- Loop over the neighbors of point i. ---*/
-    for (unsigned short iNeigh = 0; iNeigh < geometry->nodes->GetnPoint(iPoint); ++iNeigh)
-    {
-      auto jPoint = geometry->nodes->GetPoint(iPoint, iNeigh);
+    for (auto jPoint : geometry->nodes->GetPoints(iPoint)) {
+
       bool boundary_j = geometry->nodes->GetPhysicalBoundary(jPoint);
 
       /*--- If iPoint is boundary it only takes contributions from other boundary points. ---*/
@@ -3402,9 +3411,8 @@ void CEulerSolver::SetCentered_Dissipation_Sensor(CGeometry *geometry, const CCo
     jPoint_UndLapl[iPoint] = 0.0;
 
     /*--- Loop over the neighbors of point i. ---*/
-    for (unsigned short iNeigh = 0; iNeigh < geometry->nodes->GetnPoint(iPoint); ++iNeigh)
+    for (auto jPoint : geometry->nodes->GetPoints(iPoint))
     {
-      auto jPoint = geometry->nodes->GetPoint(iPoint, iNeigh);
       bool boundary_j = geometry->nodes->GetPhysicalBoundary(jPoint);
 
       /*--- If iPoint is boundary it only takes contributions from other boundary points. ---*/
@@ -8812,7 +8820,6 @@ void CEulerSolver::BC_Engine_Inflow(CGeometry *geometry, CSolver **solver_contai
 
 }
 
-
 void CEulerSolver::BC_Engine_Exhaust(CGeometry *geometry, CSolver **solver_container, CNumerics *conv_numerics, CNumerics *visc_numerics, CConfig *config, unsigned short val_marker) {
 
   unsigned short iDim;
diff --git a/SU2_CFD/src/solvers/CFEASolver.cpp b/SU2_CFD/src/solvers/CFEASolver.cpp
index df5f97d9c1f..cef588405d4 100644
--- a/SU2_CFD/src/solvers/CFEASolver.cpp
+++ b/SU2_CFD/src/solvers/CFEASolver.cpp
@@ -1884,6 +1884,34 @@ void CFEASolver::BC_DispDir(CGeometry *geometry, CNumerics *numerics, const CCon
 
 }
 
+template<class T, class U, su2enable_if<is_same<T,U>::value> = 0>
+CSysVector<T> computeLinearResidual(const CSysMatrix<T>& A,
+                                    const CSysVector<U>& x,
+                                    const CSysVector<U>& b) {
+  CSysVector<T> r(x.GetNBlk(), x.GetNBlkDomain(), x.GetNVar(), nullptr);
+  SU2_OMP_PARALLEL { A.ComputeResidual(x, b, r); }
+  return r;
+}
+
+template<class T, class U, su2enable_if<!is_same<T,U>::value> = 0>
+CSysVector<T> computeLinearResidual(const CSysMatrix<T>& A,
+                                    const CSysVector<U>& x,
+                                    const CSysVector<U>& b) {
+  /*--- Different types of A and x/b, use temporaries to interface with A. ---*/
+  const auto nVar = x.GetNVar();
+  const auto nBlk = x.GetNBlk();
+  const auto nBlkDom = x.GetNBlkDomain();
+  CSysVector<T> r(nBlk, nBlkDom, nVar, nullptr);
+  CSysVector<T> xtmp(nBlk, nBlkDom, nVar, nullptr);
+  CSysVector<T> btmp(nBlk, nBlkDom, nVar, nullptr);
+  SU2_OMP_PARALLEL {
+    xtmp.PassiveCopy(x);
+    btmp.PassiveCopy(b);
+    A.ComputeResidual(xtmp, btmp, r);
+  }
+  return r;
+}
+
 void CFEASolver::Postprocessing(CGeometry *geometry, CSolver **solver_container,
                                 CConfig *config, CNumerics **numerics, unsigned short iMesh) {
 
@@ -1922,7 +1950,7 @@ void CFEASolver::Postprocessing(CGeometry *geometry, CSolver **solver_container,
     {
     su2double utol = LinSysSol.norm();
     su2double rtol = LinSysRes.norm();
-    su2double etol = LinSysSol.dot(LinSysRes);
+    su2double etol = fabs(LinSysSol.dot(LinSysRes));
 
     SU2_OMP_MASTER
     {
@@ -1937,35 +1965,17 @@ void CFEASolver::Postprocessing(CGeometry *geometry, CSolver **solver_container,
     /*--- If the problem is linear, the only check we do is the RMS of the residuals. ---*/
     /*---  Compute the residual Ax-f ---*/
 
-#ifndef CODI_FORWARD_TYPE
-    CSysVector<su2mixedfloat> LinSysAux(nPoint, nPointDomain, nVar, nullptr);
-#else
-    CSysVector<su2double> LinSysAux(nPoint, nPointDomain, nVar, nullptr);
-#endif
-
-#if defined(CODI_REVERSE_TYPE) || defined(USE_MIXED_PRECISION)
-    /*---  We need temporaries to interface with the passive matrix. ---*/
-    CSysVector<su2mixedfloat> sol, res;
-#endif
-
-    SU2_OMP_PARALLEL
-    {
-#if !(defined(CODI_REVERSE_TYPE) || defined(USE_MIXED_PRECISION)) || defined(CODI_FORWARD_TYPE)
-    Jacobian.ComputeResidual(LinSysSol, LinSysRes, LinSysAux);
-#else
-    sol.PassiveCopy(LinSysSol);
-    res.PassiveCopy(LinSysRes);
-    Jacobian.ComputeResidual(sol, res, LinSysAux);
-#endif
+    const auto ResidualAux = computeLinearResidual(Jacobian, LinSysSol, LinSysRes);
 
     /*--- Set maximum residual to zero. ---*/
 
-    SU2_OMP_MASTER
     for (auto iVar = 0ul; iVar < nVar; iVar++) {
       SetRes_RMS(iVar, 0.0);
       SetRes_Max(iVar, 0.0, 0);
     }
 
+    SU2_OMP_PARALLEL {
+
     /*--- Compute the residual. ---*/
 
     su2double resMax[MAXNVAR] = {0.0}, resRMS[MAXNVAR] = {0.0};
@@ -1975,7 +1985,7 @@ void CFEASolver::Postprocessing(CGeometry *geometry, CSolver **solver_container,
     SU2_OMP_FOR_STAT(omp_chunk_size)
     for (auto iPoint = 0ul; iPoint < nPointDomain; iPoint++) {
       for (auto iVar = 0ul; iVar < nVar; iVar++) {
-        su2double Res = fabs(LinSysAux(iPoint, iVar));
+        su2double Res = fabs(ResidualAux(iPoint, iVar));
         resRMS[iVar] += Res*Res;
         if (Res > resMax[iVar]) {
           resMax[iVar] = Res;
@@ -3458,6 +3468,18 @@ void CFEASolver::FilterElementDensities(CGeometry *geometry, const CConfig *conf
       else if (rho < 0.0) element_properties[iElem]->SetPhysicalDensity(0.0);
       else element_properties[iElem]->SetPhysicalDensity(physical_rho[iElem]);
     }
+
+    /*--- Compute nodal averages for output. ---*/
+    SU2_OMP_FOR_STAT(omp_chunk_size)
+    for (auto iPoint=0ul; iPoint<nPoint; ++iPoint) {
+      su2double sum = 0, vol = 0;
+      for (auto iElem : geometry->nodes->GetElems(iPoint)) {
+        su2double w = geometry->nodes->GetVolume(iPoint);
+        sum += w * element_properties[iElem]->GetPhysicalDensity();
+        vol += w;
+      }
+      nodes->SetAuxVar(iPoint, sum/vol);
+    }
   }
 
   delete [] physical_rho;
diff --git a/SU2_CFD/src/solvers/CHeatSolver.cpp b/SU2_CFD/src/solvers/CHeatSolver.cpp
index fc8954e0f0f..06da5d1266d 100644
--- a/SU2_CFD/src/solvers/CHeatSolver.cpp
+++ b/SU2_CFD/src/solvers/CHeatSolver.cpp
@@ -1108,7 +1108,7 @@ void CHeatSolver::BC_ConjugateHeat_Interface(CGeometry *geometry, CSolver **solv
         T_Conjugate = GetConjugateHeatVariable(val_marker, iVertex, 0)/Temperature_Ref;
 
         nodes->SetSolution_Old(iPoint,&T_Conjugate);
-        LinSysRes.SetBlock_Zero(iPoint, 0);
+        LinSysRes(iPoint, 0) = 0.0;
         nodes->SetRes_TruncErrorZero(iPoint);
 
         if (implicit) {
diff --git a/SU2_CFD/src/solvers/CIncNSSolver.cpp b/SU2_CFD/src/solvers/CIncNSSolver.cpp
index 4ff45ef0f5b..62d98692b84 100644
--- a/SU2_CFD/src/solvers/CIncNSSolver.cpp
+++ b/SU2_CFD/src/solvers/CIncNSSolver.cpp
@@ -560,7 +560,7 @@ void CIncNSSolver::BC_HeatFlux_Wall(CGeometry *geometry, CSolver **solver_contai
       nodes->SetVelocity_Old(iPoint,Vector);
 
       for (iDim = 0; iDim < nDim; iDim++)
-        LinSysRes.SetBlock_Zero(iPoint, iDim+1);
+        LinSysRes(iPoint, iDim+1) = 0.0;
       nodes->SetVel_ResTruncError_Zero(iPoint);
 
       if (energy) {
@@ -657,7 +657,7 @@ void CIncNSSolver::BC_Isothermal_Wall(CGeometry *geometry, CSolver **solver_cont
       nodes->SetVelocity_Old(iPoint,Vector);
 
       for (iDim = 0; iDim < nDim; iDim++)
-        LinSysRes.SetBlock_Zero(iPoint, iDim+1);
+        LinSysRes(iPoint, iDim+1) = 0.0;
       nodes->SetVel_ResTruncError_Zero(iPoint);
 
       if (energy) {
@@ -797,7 +797,7 @@ void CIncNSSolver::BC_ConjugateHeat_Interface(CGeometry *geometry, CSolver **sol
       nodes->SetVelocity_Old(iPoint,Vector);
 
       for (iDim = 0; iDim < nDim; iDim++)
-        LinSysRes.SetBlock_Zero(iPoint, iDim+1);
+        LinSysRes(iPoint, iDim+1) = 0.0;
       nodes->SetVel_ResTruncError_Zero(iPoint);
 
       if (energy) {
@@ -843,7 +843,7 @@ void CIncNSSolver::BC_ConjugateHeat_Interface(CGeometry *geometry, CSolver **sol
 
         /*--- Strong imposition of the temperature on the fluid zone. ---*/
 
-        LinSysRes.SetBlock_Zero(iPoint, nDim+1);
+        LinSysRes(iPoint, nDim+1) = 0.0;
         nodes->SetSolution_Old(iPoint, nDim+1, Twall);
         nodes->SetEnergy_ResTruncError_Zero(iPoint);
       }
diff --git a/SU2_CFD/src/solvers/CMeshSolver.cpp b/SU2_CFD/src/solvers/CMeshSolver.cpp
index 55a4261f44a..75199a7e6ac 100644
--- a/SU2_CFD/src/solvers/CMeshSolver.cpp
+++ b/SU2_CFD/src/solvers/CMeshSolver.cpp
@@ -28,7 +28,6 @@
 #include "../../../Common/include/adt/CADTPointsOnlyClass.hpp"
 #include "../../../Common/include/omp_structure.hpp"
 #include "../../include/solvers/CMeshSolver.hpp"
-#include "../../include/variables/CMeshBoundVariable.hpp"
 #include "../../../Common/include/toolboxes/geometry_toolbox.hpp"
 
 using namespace GeometryToolbox;
diff --git a/SU2_CFD/src/solvers/CNEMONSSolver.cpp b/SU2_CFD/src/solvers/CNEMONSSolver.cpp
index ed6dedc5087..f920889325d 100644
--- a/SU2_CFD/src/solvers/CNEMONSSolver.cpp
+++ b/SU2_CFD/src/solvers/CNEMONSSolver.cpp
@@ -67,7 +67,7 @@ void CNEMONSSolver::SetPrimitive_Gradient_GG(CGeometry *geometry, const CConfig
 
   unsigned long iPoint, iVar;
   unsigned short iSpecies, RHO_INDEX, RHOS_INDEX;
-  
+
   auto& gradient = nodes->GetGradient_Primitive();
 
   /*--- Get indices of species & mixture density ---*/
@@ -108,7 +108,7 @@ void CNEMONSSolver::SetPrimitive_Gradient_LS(CGeometry *geometry, const CConfig
   PERIODIC_QUANTITIES kindPeriodicComm = weighted? PERIODIC_PRIM_LS : PERIODIC_PRIM_ULS;
 
   const auto& primitives = nodes->GetPrimitive();
-  
+
   computeGradientsLeastSquares(this, PRIMITIVE_GRADIENT, kindPeriodicComm, *geometry, *config,
                                weighted, primitives, 0, nPrimVarGrad, gradient, rmatrix);
 }
@@ -270,7 +270,7 @@ void CNEMONSSolver::BC_HeatFluxNonCatalytic_Wall(CGeometry *geometry,
       for (iDim = 0; iDim < nDim; iDim++) Vector[iDim] = 0.0;
       nodes->SetVelocity_Old(iPoint,Vector);
       for (iDim = 0; iDim < nDim; iDim++) {
-        LinSysRes.SetBlock_Zero(iPoint, nSpecies+iDim);
+        LinSysRes(iPoint, nSpecies+iDim) = 0.0;
         nodes->SetVal_ResTruncError_Zero(iPoint,nSpecies+iDim);
       }
       if (implicit) {
@@ -314,14 +314,14 @@ void CNEMONSSolver::BC_HeatFlux_Wall(CGeometry *geometry,
     } else {
 
       iMarker_Catalytic++;
-     
+
     }
   }
 
   if(!catalytic) BC_HeatFluxNonCatalytic_Wall(geometry, solution_container, conv_numerics,
                                               sour_numerics, config, val_marker);
 
-  
+
 }
 
 void CNEMONSSolver::BC_HeatFluxCatalytic_Wall(CGeometry *geometry,
@@ -395,7 +395,7 @@ void CNEMONSSolver::BC_HeatFluxCatalytic_Wall(CGeometry *geometry,
       /*--- Set the residual, truncation error, and velocity value ---*/
       nodes->SetVelocity_Old(iPoint,Vector);
       for (iDim = 0; iDim < nDim; iDim++) {
-        LinSysRes.SetBlock_Zero(iPoint, nSpecies+iDim);
+        LinSysRes(iPoint, nSpecies+iDim) = 0.0;
         nodes->SetVal_ResTruncError_Zero(iPoint,nSpecies+iDim);
       }
 
@@ -524,7 +524,7 @@ void CNEMONSSolver::BC_Isothermal_Wall(CGeometry *geometry,
                                   sour_numerics, config, val_marker);
       break;
     } else {
-      iMarker_Catalytic++;     
+      iMarker_Catalytic++;
     }
   }
 
@@ -603,7 +603,7 @@ void CNEMONSSolver::BC_IsothermalNonCatalytic_Wall(CGeometry *geometry,
       nodes->SetVelocity_Old(iPoint,Vector);
 
       for (iDim = 0; iDim < nDim; iDim++) {
-        LinSysRes.SetBlock_Zero(iPoint, nSpecies+iDim);
+        LinSysRes(iPoint, nSpecies+iDim) = 0.0;
         nodes->SetVal_ResTruncError_Zero(iPoint,nSpecies+iDim);
       }
 
@@ -639,7 +639,7 @@ void CNEMONSSolver::BC_IsothermalNonCatalytic_Wall(CGeometry *geometry,
       //  Jacobian.SubtractBlock(iPoint, iPoint, Jacobian_i);
       //} // implicit
     }
-  } 
+  }
 }
 
 void CNEMONSSolver::BC_IsothermalCatalytic_Wall(CGeometry *geometry,
@@ -678,7 +678,7 @@ void CNEMONSSolver::BC_IsothermalCatalytic_Wall(CGeometry *geometry,
   RuSI = UNIVERSAL_GAS_CONSTANT;
   Ru   = 1000.0*RuSI;
   Ms   = FluidModel->GetMolarMass();
-  
+
   /*--- Get the locations of the primitive variables ---*/
   RHOS_INDEX    = nodes->GetRhosIndex();
   RHO_INDEX     = nodes->GetRhoIndex();
@@ -731,7 +731,7 @@ void CNEMONSSolver::BC_IsothermalCatalytic_Wall(CGeometry *geometry,
       Di   = nodes->GetDiffusionCoeff(iPoint);
       eves = nodes->GetEve(iPoint);
       hs   = FluidModel->GetSpeciesEnthalpy(Vi[T_INDEX], eves);
-      for (iSpecies = 0; iSpecies < nSpecies; iSpecies++)      
+      for (iSpecies = 0; iSpecies < nSpecies; iSpecies++)
         Yj[iSpecies] = Vj[RHOS_INDEX+iSpecies]/Vj[RHO_INDEX];
       rho    = Vi[RHO_INDEX];
       dTdU   = nodes->GetdTdU(iPoint);
@@ -862,7 +862,7 @@ void CNEMONSSolver::BC_Smoluchowski_Maxwell(CGeometry *geometry,
   su2double div_vel=0, Delta;
 
   vector<su2double> Ms;
-  
+
   bool ionization = config->GetIonization();
 
   if (ionization) {
@@ -942,7 +942,7 @@ void CNEMONSSolver::BC_Smoluchowski_Maxwell(CGeometry *geometry,
       GasConstant=0;
       for(iSpecies=0;iSpecies<nSpecies;iSpecies++)
         GasConstant+=UNIVERSAL_GAS_CONSTANT*1000.0/Ms[iSpecies]*nodes->GetMassFraction(iPoint,iSpecies);
-      
+
       /*--- Calculate temperature gradients normal to surface---*/ //Doubt about minus sign
       dTn   = - (Ti-Tj)/dij;
       dTven = - (Tvei-Tvej)/dij;
@@ -963,9 +963,9 @@ void CNEMONSSolver::BC_Smoluchowski_Maxwell(CGeometry *geometry,
       }
 
       /*--- Calculate Heatflux tangent to surface ---*/
-      for (iDim = 0; iDim < nDim; iDim++) 
+      for (iDim = 0; iDim < nDim; iDim++)
         Vector_Tangent_HF[iDim] = ktr*Vector_Tangent_dT[iDim]+kve*Vector_Tangent_dTve[iDim];
-      
+
       /*--- Initialize viscous residual to zero ---*/
       for (iVar = 0; iVar < nVar; iVar ++)
         Res_Visc[iVar] = 0.0;
@@ -1001,7 +1001,7 @@ void CNEMONSSolver::BC_Smoluchowski_Maxwell(CGeometry *geometry,
       nodes->SetVelocity_Old(iPoint,Vector);
 
       for (iDim = 0; iDim < nDim; iDim++) {
-        LinSysRes.SetBlock_Zero(iPoint, nSpecies+iDim);
+        LinSysRes(iPoint, nSpecies+iDim) = 0.0;
         nodes->SetVal_ResTruncError_Zero(iPoint,nSpecies+iDim);
       }
 
@@ -1012,5 +1012,5 @@ void CNEMONSSolver::BC_Smoluchowski_Maxwell(CGeometry *geometry,
 
       LinSysRes.SubtractBlock(iPoint, Res_Visc);
     }
-  } 
+  }
 }
diff --git a/SU2_CFD/src/solvers/CNSSolver.cpp b/SU2_CFD/src/solvers/CNSSolver.cpp
index 345ce340469..aa53371ffc3 100644
--- a/SU2_CFD/src/solvers/CNSSolver.cpp
+++ b/SU2_CFD/src/solvers/CNSSolver.cpp
@@ -631,7 +631,7 @@ void CNSSolver::BC_HeatFlux_Wall(CGeometry *geometry, CSolver **solver_container
     }
 
     for (auto iDim = 0u; iDim < nDim; iDim++)
-      LinSysRes.SetBlock_Zero(iPoint, iDim+1);
+      LinSysRes(iPoint, iDim+1) = 0.0;
     nodes->SetVel_ResTruncError_Zero(iPoint);
 
     /*--- If the wall is moving, there are additional residual contributions
@@ -786,7 +786,7 @@ void CNSSolver::BC_Isothermal_Wall_Generic(CGeometry *geometry, CSolver **solver
     }
 
     for (auto iDim = 0u; iDim < nDim; iDim++)
-      LinSysRes.SetBlock_Zero(iPoint, iDim+1);
+      LinSysRes(iPoint, iDim+1) = 0.0;
     nodes->SetVel_ResTruncError_Zero(iPoint);
 
     /*--- Get transport coefficients ---*/
diff --git a/SU2_CFD/src/solvers/CSolver.cpp b/SU2_CFD/src/solvers/CSolver.cpp
index dc7d101368a..3daf4651cf3 100644
--- a/SU2_CFD/src/solvers/CSolver.cpp
+++ b/SU2_CFD/src/solvers/CSolver.cpp
@@ -344,7 +344,7 @@ void CSolver::InitiatePeriodicComms(CGeometry *geometry,
   bool weighted = true;
 
   unsigned short iVar, jVar, iDim;
-  unsigned short iNeighbor, nNeighbor = 0;
+  unsigned short nNeighbor       = 0;
   unsigned short COUNT_PER_POINT = 0;
   unsigned short MPI_TYPE        = 0;
   unsigned short ICOUNT          = nVar;
@@ -352,7 +352,7 @@ void CSolver::InitiatePeriodicComms(CGeometry *geometry,
 
   int iMessage, iSend, nSend;
 
-  unsigned long iPoint, jPoint, msg_offset, buf_offset, iPeriodic, Neighbor_Point;
+  unsigned long iPoint, msg_offset, buf_offset, iPeriodic;
 
   su2double *Diff      = new su2double[nVar];
   su2double *Und_Lapl  = new su2double[nVar];
@@ -482,16 +482,14 @@ void CSolver::InitiatePeriodicComms(CGeometry *geometry,
           case PERIODIC_NEIGHBORS:
 
             nNeighbor = 0;
-            for (iNeighbor = 0; iNeighbor < geometry->nodes->GetnPoint(iPoint); iNeighbor++) {
-              Neighbor_Point = geometry->nodes->GetPoint(iPoint, iNeighbor);
+            for (auto jPoint : geometry->nodes->GetPoints(iPoint)) {
 
               /*--- Check if this neighbor lies on the periodic face so
                that we avoid double counting neighbors on both sides. If
                not, increment the count of neighbors for the donor. ---*/
 
-              if (!geometry->nodes->GetPeriodicBoundary(Neighbor_Point))
-              nNeighbor++;
-
+              if (!geometry->nodes->GetPeriodicBoundary(jPoint))
+                nNeighbor++;
             }
 
             /*--- Store the number of neighbors in bufffer. ---*/
@@ -598,8 +596,7 @@ void CSolver::InitiatePeriodicComms(CGeometry *geometry,
             for (iVar = 0; iVar < nVar; iVar++)
               Und_Lapl[iVar] = 0.0;
 
-            for (iNeighbor = 0; iNeighbor < geometry->nodes->GetnPoint(iPoint); iNeighbor++) {
-              jPoint = geometry->nodes->GetPoint(iPoint, iNeighbor);
+            for (auto jPoint : geometry->nodes->GetPoints(iPoint)) {
 
               /*--- Avoid periodic boundary points so that we do not
                duplicate edges on both sides of the periodic BC. ---*/
@@ -673,8 +670,7 @@ void CSolver::InitiatePeriodicComms(CGeometry *geometry,
              on both sides of the periodic face. ---*/
 
             Sensor_i = 0.0; Sensor_j = 0.0;
-            for (iNeighbor = 0; iNeighbor < geometry->nodes->GetnPoint(iPoint); iNeighbor++) {
-              jPoint = geometry->nodes->GetPoint(iPoint, iNeighbor);
+            for (auto jPoint : geometry->nodes->GetPoints(iPoint)) {
 
               /*--- Avoid halos and boundary points so that we don't
                duplicate edges on both sides of the periodic BC. ---*/
@@ -825,8 +821,7 @@ void CSolver::InitiatePeriodicComms(CGeometry *geometry,
             r11 = 0.0;   r12 = 0.0;   r22 = 0.0;
             r13 = 0.0; r23_a = 0.0; r23_b = 0.0;  r33 = 0.0;
 
-            for (iNeighbor = 0; iNeighbor < geometry->nodes->GetnPoint(iPoint); iNeighbor++) {
-              jPoint = geometry->nodes->GetPoint(iPoint, iNeighbor);
+            for (auto jPoint : geometry->nodes->GetPoints(iPoint)) {
 
               /*--- Avoid periodic boundary points so that we do not
                duplicate edges on both sides of the periodic BC. ---*/
@@ -974,8 +969,7 @@ void CSolver::InitiatePeriodicComms(CGeometry *geometry,
             r11 = 0.0;   r12 = 0.0;   r22 = 0.0;
             r13 = 0.0; r23_a = 0.0; r23_b = 0.0;  r33 = 0.0;
 
-            for (iNeighbor = 0; iNeighbor < geometry->nodes->GetnPoint(iPoint); iNeighbor++) {
-              jPoint = geometry->nodes->GetPoint(iPoint, iNeighbor);
+            for (auto jPoint : geometry->nodes->GetPoints(iPoint)) {
 
               /*--- Avoid periodic boundary points so that we do not
                duplicate edges on both sides of the periodic BC. ---*/
@@ -1093,8 +1087,7 @@ void CSolver::InitiatePeriodicComms(CGeometry *geometry,
               Sol_Max[iVar] = base_nodes->GetSolution_Max(iPoint, iVar);
             }
 
-            for (iNeighbor = 0; iNeighbor < geometry->nodes->GetnPoint(iPoint); iNeighbor++) {
-              jPoint = geometry->nodes->GetPoint(iPoint, iNeighbor);
+            for (auto jPoint : geometry->nodes->GetPoints(iPoint)) {
               for (iVar = 0; iVar < nPrimVarGrad; iVar++) {
                 Sol_Min[iVar] = min(Sol_Min[iVar], base_nodes->GetPrimitive(jPoint, iVar));
                 Sol_Max[iVar] = max(Sol_Max[iVar], base_nodes->GetPrimitive(jPoint, iVar));
@@ -1144,8 +1137,7 @@ void CSolver::InitiatePeriodicComms(CGeometry *geometry,
               Sol_Max[iVar] = base_nodes->GetSolution_Max(iPoint, iVar);
             }
 
-            for (iNeighbor = 0; iNeighbor < geometry->nodes->GetnPoint(iPoint); iNeighbor++) {
-              jPoint = geometry->nodes->GetPoint(iPoint, iNeighbor);
+            for (auto jPoint : geometry->nodes->GetPoints(iPoint)) {
               for (iVar = 0; iVar < nVar; iVar++) {
                 Sol_Min[iVar] = min(Sol_Min[iVar], base_nodes->GetSolution(jPoint, iVar));
                 Sol_Max[iVar] = max(Sol_Max[iVar], base_nodes->GetSolution(jPoint, iVar));
@@ -1367,7 +1359,7 @@ void CSolver::CompletePeriodicComms(CGeometry *geometry,
 
                 if (iPeriodic == val_periodic_index + nPeriodic/2) {
                   for (iVar = 0; iVar < nVar; iVar++) {
-                    LinSysRes.SetBlock_Zero(iPoint, iVar);
+                    LinSysRes(iPoint, iVar) = 0.0;
                     total_index = iPoint*nVar+iVar;
                     Jacobian.DeleteValsRowi(total_index);
                   }
diff --git a/SU2_CFD/src/solvers/CTurbSolver.cpp b/SU2_CFD/src/solvers/CTurbSolver.cpp
index 556a00e5fc7..29271cf29ef 100644
--- a/SU2_CFD/src/solvers/CTurbSolver.cpp
+++ b/SU2_CFD/src/solvers/CTurbSolver.cpp
@@ -302,10 +302,7 @@ void CTurbSolver::SumEdgeFluxes(CGeometry* geometry) {
 
     LinSysRes.SetBlock_Zero(iPoint);
 
-    for (unsigned short iNeigh = 0; iNeigh < geometry->nodes->GetnPoint(iPoint); ++iNeigh) {
-
-      auto iEdge = geometry->nodes->GetEdge(iPoint, iNeigh);
-
+    for (auto iEdge : geometry->nodes->GetEdges(iPoint)) {
       if (iPoint == geometry->edges->GetNode(iEdge,0))
         LinSysRes.AddBlock(iPoint, EdgeFluxes.GetBlock(iEdge));
       else
@@ -472,7 +469,7 @@ void CTurbSolver::BC_Fluid_Interface(CGeometry *geometry, CSolver **solver_conta
         /*--- Accumulate the residuals to compute the average ---*/
 
         for (auto iVar = 0u; iVar < nVar; iVar++) {
-          LinSysRes(iPoint,iVar) += weight*residual.residual[iVar];
+          LinSysRes(iPoint,iVar) += weight*residual[iVar];
           for (auto jVar = 0u; jVar < nVar; jVar++)
             Jacobian_i[iVar*nVar+jVar] += SU2_TYPE::GetValue(weight*residual.jacobian_i[iVar][jVar]);
         }
diff --git a/SU2_CFD/src/variables/CFEAVariable.cpp b/SU2_CFD/src/variables/CFEAVariable.cpp
index c1b253cc4ff..046581a2bcf 100644
--- a/SU2_CFD/src/variables/CFEAVariable.cpp
+++ b/SU2_CFD/src/variables/CFEAVariable.cpp
@@ -87,6 +87,8 @@ CFEAVariable::CFEAVariable(const su2double *val_fea, unsigned long npoint, unsig
   if (prestretch_fem) Prestretch.resize(nPoint,nVar);
 
   if (multizone) Set_BGSSolution_k();
+
+  if (config->GetTopology_Optimization()) AuxVar.resize(nPoint);
 }
 
 void CFEAVariable::SetSolution_Vel_time_n() { Solution_Vel_time_n = Solution_Vel; }
diff --git a/TestCases/disc_adj_rans/naca0012/turb_NACA0012_sst.cfg b/TestCases/disc_adj_rans/naca0012/turb_NACA0012_sst.cfg
index 88c1746170f..8637b4fa861 100644
--- a/TestCases/disc_adj_rans/naca0012/turb_NACA0012_sst.cfg
+++ b/TestCases/disc_adj_rans/naca0012/turb_NACA0012_sst.cfg
@@ -143,6 +143,7 @@ MG_DAMP_PROLONGATION= 0.75
 % Convective numerical method (JST, LAX-FRIEDRICH, CUSP, ROE, AUSM, HLLC,
 %                              TURKEL_PREC, MSW)
 CONV_NUM_METHOD_FLOW= ROE
+USE_VECTORIZATION= YES
 %
 % Spatial numerical order integration (1ST_ORDER, 2ND_ORDER, 2ND_ORDER_LIMITER)
 MUSCL_FLOW= YES
diff --git a/TestCases/hybrid_regression.py b/TestCases/hybrid_regression.py
index a23052ba350..32ea88f47d2 100644
--- a/TestCases/hybrid_regression.py
+++ b/TestCases/hybrid_regression.py
@@ -183,7 +183,7 @@ def main():
     turb_naca0012_sa.cfg_dir   = "rans/naca0012"
     turb_naca0012_sa.cfg_file  = "turb_NACA0012_sa.cfg"
     turb_naca0012_sa.test_iter = 10
-    turb_naca0012_sa.test_vals = [-12.076819, -16.049252, 1.064326, 0.019770]
+    turb_naca0012_sa.test_vals = [-11.537781, -14.899750, 1.064330, 0.019756]
     test_list.append(turb_naca0012_sa)
 
     # NACA0012 (SST, FUN3D finest grid results: CL=1.0840, CD=0.01253)
@@ -191,7 +191,7 @@ def main():
     turb_naca0012_sst.cfg_dir   = "rans/naca0012"
     turb_naca0012_sst.cfg_file  = "turb_NACA0012_sst.cfg"
     turb_naca0012_sst.test_iter = 10
-    turb_naca0012_sst.test_vals = [-15.273728, -6.243783, 1.049988, 0.019165]
+    turb_naca0012_sst.test_vals = [-12.797090, -5.872763, 1.049989, 0.019163]
     test_list.append(turb_naca0012_sst)
 
     # NACA0012 (SST_SUST, FUN3D finest grid results: CL=1.0840, CD=0.01253)
@@ -199,7 +199,7 @@ def main():
     turb_naca0012_sst_sust.cfg_dir   = "rans/naca0012"
     turb_naca0012_sst_sust.cfg_file  = "turb_NACA0012_sst_sust.cfg"
     turb_naca0012_sst_sust.test_iter = 10
-    turb_naca0012_sst_sust.test_vals = [-14.851214, -6.062566, 1.005233, 0.019014]
+    turb_naca0012_sst_sust.test_vals = [-12.640091, -5.751854, 1.005233, 0.019017]
     test_list.append(turb_naca0012_sst_sust)
 
     # PROPELLER
diff --git a/TestCases/parallel_regression.py b/TestCases/parallel_regression.py
index 048bcdf64e8..296e407b2fd 100644
--- a/TestCases/parallel_regression.py
+++ b/TestCases/parallel_regression.py
@@ -295,7 +295,7 @@ def main():
     turb_naca0012_sa.cfg_dir   = "rans/naca0012"
     turb_naca0012_sa.cfg_file  = "turb_NACA0012_sa.cfg"
     turb_naca0012_sa.test_iter = 10
-    turb_naca0012_sa.test_vals = [-12.078780, -16.138902, 1.064326, 0.019770]
+    turb_naca0012_sa.test_vals = [-11.155953, -14.468619, 1.064330, 0.019756] #last 4 columns
     turb_naca0012_sa.su2_exec  = "parallel_computation.py -f"
     turb_naca0012_sa.timeout   = 3200
     turb_naca0012_sa.tol       = 0.00001
@@ -306,7 +306,7 @@ def main():
     turb_naca0012_sst.cfg_dir   = "rans/naca0012"
     turb_naca0012_sst.cfg_file  = "turb_NACA0012_sst.cfg"
     turb_naca0012_sst.test_iter = 10
-    turb_naca0012_sst.test_vals = [-15.273776, -6.243795, 1.049988, 0.019165]
+    turb_naca0012_sst.test_vals = [-12.799245, -5.875128, 1.049989, 0.019163] #last 4 columns
     turb_naca0012_sst.su2_exec  = "parallel_computation.py -f"
     turb_naca0012_sst.timeout   = 3200
     turb_naca0012_sst.tol       = 0.00001
@@ -317,7 +317,7 @@ def main():
     turb_naca0012_sst_sust.cfg_dir   = "rans/naca0012"
     turb_naca0012_sst_sust.cfg_file  = "turb_NACA0012_sst_sust.cfg"
     turb_naca0012_sst_sust.test_iter = 10
-    turb_naca0012_sst_sust.test_vals = [-14.851220, -6.062220, 1.005233, 0.019014]
+    turb_naca0012_sst_sust.test_vals = [-12.641087, -5.753486, 1.005233, 0.019017] #last 4 columns
     turb_naca0012_sst_sust.su2_exec  = "parallel_computation.py -f"
     turb_naca0012_sst_sust.timeout   = 3200
     turb_naca0012_sst_sust.tol       = 0.00001
@@ -1235,7 +1235,7 @@ def main():
     pywrapper_turb_naca0012_sst.cfg_dir   = "rans/naca0012"
     pywrapper_turb_naca0012_sst.cfg_file  = "turb_NACA0012_sst.cfg"
     pywrapper_turb_naca0012_sst.test_iter = 10
-    pywrapper_turb_naca0012_sst.test_vals = [-15.273776, -6.243795, 1.049988, 0.019165] #last 4 columns
+    pywrapper_turb_naca0012_sst.test_vals = [-12.799245, -5.875128, 1.049989, 0.019163] #last 4 columns
     pywrapper_turb_naca0012_sst.su2_exec  = "mpirun -np 2 SU2_CFD.py --parallel -f"
     pywrapper_turb_naca0012_sst.timeout   = 3200
     pywrapper_turb_naca0012_sst.tol       = 0.00001
diff --git a/TestCases/parallel_regression_AD.py b/TestCases/parallel_regression_AD.py
index 65fdfad3934..b1bf0fef477 100644
--- a/TestCases/parallel_regression_AD.py
+++ b/TestCases/parallel_regression_AD.py
@@ -95,7 +95,7 @@ def main():
     discadj_rans_naca0012_sst.cfg_dir   = "disc_adj_rans/naca0012"
     discadj_rans_naca0012_sst.cfg_file  = "turb_NACA0012_sst.cfg"
     discadj_rans_naca0012_sst.test_iter = 10
-    discadj_rans_naca0012_sst.test_vals = [-2.221040, -0.491810, 0.557480, 0.000027]
+    discadj_rans_naca0012_sst.test_vals = [-2.221231, -0.491824, 0.557480, 0.000027] #last 4 columns
     discadj_rans_naca0012_sst.su2_exec  = "parallel_computation.py -f"
     discadj_rans_naca0012_sst.timeout   = 1600
     discadj_rans_naca0012_sst.tol       = 0.00001
diff --git a/TestCases/rans/naca0012/turb_NACA0012_sa.cfg b/TestCases/rans/naca0012/turb_NACA0012_sa.cfg
index 644194282c3..7f1d366cccb 100644
--- a/TestCases/rans/naca0012/turb_NACA0012_sa.cfg
+++ b/TestCases/rans/naca0012/turb_NACA0012_sa.cfg
@@ -144,6 +144,7 @@ MG_DAMP_PROLONGATION= 0.75
 % Convective numerical method (JST, LAX-FRIEDRICH, CUSP, ROE, AUSM, HLLC,
 %                              TURKEL_PREC, MSW)
 CONV_NUM_METHOD_FLOW= ROE
+USE_VECTORIZATION= YES
 %
 % Spatial numerical order integration (1ST_ORDER, 2ND_ORDER, 2ND_ORDER_LIMITER)
 MUSCL_FLOW= YES
diff --git a/TestCases/rans/naca0012/turb_NACA0012_sst.cfg b/TestCases/rans/naca0012/turb_NACA0012_sst.cfg
index 86acc70062d..8c787c4a499 100644
--- a/TestCases/rans/naca0012/turb_NACA0012_sst.cfg
+++ b/TestCases/rans/naca0012/turb_NACA0012_sst.cfg
@@ -150,6 +150,7 @@ MG_DAMP_PROLONGATION= 0.75
 % Convective numerical method (JST, LAX-FRIEDRICH, CUSP, ROE, AUSM, HLLC,
 %                              TURKEL_PREC, MSW)
 CONV_NUM_METHOD_FLOW= ROE
+USE_VECTORIZATION= YES
 %
 % Spatial numerical order integration (1ST_ORDER, 2ND_ORDER, 2ND_ORDER_LIMITER)
 MUSCL_FLOW= YES
diff --git a/TestCases/rans/naca0012/turb_NACA0012_sst_sust.cfg b/TestCases/rans/naca0012/turb_NACA0012_sst_sust.cfg
index 768eb80fd69..e95536072d8 100644
--- a/TestCases/rans/naca0012/turb_NACA0012_sst_sust.cfg
+++ b/TestCases/rans/naca0012/turb_NACA0012_sst_sust.cfg
@@ -156,6 +156,7 @@ MG_DAMP_PROLONGATION= 0.75
 % Convective numerical method (JST, LAX-FRIEDRICH, CUSP, ROE, AUSM, HLLC,
 %                              TURKEL_PREC, MSW)
 CONV_NUM_METHOD_FLOW= ROE
+USE_VECTORIZATION= YES
 %
 % Spatial numerical order integration (1ST_ORDER, 2ND_ORDER, 2ND_ORDER_LIMITER)
 MUSCL_FLOW= YES
diff --git a/TestCases/serial_regression.py b/TestCases/serial_regression.py
index 1095557184e..075803d70bd 100644
--- a/TestCases/serial_regression.py
+++ b/TestCases/serial_regression.py
@@ -333,7 +333,7 @@ def main():
     turb_naca0012_sa.cfg_dir   = "rans/naca0012"
     turb_naca0012_sa.cfg_file  = "turb_NACA0012_sa.cfg"
     turb_naca0012_sa.test_iter = 10
-    turb_naca0012_sa.test_vals = [-12.075861, -16.146770, 1.064326, 0.019770] #last 4 columns
+    turb_naca0012_sa.test_vals = [-11.141831, -14.498856, 1.064330, 0.019756] #last 4 columns
     turb_naca0012_sa.su2_exec  = "SU2_CFD"
     turb_naca0012_sa.new_output = True
     turb_naca0012_sa.timeout   = 3200
@@ -345,7 +345,7 @@ def main():
     turb_naca0012_sst.cfg_dir   = "rans/naca0012"
     turb_naca0012_sst.cfg_file  = "turb_NACA0012_sst.cfg"
     turb_naca0012_sst.test_iter = 10
-    turb_naca0012_sst.test_vals = [-15.273739, -6.243814, 1.049988, 0.019165] #last 4 columns
+    turb_naca0012_sst.test_vals = [-12.797476, -5.873045, 1.049989, 0.019163] #last 4 columns
     turb_naca0012_sst.su2_exec  = "SU2_CFD"
     turb_naca0012_sst.new_output  = True
     turb_naca0012_sst.timeout   = 3200
@@ -357,7 +357,7 @@ def main():
     turb_naca0012_sst_sust.cfg_dir   = "rans/naca0012"
     turb_naca0012_sst_sust.cfg_file  = "turb_NACA0012_sst_sust.cfg"
     turb_naca0012_sst_sust.test_iter = 10
-    turb_naca0012_sst_sust.test_vals = [-14.851215, -6.062229, 1.005233, 0.019014] #last 4 columns
+    turb_naca0012_sst_sust.test_vals = [-12.640277, -5.752224, 1.005233, 0.019017] #last 4 columns
     turb_naca0012_sst_sust.su2_exec  = "SU2_CFD"
     turb_naca0012_sst_sust.timeout   = 3200
     turb_naca0012_sst_sust.tol       = 0.00001
@@ -1807,7 +1807,7 @@ def main():
     pywrapper_turb_naca0012_sst.cfg_dir   = "rans/naca0012"
     pywrapper_turb_naca0012_sst.cfg_file  = "turb_NACA0012_sst.cfg"
     pywrapper_turb_naca0012_sst.test_iter = 10
-    pywrapper_turb_naca0012_sst.test_vals = [-15.273739, -6.243814, 1.049988, 0.019165] #last 4 columns
+    pywrapper_turb_naca0012_sst.test_vals = [-12.797476, -5.873045, 1.049989, 0.019163] #last 4 columns
     pywrapper_turb_naca0012_sst.su2_exec  = "SU2_CFD.py -f"
     pywrapper_turb_naca0012_sst.new_output = True
     pywrapper_turb_naca0012_sst.timeout   = 3200
diff --git a/TestCases/serial_regression_AD.py b/TestCases/serial_regression_AD.py
index 4b46438081d..54474293e8c 100644
--- a/TestCases/serial_regression_AD.py
+++ b/TestCases/serial_regression_AD.py
@@ -95,7 +95,7 @@ def main():
     discadj_rans_naca0012_sst.cfg_dir   = "disc_adj_rans/naca0012"
     discadj_rans_naca0012_sst.cfg_file  = "turb_NACA0012_sst.cfg"
     discadj_rans_naca0012_sst.test_iter = 10
-    discadj_rans_naca0012_sst.test_vals = [-2.221040, -0.492202, 0.557470, 0.000027] #last 4 columns
+    discadj_rans_naca0012_sst.test_vals = [-2.221232, -0.492215, 0.557480, 0.000027] #last 4 columns
     discadj_rans_naca0012_sst.su2_exec  = "SU2_CFD_AD"
     discadj_rans_naca0012_sst.timeout   = 1600
     discadj_rans_naca0012_sst.tol       = 0.00001
diff --git a/UnitTests/Common/vectorization.cpp b/UnitTests/Common/vectorization.cpp
new file mode 100644
index 00000000000..cc2118d493b
--- /dev/null
+++ b/UnitTests/Common/vectorization.cpp
@@ -0,0 +1,84 @@
+/*!
+ * \file vectorization.cpp
+ * \brief Unit tests for the SIMD type and associated expression templates.
+ * \author P. Gomes
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "catch.hpp"
+#include "../../Common/include/parallelization/vectorization.hpp"
+
+using namespace std;
+
+template<class T, class U>
+struct arithmeticFun {
+  static T f(T A, T B, T C, T D, U x, U y) {
+    return pow( (A+B-x*C)/y + pow(A*x-C/D, y), D);
+  }
+};
+
+template<class T, class U>
+struct logicFun {
+  static T f(T A, T B, T C, T D, U x, U y) {
+    // (B < A || B >= C) && ...
+    return max(B < A, B >= min(C,-D)) * (abs(A) == abs(x)) * (abs(C) != abs(y));
+  }
+};
+
+template<template<class,class> class Fun, class T, class U>
+void computeAndCheck(T A, T B, T C, T D, U x, U y) {
+  const auto result = Fun<T,U>::f(A, B, C, D, x, y);
+
+  for (size_t k=0; k<T::Size; ++k) {
+    CHECK(result[k] == Fun<U,U>::f(A[k], B[k], C[k], D[k], x, y));
+  }
+}
+
+TEST_CASE("SIMD INT", "[Vectorization]") {
+  /*--- Integer types will use the expression templates. ---*/
+  using Int = simd::Array<int>;
+
+  Int A = 1, B = -3, C = 5, D = 2;
+  int x = -1, y = 2;
+
+  computeAndCheck<arithmeticFun>(A, B, C, D, x ,y);
+  computeAndCheck<logicFun>(A, B, C, D, x ,y);
+
+  Int t = sign(A)+sign(B+C);
+  CHECK(t[0] == 2);
+}
+
+TEST_CASE("SIMD DOUBLE", "[Vectorization]") {
+  /*--- Double use the explicitly vectorized template specializations. ---*/
+  using Double = simd::Array<double>;
+
+  Double A = 1, B = -3, C = 5, D = 2;
+  double x = -1, y = 2;
+
+  computeAndCheck<arithmeticFun>(A, B, C, D, x ,y);
+  computeAndCheck<logicFun>(A, B, C, D, x ,y);
+
+  Double t = sqrt(pow(B,2)*C + D*y + A+x);
+  CHECK(t[1] == 7);
+}
+
diff --git a/UnitTests/SU2_CFD/gradients.cpp b/UnitTests/SU2_CFD/gradients.cpp
new file mode 100644
index 00000000000..825dde705e5
--- /dev/null
+++ b/UnitTests/SU2_CFD/gradients.cpp
@@ -0,0 +1,166 @@
+/*!
+ * \file gradients.cpp
+ * \brief Unit tests for gradient calculation.
+ * \author P. Gomes, T. Albring
+ * \version 7.0.6 "Blackbird"
+ *
+ * SU2 Project Website: https://su2code.github.io
+ *
+ * The SU2 Project is maintained by the SU2 Foundation
+ * (http://su2foundation.org)
+ *
+ * Copyright 2012-2020, SU2 Contributors (cf. AUTHORS.md)
+ *
+ * SU2 is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * SU2 is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with SU2. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "catch.hpp"
+#include "../../Common/include/geometry/CPhysicalGeometry.hpp"
+#include "../../Common/include/containers/container_decorators.hpp"
+#include "../../SU2_CFD/include/solvers/CSolver.hpp"
+#include "../../SU2_CFD/include/gradients/computeGradientsGreenGauss.hpp"
+#include "../../SU2_CFD/include/gradients/computeGradientsLeastSquares.hpp"
+
+/*!
+ * \brief Base class for gradient tests using a unit cube geometry.
+ * Derived classes should implement operator (i,j), returning the value
+ * of the test function, and method grad(i,j,k), returning the known
+ * gradient.
+ */
+struct GradientTestBase {
+  const string configOptions =
+      "SOLVER= NAVIER_STOKES\n"
+      "MESH_FORMAT= BOX\n"
+      "INIT_OPTION= TD_CONDITIONS\n"
+      "MARKER_HEATFLUX= (y_minus, 0.0, y_plus, 0.0)\n"
+      "MARKER_FAR= (x_minus, x_plus, z_plus, z_minus)\n"
+      "MESH_BOX_SIZE= 10,10,10\n"
+      "MESH_BOX_LENGTH= 1,1,1\n"
+      "MESH_BOX_OFFSET= 0,0,0\n";
+
+  std::unique_ptr<CConfig> config;
+  std::unique_ptr<CGeometry> geometry;
+
+  GradientTestBase() {
+    initConfig();
+    initGeometry();
+  }
+
+  /*!
+   * \brief Initialize the config structure
+   */
+  void initConfig() {
+    auto origBuf = cout.rdbuf();
+    cout.rdbuf(nullptr);
+    stringstream ss(configOptions);
+    config = std::unique_ptr<CConfig>(new CConfig(ss, SU2_CFD, false));
+    cout.rdbuf(origBuf);
+  }
+
+  /*!
+   * \brief Initialize the geometry
+   */
+  void initGeometry() {
+    auto origBuf = cout.rdbuf();
+    cout.rdbuf(nullptr);
+    {
+      auto aux_geometry = std::unique_ptr<CGeometry>(new CPhysicalGeometry(config.get(), 0, 1));
+      geometry = std::unique_ptr<CGeometry>(new CPhysicalGeometry(aux_geometry.get(), config.get()));
+    }
+    geometry->SetSendReceive(config.get());
+    geometry->SetBoundaries(config.get());
+    geometry->SetPoint_Connectivity();
+    geometry->SetElement_Connectivity();
+    geometry->SetBoundVolume();
+    geometry->Check_IntElem_Orientation(config.get());
+    geometry->Check_BoundElem_Orientation(config.get());
+    geometry->SetEdges();
+    geometry->SetVertex(config.get());
+    geometry->SetCoord_CG();
+    geometry->SetControlVolume(config.get(), ALLOCATE);
+    geometry->SetBoundControlVolume(config.get(), ALLOCATE);
+    geometry->FindNormal_Neighbor(config.get());
+    geometry->SetGlobal_to_Local_Point();
+    geometry->PreprocessP2PComms(geometry.get(), config.get());
+
+    cout.rdbuf(origBuf);
+  }
+};
+
+struct LinearFunction : public GradientTestBase {
+
+  const unsigned long nVar = 1;
+  const su2double constant = -1.0;
+  const su2double slope[3] = {1.0, 2.0, 3.0};
+
+  /*!
+   * \brief Return manufactured value.
+   */
+  su2double operator() (unsigned long iPoint, unsigned long) const {
+    const auto coord = geometry->nodes->GetCoord(iPoint);
+    return constant + GeometryToolbox::DotProduct(geometry->GetnDim(), slope, coord);
+  }
+
+  /*!
+   * \brief Return reference value.
+   */
+  su2double grad(unsigned long, unsigned long, unsigned long iDim) const {
+    return slope[iDim];
+  }
+};
+
+template<class T, class U>
+void check(const T& ref, const U& calc, su2double tol = 1e-9) {
+  su2double err = 0.0;
+  for (auto iPoint = 0ul; iPoint < calc.length(); ++iPoint) {
+    for (auto iVar = 0ul; iVar < calc.rows(); ++iVar)
+      for (auto iDim = 0ul; iDim < calc.cols(); ++iDim)
+        err = max(err, abs(calc(iPoint,iVar,iDim) - ref.grad(iPoint,iVar,iDim)));
+  }
+  CHECK(err < tol);
+}
+
+template<class TestField>
+void testGreenGauss() {
+  TestField field;
+  C3DDoubleMatrix gradient(field.geometry->GetnPoint(), field.nVar, field.geometry->GetnDim());
+
+  computeGradientsGreenGauss(nullptr, SOLUTION, PERIODIC_NONE, *field.geometry.get(),
+                             *field.config.get(), field, 0, field.nVar, gradient);
+  check(field, gradient);
+}
+
+template<class TestField>
+void testLeastSquares(bool weighted) {
+  TestField field;
+  const auto nDim = field.geometry->GetnDim();
+  C3DDoubleMatrix R(field.geometry->GetnPoint(), nDim, nDim);
+  C3DDoubleMatrix gradient(field.geometry->GetnPoint(), field.nVar, nDim);
+
+  computeGradientsLeastSquares(nullptr, SOLUTION, PERIODIC_NONE, *field.geometry.get(),
+                               *field.config.get(), weighted, field, 0, field.nVar, gradient, R);
+  check(field, gradient);
+}
+
+TEST_CASE("GG", "[Gradients]") {
+  testGreenGauss<LinearFunction>();
+}
+
+TEST_CASE("LS", "[Gradients]") {
+  testLeastSquares<LinearFunction>(false);
+}
+
+TEST_CASE("WLS", "[Gradients]") {
+  testLeastSquares<LinearFunction>(true);
+}
diff --git a/UnitTests/UnitQuadTestCase.hpp b/UnitTests/UnitQuadTestCase.hpp
index 468422699e2..39b9c54f56c 100644
--- a/UnitTests/UnitQuadTestCase.hpp
+++ b/UnitTests/UnitQuadTestCase.hpp
@@ -95,7 +95,8 @@ struct UnitQuadTestCase {
     geometry->SetPoint_Connectivity();
     geometry->SetElement_Connectivity();
     geometry->SetBoundVolume();
-
+    geometry->Check_IntElem_Orientation(config.get());
+    geometry->Check_BoundElem_Orientation(config.get());
     geometry->SetEdges();
     geometry->SetVertex(config.get());
     geometry->SetCoord_CG();
diff --git a/UnitTests/meson.build b/UnitTests/meson.build
index 3b1b9a489af..5457030f8d2 100644
--- a/UnitTests/meson.build
+++ b/UnitTests/meson.build
@@ -8,7 +8,9 @@ su2_cfd_tests = files(['Common/geometry/primal_grid/CPrimalGrid_tests.cpp',
                        'Common/geometry/dual_grid/CDualGrid_tests.cpp',
                        'Common/geometry/CGeometry_test.cpp',
                        'Common/toolboxes/CQuasiNewtonInvLeastSquares_tests.cpp',
-                       'SU2_CFD/numerics/CNumerics_tests.cpp'])
+                       'Common/vectorization.cpp',
+                       'SU2_CFD/numerics/CNumerics_tests.cpp',
+                       'SU2_CFD/gradients.cpp'])
 
 # Reverse-mode (algorithmic differentiation) tests:
 su2_cfd_tests_ad = files(['Common/simple_ad_test.cpp'])
diff --git a/config_template.cfg b/config_template.cfg
index 2d676eaae0f..c0e46381094 100644
--- a/config_template.cfg
+++ b/config_template.cfg
@@ -1025,8 +1025,9 @@ MG_DAMP_PROLONGATION= 0.75
 
 % -------------------- FLOW NUMERICAL METHOD DEFINITION -----------------------%
 %
-% Convective numerical method (JST, LAX-FRIEDRICH, CUSP, ROE, AUSM, AUSMPLUSUP,
-%                              AUSMPLUSUP2, HLLC, TURKEL_PREC, MSW, FDS, SLAU, SLAU2)
+% Convective numerical method (JST, JST_KE, JST_MAT, LAX-FRIEDRICH, CUSP, ROE, AUSM,
+%                              AUSMPLUSUP, AUSMPLUSUP2, AUSMPWPLUS, HLLC, TURKEL_PREC,
+%                              SW, MSW, FDS, SLAU, SLAU2, L2ROE, LMROE)
 CONV_NUM_METHOD_FLOW= ROE
 %
 % Roe Low Dissipation function for Hybrid RANS/LES simulations (FD, NTS, NTS_DUCROS)
@@ -1042,6 +1043,10 @@ LOW_MACH_PREC= NO
 % Slower per iteration but potentialy more stable and capable of higher CFL
 USE_ACCURATE_FLUX_JACOBIANS= NO
 %
+% Use the vectorized version of the selected numerical method (available for JST family and Roe).
+% SU2 should be compiled for an AVX or AVX512 architecture for best performance.
+USE_VECTORIZATION= NO
+%
 % Entropy fix coefficient (0.0 implies no entropy fixing, 1.0 implies scalar
 %                          artificial dissipation)
 ENTROPY_FIX_COEFF= 0.0