diff --git a/src/basic/sort.F90 b/src/basic/sort.F90
index 22add9d0426f97826297efa62f5ca5d39708176b..3d833f76839621e77d276b8d6f26aea3edfc93b9 100644
--- a/src/basic/sort.F90
+++ b/src/basic/sort.F90
@@ -29,7 +29,8 @@ module sort_oct_m
 
   private
   public ::                     &
-    sort
+    sort,                       &
+    robbust_sort_by_abs
 
   ! ------------------------------------------------------------------------------
   !> This is the common interface to a sorting routine.
@@ -171,6 +172,120 @@ contains
   end subroutine lsort
 
 
+  !>@brief Sorting criterium for the robbust sorting below
+  pure logical function less_idx(i, j, off, kabs, ksgn) result(less)
+    integer, intent(in) :: i, j
+    integer, intent(in) :: off(:, :)          ! (ndim, n)
+    integer(int64), intent(in) :: kabs(:), ksgn(:)
+    integer :: d, ndim
+
+    if (kabs(i) /= kabs(j)) then
+      less = kabs(i) < kabs(j)
+      return
+    end if
+
+    if (ksgn(i) /= ksgn(j)) then
+      less = ksgn(i) < ksgn(j)    ! 0 before 1
+      return
+    end if
+
+    ndim = size(off, 1)
+    do d = 1, ndim
+      if (off(d,i) /= off(d,j)) then
+        less = off(d,i) < off(d,j)
+        return
+      end if
+    end do
+
+    less = i < j                  ! final stability (should not matter if offsets are unique)
+  end function less_idx
+
+  !>@brief Perform the permutations for the sorting
+  recursive subroutine mergesort_perm(perm, tmp, l, r, off, kabs, ksgn)
+    integer, intent(inout) :: perm(:), tmp(:)
+    integer, intent(in)    :: l, r
+    integer, intent(in) :: off(:, :)
+    integer(int64), intent(in) :: kabs(:), ksgn(:)
+
+    integer :: m, i, j, k
+
+    if (r <= l) return
+    m = (l + r) / 2
+
+    call mergesort_perm(perm, tmp, l,   m, off, kabs, ksgn)
+    call mergesort_perm(perm, tmp, m+1, r, off, kabs, ksgn)
+
+    i = l; j = m+1; k = l
+    do while (i <= m .and. j <= r)
+      if (less_idx(perm(i), perm(j), off, kabs, ksgn)) then
+        tmp(k) = perm(i); i = i+1
+      else
+        tmp(k) = perm(j); j = j+1
+      end if
+      k = k+1
+    end do
+    do while (i <= m); tmp(k) = perm(i); i=i+1; k=k+1; end do
+    do while (j <= r); tmp(k) = perm(j); j=j+1; k=k+1; end do
+
+    perm(l:r) = tmp(l:r)
+  end subroutine mergesort_perm
+
+  !>@brief Robbust sorting of floating point numbers by absolute values
+  !!
+  !! This works as follow:
+  !!  - Convert the floating point number to integers
+  !!  - Put negative number first
+  !!  - Use the offset integer values, assumed to be unique key, to order the degenerated points
+  subroutine robbust_sort_by_abs(v, off, perm, negative_first)
+    real(real64), intent(in) :: v(:)
+    integer, intent(in)      :: off(:, :)              ! (ndim, n)
+    integer, intent(out)     :: perm(size(v))
+    logical, intent(in), optional :: negative_first
+
+    integer :: n, i
+    logical :: neg_first
+    integer, allocatable :: tmp(:)
+    integer(int64), allocatable :: kabs(:), ksgn(:)
+    real(real64) :: eps, ratio, max_ratio
+    integer(int64) :: kmax
+
+    PUSH_SUB(robbust_sort_by_abs)
+
+    n = size(v)
+    ASSERT(size(off, dim=2) == n)
+
+    neg_first = optional_default(negative_first, .true.)
+
+    ! Tolerance for the quantization, to eliminate noise
+    eps = 1.0e-14_real64 * maxval(abs(v))
+
+    kmax = huge(0_int64)
+    max_ratio = real(kmax, real64) - M_ONE
+
+    allocate(tmp(n), kabs(n), ksgn(n))
+
+    do i = 1, n
+      perm(i) = i
+      ratio = abs(v(i)) / eps
+      if (ratio < max_ratio) then
+        kabs(i) = nint( abs(v(i)) / eps, int64 )
+      else
+        kabs(i) = kmax
+      end if
+
+      if (neg_first) then
+        ksgn(i) = merge(0, 1, v(i) < 0.0_real64)   ! neg (0) then pos (1)
+      else
+        ksgn(i) = merge(0, 1, v(i) >= 0.0_real64)  ! pos (0) then neg (1)
+      end if
+    end do
+
+    call mergesort_perm(perm, tmp, 1, n, off, kabs, ksgn)
+
+    deallocate(tmp, kabs, ksgn)
+    POP_SUB(robbust_sort_by_abs)
+  end subroutine robbust_sort_by_abs
+
 #include "undef.F90"
 #include "complex.F90"
 #include "sort_inc.F90"
diff --git a/src/grid/CMakeLists.txt b/src/grid/CMakeLists.txt
index 2b277d8025a5ac9ec0c8792dfc85b52b10d23e77..e2557c23953b4b8b564ac18ab816e811f3f9606a 100644
--- a/src/grid/CMakeLists.txt
+++ b/src/grid/CMakeLists.txt
@@ -16,6 +16,7 @@ target_sources(Octopus_lib PRIVATE
 		curv_gygi.F90
 		curv_modine.F90
 		derivatives.F90
+                derivatives_test.F90
 		fourier_shell.F90
 		fourier_space.F90
 		grid.F90
diff --git a/src/grid/derivatives.F90 b/src/grid/derivatives.F90
index 6a8c3dcf9fa605f0145d52aa9253667581985a75..1352860a8ef6be74fb6fceb1136dcd6738fe4095 100644
--- a/src/grid/derivatives.F90
+++ b/src/grid/derivatives.F90
@@ -42,10 +42,8 @@ module derivatives_oct_m
   use, intrinsic :: iso_fortran_env
   use lalg_basic_oct_m
   use lalg_adv_oct_m
-  use loct_oct_m
   use math_oct_m
   use mesh_oct_m
-  use mesh_function_oct_m
   use messages_oct_m
   use namespace_oct_m
   use nl_operator_oct_m
@@ -64,13 +62,6 @@ module derivatives_oct_m
   use utils_oct_m
   use varinfo_oct_m
 
-!   debug purposes
-!   use io_binary_oct_m
-!   use io_function_oct_m
-!   use io_oct_m
-!   use unit_oct_m
-!   use unit_system_oct_m
-
   implicit none
 
   private
@@ -80,8 +71,6 @@ module derivatives_oct_m
     derivatives_end,                    &
     derivatives_build,                  &
     derivatives_handle_batch_t,         &
-    dderivatives_test,                  &
-    zderivatives_test,                  &
     dderivatives_batch_start,           &
     zderivatives_batch_start,           &
     dderivatives_batch_finish,          &
@@ -430,7 +419,7 @@ contains
     order_ = optional_default(order, der%order)
 
     ! initialize nl operator
-    call nl_operator_init(lapl, name_)
+    call nl_operator_init(lapl, name_, symm=OP_SYMMETRIC)
 
     ! create stencil
     select case (der%stencil_type)
@@ -484,7 +473,7 @@ contains
       dir_label = ' '
       if (ii < 5) dir_label = index2axis(ii)
 
-      call nl_operator_init(der%grad(ii), "Gradient "//dir_label)
+      call nl_operator_init(der%grad(ii), "Gradient "//dir_label, symm=OP_ANTISYMMETRIC)
 
       ! create stencil
       select case (der%stencil_type)
@@ -905,6 +894,9 @@ contains
         call nl_operator_remove_zero_weight_points(op(i), space, der%mesh)
       end if
       call nl_operator_output_weights(op(i))
+      if (op(i)%const_w) then
+        call nl_operator_build_symmetric_weights(op(i))
+      end if
     end do
 
     ! In case of constant weights, we store the weights of the Laplacian on the GPU, as this
diff --git a/src/grid/derivatives_inc.F90 b/src/grid/derivatives_inc.F90
index 99b247edfd880b20b1f191cb646a31546680566c..ecb628746326f771a75ee013a08d5f71f9e7215f 100644
--- a/src/grid/derivatives_inc.F90
+++ b/src/grid/derivatives_inc.F90
@@ -322,301 +322,6 @@ subroutine X(derivatives_curl)(der, ff, op_ff, ghost_update, set_bc)
   POP_SUB(X(derivatives_curl))
 end subroutine X(derivatives_curl)
 
-
-! ----------------------------------------------------------
-!> @brief unit test for derivatives
-!!
-!! This will be called from the test run mode.
-subroutine X(derivatives_test)(this, namespace, repetitions, min_blocksize, max_blocksize)
-  type(derivatives_t), intent(in) :: this
-  type(namespace_t),   intent(in) :: namespace
-  integer,             intent(in) :: repetitions
-  integer,             intent(in) :: min_blocksize
-  integer,             intent(in) :: max_blocksize
-
-  R_TYPE, allocatable :: ff(:), opff(:, :), gradff(:, :), curlff(:, :), res(:), resgrad(:, :), rescurl(:, :)
-  R_TYPE :: aa, bb, cc
-  integer :: ip, idir, ist, ib
-  type(batch_t) :: ffb, opffb
-  type(batch_t) :: gradffb(this%mesh%box%dim)
-  integer :: blocksize, itime
-  logical :: packstates
-  real(real64)   :: stime, etime
-  character(len=20) :: type
-  real(real64) :: norm
-
-  call parse_variable(namespace, 'StatesPack', .true., packstates)
-
-  SAFE_ALLOCATE(ff(1:this%mesh%np_part))
-  SAFE_ALLOCATE(opff(1:this%mesh%np, 1:this%mesh%box%dim))
-  SAFE_ALLOCATE(gradff(1:this%mesh%np_part, 1:this%mesh%box%dim))
-  SAFE_ALLOCATE(res(1:this%mesh%np_part))
-  SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim))
-  SAFE_ALLOCATE(curlff(1:this%mesh%np, 1:this%mesh%box%dim))
-
-#ifdef R_TREAL
-  type = 'real'
-#else
-  type = 'complex'
-#endif
-
-  ! Note: here we need to use a constant function or anything that
-  ! is constant at the borders, since we assume that all boundary
-  ! points have equal values to optimize the application of the nl-operator.
-
-  aa = R_TOTYPE(1.0)/this%mesh%box%bounding_box_l(1)
-  bb = R_TOTYPE(10.0)
-  cc = R_TOTYPE(100.0)
-
-#ifdef R_TCOMPLEX
-  ! we make things more "complex"
-  aa = aa + M_ZI*R_TOTYPE(0.01)
-  bb = bb*exp(M_ZI*R_TOTYPE(0.345))
-  cc = cc - M_ZI*R_TOTYPE(50.0)
-#endif
-
-
-  do ip = 1, this%mesh%np_part
-    ff(ip) = bb*exp(-aa*sum(this%mesh%x(:, ip)**2)) + cc
-  end do
-  do ip = 1, this%mesh%np
-    do idir = 1, this%mesh%box%dim
-      gradff(ip, idir) = -M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2))
-    end do
-  end do
-
-  ! curl only needed in 3D
-  if (this%mesh%box%dim == 3) then
-    do ip = 1, this%mesh%np
-      curlff(ip, 1) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(2, ip) - this%mesh%x(3, ip))
-      curlff(ip, 2) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(3, ip) - this%mesh%x(1, ip))
-      curlff(ip, 3) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(1, ip) - this%mesh%x(2, ip))
-    end do
-  end if
-
-  message(1) = "Testing Laplacian"
-  call messages_info(1, namespace=namespace)
-
-  ! test Laplacian
-  blocksize = min_blocksize
-  do
-
-    call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
-    call X(batch_init)(opffb, 1, 1, blocksize, this%mesh%np)
-
-    do ist = 1, blocksize
-      call batch_set_state(ffb, ist, this%mesh%np_part, ff)
-    end do
-
-    if (packstates) then
-      call ffb%do_pack()
-      call opffb%do_pack(copy = .false.)
-    end if
-
-    if (repetitions > 1) then
-      call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF)
-    end if
-
-    stime = loct_clock()
-    do itime = 1, repetitions
-      call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF)
-    end do
-    etime = (loct_clock() - stime)/real(repetitions, real64)
-
-    call batch_get_state(opffb, blocksize, this%mesh%np, res)
-
-    call ffb%end()
-    call opffb%end()
-
-    do ip = 1, this%mesh%np
-      res(ip) = M_TWO*res(ip) - &
-        (M_FOUR*aa**2*bb*sum(this%mesh%x(:, ip)**2)*exp(-aa*sum(this%mesh%x(:, ip)**2)) &
-        - this%mesh%box%dim*M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2)))
-    end do
-
-    write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
-      'Laplacian ', trim(type),  &
-      ' bsize = ', blocksize,    &
-      ' , error = ', X(mf_nrm2)(this%mesh, res), &
-      ' , Gflops = ',  &
-#ifdef R_TREAL
-      blocksize*this%mesh%np*M_TWO*this%lapl%stencil%size/(etime*1.0e9_real64)
-#else
-    blocksize*this%mesh%np*M_FOUR*this%lapl%stencil%size/(etime*1.0e9_real64)
-#endif
-
-    call messages_info(1, namespace=namespace)
-
-    blocksize = 2*blocksize
-    if (blocksize > max_blocksize) exit
-
-  end do
-
-  message(1) = "Testing Gradient"
-  call messages_info(1, namespace=namespace)
-
-  ! test Gradient
-  blocksize = min_blocksize
-  do
-    call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
-
-    do ist = 1, blocksize
-      call batch_set_state(ffb, ist, this%mesh%np_part, ff)
-    end do
-
-    if (packstates) then
-      call ffb%do_pack()
-    end if
-
-    if (repetitions > 1) then
-      call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
-    end if
-
-    stime = loct_clock()
-    do itime = 1, repetitions
-      call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
-    end do
-    etime = (loct_clock() - stime)/real(repetitions, real64)
-
-    do idir = 1, this%mesh%box%dim
-      call batch_get_state(gradffb(idir), blocksize, this%mesh%np, resgrad(:, idir))
-    end do
-
-    call ffb%end()
-    do idir = 1, this%mesh%box%dim
-      call gradffb(idir)%end()
-    end do
-
-    do idir = 1, this%mesh%box%dim
-      do ip = 1, this%mesh%np
-        resgrad(ip, idir) = resgrad(ip, idir) - gradff(ip, idir)
-      end do
-    end do
-
-    write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
-      'Batch gradient ', trim(type),  &
-      ' bsize = ', blocksize,    &
-      ' , error (x direction) = ', X(mf_nrm2)(this%mesh, resgrad(:, 1)), &
-      ' , Gflops = ',  &
-#ifdef R_TREAL
-      blocksize*this%mesh%np*M_TWO*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64)
-#else
-    blocksize*this%mesh%np*M_FOUR*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64)
-#endif
-    call messages_info(1, namespace=namespace)
-
-    blocksize = 2*blocksize
-    if (blocksize > max_blocksize) exit
-  end do
-
-  call X(derivatives_grad)(this, ff, opff, set_bc = .false.)
-
-  do idir = 1, this%mesh%box%dim
-    do ip = 1, this%mesh%np
-      opff(ip, idir) = opff(ip, idir) - (-M_TWO*aa*bb*this%mesh%x_t(ip, idir)*exp(-aa*sum(this%mesh%x(:, ip)**2)))
-    end do
-  end do
-
-  message(1) = ''
-  call messages_info(1, namespace=namespace)
-
-  write(message(1), '(3a, es17.10)') 'Gradient ', trim(type),  &
-    ' err = ', X(mf_nrm2)(this%mesh, this%mesh%box%dim, opff)
-  call messages_info(1, namespace=namespace)
-
-  message(1) = ''
-  call messages_info(1, namespace=namespace)
-
-  ! test curl for 3D case
-  if (this%mesh%box%dim == 3) then
-    message(1) = "Testing curl"
-    call messages_info(1, namespace=namespace)
-
-    do blocksize = 3, 6, 3
-      call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
-
-      do ist = 1, blocksize
-        call batch_set_state(ffb, ist, this%mesh%np_part, ff)
-      end do
-
-      if (packstates) then
-        call ffb%do_pack()
-      end if
-
-      ! test for one blocksize without precomputed gradient
-      if (blocksize == 6) then
-        call X(derivatives_batch_curl)(this, ffb, set_bc=.false.)
-      else
-        call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
-        call X(derivatives_batch_curl_from_gradient)(this, ffb, gradffb)
-      end if
-
-      SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:blocksize))
-      do ist = 1, blocksize
-        call batch_get_state(ffb, ist, this%mesh%np, rescurl(:, ist))
-      end do
-
-      call ffb%end()
-      do idir = 1, this%mesh%box%dim
-        call gradffb(idir)%end()
-      end do
-
-      do ip = 1, this%mesh%np
-        do ib = 0, blocksize-1, this%mesh%box%dim
-          do idir = 1, this%mesh%box%dim
-            rescurl(ip, ib+idir) = rescurl(ip, ib+idir) - curlff(ip, idir)
-          end do
-        end do
-      end do
-
-      norm = M_ZERO
-      do ist = 1, blocksize
-        norm = norm + X(mf_nrm2)(this%mesh, rescurl(:, ist))
-      end do
-
-      write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
-        'Batch curl ', trim(type), ' bsize = ', blocksize, ' , error = ', norm
-      call messages_info(1, namespace=namespace)
-
-      SAFE_DEALLOCATE_A(rescurl)
-    end do
-
-    ! Testing the regular curl with the same logic as above
-    SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:this%mesh%box%dim))
-    do idir = 1, this%mesh%box%dim
-      call lalg_copy(this%mesh%np_part, ff, gradff(:, idir))
-    end do
-    call X(derivatives_curl)(this, gradff, rescurl, set_bc = .false.)
-
-    do idir = 1, this%mesh%box%dim
-      do ip = 1, this%mesh%np
-        opff(ip, idir) = rescurl(ip, idir) - curlff(ip, idir)
-      end do
-    end do
-
-    norm = M_ZERO
-    do ist = 1, this%mesh%box%dim
-      norm = norm + X(mf_nrm2)(this%mesh, opff(:, ist))
-    end do
-
-    write(message(1), '(3a, es17.10)') 'Curl ', trim(type), ' err = ', norm
-    call messages_info(1, namespace=namespace)
-
-    SAFE_DEALLOCATE_A(rescurl)
-  end if
-
-  message(1) = ''
-  call messages_info(1, namespace=namespace)
-
-  SAFE_DEALLOCATE_A(ff)
-  SAFE_DEALLOCATE_A(opff)
-  SAFE_DEALLOCATE_A(gradff)
-  SAFE_DEALLOCATE_A(res)
-  SAFE_DEALLOCATE_A(resgrad)
-  SAFE_DEALLOCATE_A(curlff)
-
-end subroutine X(derivatives_test)
-
 !=====================================================================================
 ! Batched versions of the routines:
 
diff --git a/src/grid/derivatives_test.F90 b/src/grid/derivatives_test.F90
new file mode 100644
index 0000000000000000000000000000000000000000..e1762695867b3359dee98eff70b4b2a18a10ea53
--- /dev/null
+++ b/src/grid/derivatives_test.F90
@@ -0,0 +1,219 @@
+!! Copyright (C) 2002-2006 M. Marques, A. Castro, A. Rubio, G. Bertsch
+!!
+!! This program is free software; you can redistribute it and/or modify
+!! it under the terms of the GNU General Public License as published by
+!! the Free Software Foundation; either version 2, or (at your option)
+!! any later version.
+!!
+!! This program is distributed in the hope that it will be useful,
+!! but WITHOUT ANY WARRANTY; without even the implied warranty of
+!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!! GNU General Public License for more details.
+!!
+!! You should have received a copy of the GNU General Public License
+!! along with this program; if not, write to the Free Software
+!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+!! 02110-1301, USA.
+!!
+
+#include "global.h"
+
+!> @brief This module provides unit tests for the derivatives
+
+module derivatives_test_oct_m
+  use batch_oct_m
+  use batch_ops_oct_m
+  use boundaries_oct_m
+  use debug_oct_m
+  use derivatives_oct_m
+  use global_oct_m
+  use, intrinsic :: iso_fortran_env
+  use lalg_basic_oct_m
+  use lalg_adv_oct_m
+  use loct_oct_m
+  use mesh_oct_m
+  use mesh_function_oct_m
+  use messages_oct_m
+  use namespace_oct_m
+  use parser_oct_m
+  use profiling_oct_m
+  use space_oct_m
+  use utils_oct_m
+
+  implicit none
+
+  private
+  public ::                             &
+    dderivatives_test,                  &
+    zderivatives_test,                  &
+    derivatives_advanced_benchmark
+
+contains
+
+!>@brief Further unit tests design to challenge numerical stability of the finite differences
+  subroutine derivatives_advanced_benchmark(this, namespace)
+    type(derivatives_t), intent(in) :: this
+    type(namespace_t),   intent(in) :: namespace
+
+    real(real64), allocatable :: ff(:), grad(:,:), resgrad(:, :), lapl(:), reslapl(:)
+    real(real64) :: kvec(3)
+
+    SAFE_ALLOCATE(ff(1:this%mesh%np_part))
+    SAFE_ALLOCATE(grad(1:this%mesh%np, 1:this%mesh%box%dim))
+    SAFE_ALLOCATE(lapl(1:this%mesh%np))
+    SAFE_ALLOCATE(reslapl(1:this%mesh%np))
+    SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim))
+
+    ! Testing on some planewaves
+    kvec = [0.5_real64, 0.0_real64, 0.0_real64]         ! low frequency
+    call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+    call test_function("Testing Planewave k=(0.5,0.0,0.0)")
+
+    kvec = [1.0_real64, 1.0_real64, 0.0_real64]         ! diagonal
+    call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+    call test_function("Testing Planewave k=(1.0,1.0,0.0)")
+
+    kvec = [M_PI/this%mesh%spacing(1), 0.0_real64, 0.0_real64]            ! Nyquist
+    call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+    call test_function("Testing Planewave k=(\pi/dr,0.0,0.0)")
+
+    kvec = [M_PI/this%mesh%spacing(1), M_PI/this%mesh%spacing(2), 0.0_real64]         ! diagonal Nyquist
+    call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+    call test_function("Testing Planewave k=(\pi/dr,\pi/dr,0.0)")
+
+    ! Testing on some polynomials
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=2)
+    call test_function("Testing degree-2 polynomial")
+
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=3)
+    call test_function("Testing degree-3 polynomial")
+
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=4)
+    call test_function("Testing degree-4 polynomial")
+
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=6)
+    call test_function("Testing degree-6 polynomial")
+
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=8)
+    call test_function("Testing degree-8 polynomial")
+
+
+    SAFE_DEALLOCATE_A(ff)
+    SAFE_DEALLOCATE_A(grad)
+    SAFE_DEALLOCATE_A(lapl)
+    SAFE_DEALLOCATE_A(resgrad)
+    SAFE_DEALLOCATE_A(reslapl)
+
+  contains
+    subroutine test_function(label)
+      character(len=*), intent(in) :: label
+
+      message(1) = trim(label)
+
+      call dderivatives_grad(this, ff, resgrad, set_bc=.false.)
+      resgrad = resgrad - grad
+      write(message(2), '(a, es17.10)') 'Gradient err = ', dmf_nrm2(this%mesh, this%mesh%box%dim, resgrad)
+
+      call dderivatives_lapl(this, ff, reslapl, set_bc=.false.)
+      reslapl = reslapl - lapl
+      write(message(3), '(a, es17.10)') 'Laplacian err = ', dmf_nrm2(this%mesh, reslapl)
+
+      call messages_info(3, namespace=namespace)
+    end subroutine test_function
+  end subroutine derivatives_advanced_benchmark
+
+!>@brief Generate a plane-wave given its momentum, as well as the gradient and the Laplacian
+  subroutine testfunc_plane_wave(mesh, ff, grad, lapl, kvec)
+    type(mesh_t), intent(in) :: mesh
+    real(real64), intent(out) :: ff(:), grad(:, :), lapl(:)
+    real(real64), intent(in) :: kvec(3)
+
+    integer :: ip, dim, idir
+    real(real64) :: phase, kdotx, kk
+
+    dim = mesh%box%dim
+
+    kk = dot_product(kvec(1:dim), kvec(1:dim))
+    do ip = 1, mesh%np
+      kdotx = dot_product(kvec(1:dim), mesh%x(1:dim, ip))
+      phase = cos(kdotx)
+
+      ff(ip) = phase
+
+      ! exact gradient
+      do idir = 1, mesh%box%dim
+        grad(ip, idir) = -kvec(idir)*sin(kdotx)
+      end do
+
+      ! exact Laplacian
+      lapl(ip) = -kk * phase
+    end do
+
+    do ip = mesh%np+1, mesh%np_part
+      kdotx = dot_product(kvec(1:dim), mesh%x(1:dim, ip))
+      phase = cos(kdotx)
+
+      ff(ip) = phase
+    end do
+  end subroutine testfunc_plane_wave
+
+!>@brief Generate a polyomial of degree p, as well as the gradient and the Laplacian
+  subroutine testfunc_polynomial(mesh, ff, grad, lapl, p)
+    type(mesh_t), intent(in) :: mesh
+    real(real64), intent(out) :: ff(:), grad(:, :), lapl(:)
+    integer, intent(in) :: p
+    integer :: ip, d, dim
+    real(real64) :: x(3), r2
+
+    dim = mesh%box%dim
+
+    do ip = 1, mesh%np
+      x = 0.0_real64
+      x(1:dim) = mesh%x(1:dim, ip)
+
+      ! f = (x^2 + y^2 + z^2)^(p/2)
+      r2 = dot_product(x(1:dim), x(1:dim))
+      ff(ip) = r2**(0.5_real64*p)
+
+      ! gradient: grad(f) = p * r^(p-2) * x
+      if (p >= 2) then
+        do d = 1, dim
+          grad(ip,d) = p * r2**(0.5_real64*p - 1.0_real64) * x(d)
+        end do
+      else
+        grad(ip,:) = 0.0_real64
+      end if
+
+      ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2)
+      if (p >= 2) then
+        lapl(ip) = p * (p + dim - 2) * r2**(0.5_real64*p - 1.0_real64)
+      else
+        lapl(ip) = 0.0_real64
+      end if
+    end do
+
+    do ip = mesh%np+1, mesh%np_part
+      x = 0.0_real64
+      x(1:dim) = mesh%x(1:dim, ip)
+
+      ! f = (x^2 + y^2 + z^2)^(p/2)
+      r2 = dot_product(x(1:dim), x(1:dim))
+      ff(ip) = r2**(0.5_real64*p)
+    end do
+  end subroutine testfunc_polynomial
+
+
+#include "undef.F90"
+#include "real.F90"
+#include "derivatives_test_inc.F90"
+
+#include "undef.F90"
+#include "complex.F90"
+#include "derivatives_test_inc.F90"
+
+end module derivatives_test_oct_m
+
+!! Local Variables:
+!! mode: f90
+!! coding: utf-8
+!! End:
diff --git a/src/grid/derivatives_test_inc.F90 b/src/grid/derivatives_test_inc.F90
new file mode 100644
index 0000000000000000000000000000000000000000..84028e3362ab4844c0acb3a56729d4f4be8027e9
--- /dev/null
+++ b/src/grid/derivatives_test_inc.F90
@@ -0,0 +1,312 @@
+!! Copyright (C) 2002-2006 M. Marques, A. Castro, A. Rubio, G. Bertsch
+!!
+!! This program is free software; you can redistribute it and/or modify
+!! it under the terms of the GNU General Public License as published by
+!! the Free Software Foundation; either version 2, or (at your option)
+!! any later version.
+!!
+!! This program is distributed in the hope that it will be useful,
+!! but WITHOUT ANY WARRANTY; without even the implied warranty of
+!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!! GNU General Public License for more details.
+!!
+!! You should have received a copy of the GNU General Public License
+!! along with this program; if not, write to the Free Software
+!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+!! 02110-1301, USA.
+!!
+
+! ----------------------------------------------------------
+!> @brief unit test for derivatives
+!!
+!! This will be called from the test run mode.
+subroutine X(derivatives_test)(this, namespace, repetitions, min_blocksize, max_blocksize)
+  type(derivatives_t), intent(in) :: this
+  type(namespace_t),   intent(in) :: namespace
+  integer,             intent(in) :: repetitions
+  integer,             intent(in) :: min_blocksize
+  integer,             intent(in) :: max_blocksize
+
+  R_TYPE, allocatable :: ff(:), opff(:, :), gradff(:, :), curlff(:, :), res(:), resgrad(:, :), rescurl(:, :)
+  R_TYPE :: aa, bb, cc
+  integer :: ip, idir, ist, ib
+  type(batch_t) :: ffb, opffb
+  type(batch_t) :: gradffb(this%mesh%box%dim)
+  integer :: blocksize, itime
+  logical :: packstates
+  real(real64)   :: stime, etime
+  character(len=20) :: type
+  real(real64) :: norm
+
+  call parse_variable(namespace, 'StatesPack', .true., packstates)
+
+  SAFE_ALLOCATE(ff(1:this%mesh%np_part))
+  SAFE_ALLOCATE(opff(1:this%mesh%np, 1:this%mesh%box%dim))
+  SAFE_ALLOCATE(gradff(1:this%mesh%np_part, 1:this%mesh%box%dim))
+  SAFE_ALLOCATE(res(1:this%mesh%np_part))
+  SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim))
+  SAFE_ALLOCATE(curlff(1:this%mesh%np, 1:this%mesh%box%dim))
+
+#ifdef R_TREAL
+  type = 'real'
+#else
+  type = 'complex'
+#endif
+
+  ! Note: here we need to use a constant function or anything that
+  ! is constant at the borders, since we assume that all boundary
+  ! points have equal values to optimize the application of the nl-operator.
+
+  aa = R_TOTYPE(1.0)/this%mesh%box%bounding_box_l(1)
+  bb = R_TOTYPE(10.0)
+  cc = R_TOTYPE(100.0)
+
+#ifdef R_TCOMPLEX
+  ! we make things more "complex"
+  aa = aa + M_ZI*R_TOTYPE(0.01)
+  bb = bb*exp(M_ZI*R_TOTYPE(0.345))
+  cc = cc - M_ZI*R_TOTYPE(50.0)
+#endif
+
+
+  do ip = 1, this%mesh%np_part
+    ff(ip) = bb*exp(-aa*sum(this%mesh%x(:, ip)**2)) + cc
+  end do
+  do ip = 1, this%mesh%np
+    do idir = 1, this%mesh%box%dim
+      gradff(ip, idir) = -M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2))
+    end do
+  end do
+
+  ! curl only needed in 3D
+  if (this%mesh%box%dim == 3) then
+    do ip = 1, this%mesh%np
+      curlff(ip, 1) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(2, ip) - this%mesh%x(3, ip))
+      curlff(ip, 2) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(3, ip) - this%mesh%x(1, ip))
+      curlff(ip, 3) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(1, ip) - this%mesh%x(2, ip))
+    end do
+  end if
+
+  message(1) = "Testing Laplacian"
+  call messages_info(1, namespace=namespace)
+
+  ! test Laplacian
+  blocksize = min_blocksize
+  do
+
+    call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
+    call X(batch_init)(opffb, 1, 1, blocksize, this%mesh%np)
+
+    do ist = 1, blocksize
+      call batch_set_state(ffb, ist, this%mesh%np_part, ff)
+    end do
+
+    if (packstates) then
+      call ffb%do_pack()
+      call opffb%do_pack(copy = .false.)
+    end if
+
+    if (repetitions > 1) then
+      call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF)
+    end if
+
+    stime = loct_clock()
+    do itime = 1, repetitions
+      call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF)
+    end do
+    etime = (loct_clock() - stime)/real(repetitions, real64)
+
+    call batch_get_state(opffb, blocksize, this%mesh%np, res)
+
+    call ffb%end()
+    call opffb%end()
+
+    do ip = 1, this%mesh%np
+      res(ip) = M_TWO*res(ip) - &
+        (M_FOUR*aa**2*bb*sum(this%mesh%x(:, ip)**2)*exp(-aa*sum(this%mesh%x(:, ip)**2)) &
+        - this%mesh%box%dim*M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2)))
+    end do
+
+    write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
+      'Laplacian ', trim(type),  &
+      ' bsize = ', blocksize,    &
+      ' , error = ', X(mf_nrm2)(this%mesh, res), &
+      ' , Gflops = ',  &
+#ifdef R_TREAL
+      blocksize*this%mesh%np*M_TWO*this%lapl%stencil%size/(etime*1.0e9_real64)
+#else
+    blocksize*this%mesh%np*M_FOUR*this%lapl%stencil%size/(etime*1.0e9_real64)
+#endif
+
+    call messages_info(1, namespace=namespace)
+
+    blocksize = 2*blocksize
+    if (blocksize > max_blocksize) exit
+
+  end do
+
+  message(1) = "Testing Gradient"
+  call messages_info(1, namespace=namespace)
+
+  ! test Gradient
+  blocksize = min_blocksize
+  do
+    call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
+
+    do ist = 1, blocksize
+      call batch_set_state(ffb, ist, this%mesh%np_part, ff)
+    end do
+
+    if (packstates) then
+      call ffb%do_pack()
+    end if
+
+    if (repetitions > 1) then
+      call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
+    end if
+
+    stime = loct_clock()
+    do itime = 1, repetitions
+      call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
+    end do
+    etime = (loct_clock() - stime)/real(repetitions, real64)
+
+    do idir = 1, this%mesh%box%dim
+      call batch_get_state(gradffb(idir), blocksize, this%mesh%np, resgrad(:, idir))
+    end do
+
+    call ffb%end()
+    do idir = 1, this%mesh%box%dim
+      call gradffb(idir)%end()
+    end do
+
+    do idir = 1, this%mesh%box%dim
+      do ip = 1, this%mesh%np
+        resgrad(ip, idir) = resgrad(ip, idir) - gradff(ip, idir)
+      end do
+    end do
+
+    write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
+      'Batch gradient ', trim(type),  &
+      ' bsize = ', blocksize,    &
+      ' , error (x direction) = ', X(mf_nrm2)(this%mesh, resgrad(:, 1)), &
+      ' , Gflops = ',  &
+#ifdef R_TREAL
+      blocksize*this%mesh%np*M_TWO*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64)
+#else
+    blocksize*this%mesh%np*M_FOUR*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64)
+#endif
+    call messages_info(1, namespace=namespace)
+
+    blocksize = 2*blocksize
+    if (blocksize > max_blocksize) exit
+  end do
+
+  call X(derivatives_grad)(this, ff, opff, set_bc = .false.)
+
+  do idir = 1, this%mesh%box%dim
+    do ip = 1, this%mesh%np
+      opff(ip, idir) = opff(ip, idir) - (-M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2)))
+    end do
+  end do
+
+  message(1) = ''
+  call messages_info(1, namespace=namespace)
+
+  write(message(1), '(3a, es17.10)') 'Gradient ', trim(type),  &
+    ' err = ', X(mf_nrm2)(this%mesh, this%mesh%box%dim, opff)
+  call messages_info(1, namespace=namespace)
+
+  message(1) = ''
+  call messages_info(1, namespace=namespace)
+
+  ! test curl for 3D case
+  if (this%mesh%box%dim == 3) then
+    message(1) = "Testing curl"
+    call messages_info(1, namespace=namespace)
+
+    do blocksize = 3, 6, 3
+      call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
+
+      do ist = 1, blocksize
+        call batch_set_state(ffb, ist, this%mesh%np_part, ff)
+      end do
+
+      if (packstates) then
+        call ffb%do_pack()
+      end if
+
+      ! test for one blocksize without precomputed gradient
+      if (blocksize == 6) then
+        call X(derivatives_batch_curl)(this, ffb, set_bc=.false.)
+      else
+        call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
+        call X(derivatives_batch_curl_from_gradient)(this, ffb, gradffb)
+      end if
+
+      SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:blocksize))
+      do ist = 1, blocksize
+        call batch_get_state(ffb, ist, this%mesh%np, rescurl(:, ist))
+      end do
+
+      call ffb%end()
+      do idir = 1, this%mesh%box%dim
+        call gradffb(idir)%end()
+      end do
+
+      do ip = 1, this%mesh%np
+        do ib = 0, blocksize-1, this%mesh%box%dim
+          do idir = 1, this%mesh%box%dim
+            rescurl(ip, ib+idir) = rescurl(ip, ib+idir) - curlff(ip, idir)
+          end do
+        end do
+      end do
+
+      norm = M_ZERO
+      do ist = 1, blocksize
+        norm = norm + X(mf_nrm2)(this%mesh, rescurl(:, ist))
+      end do
+
+      write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
+        'Batch curl ', trim(type), ' bsize = ', blocksize, ' , error = ', norm
+      call messages_info(1, namespace=namespace)
+
+      SAFE_DEALLOCATE_A(rescurl)
+    end do
+
+    ! Testing the regular curl with the same logic as above
+    SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:this%mesh%box%dim))
+    do idir = 1, this%mesh%box%dim
+      call lalg_copy(this%mesh%np_part, ff, gradff(:, idir))
+    end do
+    call X(derivatives_curl)(this, gradff, rescurl, set_bc = .false.)
+
+    do idir = 1, this%mesh%box%dim
+      do ip = 1, this%mesh%np
+        opff(ip, idir) = rescurl(ip, idir) - curlff(ip, idir)
+      end do
+    end do
+
+    norm = M_ZERO
+    do ist = 1, this%mesh%box%dim
+      norm = norm + X(mf_nrm2)(this%mesh, opff(:, ist))
+    end do
+
+    write(message(1), '(3a, es17.10)') 'Curl ', trim(type), ' err = ', norm
+    call messages_info(1, namespace=namespace)
+
+    SAFE_DEALLOCATE_A(rescurl)
+  end if
+
+  message(1) = ''
+  call messages_info(1, namespace=namespace)
+
+  SAFE_DEALLOCATE_A(ff)
+  SAFE_DEALLOCATE_A(opff)
+  SAFE_DEALLOCATE_A(gradff)
+  SAFE_DEALLOCATE_A(res)
+  SAFE_DEALLOCATE_A(resgrad)
+  SAFE_DEALLOCATE_A(curlff)
+
+end subroutine X(derivatives_test)
+
diff --git a/src/grid/grid.F90 b/src/grid/grid.F90
index 5c9004161739ac8f68431861ac6563cd85adac7b..423e9200d71e8d4d29085f46423ad27fc1532b91 100644
--- a/src/grid/grid.F90
+++ b/src/grid/grid.F90
@@ -193,6 +193,7 @@ contains
     call initialize_coordinate_system(gr, namespace, space, grid_spacing, latt, n_sites, site_position)
 
     ! initialize derivatives
+    call nl_operator_global_init(namespace)
     call derivatives_init(gr%der, namespace, space, gr%coord_system)
     ! the stencil used to generate the grid is a union of a cube (for
     ! multigrid) and the Laplacian
@@ -254,6 +255,7 @@ contains
     call initialize_coordinate_system(gr, namespace, space, gr%spacing, latt, n_sites, site_position)
 
     ! initialize derivatives
+    call nl_operator_global_init(namespace)
     call derivatives_init(gr%der, namespace, space, gr%coord_system)
     ! the stencil used to generate the grid is a union of a cube (for
     ! multigrid) and the Laplacian
@@ -383,7 +385,6 @@ contains
 
     call mesh_init_stage_3(gr, namespace, space, gr%stencil, mc)
 
-    call nl_operator_global_init(namespace)
     call derivatives_build(gr%der, namespace, space, gr, qvector)
 
     ! print info concerning the grid
diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index f5f198c62e0cba8287c86d3120479e467f32c280..ed6d8be53e3d2c496cfb8cc3b233a684b2ad53a6 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -39,6 +39,7 @@ module nl_operator_oct_m
   use parser_oct_m
   use profiling_oct_m
   use space_oct_m
+  use sort_oct_m
   use stencil_oct_m
   use types_oct_m
   use varinfo_oct_m
@@ -66,7 +67,8 @@ module nl_operator_oct_m
     nl_operator_np_zero_bc,     &
     nl_operator_allocate_gpu_buffers, &
     nl_operator_update_gpu_buffers, &
-    nl_operator_remove_zero_weight_points
+    nl_operator_remove_zero_weight_points, &
+    nl_operator_build_symmetric_weights
 
   !> @brief index type for non-local operators
   !!
@@ -78,8 +80,15 @@ module nl_operator_oct_m
     integer, allocatable :: imin(:)
     integer, allocatable :: imax(:)
     integer, allocatable :: ri(:, :)
+    integer, allocatable :: ri_pos(:,:,:)
+    integer, allocatable :: ri_neg(:,:,:)
   end type nl_operator_index_t
 
+  integer, public, parameter ::  &
+    OP_GENERAL       = 1,   &
+    OP_SYMMETRIC     = 2,   &
+    OP_ANTISYMMETRIC = 3
+
   !> data type for non local operators
   type nl_operator_t
     private
@@ -95,6 +104,8 @@ module nl_operator_oct_m
     type(accel_mem_t), public :: buff_weights      !< buffer with constant weights
     type(accel_mem_t), public :: buff_half_weights !< buffer with weights multiplied by -1/2
 
+    integer                   :: symmetry = OP_GENERAL !< Symmetry of the weights to be used
+
     character(len=40) :: label
 
     !> the compressed index of grid points
@@ -103,6 +114,14 @@ module nl_operator_oct_m
     integer, allocatable, public :: rimap(:)
     integer, allocatable, public :: rimap_inv(:)
 
+    !> For symmetry-based stencils
+    integer :: npairs = 0
+    real(real64),allocatable :: wpair(:)
+    real(real64)             :: wcenter
+    integer, allocatable :: ri_pos(:,:,:)
+    integer, allocatable :: ri_neg(:,:,:)
+    integer :: max_allocated_ri_pair = 0
+
     integer                   :: ninner = 0
     integer                   :: nouter = 0
 
@@ -149,6 +168,7 @@ module nl_operator_oct_m
   integer :: dfunction_global = -1
   integer :: zfunction_global = -1
   integer :: function_accel
+  logical :: use_symmetries = .false.
 
 contains
 
@@ -196,6 +216,17 @@ contains
     call parse_variable(namespace, 'OperateComplex', default, zfunction_global)
     if (.not. varinfo_valid_option('OperateComplex', zfunction_global)) call messages_input_error(namespace, 'OperateComplex')
 
+    !%Variable OperateUseSymmetries
+    !%Type logical
+    !%Section Execution::Optimization
+    !%Default no
+    !%Description
+    !% This variable selects if the operators are built using symmetries or not.
+    !%End
+    call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries)
+    if (accel_is_enabled() .and. use_symmetries) then
+      call messages_not_implemented("OperateUseSymmetries=yes with GPUs")
+    end if
 
     if (accel_is_enabled()) then
 
@@ -232,13 +263,18 @@ contains
   !> @brief initialize an instance of a non-local operator
   !! by setting the label
   !!
-  subroutine nl_operator_init(op, label)
+  subroutine nl_operator_init(op, label, symm)
     type(nl_operator_t), intent(inout) :: op
     character(len=*),    intent(in)    :: label
+    integer, optional,   intent(in)    :: symm
 
     PUSH_SUB(nl_operator_init)
 
     op%label = label
+    op%symmetry = OP_GENERAL
+    if (use_symmetries) then
+      op%symmetry = optional_default(symm, OP_GENERAL)
+    end if
 
     POP_SUB(nl_operator_init)
   end subroutine nl_operator_init
@@ -261,6 +297,7 @@ contains
 
     opo%np           =  opi%np
     opo%mesh         => opi%mesh
+    opo%symmetry     =  opi%symmetry
 
     SAFE_ALLOCATE_SOURCE_A(opo%nn, opi%nn)
     SAFE_ALLOCATE_SOURCE_A(opo%w, opi%w)
@@ -274,6 +311,14 @@ contains
     SAFE_ALLOCATE_SOURCE_A(opo%rimap, opi%rimap)
     SAFE_ALLOCATE_SOURCE_A(opo%rimap_inv, opi%rimap_inv)
 
+    opo%npairs = opi%npairs
+    if (opi%symmetry /= OP_GENERAL) then
+      opo%wcenter = opi%wcenter
+      SAFE_ALLOCATE_SOURCE_A(opo%wpair, opi%wpair)
+      SAFE_ALLOCATE_SOURCE_A(opo%ri_pos, opi%ri_pos)
+      SAFE_ALLOCATE_SOURCE_A(opo%ri_neg, opi%ri_neg)
+    end if
+
     if (opi%mesh%parallel_in_domains) then
       opo%inner%nri = opi%inner%nri
       SAFE_ALLOCATE_SOURCE_A(opo%inner%imin, opi%inner%imin)
@@ -284,6 +329,13 @@ contains
       SAFE_ALLOCATE_SOURCE_A(opo%outer%imin, opi%outer%imin)
       SAFE_ALLOCATE_SOURCE_A(opo%outer%imax, opi%outer%imax)
       SAFE_ALLOCATE_SOURCE_A(opo%outer%ri,   opi%outer%ri)
+
+      if (opi%symmetry /= OP_GENERAL) then
+        SAFE_ALLOCATE_SOURCE_A(opo%inner%ri_pos, opi%inner%ri_pos)
+        SAFE_ALLOCATE_SOURCE_A(opo%inner%ri_neg, opi%inner%ri_neg)
+        SAFE_ALLOCATE_SOURCE_A(opo%outer%ri_pos, opi%outer%ri_pos)
+        SAFE_ALLOCATE_SOURCE_A(opo%outer%ri_neg, opi%outer%ri_neg)
+      end if
     end if
 
     ! We create the corresponding GPU buffers
@@ -627,6 +679,14 @@ contains
     SAFE_DEALLOCATE_A(op%rimap_inv)
     SAFE_DEALLOCATE_A(op%nn)
 
+    SAFE_DEALLOCATE_A(op%wpair)
+    SAFE_DEALLOCATE_A(op%ri_pos)
+    SAFE_DEALLOCATE_A(op%ri_neg)
+
+    SAFE_DEALLOCATE_A(op%inner%ri_pos)
+    SAFE_DEALLOCATE_A(op%inner%ri_neg)
+    SAFE_DEALLOCATE_A(op%outer%ri_pos)
+    SAFE_DEALLOCATE_A(op%outer%ri_neg)
     call stencil_end(op%stencil)
 
     POP_SUB(nl_operator_end)
@@ -762,6 +822,238 @@ contains
     POP_SUB(nl_operator_remove_zero_weight_points)
   end subroutine nl_operator_remove_zero_weight_points
 
+!> Take a list of weights and offsets and build pairs of symmetric points with common weights
+  subroutine group_by_pairs_sym(size, ldf, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg, wcenter)
+    integer,                    intent(in)  :: size
+    integer,                    intent(in)  :: ldf
+    integer,                    intent(in)  :: offsets(:, :)
+    real(real64),               intent(in)  :: wre(:)
+    integer,                    intent(in)  :: ri(:, :)
+    integer,                    intent(in)  :: nri
+    integer,                    intent(out) :: npairs
+    real(real64),             intent(inout) :: wpair(:)
+    integer,                  intent(inout) :: pair_pos(:,:), pair_neg(:,:)
+    real(real64),               intent(out) :: wcenter
+
+    logical, allocatable :: used(:)
+    integer :: i, j, ndim, s
+    logical :: same
+    integer, allocatable :: idx(:)
+
+    real(real64), parameter :: tol = 1.0e-12_real64
+
+    PUSH_SUB(group_by_pairs_sym)
+
+    ASSERT(mod(size,2) == 1)
+
+    SAFE_ALLOCATE(used(1:size))
+    used = .false.
+    npairs = 0
+
+    ndim = ubound(offsets, dim=1)
+
+    SAFE_ALLOCATE(idx(1:size))
+    call robbust_sort_by_abs(wre, offsets, idx)
+
+    do i = 1, size
+      if (used(i)) cycle
+
+      if (all(offsets(:, idx(i))==0)) then
+        wcenter = wre(idx(i))
+        used(i) = .true.
+        cycle
+      end if
+
+      ! Try to find symmetric partner j
+      do j = i+1, size
+        if (used(j)) cycle
+
+        ! Weight equality
+        same = abs(wre(idx(i)) - wre(idx(j))) <= tol
+        if (.not. same) cycle
+
+        ! Offsets equal and opposite
+        if (any(offsets(:,idx(j))+offsets(:, idx(i)) /= 0)) cycle
+
+        npairs = npairs + 1
+        do s = 1, nri
+          pair_pos(npairs, s) = ishft(ri(idx(i), s), ldf)
+          pair_neg(npairs, s) = ishft(ri(idx(j), s), ldf)
+        end do
+        wpair(npairs)  = M_HALF*(wre(idx(i)) + wre(idx(j)))
+
+        used(i) = .true.
+        used(j) = .true.
+        exit
+      end do
+    end do
+
+    ASSERT(npairs == size/2)
+
+    SAFE_DEALLOCATE_A(idx)
+
+    POP_SUB(group_by_pairs_sym)
+  end subroutine group_by_pairs_sym
+
+!> Take a list of weights and offsets and build pairs of symmetric points with common weights
+  subroutine group_by_pairs_antisym(size, ldf, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg)
+    integer,                    intent(in)  :: size
+    integer,                    intent(in)  :: ldf
+    integer,                    intent(in)  :: offsets(:, :)
+    real(real64),               intent(in)  :: wre(:)
+    integer,                    intent(in)  :: ri(:, :)
+    integer,                    intent(in)  :: nri
+    integer,                   intent(out)  :: npairs
+    real(real64),             intent(inout) :: wpair(:)
+    integer,                  intent(inout) :: pair_pos(:,:), pair_neg(:,:)
+
+    logical, allocatable :: used(:)
+    integer :: i, j, ndim, s
+    logical :: same
+    integer, allocatable :: idx(:)
+
+    real(real64), parameter :: tol = 1.0e-12_real64
+
+    PUSH_SUB(group_by_pairs_antisym)
+
+    ASSERT(mod(size,2) == 0)
+
+    SAFE_ALLOCATE(used(1:size))
+    used = .false.
+    npairs = 0
+
+    ndim = ubound(offsets, dim=1)
+
+    SAFE_ALLOCATE(idx(1:size))
+    call robbust_sort_by_abs(wre, offsets, idx)
+
+    ! Max pairs = n/2
+    do i = 1, size
+      if (used(i)) cycle
+
+      ! Try to find symmetric partner j
+      do j = i+1, size
+        if (used(j)) cycle
+
+        ! Weight equality
+        same = abs(wre(idx(i)) + wre(idx(j))) <= tol
+        if (.not. same) cycle
+
+        ! Offsets equal and opposite
+        if (any(offsets(:,idx(j))+offsets(:, idx(i)) /= 0)) cycle
+
+        npairs = npairs + 1
+        do s = 1, nri
+          pair_pos(npairs, s) = ishft(ri(idx(i), s), ldf)
+          pair_neg(npairs, s) = ishft(ri(idx(j), s), ldf)
+        end do
+        wpair(npairs)  = M_HALF*(wre(idx(i)) - wre(idx(j)))
+
+        used(i) = .true.
+        used(j) = .true.
+        exit
+      end do
+    end do
+
+    ASSERT(npairs == size/2)
+
+    SAFE_DEALLOCATE_A(idx)
+    POP_SUB(group_by_pairs_antisym)
+  end subroutine group_by_pairs_antisym
+
+  !>@brief Builds (or rebuild) the necessary arrays for symmetric and antisymmetric stencils
+  subroutine nl_operator_build_symmetric_weights(op, max_size)
+    type(nl_operator_t), intent(inout)   :: op
+    integer, optional,   intent(in)      :: max_size !< For reallocation
+
+    integer :: ldf, start, end, ipair
+
+    if (op%symmetry == OP_GENERAL) return
+
+    PUSH_SUB(nl_operator_build_symmetric_weights)
+
+    ASSERT(op%const_w)
+    ASSERT(conf%target_states_block_size > 0)
+
+    if(present(max_size)) then
+      start = op%max_allocated_ri_pair + 1
+      end = max_size
+      call reallocate_array(op%ri_pos, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end)
+      call reallocate_array(op%ri_neg, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end)
+      if (op%mesh%parallel_in_domains) then
+        call reallocate_array(op%inner%ri_pos, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end)
+        call reallocate_array(op%inner%ri_neg, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end)
+        call reallocate_array(op%outer%ri_pos, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end)
+        call reallocate_array(op%outer%ri_neg, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end)
+      end if
+    else
+      ! Max pairs = n/2
+      SAFE_ALLOCATE(op%wpair(1:op%stencil%size/2))
+      start = 1
+      end = log2(conf%target_states_block_size)+1
+
+      SAFE_ALLOCATE(op%ri_pos(1:op%stencil%size/2, 1:op%nri, 1:end))
+      SAFE_ALLOCATE(op%ri_neg(1:op%stencil%size/2, 1:op%nri, 1:end))
+      if (op%mesh%parallel_in_domains) then
+        SAFE_ALLOCATE(op%inner%ri_pos(1:op%stencil%size/2, 1:op%inner%nri, 1:end))
+        SAFE_ALLOCATE(op%inner%ri_neg(1:op%stencil%size/2, 1:op%inner%nri, 1:end))
+        SAFE_ALLOCATE(op%outer%ri_pos(1:op%stencil%size/2, 1:op%outer%nri, 1:end))
+        SAFE_ALLOCATE(op%outer%ri_neg(1:op%stencil%size/2, 1:op%outer%nri, 1:end))
+      end if
+    end if
+    op%max_allocated_ri_pair = end
+
+    do ldf = start-1, end-1
+      select case(op%symmetry)
+      case(OP_SYMMETRIC)
+        call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, &
+          op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1), op%wcenter)
+        if (op%mesh%parallel_in_domains) then
+          call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
+            op%npairs, op%wpair, op%inner%ri_pos(:,:,ldf+1), op%inner%ri_neg(:,:,ldf+1), op%wcenter)
+          call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
+            op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1), op%wcenter)
+        end if
+      case(OP_ANTISYMMETRIC)
+        call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, &
+          op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1))
+        if (op%mesh%parallel_in_domains) then
+          call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
+            op%npairs, op%wpair, op%inner%ri_pos(:,:,ldf+1), op%inner%ri_neg(:,:,ldf+1))
+          call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
+            op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1))
+        end if
+      end select
+    end do
+
+    if (.not. present(max_size)) then
+      write(message(1), '(3a)') 'Debug info: Sorted weights for ', trim(op%label), '.'
+      call messages_info(1, debug_only=.true.)
+
+      do ipair = 1, op%npairs
+        write(message(1), '(a,i3,f25.10,2(1x,i4))') '      ', ipair, op%wpair(ipair), op%ri_pos(ipair,1,1), op%ri_neg(ipair,1,1)
+        call messages_info(1, debug_only=.true.)
+      end do
+    end if
+
+    POP_SUB(nl_operator_build_symmetric_weights)
+  end subroutine nl_operator_build_symmetric_weights
+
+  !>@brief Reallocate an ri array
+  subroutine reallocate_array(ri, stencil_size, nri, old_size, new_size)
+    integer,      allocatable, intent(inout) :: ri(:,:,:)
+    integer,                   intent(in)    :: stencil_size, nri
+    integer,                   intent(in)    :: old_size, new_size
+
+    integer, allocatable :: tmp(:,:,:)
+
+    SAFE_ALLOCATE_SOURCE_A(tmp, ri)
+    SAFE_DEALLOCATE_A(ri)
+    SAFE_ALLOCATE(ri(1:stencil_size, 1:nri, 1:new_size))
+    ri(:,:,1:old_size) = tmp
+    SAFE_DEALLOCATE_A(tmp)
+  end subroutine reallocate_array
+
 #include "undef.F90"
 #include "real.F90"
 #include "nl_operator_inc.F90"
diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90
index d2eb96aec17fe28b7f5c7f4ba3ba4673ddc2fe5f..6ba64178b3a64a48e88a3527757f1df19c9d22bd 100644
--- a/src/grid/nl_operator_inc.F90
+++ b/src/grid/nl_operator_inc.F90
@@ -19,7 +19,7 @@
 ! ---------------------------------------------------------
 
 subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, points, factor, async)
-  type(nl_operator_t), target, intent(in)    :: op
+  type(nl_operator_t), target, intent(inout) :: op
   type(batch_t),       target, intent(inout) :: fi
   type(batch_t),               intent(inout) :: fo !< this should be target, but old ifort 9.1 segfaults with it
   logical,         optional,   intent(in)    :: ghost_update
@@ -33,14 +33,17 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
   logical :: ghost_update_, profile_, use_accel
   integer :: nri
   integer, pointer :: imin(:), imax(:), ri(:, :)
-!  real(real64), allocatable :: wre(:), wim(:)
-  R_BASE, allocatable :: wre(:), wim(:)
+  R_BASE, allocatable :: wre(:)
 #ifdef R_TREAL
   integer, parameter :: logldf = 0
 #else
   integer, parameter :: logldf = 1
 #endif
   integer :: nri_loc, ini
+  !> Symmetry of the stencil can be used to group points by pairs
+  real(real64),   allocatable :: wpair(:)
+  integer, pointer :: ri_pos(:,:), ri_neg(:,:)
+  real(real64) :: wcenter
 
   PUSH_SUB(X(nl_operator_operate_batch))
 
@@ -75,6 +78,15 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
     if (present(factor)) then
       wre(1:op%stencil%size) = wre(1:op%stencil%size)*factor
     end if
+
+    if (op%symmetry /= OP_GENERAL) then
+      SAFE_ALLOCATE_SOURCE_A(wpair, op%wpair)
+      wcenter = op%wcenter
+      if (present(factor)) then
+        wpair = wpair * factor
+        wcenter = wcenter * factor
+      end if
+    end if
   end if
 
   use_accel = .false.
@@ -86,10 +98,19 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
       use_accel = .true.
       call operate_accel()
     else if (X(function_global) == OP_FORTRAN) then
-      call operate_const_weights()
+      select case(op%symmetry)
+      case(OP_ANTISYMMETRIC)
+        call operate_const_weights_antisymmetric()
+      case(OP_SYMMETRIC)
+        call operate_const_weights_symmetric()
+      case(OP_GENERAL)
+        call operate_const_weights()
+      case default
+        ASSERT(.false.)
+      end select
     else
 
-! for the moment this is not implemented
+      ! for the moment this is not implemented
       !$omp parallel private(ini, nri_loc, ist)
       call multicomm_divide_range_omp(nri, ini, nri_loc)
 
@@ -98,16 +119,38 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
         ASSERT(ubound(fi%X(ff_pack), dim = 2) >= op%mesh%np_part)
         ASSERT(ubound(fo%X(ff_pack), dim = 2) >= op%mesh%np)
 
-        call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
-          fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
+        select case(op%symmetry)
+        case(OP_GENERAL)
+          call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
+            fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
+        case(OP_SYMMETRIC)
+          call X(operate_ri_sym_vec)(op%npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+            imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
+        case(OP_ANTISYMMETRIC)
+          call X(operate_ri_antisym_vec)(op%npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+            imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
+        case default
+          ASSERT(.false.)
+        end select
       else
         do ist = 1, fi%nst_linear
 
           ASSERT(ubound(fi%X(ff_linear), dim=1) == op%mesh%np_part)
           ASSERT(ubound(fo%X(ff_linear), dim=1) >= op%mesh%np)
 
-          call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
-            fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
+          select case(op%symmetry)
+          case(OP_GENERAL)
+            call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
+              fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
+          case(OP_SYMMETRIC)
+            call X(operate_ri_sym_vec)(op%npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+              imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
+          case(OP_ANTISYMMETRIC)
+            call X(operate_ri_antisym_vec)(op%npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+              imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
+          case default
+            ASSERT(.false.)
+          end select
         end do
       end if
       !$omp end parallel
@@ -121,7 +164,7 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
   end if
 
   SAFE_DEALLOCATE_A(wre)
-  SAFE_DEALLOCATE_A(wim)
+  SAFE_DEALLOCATE_A(wpair)
 
   if (profile_) call profiling_out(TOSTRING(X(NL_OPERATOR_BATCH)))
   POP_SUB(X(nl_operator_operate_batch))
@@ -131,24 +174,50 @@ contains
   ! ---------------------------------------------------------
   subroutine select_op()
 
+    integer :: ldf
+
     PUSH_SUB(X(nl_operator_operate_batch).select_op)
 
+    ldf = logldf+1
+    if (fi%status() == BATCH_PACKED) then
+      ldf = log2(int(fi%pack_size_real(1), int32))+1
+    end if
+
+    if (X(function_global) == OP_FORTRAN .or. op%symmetry == OP_GENERAL) ldf = 1
+
+    ! If needed, we realloc the offsets
+    if (op%symmetry /= OP_GENERAL .and. ldf > op%max_allocated_ri_pair) then
+      call nl_operator_build_symmetric_weights(op, ldf)
+    end if
+
     select case (points_)
     case (OP_ALL)
       nri  =  op%nri
       imin => op%rimap_inv(1:)
       imax => op%rimap_inv(2:)
       ri   => op%ri
+      if (op%symmetry /= OP_GENERAL) then
+        ri_pos   => op%ri_pos(:,:,ldf)
+        ri_neg   => op%ri_neg(:,:,ldf)
+      end if
     case (OP_INNER)
       nri  =  op%inner%nri
       imin => op%inner%imin
       imax => op%inner%imax
       ri   => op%inner%ri
+      if (op%symmetry /= OP_GENERAL) then
+        ri_pos   => op%inner%ri_pos(:,:,ldf)
+        ri_neg   => op%inner%ri_neg(:,:,ldf)
+      end if
     case (OP_OUTER)
       nri  =  op%outer%nri
       imin => op%outer%imin
       imax => op%outer%imax
       ri   => op%outer%ri
+      if (op%symmetry /= OP_GENERAL) then
+        ri_pos   => op%outer%ri_pos(:,:,ldf)
+        ri_neg   => op%outer%ri_neg(:,:,ldf)
+      end if
     case default
       ASSERT(.false.)
     end select
@@ -203,6 +272,109 @@ contains
     POP_SUB(X(nl_operator_operate_batch).operate_const_weights)
   end subroutine operate_const_weights
 
+  ! ---------------------------------------------------------
+  subroutine operate_const_weights_antisymmetric()
+    integer :: nn, ll, ii, ist, ipair
+
+    PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_antisymmetric)
+
+    nn = op%stencil%size
+
+    select case (fi%status())
+
+    case (BATCH_DEVICE_PACKED)
+
+      ASSERT(.false.)
+
+    case (BATCH_NOT_PACKED)
+
+      !$omp parallel do private(ll, ist, ii, ipair)
+      do ll = 1, nri
+        do ii = imin(ll) + 1, imax(ll)
+          do ist = 1, fi%nst_linear
+            fo%X(ff_linear)(ii, ist) = M_ZERO
+            do ipair = 1, op%npairs
+              fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + &
+                wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) - fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist))
+            end do
+          end do
+        end do
+      end do
+      !$omp end parallel do
+
+    case (BATCH_PACKED)
+
+      !$omp parallel do private(ll, ist, ii, ipair)
+      do ll = 1, nri
+        do ii = imin(ll) + 1, imax(ll)
+          do ist = 1, fi%nst_linear
+            fo%X(ff_pack)(ist, ii) = M_ZERO
+            do ipair = 1, op%npairs
+              fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) &
+                + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) - fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll)))
+            end do
+          end do
+        end do
+      end do
+      !$omp end parallel do
+
+    end select
+
+    POP_SUB(X(nl_operator_operate_batch).operate_const_weights_antisymmetric)
+  end subroutine operate_const_weights_antisymmetric
+
+  ! ---------------------------------------------------------
+  subroutine operate_const_weights_symmetric()
+    integer :: nn, ll, ii, ist, ipair
+
+    PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric)
+
+    nn = op%stencil%size
+
+    select case (fi%status())
+
+    case (BATCH_DEVICE_PACKED)
+
+      ASSERT(.false.)
+
+    case (BATCH_NOT_PACKED)
+
+      !$omp parallel do private(ll, ist, ii, ipair)
+      do ll = 1, nri
+        do ii = imin(ll) + 1, imax(ll)
+          do ist = 1, fi%nst_linear
+            fo%X(ff_linear)(ii, ist) = M_ZERO
+            do ipair = 1, op%npairs
+              fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + &
+                wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) + fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist))
+            end do
+            fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + wcenter * fi%X(ff_linear)(ii, ist)
+          end do
+        end do
+      end do
+      !$omp end parallel do
+
+    case (BATCH_PACKED)
+
+      !$omp parallel do private(ll, ist, ii, ipair)
+      do ll = 1, nri
+        do ii = imin(ll) + 1, imax(ll)
+          do ist = 1, fi%nst_linear
+            fo%X(ff_pack)(ist, ii) = M_ZERO
+            do ipair = 1, op%npairs
+              fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) &
+                + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) + fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll)))
+            end do
+            fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) + wcenter * fi%X(ff_pack)(ist, ii)
+          end do
+        end do
+      end do
+      !$omp end parallel do
+
+    end select
+
+    POP_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric)
+  end subroutine operate_const_weights_symmetric
 
   ! ---------------------------------------------------------
   subroutine operate_non_const_weights()
@@ -471,7 +643,7 @@ end subroutine X(nl_operator_operate_batch)
 
 subroutine X(nl_operator_operate)(op, fi, fo, ghost_update, profile, points)
   R_TYPE, contiguous,         intent(inout) :: fi(:)  !< fi(op%np_part)
-  type(nl_operator_t),        intent(in)    :: op
+  type(nl_operator_t),        intent(inout) :: op
   R_TYPE, contiguous, target, intent(out)   :: fo(:)
   logical, optional,          intent(in)    :: ghost_update
   logical, optional,          intent(in)    :: profile
diff --git a/src/grid/operate.c b/src/grid/operate.c
index 482b8ed3cee6a82435c32b48d15168c9470fc8ea..ce5071e9a6f8b8792b6b62eff12d4b93de59bb9f 100644
--- a/src/grid/operate.c
+++ b/src/grid/operate.c
@@ -64,6 +64,57 @@ void FC_FUNC_(doperate_ri_vec,
   }
 }
 
+void FC_FUNC_(doperate_ri_sym_vec,
+              DOPERATE_RI_SYM_VEC)(const int *opn, const double *restrict wpair, const double *restrict wcenter,
+                               const int *opnri, const int *opri_pos, const int *opri_neg,
+                               const int *rimap_inv, const int *rimap_inv_max,
+                               const double *restrict fi, const int *ldfp,
+                               double *restrict fo) {
+  const size_t ldf = ldfp[0];
+
+  /* check whether we got aligned vectors or not */
+  int aligned = 1;
+  aligned = aligned && (((long long)fi) % (8 * VEC_SIZE) == 0);
+  aligned = aligned && (((long long)fo) % (8 * VEC_SIZE) == 0);
+  aligned = aligned && ((1 << ldf) % VEC_SIZE == 0);
+
+  if (aligned) {
+#define ALIGNED
+#include "operate_sym_inc.c"
+#undef ALIGNED
+
+  } else {
+    /* not aligned */
+  #include "operate_sym_inc.c"
+  }
+}
+
+void FC_FUNC_(doperate_ri_antisym_vec,
+              DOPERATE_RI_ANTISYM_VEC)(const int *opn, const double *restrict wpair,
+                               const int *opnri, const int *opri_pos, const int *opri_neg, 
+                               const int *rimap_inv, const int *rimap_inv_max,
+                               const double *restrict fi, const int *ldfp,
+                               double *restrict fo) {
+  const size_t ldf = ldfp[0];
+
+  /* check whether we got aligned vectors or not */
+  int aligned = 1;
+  aligned = aligned && (((long long)fi) % (8 * VEC_SIZE) == 0);
+  aligned = aligned && (((long long)fo) % (8 * VEC_SIZE) == 0);
+  aligned = aligned && ((1 << ldf) % VEC_SIZE == 0);
+
+  if (aligned) {
+#define ALIGNED
+#include "operate_antisym_inc.c"
+#undef ALIGNED
+
+  } else {
+    /* not aligned */
+#include "operate_antisym_inc.c"
+  }
+}
+
+
 /* the same as doperate_ri_vec, but allows giving each an appropriate Fortan
    interface in which fi and fo are actually complex in Fortran Could be inline,
    but in that case pgcc will not put it in the symbol table. */
@@ -77,6 +128,28 @@ void FC_FUNC_(zoperate_ri_vec,
   (opn, w, opnri, opri, rimap_inv, rimap_inv_max, fi, ldfp, fo);
 }
 
+void FC_FUNC_(zoperate_ri_sym_vec,
+              ZOPERATE_RI_SYM_VEC)(const int *opn, const double *restrict w, 
+                               const double *restrict wcenter,
+                               const int *opnri, const int *opri_pos, const int *opri_neg,
+                               const int *rimap_inv, const int *rimap_inv_max,
+                               const double *restrict fi, const int *ldfp,
+                               double *restrict fo) {
+  FC_FUNC_(doperate_ri_sym_vec, DOPERATE_RI_SYM_VEC)
+  (opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo);
+}
+
+void FC_FUNC_(zoperate_ri_antisym_vec,
+              ZOPERATE_RI_ANTISYM_VEC)(const int *opn, const double *restrict w,
+                               const int *opnri, const int *opri_pos, const int *opri_neg,
+                               const int *rimap_inv, const int *rimap_inv_max,
+                               const double *restrict fi, const int *ldfp,
+                               double *restrict fo) {
+  FC_FUNC_(doperate_ri_antisym_vec, DOPERATE_RI_SYM_VEC)
+  (opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo);
+}
+
+
 void FC_FUNC_(dgauss_seidel,
               DGAUSS_SEIDEL)(const int *opn, const double *restrict w,
                              const int *opnri, const int *opri,
diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c
new file mode 100644
index 0000000000000000000000000000000000000000..efa735b6a8eb84edae8f4e6b30da76bbf553950c
--- /dev/null
+++ b/src/grid/operate_antisym_inc.c
@@ -0,0 +1,312 @@
+/*
+ Copyright (C) 2006 X. Andrade
+ Copyright (C) 2025 N. Tancogne-Dejean
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+*/
+
+#ifdef ALIGNED
+#define LOAD VEC_LD
+#define STORE VEC_ST
+#else
+#define LOAD VEC_LDU
+#define STORE VEC_STU
+#endif
+
+{
+  const ptrdiff_t npairs = opn[0];
+  const ptrdiff_t nri = opnri[0];
+#if DEPTH >= 16
+  const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 8
+  const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 4
+  const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 2
+  const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 1
+  const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
+#endif
+
+  ptrdiff_t l, i, j;
+  const int *restrict index_pos;
+  const int *restrict index_neg;
+
+  for (l = 0; l < nri; l++) {
+    index_pos = opri_pos + npairs * l;
+    index_neg = opri_neg + npairs * l;
+    i = rimap_inv[l];
+
+#if DEPTH >= 16
+    for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+        register VEC_TYPE a8, a9, aa, ab;
+        register VEC_TYPE ac, ad, ae, af;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+        a8 = a9 = aa = ab = VEC_ZERO;
+        ac = ad = ae = af = VEC_ZERO;
+
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
+          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
+          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
+          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
+          a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
+          a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
+          a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
+          a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
+          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE);
+          a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8);
+          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE);
+          a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9);
+          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE);
+          aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa);
+          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE);
+          ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab);
+          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE);
+          ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac);
+          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE);
+          ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad);
+          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE);
+          ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae);
+          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE);
+          af = VEC_FMA(wj, VEC_SUB(pos, neg), af);
+
+        }
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
+        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
+        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
+        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
+        STORE(fo + base_ik + 8 * VEC_SIZE, a8);
+        STORE(fo + base_ik + 9 * VEC_SIZE, a9);
+        STORE(fo + base_ik + 10 * VEC_SIZE, aa);
+        STORE(fo + base_ik + 11 * VEC_SIZE, ab);
+        STORE(fo + base_ik + 12 * VEC_SIZE, ac);
+        STORE(fo + base_ik + 13 * VEC_SIZE, ad);
+        STORE(fo + base_ik + 14 * VEC_SIZE, ae);
+        STORE(fo + base_ik + 15 * VEC_SIZE, af);
+      }
+    }
+#endif
+
+#if DEPTH >= 8
+    for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
+          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
+          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
+          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
+          a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
+          a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
+          a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
+          a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
+        }
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
+        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
+        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
+        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
+      }
+    }
+#endif
+
+#if DEPTH >= 4
+    for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
+          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
+          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
+          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
+        }
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+      }
+    }
+#endif
+
+#if DEPTH >= 2
+    for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
+        register VEC_TYPE a0, a1;
+
+        a0 = a1 = VEC_ZERO;
+
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
+          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
+        }
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+      }
+    }
+#endif
+
+#if DEPTH >= 1
+    for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
+        register VEC_TYPE a0 = VEC_ZERO;
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+        }
+        STORE(fo + base_ik, a0);
+      }
+    }
+#endif
+
+#if VEC_SIZE > 1
+
+    const ptrdiff_t size =  (ptrdiff_t) 1 << ldf;
+    double a;
+
+    for (; i < rimap_inv_max[l]; i++) {
+      for (ptrdiff_t k = 0; k < size; k++) {
+        a = 0.0;
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
+        for (j = 0; j < npairs; j++) {
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          const double diff = fi[indexj_pos] - fi[indexj_neg];  
+          a += wpair[j] * diff;
+        }
+        fo[base_ik] = a;
+      }
+    }
+#endif
+
+  } /* l */
+
+  // this fence instruction is needed to ensure correctness when using
+  // non-temporal stores
+#if defined(ALIGNED) && defined(FENCE)
+  FENCE;
+#endif
+}
+
+#undef LOAD
+#undef STORE
diff --git a/src/grid/operate_f.F90 b/src/grid/operate_f.F90
index f17b4b1a7d0d080f0148c9c2c0a4ba1fa777581b..51c467b6b21a3a8640dc1ba07674e57eb4e78430 100644
--- a/src/grid/operate_f.F90
+++ b/src/grid/operate_f.F90
@@ -61,6 +61,77 @@ module operate_f_oct_m
     end subroutine zoperate_ri_vec
   end interface
 
+  interface
+    subroutine doperate_ri_sym_vec(opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo)
+      import :: real64
+      implicit none
+      integer,        intent(in)    :: opn
+      real(real64),   intent(in)    :: w
+      real(real64),   intent(in)    :: wcenter
+      integer,        intent(in)    :: opnri
+      integer,        intent(in)    :: opri_pos
+      integer,        intent(in)    :: opri_neg
+      integer,        intent(in)    :: rimap_inv
+      integer,        intent(in)    :: rimap_inv_max
+      real(real64),   intent(in)    :: fi
+      integer,        intent(in)    :: ldfp
+      real(real64),   intent(inout) :: fo
+    end subroutine doperate_ri_sym_vec
+  end interface
+
+  interface
+    subroutine zoperate_ri_sym_vec(opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo)
+      import :: real64
+      implicit none
+      integer,         intent(in)    :: opn
+      real(real64),    intent(in)    :: w
+      real(real64),    intent(in)    :: wcenter
+      integer,         intent(in)    :: opnri
+      integer,         intent(in)    :: opri_pos
+      integer,         intent(in)    :: opri_neg
+      integer,         intent(in)    :: rimap_inv
+      integer,         intent(in)    :: rimap_inv_max
+      complex(real64), intent(in)    :: fi
+      integer,         intent(in)    :: ldfp
+      complex(real64), intent(inout) :: fo
+    end subroutine zoperate_ri_sym_vec
+  end interface
+
+  interface
+    subroutine doperate_ri_antisym_vec(opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo)
+      import :: real64
+      implicit none
+      integer,        intent(in)    :: opn
+      real(real64),   intent(in)    :: w
+      integer,        intent(in)    :: opnri
+      integer,        intent(in)    :: opri_pos
+      integer,        intent(in)    :: opri_neg
+      integer,        intent(in)    :: rimap_inv
+      integer,        intent(in)    :: rimap_inv_max
+      real(real64),   intent(in)    :: fi
+      integer,        intent(in)    :: ldfp
+      real(real64),   intent(inout) :: fo
+    end subroutine doperate_ri_antisym_vec
+  end interface
+
+  interface
+    subroutine zoperate_ri_antisym_vec(opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo)
+      import :: real64
+      implicit none
+      integer,         intent(in)    :: opn
+      real(real64),    intent(in)    :: w
+      integer,         intent(in)    :: opnri
+      integer,         intent(in)    :: opri_pos
+      integer,         intent(in)    :: opri_neg
+      integer,         intent(in)    :: rimap_inv
+      integer,         intent(in)    :: rimap_inv_max
+      complex(real64), intent(in)    :: fi
+      integer,         intent(in)    :: ldfp
+      complex(real64), intent(inout) :: fo
+    end subroutine zoperate_ri_antisym_vec
+  end interface
+
+
   interface
     subroutine dgauss_seidel(opn, w, opnri, opri, rimap_inv, rimap_inv_max, factor, pot, rho)
       import :: real64
diff --git a/src/grid/operate_inc.c b/src/grid/operate_inc.c
index d9014ac3670e2ea69aad7c3938ed6bcf9d372404..2eeb3d0ebf8939f06c2ae1219faef470364012e5 100644
--- a/src/grid/operate_inc.c
+++ b/src/grid/operate_inc.c
@@ -29,9 +29,6 @@
 {
   const ptrdiff_t n = opn[0];
   const ptrdiff_t nri = opnri[0];
-#if DEPTH >= 32
-  const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf);
-#endif
 #if DEPTH >= 16
   const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
 #endif
@@ -55,104 +52,6 @@
     index = opri + n * l;
     i = rimap_inv[l];
 
-#if DEPTH >= 32
-    for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) {
-      ptrdiff_t k;
-      for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) {
-        register VEC_TYPE a0, a1, a2, a3;
-        register VEC_TYPE a4, a5, a6, a7;
-        register VEC_TYPE a8, a9, aa, ab;
-        register VEC_TYPE ac, ad, ae, af;
-        register VEC_TYPE b0, b1, b2, b3;
-        register VEC_TYPE b4, b5, b6, b7;
-        register VEC_TYPE b8, b9, ba, bb;
-        register VEC_TYPE bc, bd, be, bf;
-
-        a0 = a1 = a2 = a3 = VEC_ZERO;
-        a4 = a5 = a6 = a7 = VEC_ZERO;
-        a8 = a9 = aa = ab = VEC_ZERO;
-        ac = ad = ae = af = VEC_ZERO;
-        b0 = b1 = b2 = b3 = VEC_ZERO;
-        b4 = b5 = b6 = b7 = VEC_ZERO;
-        b8 = b9 = ba = bb = VEC_ZERO;
-        bc = bd = be = bf = VEC_ZERO;
-
-        for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
-          register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
-          ptrdiff_t indexj = (index[j] + i) << ldf;
-          a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
-          a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
-          a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
-          a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
-          a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
-          a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
-          a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
-          a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
-          a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
-          a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
-          aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
-          ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
-          ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
-          ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
-          ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
-          af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
-          b0 = VEC_FMA(wj, LOAD(fi + indexj + 16 * VEC_SIZE + k), b0);
-          b1 = VEC_FMA(wj, LOAD(fi + indexj + 17 * VEC_SIZE + k), b1);
-          b2 = VEC_FMA(wj, LOAD(fi + indexj + 18 * VEC_SIZE + k), b2);
-          b3 = VEC_FMA(wj, LOAD(fi + indexj + 19 * VEC_SIZE + k), b3);
-          b4 = VEC_FMA(wj, LOAD(fi + indexj + 20 * VEC_SIZE + k), b4);
-          b5 = VEC_FMA(wj, LOAD(fi + indexj + 21 * VEC_SIZE + k), b5);
-          b6 = VEC_FMA(wj, LOAD(fi + indexj + 22 * VEC_SIZE + k), b6);
-          b7 = VEC_FMA(wj, LOAD(fi + indexj + 23 * VEC_SIZE + k), b7);
-          b8 = VEC_FMA(wj, LOAD(fi + indexj + 24 * VEC_SIZE + k), b8);
-          b9 = VEC_FMA(wj, LOAD(fi + indexj + 25 * VEC_SIZE + k), b9);
-          ba = VEC_FMA(wj, LOAD(fi + indexj + 26 * VEC_SIZE + k), ba);
-          bb = VEC_FMA(wj, LOAD(fi + indexj + 27 * VEC_SIZE + k), bb);
-          bc = VEC_FMA(wj, LOAD(fi + indexj + 28 * VEC_SIZE + k), bc);
-          bd = VEC_FMA(wj, LOAD(fi + indexj + 29 * VEC_SIZE + k), bd);
-          be = VEC_FMA(wj, LOAD(fi + indexj + 30 * VEC_SIZE + k), be);
-          bf = VEC_FMA(wj, LOAD(fi + indexj + 31 * VEC_SIZE + k), bf);
-        }
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
-        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
-        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
-        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
-        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
-        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
-        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
-        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
-        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
-        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
-        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
-        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
-        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
-        STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0);
-        STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1);
-        STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2);
-        STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3);
-        STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4);
-        STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5);
-        STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6);
-        STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7);
-        STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8);
-        STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9);
-        STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba);
-        STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb);
-        STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc);
-        STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd);
-        STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be);
-        STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf);
-      }
-    }
-#endif
-
 #if DEPTH >= 16
     for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
       ptrdiff_t k;
@@ -168,11 +67,7 @@
         ac = ad = ae = af = VEC_ZERO;
 
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
           a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
@@ -222,11 +117,7 @@
         a4 = a5 = a6 = a7 = VEC_ZERO;
 
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
           a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
@@ -258,11 +149,7 @@
         a0 = a1 = a2 = a3 = VEC_ZERO;
 
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
           a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
@@ -286,11 +173,7 @@
         a0 = a1 = VEC_ZERO;
 
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
           a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
@@ -307,11 +190,7 @@
       for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
         register VEC_TYPE a0 = VEC_ZERO;
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
         }
diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c
new file mode 100644
index 0000000000000000000000000000000000000000..fc5ba4ac88661571229f392cee808014857f9103
--- /dev/null
+++ b/src/grid/operate_sym_inc.c
@@ -0,0 +1,352 @@
+/*
+ Copyright (C) 2006 X. Andrade
+ Copyright (C) 2025 N. Tancogne-Dejean
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+*/
+
+#ifdef ALIGNED
+#define LOAD VEC_LD
+#define STORE VEC_ST
+#else
+#define LOAD VEC_LDU
+#define STORE VEC_STU
+#endif
+
+{
+  const ptrdiff_t npairs = opn[0];
+  const ptrdiff_t nri = opnri[0];
+#if DEPTH >= 16
+  const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 8
+  const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 4
+  const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 2
+  const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 1
+  const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
+#endif
+
+  ptrdiff_t l, i, j;
+  const int *restrict index_pos;
+  const int *restrict index_neg;
+
+  // Hoisting wcenter
+  const double w0 = wcenter[0];
+
+  for (l = 0; l < nri; l++) {
+    index_pos = opri_pos + npairs * l;
+    index_neg = opri_neg + npairs * l;
+    i = rimap_inv[l];
+
+#if DEPTH >= 16
+    for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+        register VEC_TYPE a8, a9, aa, ab;
+        register VEC_TYPE ac, ad, ae, af;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+        a8 = a9 = aa = ab = VEC_ZERO;
+        ac = ad = ae = af = VEC_ZERO;
+
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
+          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
+          a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
+          a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
+          a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
+          a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
+          a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
+          a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
+          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE);
+          a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8);
+          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE);
+          a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9);
+          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE);
+          aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa);
+          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE);
+          ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab);
+          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE);
+          ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac);
+          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE);
+          ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad);
+          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE);
+          ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae);
+          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE);
+          af = VEC_FMA(wj, VEC_ADD(pos, neg), af);
+
+        }
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
+        a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4);
+        a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5);
+        a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6);
+        a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7);
+        a8 = VEC_FMA(wj, LOAD(fi + base_ik + 8 * VEC_SIZE), a8);
+        a9 = VEC_FMA(wj, LOAD(fi + base_ik + 9 * VEC_SIZE), a9);
+        aa = VEC_FMA(wj, LOAD(fi + base_ik + 10 * VEC_SIZE), aa);
+        ab = VEC_FMA(wj, LOAD(fi + base_ik + 11 * VEC_SIZE), ab);
+        ac = VEC_FMA(wj, LOAD(fi + base_ik + 12 * VEC_SIZE), ac);
+        ad = VEC_FMA(wj, LOAD(fi + base_ik + 13 * VEC_SIZE), ad);
+        ae = VEC_FMA(wj, LOAD(fi + base_ik + 14 * VEC_SIZE), ae);
+        af = VEC_FMA(wj, LOAD(fi + base_ik + 15 * VEC_SIZE), af);
+
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
+        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
+        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
+        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
+        STORE(fo + base_ik + 8 * VEC_SIZE, a8);
+        STORE(fo + base_ik + 9 * VEC_SIZE, a9);
+        STORE(fo + base_ik + 10 * VEC_SIZE, aa);
+        STORE(fo + base_ik + 11 * VEC_SIZE, ab);
+        STORE(fo + base_ik + 12 * VEC_SIZE, ac);
+        STORE(fo + base_ik + 13 * VEC_SIZE, ad);
+        STORE(fo + base_ik + 14 * VEC_SIZE, ae);
+        STORE(fo + base_ik + 15 * VEC_SIZE, af);
+      }
+    }
+#endif
+
+#if DEPTH >= 8
+    for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
+          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
+          a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
+          a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
+          a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
+          a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
+          a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
+          a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
+        }
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
+        a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4);
+        a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5);
+        a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6);
+        a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
+        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
+        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
+        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
+      }
+    }
+#endif
+
+#if DEPTH >= 4
+    for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
+          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
+          a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
+          a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
+        }
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+      }
+    }
+#endif
+
+#if DEPTH >= 2
+    for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
+        register VEC_TYPE a0, a1;
+
+        a0 = a1 = VEC_ZERO;
+
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
+          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
+        }
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+      }
+    }
+#endif
+
+#if DEPTH >= 1
+    for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
+        register VEC_TYPE a0 = VEC_ZERO;
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
+        for (j = 0; j < npairs; j++) {
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+        }
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        STORE(fo + base_ik, a0);
+      }
+    }
+#endif
+
+#if VEC_SIZE > 1
+
+    const ptrdiff_t size =  (ptrdiff_t) 1 << ldf;
+    double a;
+
+    for (; i < rimap_inv_max[l]; i++) {
+      for (ptrdiff_t k = 0; k < size; k++) {
+        a = 0.0;
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
+        for (j = 0; j < npairs; j++) {
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          const double sum = fi[indexj_pos] + fi[indexj_neg];  
+          a += wpair[j] * sum;
+        }
+        fo[base_ik] = a + w0 * fi[base_ik];
+      }
+    }
+#endif
+
+  } /* l */
+
+  // this fence instruction is needed to ensure correctness when using
+  // non-temporal stores
+#if defined(ALIGNED) && defined(FENCE)
+  FENCE;
+#endif
+}
+
+#undef LOAD
+#undef STORE
diff --git a/src/include/vectors.h b/src/include/vectors.h
index 4b85e9a7c68a2e3ce01df1ecdc4a19921cc6e7fe..8f735020a625f16e032e3ef5920e4de284ebfda2 100644
--- a/src/include/vectors.h
+++ b/src/include/vectors.h
@@ -41,6 +41,8 @@
 #define VEC_STU(addr, vec) _mm512_storeu_pd(addr, vec)
 #define VEC_FMA(aa, bb, cc) _mm512_fmadd_pd(aa, bb, cc)
 #define VEC_SCAL(aa) _mm512_set1_pd(aa)
+#define VEC_ADD(aa, bb) _mm512_add_pd(aa, bb)
+#define VEC_SUB(aa, bb) _mm512_sub_pd(aa, bb)
 #define VEC_ZERO _mm512_setzero_pd()
 #include <emmintrin.h>
 #define FENCE _mm_mfence()
@@ -73,6 +75,8 @@
 #define VEC_FMA(aa, bb, cc) _mm256_add_pd(cc, _mm256_mul_pd(aa, bb))
 #endif
 #define VEC_SCAL(aa) _mm256_set1_pd(aa)
+#define VEC_ADD(aa, bb) _mm256_add_pd(aa, bb)
+#define VEC_SUB(aa, bb) _mm256_sub_pd(aa, bb)
 #define VEC_ZERO _mm256_setzero_pd()
 #include <emmintrin.h>
 #define FENCE _mm_mfence()
@@ -101,6 +105,8 @@
 #define VEC_FMA(aa, bb, cc) _mm_add_pd(cc, _mm_mul_pd(aa, bb))
 #endif
 #define VEC_SCAL(aa) _mm_set1_pd(aa)
+#define VEC_ADD(aa, bb) _mm_add_pd(aa, bb)
+#define VEC_SUB(aa, bb) _mm_sub_pd(aa, bb)
 #define VEC_ZERO _mm_setzero_pd()
 #define FENCE _mm_mfence()
 
@@ -118,8 +124,10 @@
 #define VEC_LDU(addr) VEC_LD(addr)
 #define VEC_ST(addr, vec) (addr)[0] = vec
 #define VEC_STU(addr, vec) VEC_ST(addr, vec)
-#define VEC_FMA(aa, bb, cc) aa *bb + cc
+#define VEC_FMA(aa, bb, cc) ((aa) * (bb) + (cc))
 #define VEC_SCAL(aa) aa
+#define VEC_ADD(aa, bb) ((aa)+(bb))
+#define VEC_SUB(aa, bb) ((aa)-(bb))
 #define VEC_ZERO 0.0
 
 #define DEPTH 8
diff --git a/src/main/test.F90 b/src/main/test.F90
index d33d56c0c01c562f8de182e51eeee320e136cc66..dac77f904b58cab911d826d27072b83084f898a2 100644
--- a/src/main/test.F90
+++ b/src/main/test.F90
@@ -34,6 +34,7 @@ module test_oct_m
   use debug_oct_m
   use density_oct_m
   use derivatives_oct_m
+  use derivatives_test_oct_m
   use eigen_chebyshev_oct_m
   use electron_space_oct_m
   use electrons_oct_m
@@ -1303,6 +1304,10 @@ contains
       call zderivatives_test(sys%gr%der, sys%namespace, param%repetitions, param%min_blocksize, param%max_blocksize)
     end if
 
+    if (sys%space%dim > 1) then
+      call derivatives_advanced_benchmark(sys%gr%der, sys%namespace)
+    end if
+
     SAFE_DEALLOCATE_P(sys)
 
     POP_SUB(test_derivatives)