From 54e67cb920535bcde36d25fbafb1334dcb21735e Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Fri, 12 Dec 2025 15:51:33 +0100
Subject: [PATCH 01/11] Improved numerical stability strategy for the finite
 differences

The logic of this change is to change the order on which points are summed over.
Instead of summing over points in a random order, the summation is done by
pairs of points, with a sorting of the weights by theior absolute values.

Some challenging numerical unit tests are added, to reveal the impact of the changes.
In all the tests, the new method seems to reduce the numerical error.
The new method is only implemented in the Fortran operator at the moment.
---
 src/grid/CMakeLists.txt           |   1 +
 src/grid/derivatives.F90          |  15 +-
 src/grid/derivatives_inc.F90      | 295 ----------------------------
 src/grid/derivatives_test.F90     | 219 +++++++++++++++++++++
 src/grid/derivatives_test_inc.F90 | 312 ++++++++++++++++++++++++++++++
 src/grid/nl_operator.F90          | 168 +++++++++++++++-
 src/grid/nl_operator_inc.F90      | 127 +++++++++++-
 src/main/test.F90                 |   3 +
 src/utils/dielectric_function.F90 |  22 ++-
 9 files changed, 849 insertions(+), 313 deletions(-)
 create mode 100644 src/grid/derivatives_test.F90
 create mode 100644 src/grid/derivatives_test_inc.F90

diff --git a/src/grid/CMakeLists.txt b/src/grid/CMakeLists.txt
index 2b277d8025..e2557c2395 100644
--- a/src/grid/CMakeLists.txt
+++ b/src/grid/CMakeLists.txt
@@ -16,6 +16,7 @@ target_sources(Octopus_lib PRIVATE
 		curv_gygi.F90
 		curv_modine.F90
 		derivatives.F90
+                derivatives_test.F90
 		fourier_shell.F90
 		fourier_space.F90
 		grid.F90
diff --git a/src/grid/derivatives.F90 b/src/grid/derivatives.F90
index 6a8c3dcf9f..3392f2425b 100644
--- a/src/grid/derivatives.F90
+++ b/src/grid/derivatives.F90
@@ -42,10 +42,8 @@ module derivatives_oct_m
   use, intrinsic :: iso_fortran_env
   use lalg_basic_oct_m
   use lalg_adv_oct_m
-  use loct_oct_m
   use math_oct_m
   use mesh_oct_m
-  use mesh_function_oct_m
   use messages_oct_m
   use namespace_oct_m
   use nl_operator_oct_m
@@ -64,13 +62,6 @@ module derivatives_oct_m
   use utils_oct_m
   use varinfo_oct_m
 
-!   debug purposes
-!   use io_binary_oct_m
-!   use io_function_oct_m
-!   use io_oct_m
-!   use unit_oct_m
-!   use unit_system_oct_m
-
   implicit none
 
   private
@@ -80,8 +71,6 @@ module derivatives_oct_m
     derivatives_end,                    &
     derivatives_build,                  &
     derivatives_handle_batch_t,         &
-    dderivatives_test,                  &
-    zderivatives_test,                  &
     dderivatives_batch_start,           &
     zderivatives_batch_start,           &
     dderivatives_batch_finish,          &
@@ -430,7 +419,7 @@ contains
     order_ = optional_default(order, der%order)
 
     ! initialize nl operator
-    call nl_operator_init(lapl, name_)
+    call nl_operator_init(lapl, name_, symm=OP_SYMMETRIC)
 
     ! create stencil
     select case (der%stencil_type)
@@ -484,7 +473,7 @@ contains
       dir_label = ' '
       if (ii < 5) dir_label = index2axis(ii)
 
-      call nl_operator_init(der%grad(ii), "Gradient "//dir_label)
+      call nl_operator_init(der%grad(ii), "Gradient "//dir_label, symm=OP_ANTISYMMETRIC)
 
       ! create stencil
       select case (der%stencil_type)
diff --git a/src/grid/derivatives_inc.F90 b/src/grid/derivatives_inc.F90
index 99b247edfd..ecb6287463 100644
--- a/src/grid/derivatives_inc.F90
+++ b/src/grid/derivatives_inc.F90
@@ -322,301 +322,6 @@ subroutine X(derivatives_curl)(der, ff, op_ff, ghost_update, set_bc)
   POP_SUB(X(derivatives_curl))
 end subroutine X(derivatives_curl)
 
-
-! ----------------------------------------------------------
-!> @brief unit test for derivatives
-!!
-!! This will be called from the test run mode.
-subroutine X(derivatives_test)(this, namespace, repetitions, min_blocksize, max_blocksize)
-  type(derivatives_t), intent(in) :: this
-  type(namespace_t),   intent(in) :: namespace
-  integer,             intent(in) :: repetitions
-  integer,             intent(in) :: min_blocksize
-  integer,             intent(in) :: max_blocksize
-
-  R_TYPE, allocatable :: ff(:), opff(:, :), gradff(:, :), curlff(:, :), res(:), resgrad(:, :), rescurl(:, :)
-  R_TYPE :: aa, bb, cc
-  integer :: ip, idir, ist, ib
-  type(batch_t) :: ffb, opffb
-  type(batch_t) :: gradffb(this%mesh%box%dim)
-  integer :: blocksize, itime
-  logical :: packstates
-  real(real64)   :: stime, etime
-  character(len=20) :: type
-  real(real64) :: norm
-
-  call parse_variable(namespace, 'StatesPack', .true., packstates)
-
-  SAFE_ALLOCATE(ff(1:this%mesh%np_part))
-  SAFE_ALLOCATE(opff(1:this%mesh%np, 1:this%mesh%box%dim))
-  SAFE_ALLOCATE(gradff(1:this%mesh%np_part, 1:this%mesh%box%dim))
-  SAFE_ALLOCATE(res(1:this%mesh%np_part))
-  SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim))
-  SAFE_ALLOCATE(curlff(1:this%mesh%np, 1:this%mesh%box%dim))
-
-#ifdef R_TREAL
-  type = 'real'
-#else
-  type = 'complex'
-#endif
-
-  ! Note: here we need to use a constant function or anything that
-  ! is constant at the borders, since we assume that all boundary
-  ! points have equal values to optimize the application of the nl-operator.
-
-  aa = R_TOTYPE(1.0)/this%mesh%box%bounding_box_l(1)
-  bb = R_TOTYPE(10.0)
-  cc = R_TOTYPE(100.0)
-
-#ifdef R_TCOMPLEX
-  ! we make things more "complex"
-  aa = aa + M_ZI*R_TOTYPE(0.01)
-  bb = bb*exp(M_ZI*R_TOTYPE(0.345))
-  cc = cc - M_ZI*R_TOTYPE(50.0)
-#endif
-
-
-  do ip = 1, this%mesh%np_part
-    ff(ip) = bb*exp(-aa*sum(this%mesh%x(:, ip)**2)) + cc
-  end do
-  do ip = 1, this%mesh%np
-    do idir = 1, this%mesh%box%dim
-      gradff(ip, idir) = -M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2))
-    end do
-  end do
-
-  ! curl only needed in 3D
-  if (this%mesh%box%dim == 3) then
-    do ip = 1, this%mesh%np
-      curlff(ip, 1) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(2, ip) - this%mesh%x(3, ip))
-      curlff(ip, 2) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(3, ip) - this%mesh%x(1, ip))
-      curlff(ip, 3) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(1, ip) - this%mesh%x(2, ip))
-    end do
-  end if
-
-  message(1) = "Testing Laplacian"
-  call messages_info(1, namespace=namespace)
-
-  ! test Laplacian
-  blocksize = min_blocksize
-  do
-
-    call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
-    call X(batch_init)(opffb, 1, 1, blocksize, this%mesh%np)
-
-    do ist = 1, blocksize
-      call batch_set_state(ffb, ist, this%mesh%np_part, ff)
-    end do
-
-    if (packstates) then
-      call ffb%do_pack()
-      call opffb%do_pack(copy = .false.)
-    end if
-
-    if (repetitions > 1) then
-      call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF)
-    end if
-
-    stime = loct_clock()
-    do itime = 1, repetitions
-      call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF)
-    end do
-    etime = (loct_clock() - stime)/real(repetitions, real64)
-
-    call batch_get_state(opffb, blocksize, this%mesh%np, res)
-
-    call ffb%end()
-    call opffb%end()
-
-    do ip = 1, this%mesh%np
-      res(ip) = M_TWO*res(ip) - &
-        (M_FOUR*aa**2*bb*sum(this%mesh%x(:, ip)**2)*exp(-aa*sum(this%mesh%x(:, ip)**2)) &
-        - this%mesh%box%dim*M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2)))
-    end do
-
-    write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
-      'Laplacian ', trim(type),  &
-      ' bsize = ', blocksize,    &
-      ' , error = ', X(mf_nrm2)(this%mesh, res), &
-      ' , Gflops = ',  &
-#ifdef R_TREAL
-      blocksize*this%mesh%np*M_TWO*this%lapl%stencil%size/(etime*1.0e9_real64)
-#else
-    blocksize*this%mesh%np*M_FOUR*this%lapl%stencil%size/(etime*1.0e9_real64)
-#endif
-
-    call messages_info(1, namespace=namespace)
-
-    blocksize = 2*blocksize
-    if (blocksize > max_blocksize) exit
-
-  end do
-
-  message(1) = "Testing Gradient"
-  call messages_info(1, namespace=namespace)
-
-  ! test Gradient
-  blocksize = min_blocksize
-  do
-    call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
-
-    do ist = 1, blocksize
-      call batch_set_state(ffb, ist, this%mesh%np_part, ff)
-    end do
-
-    if (packstates) then
-      call ffb%do_pack()
-    end if
-
-    if (repetitions > 1) then
-      call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
-    end if
-
-    stime = loct_clock()
-    do itime = 1, repetitions
-      call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
-    end do
-    etime = (loct_clock() - stime)/real(repetitions, real64)
-
-    do idir = 1, this%mesh%box%dim
-      call batch_get_state(gradffb(idir), blocksize, this%mesh%np, resgrad(:, idir))
-    end do
-
-    call ffb%end()
-    do idir = 1, this%mesh%box%dim
-      call gradffb(idir)%end()
-    end do
-
-    do idir = 1, this%mesh%box%dim
-      do ip = 1, this%mesh%np
-        resgrad(ip, idir) = resgrad(ip, idir) - gradff(ip, idir)
-      end do
-    end do
-
-    write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
-      'Batch gradient ', trim(type),  &
-      ' bsize = ', blocksize,    &
-      ' , error (x direction) = ', X(mf_nrm2)(this%mesh, resgrad(:, 1)), &
-      ' , Gflops = ',  &
-#ifdef R_TREAL
-      blocksize*this%mesh%np*M_TWO*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64)
-#else
-    blocksize*this%mesh%np*M_FOUR*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64)
-#endif
-    call messages_info(1, namespace=namespace)
-
-    blocksize = 2*blocksize
-    if (blocksize > max_blocksize) exit
-  end do
-
-  call X(derivatives_grad)(this, ff, opff, set_bc = .false.)
-
-  do idir = 1, this%mesh%box%dim
-    do ip = 1, this%mesh%np
-      opff(ip, idir) = opff(ip, idir) - (-M_TWO*aa*bb*this%mesh%x_t(ip, idir)*exp(-aa*sum(this%mesh%x(:, ip)**2)))
-    end do
-  end do
-
-  message(1) = ''
-  call messages_info(1, namespace=namespace)
-
-  write(message(1), '(3a, es17.10)') 'Gradient ', trim(type),  &
-    ' err = ', X(mf_nrm2)(this%mesh, this%mesh%box%dim, opff)
-  call messages_info(1, namespace=namespace)
-
-  message(1) = ''
-  call messages_info(1, namespace=namespace)
-
-  ! test curl for 3D case
-  if (this%mesh%box%dim == 3) then
-    message(1) = "Testing curl"
-    call messages_info(1, namespace=namespace)
-
-    do blocksize = 3, 6, 3
-      call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
-
-      do ist = 1, blocksize
-        call batch_set_state(ffb, ist, this%mesh%np_part, ff)
-      end do
-
-      if (packstates) then
-        call ffb%do_pack()
-      end if
-
-      ! test for one blocksize without precomputed gradient
-      if (blocksize == 6) then
-        call X(derivatives_batch_curl)(this, ffb, set_bc=.false.)
-      else
-        call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
-        call X(derivatives_batch_curl_from_gradient)(this, ffb, gradffb)
-      end if
-
-      SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:blocksize))
-      do ist = 1, blocksize
-        call batch_get_state(ffb, ist, this%mesh%np, rescurl(:, ist))
-      end do
-
-      call ffb%end()
-      do idir = 1, this%mesh%box%dim
-        call gradffb(idir)%end()
-      end do
-
-      do ip = 1, this%mesh%np
-        do ib = 0, blocksize-1, this%mesh%box%dim
-          do idir = 1, this%mesh%box%dim
-            rescurl(ip, ib+idir) = rescurl(ip, ib+idir) - curlff(ip, idir)
-          end do
-        end do
-      end do
-
-      norm = M_ZERO
-      do ist = 1, blocksize
-        norm = norm + X(mf_nrm2)(this%mesh, rescurl(:, ist))
-      end do
-
-      write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
-        'Batch curl ', trim(type), ' bsize = ', blocksize, ' , error = ', norm
-      call messages_info(1, namespace=namespace)
-
-      SAFE_DEALLOCATE_A(rescurl)
-    end do
-
-    ! Testing the regular curl with the same logic as above
-    SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:this%mesh%box%dim))
-    do idir = 1, this%mesh%box%dim
-      call lalg_copy(this%mesh%np_part, ff, gradff(:, idir))
-    end do
-    call X(derivatives_curl)(this, gradff, rescurl, set_bc = .false.)
-
-    do idir = 1, this%mesh%box%dim
-      do ip = 1, this%mesh%np
-        opff(ip, idir) = rescurl(ip, idir) - curlff(ip, idir)
-      end do
-    end do
-
-    norm = M_ZERO
-    do ist = 1, this%mesh%box%dim
-      norm = norm + X(mf_nrm2)(this%mesh, opff(:, ist))
-    end do
-
-    write(message(1), '(3a, es17.10)') 'Curl ', trim(type), ' err = ', norm
-    call messages_info(1, namespace=namespace)
-
-    SAFE_DEALLOCATE_A(rescurl)
-  end if
-
-  message(1) = ''
-  call messages_info(1, namespace=namespace)
-
-  SAFE_DEALLOCATE_A(ff)
-  SAFE_DEALLOCATE_A(opff)
-  SAFE_DEALLOCATE_A(gradff)
-  SAFE_DEALLOCATE_A(res)
-  SAFE_DEALLOCATE_A(resgrad)
-  SAFE_DEALLOCATE_A(curlff)
-
-end subroutine X(derivatives_test)
-
 !=====================================================================================
 ! Batched versions of the routines:
 
diff --git a/src/grid/derivatives_test.F90 b/src/grid/derivatives_test.F90
new file mode 100644
index 0000000000..c9231922dc
--- /dev/null
+++ b/src/grid/derivatives_test.F90
@@ -0,0 +1,219 @@
+!! Copyright (C) 2002-2006 M. Marques, A. Castro, A. Rubio, G. Bertsch
+!!
+!! This program is free software; you can redistribute it and/or modify
+!! it under the terms of the GNU General Public License as published by
+!! the Free Software Foundation; either version 2, or (at your option)
+!! any later version.
+!!
+!! This program is distributed in the hope that it will be useful,
+!! but WITHOUT ANY WARRANTY; without even the implied warranty of
+!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!! GNU General Public License for more details.
+!!
+!! You should have received a copy of the GNU General Public License
+!! along with this program; if not, write to the Free Software
+!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+!! 02110-1301, USA.
+!!
+
+#include "global.h"
+
+!> @brief This module provides unit tests for the derivatives
+
+module derivatives_test_oct_m
+  use batch_oct_m
+  use batch_ops_oct_m
+  use boundaries_oct_m
+  use debug_oct_m
+  use derivatives_oct_m
+  use global_oct_m
+  use, intrinsic :: iso_fortran_env
+  use lalg_basic_oct_m
+  use lalg_adv_oct_m
+  use loct_oct_m
+  use mesh_oct_m
+  use mesh_function_oct_m
+  use messages_oct_m
+  use namespace_oct_m
+  use parser_oct_m
+  use profiling_oct_m
+  use space_oct_m
+  use utils_oct_m
+
+  implicit none
+
+  private
+  public ::                             &
+    dderivatives_test,                  &
+    zderivatives_test,                  &
+    derivatives_advanced_benchmark
+
+contains
+
+!>@brief Further unit tests design to challenge numerical stability of the finite differences
+subroutine derivatives_advanced_benchmark(this, namespace)
+  type(derivatives_t), intent(in) :: this
+  type(namespace_t),   intent(in) :: namespace
+
+  real(real64), allocatable :: ff(:), grad(:,:), resgrad(:, :), lapl(:), reslapl(:)
+  real(real64) :: kvec(3)
+
+  SAFE_ALLOCATE(ff(1:this%mesh%np_part))
+  SAFE_ALLOCATE(grad(1:this%mesh%np, 1:this%mesh%box%dim))
+  SAFE_ALLOCATE(lapl(1:this%mesh%np))
+  SAFE_ALLOCATE(reslapl(1:this%mesh%np))
+  SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim))
+
+  ! Testing on some planewaves
+  kvec = [0.5_real64, 0.0_real64, 0.0_real64]         ! low frequency
+  call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+  call test_function("Testing Planewave k=(0.5,0.0,0.0)")
+
+  kvec = [1.0_real64, 1.0_real64, 0.0_real64]         ! diagonal
+  call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+  call test_function("Testing Planewave k=(1.0,1.0,0.0)")
+
+  kvec = [M_PI/this%mesh%spacing(1), 0.0_real64, 0.0_real64]            ! Nyquist
+  call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+  call test_function("Testing Planewave k=(\pi/dr,0.0,0.0)")
+
+  kvec = [M_PI/this%mesh%spacing(1), M_PI/this%mesh%spacing(2), 0.0_real64]         ! diagonal Nyquist
+  call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+  call test_function("Testing Planewave k=(\pi/dr,\pi/dr,0.0)")
+
+  ! Testing on some polynomials
+  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=2)
+  call test_function("Testing degree-2 polynomial")
+
+  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=3)
+  call test_function("Testing degree-3 polynomial")
+
+  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=4)
+  call test_function("Testing degree-4 polynomial")
+
+  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=6)
+  call test_function("Testing degree-6 polynomial")
+
+  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=8)
+  call test_function("Testing degree-8 polynomial")
+
+
+  SAFE_DEALLOCATE_A(ff)
+  SAFE_DEALLOCATE_A(grad)
+  SAFE_DEALLOCATE_A(lapl)
+  SAFE_DEALLOCATE_A(resgrad)
+  SAFE_DEALLOCATE_A(reslapl)
+
+contains
+  subroutine test_function(label)
+    character(len=*), intent(in) :: label
+
+    message(1) = trim(label)
+
+    call dderivatives_grad(this, ff, resgrad, set_bc=.false.)
+    resgrad = resgrad - grad
+    write(message(2), '(a, es17.10)') 'Gradient err = ', dmf_nrm2(this%mesh, this%mesh%box%dim, resgrad)
+
+    call dderivatives_lapl(this, ff, reslapl, set_bc=.false.)
+    reslapl = reslapl - lapl
+    write(message(3), '(a, es17.10)') 'Laplacian err = ', dmf_nrm2(this%mesh, reslapl)
+
+    call messages_info(3, namespace=namespace)
+  end subroutine test_function
+end subroutine derivatives_advanced_benchmark
+
+!>@brief Generate a plane-wave given its momentum, as well as the gradient and the Laplacian
+subroutine testfunc_plane_wave(mesh, ff, grad, lapl, kvec)
+  type(mesh_t), intent(in) :: mesh
+  real(real64), intent(out) :: ff(:), grad(:, :), lapl(:)
+  real(real64), intent(in) :: kvec(3)
+
+  integer :: ip, dim, idir
+  real(real64) :: phase, kdotx, kk
+
+  dim = mesh%box%dim
+
+  kk = dot_product(kvec(1:dim), kvec(1:dim))
+  do ip = 1, mesh%np
+     kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim))
+     phase = cos(kdotx)
+
+     ff(ip) = phase
+
+     ! exact gradient
+     do idir = 1, mesh%box%dim
+        grad(ip, idir) = -kvec(idir)*sin(kdotx)
+     end do
+
+     ! exact Laplacian
+     lapl(ip) = -kk * phase
+  end do
+
+  do ip = mesh%np+1, mesh%np_part
+    kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim))
+    phase = cos(kdotx)
+
+    ff(ip) = phase    
+  end do
+end subroutine testfunc_plane_wave
+
+!>@brief Generate a polyomial of degree p, as well as the gradient and the Laplacian
+subroutine testfunc_polynomial(mesh, ff, grad, lapl, p)
+  type(mesh_t), intent(in) :: mesh
+  real(real64), intent(out) :: ff(:), grad(:, :), lapl(:)
+  integer, intent(in) :: p
+  integer :: ip, d, dim
+  real(real64) :: x(3), r2
+
+  dim = mesh%box%dim
+
+  do ip = 1, mesh%np
+     x = 0.0_real64
+     x(1:dim) = mesh%x(ip, 1:dim)
+
+     ! f = (x^2 + y^2 + z^2)^(p/2)
+     r2 = dot_product(x(1:dim), x(1:dim))
+     ff(ip) = r2**(0.5d0*p)
+
+     ! gradient: grad(f) = p * r^(p-2) * x
+     if (p >= 2) then
+        do d = 1, dim
+           grad(ip,d) = p * r2**(0.5d0*p - 1.0d0) * x(d)
+        end do
+     else
+        grad(ip,:) = 0.0
+     end if
+
+     ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2)
+     if (p >= 2) then
+        lapl(ip) = p * (p + dim - 2) * r2**(0.5d0*p - 1.0d0)
+     else
+        lapl(ip) = 0.0
+     end if
+  end do
+
+  do ip = mesh%np+1, mesh%np_part
+    x = 0.0_real64
+    x(1:dim) = mesh%x(ip,1:dim)
+
+    ! f = (x^2 + y^2 + z^2)^(p/2)
+    r2 = dot_product(x(1:dim), x(1:dim))
+    ff(ip) = r2**(0.5d0*p)
+  end do
+end subroutine testfunc_polynomial
+
+
+#include "undef.F90"
+#include "real.F90"
+#include "derivatives_test_inc.F90"
+
+#include "undef.F90"
+#include "complex.F90"
+#include "derivatives_test_inc.F90"
+
+end module derivatives_test_oct_m
+
+!! Local Variables:
+!! mode: f90
+!! coding: utf-8
+!! End:
diff --git a/src/grid/derivatives_test_inc.F90 b/src/grid/derivatives_test_inc.F90
new file mode 100644
index 0000000000..84028e3362
--- /dev/null
+++ b/src/grid/derivatives_test_inc.F90
@@ -0,0 +1,312 @@
+!! Copyright (C) 2002-2006 M. Marques, A. Castro, A. Rubio, G. Bertsch
+!!
+!! This program is free software; you can redistribute it and/or modify
+!! it under the terms of the GNU General Public License as published by
+!! the Free Software Foundation; either version 2, or (at your option)
+!! any later version.
+!!
+!! This program is distributed in the hope that it will be useful,
+!! but WITHOUT ANY WARRANTY; without even the implied warranty of
+!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+!! GNU General Public License for more details.
+!!
+!! You should have received a copy of the GNU General Public License
+!! along with this program; if not, write to the Free Software
+!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+!! 02110-1301, USA.
+!!
+
+! ----------------------------------------------------------
+!> @brief unit test for derivatives
+!!
+!! This will be called from the test run mode.
+subroutine X(derivatives_test)(this, namespace, repetitions, min_blocksize, max_blocksize)
+  type(derivatives_t), intent(in) :: this
+  type(namespace_t),   intent(in) :: namespace
+  integer,             intent(in) :: repetitions
+  integer,             intent(in) :: min_blocksize
+  integer,             intent(in) :: max_blocksize
+
+  R_TYPE, allocatable :: ff(:), opff(:, :), gradff(:, :), curlff(:, :), res(:), resgrad(:, :), rescurl(:, :)
+  R_TYPE :: aa, bb, cc
+  integer :: ip, idir, ist, ib
+  type(batch_t) :: ffb, opffb
+  type(batch_t) :: gradffb(this%mesh%box%dim)
+  integer :: blocksize, itime
+  logical :: packstates
+  real(real64)   :: stime, etime
+  character(len=20) :: type
+  real(real64) :: norm
+
+  call parse_variable(namespace, 'StatesPack', .true., packstates)
+
+  SAFE_ALLOCATE(ff(1:this%mesh%np_part))
+  SAFE_ALLOCATE(opff(1:this%mesh%np, 1:this%mesh%box%dim))
+  SAFE_ALLOCATE(gradff(1:this%mesh%np_part, 1:this%mesh%box%dim))
+  SAFE_ALLOCATE(res(1:this%mesh%np_part))
+  SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim))
+  SAFE_ALLOCATE(curlff(1:this%mesh%np, 1:this%mesh%box%dim))
+
+#ifdef R_TREAL
+  type = 'real'
+#else
+  type = 'complex'
+#endif
+
+  ! Note: here we need to use a constant function or anything that
+  ! is constant at the borders, since we assume that all boundary
+  ! points have equal values to optimize the application of the nl-operator.
+
+  aa = R_TOTYPE(1.0)/this%mesh%box%bounding_box_l(1)
+  bb = R_TOTYPE(10.0)
+  cc = R_TOTYPE(100.0)
+
+#ifdef R_TCOMPLEX
+  ! we make things more "complex"
+  aa = aa + M_ZI*R_TOTYPE(0.01)
+  bb = bb*exp(M_ZI*R_TOTYPE(0.345))
+  cc = cc - M_ZI*R_TOTYPE(50.0)
+#endif
+
+
+  do ip = 1, this%mesh%np_part
+    ff(ip) = bb*exp(-aa*sum(this%mesh%x(:, ip)**2)) + cc
+  end do
+  do ip = 1, this%mesh%np
+    do idir = 1, this%mesh%box%dim
+      gradff(ip, idir) = -M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2))
+    end do
+  end do
+
+  ! curl only needed in 3D
+  if (this%mesh%box%dim == 3) then
+    do ip = 1, this%mesh%np
+      curlff(ip, 1) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(2, ip) - this%mesh%x(3, ip))
+      curlff(ip, 2) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(3, ip) - this%mesh%x(1, ip))
+      curlff(ip, 3) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(1, ip) - this%mesh%x(2, ip))
+    end do
+  end if
+
+  message(1) = "Testing Laplacian"
+  call messages_info(1, namespace=namespace)
+
+  ! test Laplacian
+  blocksize = min_blocksize
+  do
+
+    call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
+    call X(batch_init)(opffb, 1, 1, blocksize, this%mesh%np)
+
+    do ist = 1, blocksize
+      call batch_set_state(ffb, ist, this%mesh%np_part, ff)
+    end do
+
+    if (packstates) then
+      call ffb%do_pack()
+      call opffb%do_pack(copy = .false.)
+    end if
+
+    if (repetitions > 1) then
+      call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF)
+    end if
+
+    stime = loct_clock()
+    do itime = 1, repetitions
+      call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF)
+    end do
+    etime = (loct_clock() - stime)/real(repetitions, real64)
+
+    call batch_get_state(opffb, blocksize, this%mesh%np, res)
+
+    call ffb%end()
+    call opffb%end()
+
+    do ip = 1, this%mesh%np
+      res(ip) = M_TWO*res(ip) - &
+        (M_FOUR*aa**2*bb*sum(this%mesh%x(:, ip)**2)*exp(-aa*sum(this%mesh%x(:, ip)**2)) &
+        - this%mesh%box%dim*M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2)))
+    end do
+
+    write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
+      'Laplacian ', trim(type),  &
+      ' bsize = ', blocksize,    &
+      ' , error = ', X(mf_nrm2)(this%mesh, res), &
+      ' , Gflops = ',  &
+#ifdef R_TREAL
+      blocksize*this%mesh%np*M_TWO*this%lapl%stencil%size/(etime*1.0e9_real64)
+#else
+    blocksize*this%mesh%np*M_FOUR*this%lapl%stencil%size/(etime*1.0e9_real64)
+#endif
+
+    call messages_info(1, namespace=namespace)
+
+    blocksize = 2*blocksize
+    if (blocksize > max_blocksize) exit
+
+  end do
+
+  message(1) = "Testing Gradient"
+  call messages_info(1, namespace=namespace)
+
+  ! test Gradient
+  blocksize = min_blocksize
+  do
+    call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
+
+    do ist = 1, blocksize
+      call batch_set_state(ffb, ist, this%mesh%np_part, ff)
+    end do
+
+    if (packstates) then
+      call ffb%do_pack()
+    end if
+
+    if (repetitions > 1) then
+      call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
+    end if
+
+    stime = loct_clock()
+    do itime = 1, repetitions
+      call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
+    end do
+    etime = (loct_clock() - stime)/real(repetitions, real64)
+
+    do idir = 1, this%mesh%box%dim
+      call batch_get_state(gradffb(idir), blocksize, this%mesh%np, resgrad(:, idir))
+    end do
+
+    call ffb%end()
+    do idir = 1, this%mesh%box%dim
+      call gradffb(idir)%end()
+    end do
+
+    do idir = 1, this%mesh%box%dim
+      do ip = 1, this%mesh%np
+        resgrad(ip, idir) = resgrad(ip, idir) - gradff(ip, idir)
+      end do
+    end do
+
+    write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
+      'Batch gradient ', trim(type),  &
+      ' bsize = ', blocksize,    &
+      ' , error (x direction) = ', X(mf_nrm2)(this%mesh, resgrad(:, 1)), &
+      ' , Gflops = ',  &
+#ifdef R_TREAL
+      blocksize*this%mesh%np*M_TWO*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64)
+#else
+    blocksize*this%mesh%np*M_FOUR*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64)
+#endif
+    call messages_info(1, namespace=namespace)
+
+    blocksize = 2*blocksize
+    if (blocksize > max_blocksize) exit
+  end do
+
+  call X(derivatives_grad)(this, ff, opff, set_bc = .false.)
+
+  do idir = 1, this%mesh%box%dim
+    do ip = 1, this%mesh%np
+      opff(ip, idir) = opff(ip, idir) - (-M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2)))
+    end do
+  end do
+
+  message(1) = ''
+  call messages_info(1, namespace=namespace)
+
+  write(message(1), '(3a, es17.10)') 'Gradient ', trim(type),  &
+    ' err = ', X(mf_nrm2)(this%mesh, this%mesh%box%dim, opff)
+  call messages_info(1, namespace=namespace)
+
+  message(1) = ''
+  call messages_info(1, namespace=namespace)
+
+  ! test curl for 3D case
+  if (this%mesh%box%dim == 3) then
+    message(1) = "Testing curl"
+    call messages_info(1, namespace=namespace)
+
+    do blocksize = 3, 6, 3
+      call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part)
+
+      do ist = 1, blocksize
+        call batch_set_state(ffb, ist, this%mesh%np_part, ff)
+      end do
+
+      if (packstates) then
+        call ffb%do_pack()
+      end if
+
+      ! test for one blocksize without precomputed gradient
+      if (blocksize == 6) then
+        call X(derivatives_batch_curl)(this, ffb, set_bc=.false.)
+      else
+        call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.)
+        call X(derivatives_batch_curl_from_gradient)(this, ffb, gradffb)
+      end if
+
+      SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:blocksize))
+      do ist = 1, blocksize
+        call batch_get_state(ffb, ist, this%mesh%np, rescurl(:, ist))
+      end do
+
+      call ffb%end()
+      do idir = 1, this%mesh%box%dim
+        call gradffb(idir)%end()
+      end do
+
+      do ip = 1, this%mesh%np
+        do ib = 0, blocksize-1, this%mesh%box%dim
+          do idir = 1, this%mesh%box%dim
+            rescurl(ip, ib+idir) = rescurl(ip, ib+idir) - curlff(ip, idir)
+          end do
+        end do
+      end do
+
+      norm = M_ZERO
+      do ist = 1, blocksize
+        norm = norm + X(mf_nrm2)(this%mesh, rescurl(:, ist))
+      end do
+
+      write(message(1), '(3a,i3,a,es17.10,a,f8.3)') &
+        'Batch curl ', trim(type), ' bsize = ', blocksize, ' , error = ', norm
+      call messages_info(1, namespace=namespace)
+
+      SAFE_DEALLOCATE_A(rescurl)
+    end do
+
+    ! Testing the regular curl with the same logic as above
+    SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:this%mesh%box%dim))
+    do idir = 1, this%mesh%box%dim
+      call lalg_copy(this%mesh%np_part, ff, gradff(:, idir))
+    end do
+    call X(derivatives_curl)(this, gradff, rescurl, set_bc = .false.)
+
+    do idir = 1, this%mesh%box%dim
+      do ip = 1, this%mesh%np
+        opff(ip, idir) = rescurl(ip, idir) - curlff(ip, idir)
+      end do
+    end do
+
+    norm = M_ZERO
+    do ist = 1, this%mesh%box%dim
+      norm = norm + X(mf_nrm2)(this%mesh, opff(:, ist))
+    end do
+
+    write(message(1), '(3a, es17.10)') 'Curl ', trim(type), ' err = ', norm
+    call messages_info(1, namespace=namespace)
+
+    SAFE_DEALLOCATE_A(rescurl)
+  end if
+
+  message(1) = ''
+  call messages_info(1, namespace=namespace)
+
+  SAFE_DEALLOCATE_A(ff)
+  SAFE_DEALLOCATE_A(opff)
+  SAFE_DEALLOCATE_A(gradff)
+  SAFE_DEALLOCATE_A(res)
+  SAFE_DEALLOCATE_A(resgrad)
+  SAFE_DEALLOCATE_A(curlff)
+
+end subroutine X(derivatives_test)
+
diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index f5f198c62e..453d83b098 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -39,6 +39,7 @@ module nl_operator_oct_m
   use parser_oct_m
   use profiling_oct_m
   use space_oct_m
+  use sort_oct_m
   use stencil_oct_m
   use types_oct_m
   use varinfo_oct_m
@@ -80,6 +81,11 @@ module nl_operator_oct_m
     integer, allocatable :: ri(:, :)
   end type nl_operator_index_t
 
+  integer, public, parameter ::  &
+    OP_GENERAL       = 1,   &
+    OP_SYMMETRIC     = 2,   &
+    OP_ANTISYMMETRIC = 3
+
   !> data type for non local operators
   type nl_operator_t
     private
@@ -95,6 +101,8 @@ module nl_operator_oct_m
     type(accel_mem_t), public :: buff_weights      !< buffer with constant weights
     type(accel_mem_t), public :: buff_half_weights !< buffer with weights multiplied by -1/2
 
+    integer                   :: symmetry = OP_GENERAL !< Symmetry of the weights to be used
+
     character(len=40) :: label
 
     !> the compressed index of grid points
@@ -232,13 +240,15 @@ contains
   !> @brief initialize an instance of a non-local operator
   !! by setting the label
   !!
-  subroutine nl_operator_init(op, label)
+  subroutine nl_operator_init(op, label, symm)
     type(nl_operator_t), intent(inout) :: op
     character(len=*),    intent(in)    :: label
+    integer, optional,   intent(in)    :: symm
 
     PUSH_SUB(nl_operator_init)
 
     op%label = label
+    op%symmetry = optional_default(symm, OP_GENERAL)
 
     POP_SUB(nl_operator_init)
   end subroutine nl_operator_init
@@ -261,6 +271,7 @@ contains
 
     opo%np           =  opi%np
     opo%mesh         => opi%mesh
+    opo%symmetry     =  opi%symmetry
 
     SAFE_ALLOCATE_SOURCE_A(opo%nn, opi%nn)
     SAFE_ALLOCATE_SOURCE_A(opo%w, opi%w)
@@ -762,6 +773,161 @@ contains
     POP_SUB(nl_operator_remove_zero_weight_points)
   end subroutine nl_operator_remove_zero_weight_points
 
+!> Take a list of weights and offsets and build pairs of symmetric points with common weights 
+subroutine group_by_pairs_sym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg, wcenter)
+  integer,                    intent(in)  :: size
+  integer,                    intent(in)  :: offsets(:, :)
+  real(real64),               intent(in)  :: wre(:)
+  integer,                    intent(in)  :: ri(:, :)
+  integer,                    intent(in)  :: nri
+  integer,                    intent(out) :: npairs
+  real(real64), allocatable,  intent(out) :: wpair(:)
+  integer, allocatable,       intent(out) :: pair_pos(:,:), pair_neg(:,:)
+  real(real64),               intent(out) :: wcenter
+
+  logical, allocatable :: used(:)
+  integer :: i, j, d, ndim, s
+  logical :: same
+  real(real64), allocatable :: abs_wre(:)
+  integer, allocatable :: idx(:)
+
+  real(real64), parameter :: tol = 1.0e-12_real64
+
+  PUSH_SUB(group_by_pairs_sym)
+
+  ASSERT(mod(size,2) == 1)
+
+  SAFE_ALLOCATE(used(1:size))
+  used = .false.
+  npairs = 0
+
+  ndim = ubound(offsets, dim=1) 
+
+  SAFE_ALLOCATE(abs_wre(1:size))
+  SAFE_ALLOCATE(idx(1:size))
+  abs_wre = abs(wre)
+  call sort(abs_wre, idx)
+  
+  ! Max pairs = n/2
+  SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri))
+  SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri))
+  SAFE_ALLOCATE(wpair(1:size/2))
+
+  do i = 1, size
+     if (used(i)) cycle
+
+     if (all(offsets(:, idx(i))==0)) then
+        wcenter = wre(idx(i))
+        used(i) = .true.
+        cycle
+     end if
+
+     ! Try to find symmetric partner j
+     do j = i+1, size
+        if (used(j)) cycle
+
+        ! Weight equality
+        same = abs(wre(idx(i)) - wre(idx(j))) <= tol
+        if (.not. same) cycle
+
+        ! Offsets equal and opposite
+        if (any(offsets(:,idx(j))+offsets(:, idx(i)) /= 0)) cycle
+
+        npairs = npairs + 1
+        do s = 1, nri
+          pair_pos(npairs, s) = ri(idx(i), s)
+          pair_neg(npairs, s) = ri(idx(j), s)
+        end do
+        wpair(npairs)  = M_HALF*(wre(idx(i)) + wre(idx(j)))
+
+        used(i) = .true.
+        used(j) = .true.
+        exit
+     end do
+  end do
+
+  ASSERT(npairs == size/2)
+
+  SAFE_DEALLOCATE_A(abs_wre)
+  SAFE_DEALLOCATE_A(idx)
+
+  POP_SUB(group_by_pairs_sym)
+end subroutine group_by_pairs_sym
+
+!> Take a list of weights and offsets and build pairs of symmetric points with common weights 
+subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg)
+  integer,                    intent(in)  :: size
+  integer,                    intent(in)  :: offsets(:, :)
+  real(real64),               intent(in)  :: wre(:)
+  integer,                    intent(in)  :: ri(:, :)
+  integer,                    intent(in)  :: nri
+  integer,                    intent(out) :: npairs
+  real(real64), allocatable,  intent(out) :: wpair(:)
+  integer, allocatable,       intent(out) :: pair_pos(:,:), pair_neg(:,:)
+
+  logical, allocatable :: used(:)
+  integer :: i, j, d, ndim, s
+  logical :: same
+  real(real64), allocatable :: abs_wre(:)
+  integer, allocatable :: idx(:)
+
+  real(real64), parameter :: tol = 1.0e-12_real64
+
+  PUSH_SUB(group_by_pairs_antisym)
+
+  ASSERT(mod(size,2) == 0)
+
+  SAFE_ALLOCATE(used(1:size))
+  used = .false.
+  npairs = 0
+
+  ndim = ubound(offsets, dim=1) 
+
+  SAFE_ALLOCATE(abs_wre(1:size))
+  SAFE_ALLOCATE(idx(1:size))
+  abs_wre = abs(wre)
+  call sort(abs_wre, idx)
+  
+  ! Max pairs = n/2
+  SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri))
+  SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri))
+  SAFE_ALLOCATE(wpair(1:size/2))
+
+  do i = 1, size
+     if (used(i)) cycle
+
+     ! Try to find symmetric partner j
+     do j = i+1, size
+        if (used(j)) cycle
+
+        ! Weight equality
+        same = abs(wre(idx(i)) + wre(idx(j))) <= tol
+        if (.not. same) cycle
+
+        ! Offsets equal and opposite
+        if (any(offsets(:,idx(j))+offsets(:, idx(i)) /= 0)) cycle
+
+        npairs = npairs + 1
+        do s = 1, nri
+          pair_pos(npairs, s) = ri(idx(i), s)
+          pair_neg(npairs, s) = ri(idx(j), s)
+        end do
+        wpair(npairs)  = M_HALF*(wre(idx(i)) - wre(idx(j)))
+
+        used(i) = .true.
+        used(j) = .true.
+        exit
+     end do
+  end do
+
+  ASSERT(npairs == size/2)
+
+  SAFE_DEALLOCATE_A(abs_wre)
+  SAFE_DEALLOCATE_A(idx)
+  POP_SUB(group_by_pairs_antisym)
+end subroutine group_by_pairs_antisym
+
+
 #include "undef.F90"
 #include "real.F90"
 #include "nl_operator_inc.F90"
diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90
index d2eb96aec1..5e3e8a2e13 100644
--- a/src/grid/nl_operator_inc.F90
+++ b/src/grid/nl_operator_inc.F90
@@ -41,6 +41,11 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
   integer, parameter :: logldf = 1
 #endif
   integer :: nri_loc, ini
+  !> Symmetry of the stencil can be used to group points by pairs
+  integer :: npairs
+  real(real64),   allocatable :: wpair(:) 
+  integer, allocatable :: ri_pos(:,:), ri_neg(:,:)
+  real(real64) :: wcenter
 
   PUSH_SUB(X(nl_operator_operate_batch))
 
@@ -86,7 +91,24 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
       use_accel = .true.
       call operate_accel()
     else if (X(function_global) == OP_FORTRAN) then
-      call operate_const_weights()
+      select case(op%symmetry)
+      case(OP_GENERAL)
+        call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg) 
+        call operate_const_weights_antisymmetric()
+        SAFE_DEALLOCATE_A(wpair)
+        SAFE_DEALLOCATE_A(ri_pos)
+        SAFE_DEALLOCATE_A(ri_neg)
+      case(OP_SYMMETRIC)
+        call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter) 
+        call operate_const_weights_symmetric()
+        SAFE_DEALLOCATE_A(wpair)
+        SAFE_DEALLOCATE_A(ri_pos)
+        SAFE_DEALLOCATE_A(ri_neg)
+      case(OP_ANTISYMMETRIC)
+        call operate_const_weights()
+      case default
+        ASSERT(.false.)
+      end select
     else
 
 ! for the moment this is not implemented
@@ -203,6 +225,109 @@ contains
     POP_SUB(X(nl_operator_operate_batch).operate_const_weights)
   end subroutine operate_const_weights
 
+  ! ---------------------------------------------------------
+  subroutine operate_const_weights_antisymmetric()
+    integer :: nn, ll, ii, ist, ipair
+
+    PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric)
+
+    nn = op%stencil%size
+
+    select case (fi%status())
+
+    case (BATCH_DEVICE_PACKED)
+
+      ASSERT(.false.)
+
+    case (BATCH_NOT_PACKED)
+
+      !$omp parallel do private(ll, ist, ii, ipair)
+      do ll = 1, nri
+        do ii = imin(ll) + 1, imax(ll)
+          do ist = 1, fi%nst_linear
+            fo%X(ff_linear)(ii, ist) = M_ZERO
+            do ipair = 1, npairs
+              fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + &
+                wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) - fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist))
+            end do
+          end do
+        end do
+      end do
+      !$omp end parallel do
+
+    case (BATCH_PACKED)
+
+      !$omp parallel do private(ll, ist, ii, ipair)
+      do ll = 1, nri
+        do ii = imin(ll) + 1, imax(ll)
+          do ist = 1, fi%nst_linear
+            fo%X(ff_pack)(ist, ii) = M_ZERO
+            do ipair = 1, npairs
+              fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) &
+                + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) - fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll)))
+            end do
+          end do
+        end do
+      end do
+      !$omp end parallel do
+
+    end select
+
+    POP_SUB(X(nl_operator_operate_batch).operate_const_weights_antisymmetric)
+  end subroutine operate_const_weights_antisymmetric
+
+  ! ---------------------------------------------------------
+  subroutine operate_const_weights_symmetric()
+    integer :: nn, ll, ii, ist, ipair
+
+    PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric)
+
+    nn = op%stencil%size
+
+    select case (fi%status())
+
+    case (BATCH_DEVICE_PACKED)
+
+      ASSERT(.false.)
+
+    case (BATCH_NOT_PACKED)
+
+      !$omp parallel do private(ll, ist, ii, ipair)
+      do ll = 1, nri
+        do ii = imin(ll) + 1, imax(ll)
+          do ist = 1, fi%nst_linear
+            fo%X(ff_linear)(ii, ist) = M_ZERO
+            do ipair = 1, npairs
+              fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + &
+                wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) + fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist))
+            end do
+            fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + wcenter * fi%X(ff_linear)(ii, ist)
+          end do
+        end do
+      end do
+      !$omp end parallel do
+
+    case (BATCH_PACKED)
+
+      !$omp parallel do private(ll, ist, ii, ipair)
+      do ll = 1, nri
+        do ii = imin(ll) + 1, imax(ll)
+          do ist = 1, fi%nst_linear
+            fo%X(ff_pack)(ist, ii) = M_ZERO
+            do ipair = 1, npairs
+              fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) &
+                + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) + fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll)))
+            end do
+            fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) + wcenter * fi%X(ff_pack)(ist, ii)
+          end do
+        end do
+      end do
+      !$omp end parallel do
+
+    end select
+
+    POP_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric)
+  end subroutine operate_const_weights_symmetric
 
   ! ---------------------------------------------------------
   subroutine operate_non_const_weights()
diff --git a/src/main/test.F90 b/src/main/test.F90
index d33d56c0c0..0a8d3be31c 100644
--- a/src/main/test.F90
+++ b/src/main/test.F90
@@ -34,6 +34,7 @@ module test_oct_m
   use debug_oct_m
   use density_oct_m
   use derivatives_oct_m
+  use derivatives_test_oct_m
   use eigen_chebyshev_oct_m
   use electron_space_oct_m
   use electrons_oct_m
@@ -1303,6 +1304,8 @@ contains
       call zderivatives_test(sys%gr%der, sys%namespace, param%repetitions, param%min_blocksize, param%max_blocksize)
     end if
 
+    call derivatives_advanced_benchmark(sys%gr%der, sys%namespace)
+
     SAFE_DEALLOCATE_P(sys)
 
     POP_SUB(test_derivatives)
diff --git a/src/utils/dielectric_function.F90 b/src/utils/dielectric_function.F90
index 750875b12b..754aa2301b 100644
--- a/src/utils/dielectric_function.F90
+++ b/src/utils/dielectric_function.F90
@@ -84,7 +84,7 @@ contains
     real(real64)   :: dt, dt_ref, tt, ww, norm_Aext
     real(real64), allocatable :: vecpot(:, :), Aext(:), Eind_real(:, :), Eind_imag(:, :)
     complex(real64), allocatable :: dielectric(:), chi(:), invdielectric(:)
-    real(real64), allocatable :: vecpot_ref(:, :)
+    real(real64), allocatable :: vecpot_ref(:, :), average(:)
     type(spectrum_t)  :: spectrum
     type(block_t)     :: blk
     type(space_t)     :: space
@@ -102,6 +102,7 @@ contains
 
     SAFE_ALLOCATE(Aext(1:space%dim))
     SAFE_ALLOCATE(Eind(1:space%dim))
+    SAFE_ALLOCATE(average(1:space%dim*3))
 
     if (parse_block(global_namespace, 'GaugeVectorField', blk) == 0) then
 
@@ -192,8 +193,8 @@ contains
         read(ref_file, *) jj, tt, (vecpot_ref(ii, kk), kk = 1, space%dim*3)
       end do
       call io_close(ref_file)
-      do ii = 1, time_steps
-        do kk = 1, space%dim*3
+      do kk = 1, space%dim*3
+        do ii = 1, time_steps
           vecpot(ii, kk) = vecpot(ii, kk) - vecpot_ref(ii, kk)
         end do
       end do
@@ -203,6 +204,20 @@ contains
       trim(io_workpath('td.general/gauge_field', global_namespace))//"'"
     call messages_info(1)
 
+    ! Treat the Drude peak
+    ! We remove the offset before applying the mask, to get proper treatment of metals
+    do kk = 1, space%dim*3
+     average(kk) = M_ZERO
+     do ii = time_steps/2, time_steps
+       average(kk) = average(kk) + vecpot(ii, kk)
+     end do
+     average = average/(time_steps -time_steps/2 + 1)
+     do ii = 1, time_steps
+       vecpot(ii, kk) = vecpot(ii, kk) - average(kk)
+     end do
+    end do
+    Aext(1:space%dim) = Aext(1:space%dim) + average(1:space%dim)
+
 
     ! Find out the iteration numbers corresponding to the time limits.
     ! Max time correspond to time_steps-1, as we start from 0.
@@ -297,6 +312,7 @@ contains
     SAFE_DEALLOCATE_A(vecpot_ref)
     SAFE_DEALLOCATE_A(Aext)
     SAFE_DEALLOCATE_A(Eind)
+    SAFE_DEALLOCATE_A(average)
     SAFE_DEALLOCATE_A(Eind_real)
     SAFE_DEALLOCATE_A(Eind_imag)
 
-- 
GitLab


From d0be0c96390244077a812daf567e9f781a8b7a34 Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Fri, 12 Dec 2025 19:58:43 +0100
Subject: [PATCH 02/11] Implement the symmetric and antisymmetric operators in
 vectorized forms

Adding an input variable to control the use of the symmetric form or not.
---
 src/grid/grid.F90              |   3 +-
 src/grid/nl_operator.F90       |  14 +-
 src/grid/nl_operator_inc.F90   |  56 +++-
 src/grid/operate.c             |  73 +++++
 src/grid/operate_antisym_inc.c | 480 ++++++++++++++++++++++++++++
 src/grid/operate_f.F90         |  71 +++++
 src/grid/operate_sym_inc.c     | 553 +++++++++++++++++++++++++++++++++
 src/include/vectors.h          |   8 +
 8 files changed, 1240 insertions(+), 18 deletions(-)
 create mode 100644 src/grid/operate_antisym_inc.c
 create mode 100644 src/grid/operate_sym_inc.c

diff --git a/src/grid/grid.F90 b/src/grid/grid.F90
index 5c90041617..423e9200d7 100644
--- a/src/grid/grid.F90
+++ b/src/grid/grid.F90
@@ -193,6 +193,7 @@ contains
     call initialize_coordinate_system(gr, namespace, space, grid_spacing, latt, n_sites, site_position)
 
     ! initialize derivatives
+    call nl_operator_global_init(namespace)
     call derivatives_init(gr%der, namespace, space, gr%coord_system)
     ! the stencil used to generate the grid is a union of a cube (for
     ! multigrid) and the Laplacian
@@ -254,6 +255,7 @@ contains
     call initialize_coordinate_system(gr, namespace, space, gr%spacing, latt, n_sites, site_position)
 
     ! initialize derivatives
+    call nl_operator_global_init(namespace)
     call derivatives_init(gr%der, namespace, space, gr%coord_system)
     ! the stencil used to generate the grid is a union of a cube (for
     ! multigrid) and the Laplacian
@@ -383,7 +385,6 @@ contains
 
     call mesh_init_stage_3(gr, namespace, space, gr%stencil, mc)
 
-    call nl_operator_global_init(namespace)
     call derivatives_build(gr%der, namespace, space, gr, qvector)
 
     ! print info concerning the grid
diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index 453d83b098..3acbed8422 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -157,6 +157,7 @@ module nl_operator_oct_m
   integer :: dfunction_global = -1
   integer :: zfunction_global = -1
   integer :: function_accel
+  logical :: use_symmetries = .false.
 
 contains
 
@@ -204,6 +205,14 @@ contains
     call parse_variable(namespace, 'OperateComplex', default, zfunction_global)
     if (.not. varinfo_valid_option('OperateComplex', zfunction_global)) call messages_input_error(namespace, 'OperateComplex')
 
+    !%Variable OperateUseSymmetries
+    !%Type logical
+    !%Section Execution::Optimization
+    !%Default no
+    !%Description
+    !% This variable selects if the operators are build using symmetries or not.
+    !%End
+    call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries)
 
     if (accel_is_enabled()) then
 
@@ -248,7 +257,10 @@ contains
     PUSH_SUB(nl_operator_init)
 
     op%label = label
-    op%symmetry = optional_default(symm, OP_GENERAL)
+    op%symmetry = OP_GENERAL
+    if (use_symmetries) then
+      op%symmetry = optional_default(symm, OP_GENERAL)
+    end if
 
     POP_SUB(nl_operator_init)
   end subroutine nl_operator_init
diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90
index 5e3e8a2e13..a5d3eb1fae 100644
--- a/src/grid/nl_operator_inc.F90
+++ b/src/grid/nl_operator_inc.F90
@@ -33,8 +33,7 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
   logical :: ghost_update_, profile_, use_accel
   integer :: nri
   integer, pointer :: imin(:), imax(:), ri(:, :)
-!  real(real64), allocatable :: wre(:), wim(:)
-  R_BASE, allocatable :: wre(:), wim(:)
+  R_BASE, allocatable :: wre(:)
 #ifdef R_TREAL
   integer, parameter :: logldf = 0
 #else
@@ -92,26 +91,27 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
       call operate_accel()
     else if (X(function_global) == OP_FORTRAN) then
       select case(op%symmetry)
-      case(OP_GENERAL)
+      case(OP_ANTISYMMETRIC)
         call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg) 
         call operate_const_weights_antisymmetric()
-        SAFE_DEALLOCATE_A(wpair)
-        SAFE_DEALLOCATE_A(ri_pos)
-        SAFE_DEALLOCATE_A(ri_neg)
       case(OP_SYMMETRIC)
         call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter) 
         call operate_const_weights_symmetric()
-        SAFE_DEALLOCATE_A(wpair)
-        SAFE_DEALLOCATE_A(ri_pos)
-        SAFE_DEALLOCATE_A(ri_neg)
-      case(OP_ANTISYMMETRIC)
+      case(OP_GENERAL)
         call operate_const_weights()
       case default
         ASSERT(.false.)
       end select
     else
 
-! for the moment this is not implemented
+      select case(op%symmetry)
+      case(OP_SYMMETRIC)
+        call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter)
+      case(OP_ANTISYMMETRIC)
+        call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg)
+      end select
+
+      ! for the moment this is not implemented
       !$omp parallel private(ini, nri_loc, ist)
       call multicomm_divide_range_omp(nri, ini, nri_loc)
 
@@ -120,16 +120,38 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
         ASSERT(ubound(fi%X(ff_pack), dim = 2) >= op%mesh%np_part)
         ASSERT(ubound(fo%X(ff_pack), dim = 2) >= op%mesh%np)
 
-        call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
-          fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
+        select case(op%symmetry)
+        case(OP_GENERAL)
+          call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
+            fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
+        case(OP_SYMMETRIC)
+          call X(operate_ri_sym_vec)(npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+            imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
+        case(OP_ANTISYMMETRIC)
+          call X(operate_ri_antisym_vec)(npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+            imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
+        case default
+          ASSERT(.false.)
+        end select
       else
         do ist = 1, fi%nst_linear
 
           ASSERT(ubound(fi%X(ff_linear), dim=1) == op%mesh%np_part)
           ASSERT(ubound(fo%X(ff_linear), dim=1) >= op%mesh%np)
 
-          call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
-            fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
+          select case(op%symmetry)
+          case(OP_GENERAL)
+            call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
+              fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
+          case(OP_SYMMETRIC)
+            call X(operate_ri_sym_vec)(npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+              imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
+          case(OP_ANTISYMMETRIC)
+            call X(operate_ri_antisym_vec)(npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+              imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
+          case default
+            ASSERT(.false.)
+          end select
         end do
       end if
       !$omp end parallel
@@ -143,7 +165,9 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
   end if
 
   SAFE_DEALLOCATE_A(wre)
-  SAFE_DEALLOCATE_A(wim)
+  SAFE_DEALLOCATE_A(wpair)
+  SAFE_DEALLOCATE_A(ri_pos)
+  SAFE_DEALLOCATE_A(ri_neg)
 
   if (profile_) call profiling_out(TOSTRING(X(NL_OPERATOR_BATCH)))
   POP_SUB(X(nl_operator_operate_batch))
diff --git a/src/grid/operate.c b/src/grid/operate.c
index 482b8ed3ce..ce5071e9a6 100644
--- a/src/grid/operate.c
+++ b/src/grid/operate.c
@@ -64,6 +64,57 @@ void FC_FUNC_(doperate_ri_vec,
   }
 }
 
+void FC_FUNC_(doperate_ri_sym_vec,
+              DOPERATE_RI_SYM_VEC)(const int *opn, const double *restrict wpair, const double *restrict wcenter,
+                               const int *opnri, const int *opri_pos, const int *opri_neg,
+                               const int *rimap_inv, const int *rimap_inv_max,
+                               const double *restrict fi, const int *ldfp,
+                               double *restrict fo) {
+  const size_t ldf = ldfp[0];
+
+  /* check whether we got aligned vectors or not */
+  int aligned = 1;
+  aligned = aligned && (((long long)fi) % (8 * VEC_SIZE) == 0);
+  aligned = aligned && (((long long)fo) % (8 * VEC_SIZE) == 0);
+  aligned = aligned && ((1 << ldf) % VEC_SIZE == 0);
+
+  if (aligned) {
+#define ALIGNED
+#include "operate_sym_inc.c"
+#undef ALIGNED
+
+  } else {
+    /* not aligned */
+  #include "operate_sym_inc.c"
+  }
+}
+
+void FC_FUNC_(doperate_ri_antisym_vec,
+              DOPERATE_RI_ANTISYM_VEC)(const int *opn, const double *restrict wpair,
+                               const int *opnri, const int *opri_pos, const int *opri_neg, 
+                               const int *rimap_inv, const int *rimap_inv_max,
+                               const double *restrict fi, const int *ldfp,
+                               double *restrict fo) {
+  const size_t ldf = ldfp[0];
+
+  /* check whether we got aligned vectors or not */
+  int aligned = 1;
+  aligned = aligned && (((long long)fi) % (8 * VEC_SIZE) == 0);
+  aligned = aligned && (((long long)fo) % (8 * VEC_SIZE) == 0);
+  aligned = aligned && ((1 << ldf) % VEC_SIZE == 0);
+
+  if (aligned) {
+#define ALIGNED
+#include "operate_antisym_inc.c"
+#undef ALIGNED
+
+  } else {
+    /* not aligned */
+#include "operate_antisym_inc.c"
+  }
+}
+
+
 /* the same as doperate_ri_vec, but allows giving each an appropriate Fortan
    interface in which fi and fo are actually complex in Fortran Could be inline,
    but in that case pgcc will not put it in the symbol table. */
@@ -77,6 +128,28 @@ void FC_FUNC_(zoperate_ri_vec,
   (opn, w, opnri, opri, rimap_inv, rimap_inv_max, fi, ldfp, fo);
 }
 
+void FC_FUNC_(zoperate_ri_sym_vec,
+              ZOPERATE_RI_SYM_VEC)(const int *opn, const double *restrict w, 
+                               const double *restrict wcenter,
+                               const int *opnri, const int *opri_pos, const int *opri_neg,
+                               const int *rimap_inv, const int *rimap_inv_max,
+                               const double *restrict fi, const int *ldfp,
+                               double *restrict fo) {
+  FC_FUNC_(doperate_ri_sym_vec, DOPERATE_RI_SYM_VEC)
+  (opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo);
+}
+
+void FC_FUNC_(zoperate_ri_antisym_vec,
+              ZOPERATE_RI_ANTISYM_VEC)(const int *opn, const double *restrict w,
+                               const int *opnri, const int *opri_pos, const int *opri_neg,
+                               const int *rimap_inv, const int *rimap_inv_max,
+                               const double *restrict fi, const int *ldfp,
+                               double *restrict fo) {
+  FC_FUNC_(doperate_ri_antisym_vec, DOPERATE_RI_SYM_VEC)
+  (opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo);
+}
+
+
 void FC_FUNC_(dgauss_seidel,
               DGAUSS_SEIDEL)(const int *opn, const double *restrict w,
                              const int *opnri, const int *opri,
diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c
new file mode 100644
index 0000000000..3e14fc74a4
--- /dev/null
+++ b/src/grid/operate_antisym_inc.c
@@ -0,0 +1,480 @@
+/*
+ Copyright (C) 2006 X. Andrade
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+*/
+
+#ifdef ALIGNED
+#define LOAD VEC_LD
+#define STORE VEC_ST
+#else
+#define LOAD VEC_LDU
+#define STORE VEC_STU
+#endif
+
+{
+  const ptrdiff_t npairs = opn[0];
+  const ptrdiff_t nri = opnri[0];
+#if DEPTH >= 32
+  const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 16
+  const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 8
+  const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 4
+  const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 2
+  const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 1
+  const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
+#endif
+
+  ptrdiff_t l, i, j;
+  const int *restrict index_pos;
+  const int *restrict index_neg;
+
+  for (l = 0; l < nri; l++) {
+    index_pos = opri_pos + npairs * l;
+    index_neg = opri_neg + npairs * l;
+    i = rimap_inv[l];
+
+#if DEPTH >= 32
+    for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+        register VEC_TYPE a8, a9, aa, ab;
+        register VEC_TYPE ac, ad, ae, af;
+        register VEC_TYPE b0, b1, b2, b3;
+        register VEC_TYPE b4, b5, b6, b7;
+        register VEC_TYPE b8, b9, ba, bb;
+        register VEC_TYPE bc, bd, be, bf;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+        a8 = a9 = aa = ab = VEC_ZERO;
+        ac = ad = ae = af = VEC_ZERO;
+        b0 = b1 = b2 = b3 = VEC_ZERO;
+        b4 = b5 = b6 = b7 = VEC_ZERO;
+        b8 = b9 = ba = bb = VEC_ZERO;
+        bc = bd = be = bf = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
+          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k);
+          a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8);
+          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k);
+          a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9);
+          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k);
+          aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa);
+          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k);
+          ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab);
+          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k);
+          ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac);
+          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k);
+          ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad);
+          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k);
+          ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae);
+          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k);
+          af = VEC_FMA(wj, VEC_SUB(pos, neg), af);
+          pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE + k);
+          b0 = VEC_FMA(wj, VEC_SUB(pos, neg), b0);
+          pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE + k);
+          b1 = VEC_FMA(wj, VEC_SUB(pos, neg), b1);
+          pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE + k);
+          b2 = VEC_FMA(wj, VEC_SUB(pos, neg), b2);
+          pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE + k);
+          b3 = VEC_FMA(wj, VEC_SUB(pos, neg), b3);
+          pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE + k);
+          b4 = VEC_FMA(wj, VEC_SUB(pos, neg), b4);
+          pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE + k);
+          b5 = VEC_FMA(wj, VEC_SUB(pos, neg), b5);
+          pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE + k);
+          b6 = VEC_FMA(wj, VEC_SUB(pos, neg), b6);
+          pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE + k);
+          b7 = VEC_FMA(wj, VEC_SUB(pos, neg), b7);
+          pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE + k);
+          b8 = VEC_FMA(wj, VEC_SUB(pos, neg), b8);
+          pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE + k);
+          b9 = VEC_FMA(wj, VEC_SUB(pos, neg), b9);
+          pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE + k);
+          ba = VEC_FMA(wj, VEC_SUB(pos, neg), ba);
+          pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE + k);
+          bb = VEC_FMA(wj, VEC_SUB(pos, neg), bb);
+          pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE + k);
+          bc = VEC_FMA(wj, VEC_SUB(pos, neg), bc);
+          pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE + k);
+          bd = VEC_FMA(wj, VEC_SUB(pos, neg), bd);
+          pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE + k);
+          be = VEC_FMA(wj, VEC_SUB(pos, neg), be); 
+          pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE + k);
+          be = VEC_FMA(wj, VEC_SUB(pos, neg), be);
+        }
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
+        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
+        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
+        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
+        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
+        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
+        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
+        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
+        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
+        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
+        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
+        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
+        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
+        STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0);
+        STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1);
+        STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2);
+        STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3);
+        STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4);
+        STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5);
+        STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6);
+        STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7);
+        STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8);
+        STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9);
+        STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba);
+        STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb);
+        STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc);
+        STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd);
+        STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be);
+        STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf);
+      }
+    }
+#endif
+
+#if DEPTH >= 16
+    for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+        register VEC_TYPE a8, a9, aa, ab;
+        register VEC_TYPE ac, ad, ae, af;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+        a8 = a9 = aa = ab = VEC_ZERO;
+        ac = ad = ae = af = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
+          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k);
+          a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8);
+          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k);
+          a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9);
+          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k);
+          aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa);
+          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k);
+          ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab);
+          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k);
+          ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac);
+          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k);
+          ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad);
+          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k);
+          ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae);
+          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k);
+          af = VEC_FMA(wj, VEC_SUB(pos, neg), af);
+
+        }
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
+        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
+        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
+        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
+        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
+        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
+        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
+        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
+        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
+        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
+        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
+        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
+        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
+      }
+    }
+#endif
+
+#if DEPTH >= 8
+    for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
+        }
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
+        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
+        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
+        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
+        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
+      }
+    }
+#endif
+
+#if DEPTH >= 4
+    for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
+        }
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
+        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+      }
+    }
+#endif
+
+#if DEPTH >= 2
+    for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
+        register VEC_TYPE a0, a1;
+
+        a0 = a1 = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
+        }
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+      }
+    }
+#endif
+
+#if DEPTH >= 1
+    for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
+        register VEC_TYPE a0 = VEC_ZERO;
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
+        }
+        STORE(fo + (i << ldf) + k, a0);
+      }
+    }
+#endif
+
+#if VEC_SIZE > 1
+    for (; i < rimap_inv_max[l]; i++) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k++) {
+        double a = 0.0;
+        for (j = 0; j < npairs; j++)
+          a += wpair[j] * ( fi[((index_pos[j] + i) << ldf) + k] - fi[((index_neg[j] + i) << ldf) + k]);
+        fo[(i << ldf) + k] = a;
+      }
+    }
+#endif
+
+  } /* l */
+
+  // this fence instruction is needed to ensure correctness when using
+  // non-temporal stores
+#if defined(ALIGNED) && defined(FENCE)
+  FENCE;
+#endif
+}
+
+#undef LOAD
+#undef STORE
diff --git a/src/grid/operate_f.F90 b/src/grid/operate_f.F90
index f17b4b1a7d..51c467b6b2 100644
--- a/src/grid/operate_f.F90
+++ b/src/grid/operate_f.F90
@@ -61,6 +61,77 @@ module operate_f_oct_m
     end subroutine zoperate_ri_vec
   end interface
 
+  interface
+    subroutine doperate_ri_sym_vec(opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo)
+      import :: real64
+      implicit none
+      integer,        intent(in)    :: opn
+      real(real64),   intent(in)    :: w
+      real(real64),   intent(in)    :: wcenter
+      integer,        intent(in)    :: opnri
+      integer,        intent(in)    :: opri_pos
+      integer,        intent(in)    :: opri_neg
+      integer,        intent(in)    :: rimap_inv
+      integer,        intent(in)    :: rimap_inv_max
+      real(real64),   intent(in)    :: fi
+      integer,        intent(in)    :: ldfp
+      real(real64),   intent(inout) :: fo
+    end subroutine doperate_ri_sym_vec
+  end interface
+
+  interface
+    subroutine zoperate_ri_sym_vec(opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo)
+      import :: real64
+      implicit none
+      integer,         intent(in)    :: opn
+      real(real64),    intent(in)    :: w
+      real(real64),    intent(in)    :: wcenter
+      integer,         intent(in)    :: opnri
+      integer,         intent(in)    :: opri_pos
+      integer,         intent(in)    :: opri_neg
+      integer,         intent(in)    :: rimap_inv
+      integer,         intent(in)    :: rimap_inv_max
+      complex(real64), intent(in)    :: fi
+      integer,         intent(in)    :: ldfp
+      complex(real64), intent(inout) :: fo
+    end subroutine zoperate_ri_sym_vec
+  end interface
+
+  interface
+    subroutine doperate_ri_antisym_vec(opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo)
+      import :: real64
+      implicit none
+      integer,        intent(in)    :: opn
+      real(real64),   intent(in)    :: w
+      integer,        intent(in)    :: opnri
+      integer,        intent(in)    :: opri_pos
+      integer,        intent(in)    :: opri_neg
+      integer,        intent(in)    :: rimap_inv
+      integer,        intent(in)    :: rimap_inv_max
+      real(real64),   intent(in)    :: fi
+      integer,        intent(in)    :: ldfp
+      real(real64),   intent(inout) :: fo
+    end subroutine doperate_ri_antisym_vec
+  end interface
+
+  interface
+    subroutine zoperate_ri_antisym_vec(opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo)
+      import :: real64
+      implicit none
+      integer,         intent(in)    :: opn
+      real(real64),    intent(in)    :: w
+      integer,         intent(in)    :: opnri
+      integer,         intent(in)    :: opri_pos
+      integer,         intent(in)    :: opri_neg
+      integer,         intent(in)    :: rimap_inv
+      integer,         intent(in)    :: rimap_inv_max
+      complex(real64), intent(in)    :: fi
+      integer,         intent(in)    :: ldfp
+      complex(real64), intent(inout) :: fo
+    end subroutine zoperate_ri_antisym_vec
+  end interface
+
+
   interface
     subroutine dgauss_seidel(opn, w, opnri, opri, rimap_inv, rimap_inv_max, factor, pot, rho)
       import :: real64
diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c
new file mode 100644
index 0000000000..bf89e9ba67
--- /dev/null
+++ b/src/grid/operate_sym_inc.c
@@ -0,0 +1,553 @@
+/*
+ Copyright (C) 2006 X. Andrade
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+*/
+
+#ifdef ALIGNED
+#define LOAD VEC_LD
+#define STORE VEC_ST
+#else
+#define LOAD VEC_LDU
+#define STORE VEC_STU
+#endif
+
+{
+  const ptrdiff_t npairs = opn[0];
+  const ptrdiff_t nri = opnri[0];
+#if DEPTH >= 32
+  const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 16
+  const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 8
+  const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 4
+  const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 2
+  const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf);
+#endif
+#if DEPTH >= 1
+  const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf);
+#endif
+
+  ptrdiff_t l, i, j;
+  const int *restrict index_pos;
+  const int *restrict index_neg;
+
+  for (l = 0; l < nri; l++) {
+    index_pos = opri_pos + npairs * l;
+    index_neg = opri_neg + npairs * l;
+    i = rimap_inv[l];
+
+#if DEPTH >= 32
+    for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+        register VEC_TYPE a8, a9, aa, ab;
+        register VEC_TYPE ac, ad, ae, af;
+        register VEC_TYPE b0, b1, b2, b3;
+        register VEC_TYPE b4, b5, b6, b7;
+        register VEC_TYPE b8, b9, ba, bb;
+        register VEC_TYPE bc, bd, be, bf;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+        a8 = a9 = aa = ab = VEC_ZERO;
+        ac = ad = ae = af = VEC_ZERO;
+        b0 = b1 = b2 = b3 = VEC_ZERO;
+        b4 = b5 = b6 = b7 = VEC_ZERO;
+        b8 = b9 = ba = bb = VEC_ZERO;
+        bc = bd = be = bf = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
+          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k);
+          a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8);
+          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k);
+          a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9);
+          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k);
+          aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa);
+          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k);
+          ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab);
+          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k);
+          ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac);
+          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k);
+          ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad);
+          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k);
+          ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae);
+          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k);
+          af = VEC_FMA(wj, VEC_ADD(pos, neg), af);
+          pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE + k);
+          b0 = VEC_FMA(wj, VEC_ADD(pos, neg), b0);
+          pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE + k);
+          b1 = VEC_FMA(wj, VEC_ADD(pos, neg), b1);
+          pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE + k);
+          b2 = VEC_FMA(wj, VEC_ADD(pos, neg), b2);
+          pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE + k);
+          b3 = VEC_FMA(wj, VEC_ADD(pos, neg), b3);
+          pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE + k);
+          b4 = VEC_FMA(wj, VEC_ADD(pos, neg), b4);
+          pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE + k);
+          b5 = VEC_FMA(wj, VEC_ADD(pos, neg), b5);
+          pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE + k);
+          b6 = VEC_FMA(wj, VEC_ADD(pos, neg), b6);
+          pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE + k);
+          b7 = VEC_FMA(wj, VEC_ADD(pos, neg), b7);
+          pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE + k);
+          b8 = VEC_FMA(wj, VEC_ADD(pos, neg), b8);
+          pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE + k);
+          b9 = VEC_FMA(wj, VEC_ADD(pos, neg), b9);
+          pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE + k);
+          ba = VEC_FMA(wj, VEC_ADD(pos, neg), ba);
+          pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE + k);
+          bb = VEC_FMA(wj, VEC_ADD(pos, neg), bb);
+          pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE + k);
+          bc = VEC_FMA(wj, VEC_ADD(pos, neg), bc);
+          pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE + k);
+          bd = VEC_FMA(wj, VEC_ADD(pos, neg), bd);
+          pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE + k);
+          be = VEC_FMA(wj, VEC_ADD(pos, neg), be); 
+          pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE + k);
+          be = VEC_FMA(wj, VEC_ADD(pos, neg), be);
+        }
+        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
+        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3);
+        a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4);
+        a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5);
+        a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6);
+        a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7);
+        a8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 8 * VEC_SIZE + k), a8);
+        a9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 9 * VEC_SIZE + k), a9);
+        aa = VEC_FMA(wj, LOAD(fi + (i << ldf) + 10 * VEC_SIZE + k), aa);
+        ab = VEC_FMA(wj, LOAD(fi + (i << ldf) + 11 * VEC_SIZE + k), ab);
+        ac = VEC_FMA(wj, LOAD(fi + (i << ldf) + 12 * VEC_SIZE + k), ac);
+        ad = VEC_FMA(wj, LOAD(fi + (i << ldf) + 13 * VEC_SIZE + k), ad);
+        ae = VEC_FMA(wj, LOAD(fi + (i << ldf) + 14 * VEC_SIZE + k), ae);
+        af = VEC_FMA(wj, LOAD(fi + (i << ldf) + 15 * VEC_SIZE + k), af);
+        b0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 16 * VEC_SIZE + k), b0);
+        b1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 17 * VEC_SIZE + k), b1);
+        b2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 18 * VEC_SIZE + k), b2);
+        b3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 19 * VEC_SIZE + k), b3);
+        b4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 20 * VEC_SIZE + k), b4);
+        b5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 21 * VEC_SIZE + k), b5);
+        b6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 22 * VEC_SIZE + k), b6);
+        b7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 23 * VEC_SIZE + k), b7);
+        b8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 24 * VEC_SIZE + k), b8);
+        b9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 25 * VEC_SIZE + k), b9);
+        ba = VEC_FMA(wj, LOAD(fi + (i << ldf) + 26 * VEC_SIZE + k), ba);
+        bb = VEC_FMA(wj, LOAD(fi + (i << ldf) + 27 * VEC_SIZE + k), bb);
+        bc = VEC_FMA(wj, LOAD(fi + (i << ldf) + 28 * VEC_SIZE + k), bc);
+        bd = VEC_FMA(wj, LOAD(fi + (i << ldf) + 29 * VEC_SIZE + k), bd);
+        be = VEC_FMA(wj, LOAD(fi + (i << ldf) + 30 * VEC_SIZE + k), be);
+        bf = VEC_FMA(wj, LOAD(fi + (i << ldf) + 31 * VEC_SIZE + k), bf);
+
+
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
+        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
+        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
+        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
+        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
+        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
+        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
+        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
+        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
+        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
+        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
+        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
+        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
+        STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0);
+        STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1);
+        STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2);
+        STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3);
+        STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4);
+        STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5);
+        STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6);
+        STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7);
+        STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8);
+        STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9);
+        STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba);
+        STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb);
+        STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc);
+        STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd);
+        STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be);
+        STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf);
+      }
+    }
+#endif
+
+#if DEPTH >= 16
+    for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+        register VEC_TYPE a8, a9, aa, ab;
+        register VEC_TYPE ac, ad, ae, af;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+        a8 = a9 = aa = ab = VEC_ZERO;
+        ac = ad = ae = af = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
+          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k);
+          a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8);
+          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k);
+          a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9);
+          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k);
+          aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa);
+          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k);
+          ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab);
+          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k);
+          ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac);
+          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k);
+          ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad);
+          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k);
+          ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae);
+          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k);
+          af = VEC_FMA(wj, VEC_ADD(pos, neg), af);
+
+        }
+        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
+        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3);
+        a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4);
+        a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5);
+        a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6);
+        a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7);
+        a8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 8 * VEC_SIZE + k), a8);
+        a9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 9 * VEC_SIZE + k), a9);
+        aa = VEC_FMA(wj, LOAD(fi + (i << ldf) + 10 * VEC_SIZE + k), aa);
+        ab = VEC_FMA(wj, LOAD(fi + (i << ldf) + 11 * VEC_SIZE + k), ab);
+        ac = VEC_FMA(wj, LOAD(fi + (i << ldf) + 12 * VEC_SIZE + k), ac);
+        ad = VEC_FMA(wj, LOAD(fi + (i << ldf) + 13 * VEC_SIZE + k), ad);
+        ae = VEC_FMA(wj, LOAD(fi + (i << ldf) + 14 * VEC_SIZE + k), ae);
+        af = VEC_FMA(wj, LOAD(fi + (i << ldf) + 15 * VEC_SIZE + k), af);
+
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
+        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
+        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
+        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
+        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
+        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
+        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
+        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
+        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
+        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
+        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
+        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
+        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
+      }
+    }
+#endif
+
+#if DEPTH >= 8
+    for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+        register VEC_TYPE a4, a5, a6, a7;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+        a4 = a5 = a6 = a7 = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
+        }
+        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
+        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3);
+        a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4);
+        a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5);
+        a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6);
+        a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7);
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
+        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
+        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
+        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
+        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
+      }
+    }
+#endif
+
+#if DEPTH >= 4
+    for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) {
+        register VEC_TYPE a0, a1, a2, a3;
+
+        a0 = a1 = a2 = a3 = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
+        }
+        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
+        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3);
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
+        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+      }
+    }
+#endif
+
+#if DEPTH >= 2
+    for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) {
+        register VEC_TYPE a0, a1;
+
+        a0 = a1 = VEC_ZERO;
+
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
+        }
+        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
+        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
+        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+      }
+    }
+#endif
+
+#if DEPTH >= 1
+    for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
+        register VEC_TYPE a0 = VEC_ZERO;
+        for (j = 0; j < npairs; j++) {
+#ifdef VEC_SCAL_LD
+          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
+#else
+          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
+#endif
+          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
+          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
+        }
+        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
+        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
+        STORE(fo + (i << ldf) + k, a0);
+      }
+    }
+#endif
+
+#if VEC_SIZE > 1
+    for (; i < rimap_inv_max[l]; i++) {
+      ptrdiff_t k;
+      for (k = 0; k < (1 << ldf); k++) {
+        double a = 0.0;
+        for (j = 0; j < npairs; j++)
+          a += wpair[j] * ( fi[((index_pos[j] + i) << ldf) + k] + fi[((index_neg[j] + i) << ldf) + k]);
+        a += wcenter[0] * fi[(i << ldf) + k];
+        fo[(i << ldf) + k] = a;
+      }
+    }
+#endif
+
+  } /* l */
+
+  // this fence instruction is needed to ensure correctness when using
+  // non-temporal stores
+#if defined(ALIGNED) && defined(FENCE)
+  FENCE;
+#endif
+}
+
+#undef LOAD
+#undef STORE
diff --git a/src/include/vectors.h b/src/include/vectors.h
index 4b85e9a7c6..3c15558890 100644
--- a/src/include/vectors.h
+++ b/src/include/vectors.h
@@ -41,6 +41,8 @@
 #define VEC_STU(addr, vec) _mm512_storeu_pd(addr, vec)
 #define VEC_FMA(aa, bb, cc) _mm512_fmadd_pd(aa, bb, cc)
 #define VEC_SCAL(aa) _mm512_set1_pd(aa)
+#define VEC_ADD(aa, bb) _mm512_add_pd(aa, bb)
+#define VEC_SUB(aa, bb) _mm512_sub_pd(aa, bb)
 #define VEC_ZERO _mm512_setzero_pd()
 #include <emmintrin.h>
 #define FENCE _mm_mfence()
@@ -73,6 +75,8 @@
 #define VEC_FMA(aa, bb, cc) _mm256_add_pd(cc, _mm256_mul_pd(aa, bb))
 #endif
 #define VEC_SCAL(aa) _mm256_set1_pd(aa)
+#define VEC_ADD(aa, bb) _mm256_add_pd(aa, bb)
+#define VEC_SUB(aa, bb) _mm256_sub_pd(aa, bb)
 #define VEC_ZERO _mm256_setzero_pd()
 #include <emmintrin.h>
 #define FENCE _mm_mfence()
@@ -101,6 +105,8 @@
 #define VEC_FMA(aa, bb, cc) _mm_add_pd(cc, _mm_mul_pd(aa, bb))
 #endif
 #define VEC_SCAL(aa) _mm_set1_pd(aa)
+#define VEC_ADD(aa, bb) _mm_add_pd(aa, bb)
+#define VEC_SUB(aa, bb) _mm_sub_pd(aa, bb)
 #define VEC_ZERO _mm_setzero_pd()
 #define FENCE _mm_mfence()
 
@@ -120,6 +126,8 @@
 #define VEC_STU(addr, vec) VEC_ST(addr, vec)
 #define VEC_FMA(aa, bb, cc) aa *bb + cc
 #define VEC_SCAL(aa) aa
+#define VEC_ADD(aa, bb) aa+bb
+#define VEC_SUB(aa, bb) aa-bb
 #define VEC_ZERO 0.0
 
 #define DEPTH 8
-- 
GitLab


From d79027e96c97eeb698b966e3afcbb1c5c786b417 Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Fri, 12 Dec 2025 21:24:43 +0100
Subject: [PATCH 03/11] Store the pairs of points to avoid multiple
 recomputations.

---
 src/grid/derivatives.F90     |  3 +++
 src/grid/nl_operator.F90     | 50 +++++++++++++++++++++++++++++++++++-
 src/grid/nl_operator_inc.F90 | 45 +++++++++++++++++---------------
 3 files changed, 76 insertions(+), 22 deletions(-)

diff --git a/src/grid/derivatives.F90 b/src/grid/derivatives.F90
index 3392f2425b..5d3ff64ee4 100644
--- a/src/grid/derivatives.F90
+++ b/src/grid/derivatives.F90
@@ -893,6 +893,9 @@ contains
       if (der%remove_zero_weight_points) then
         call nl_operator_remove_zero_weight_points(op(i), space, der%mesh)
       end if
+      if (op(i)%const_w) then
+        call nl_operator_build_symmetric_weights(op(i))
+      end if
       call nl_operator_output_weights(op(i))
     end do
 
diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index 3acbed8422..3a7c6a4fee 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -67,7 +67,8 @@ module nl_operator_oct_m
     nl_operator_np_zero_bc,     &
     nl_operator_allocate_gpu_buffers, &
     nl_operator_update_gpu_buffers, &
-    nl_operator_remove_zero_weight_points
+    nl_operator_remove_zero_weight_points, &
+    nl_operator_build_symmetric_weights
 
   !> @brief index type for non-local operators
   !!
@@ -79,6 +80,8 @@ module nl_operator_oct_m
     integer, allocatable :: imin(:)
     integer, allocatable :: imax(:)
     integer, allocatable :: ri(:, :)
+    integer, allocatable :: ri_pos(:,:)
+    integer, allocatable :: ri_neg(:,:)
   end type nl_operator_index_t
 
   integer, public, parameter ::  &
@@ -111,6 +114,13 @@ module nl_operator_oct_m
     integer, allocatable, public :: rimap(:)
     integer, allocatable, public :: rimap_inv(:)
 
+    !> For symmetry-based stencils
+    integer :: npairs = 0
+    real(real64),allocatable :: wpair(:) 
+    real(real64)             :: wcenter
+    integer, allocatable :: ri_pos(:,:)
+    integer, allocatable :: ri_neg(:,:)
+
     integer                   :: ninner = 0
     integer                   :: nouter = 0
 
@@ -297,6 +307,14 @@ contains
     SAFE_ALLOCATE_SOURCE_A(opo%rimap, opi%rimap)
     SAFE_ALLOCATE_SOURCE_A(opo%rimap_inv, opi%rimap_inv)
 
+    opo%npairs = opi%npairs
+    if (opi%symmetry /= OP_GENERAL) then
+      opo%wcenter = opi%wcenter
+      SAFE_ALLOCATE_SOURCE_A(opo%wpair, opi%wpair)
+      SAFE_ALLOCATE_SOURCE_A(opo%ri_pos, opi%ri_pos)
+      SAFE_ALLOCATE_SOURCE_A(opo%ri_neg, opi%ri_neg)
+    end if
+
     if (opi%mesh%parallel_in_domains) then
       opo%inner%nri = opi%inner%nri
       SAFE_ALLOCATE_SOURCE_A(opo%inner%imin, opi%inner%imin)
@@ -650,6 +668,10 @@ contains
     SAFE_DEALLOCATE_A(op%rimap_inv)
     SAFE_DEALLOCATE_A(op%nn)
 
+    SAFE_DEALLOCATE_A(op%wpair)
+    SAFE_DEALLOCATE_A(op%ri_pos)
+    SAFE_DEALLOCATE_A(op%ri_neg)
+
     call stencil_end(op%stencil)
 
     POP_SUB(nl_operator_end)
@@ -939,6 +961,32 @@ subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pa
   POP_SUB(group_by_pairs_antisym)
 end subroutine group_by_pairs_antisym
 
+subroutine nl_operator_build_symmetric_weights(op)
+  type(nl_operator_t), intent(inout)   :: op
+
+  PUSH_SUB(nl_operator_build_symmetric_weights)
+
+  ASSERT(op%const_w)
+
+  select case(op%symmetry)
+  case(OP_SYMMETRIC)
+    call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, &
+      op%npairs, op%wpair, op%ri_pos, op%ri_neg, op%wcenter)
+    call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
+      op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg, op%wcenter)
+    call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
+      op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg, op%wcenter)
+  case(OP_ANTISYMMETRIC)
+    call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, &
+      op%npairs, op%wpair, op%ri_pos, op%ri_neg)
+    call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
+      op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg)
+    call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
+      op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg)
+  end select
+
+  POP_SUB(nl_operator_build_symmetric_weights)
+end subroutine nl_operator_build_symmetric_weights
 
 #include "undef.F90"
 #include "real.F90"
diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90
index a5d3eb1fae..e7c885e33b 100644
--- a/src/grid/nl_operator_inc.F90
+++ b/src/grid/nl_operator_inc.F90
@@ -41,9 +41,8 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
 #endif
   integer :: nri_loc, ini
   !> Symmetry of the stencil can be used to group points by pairs
-  integer :: npairs
   real(real64),   allocatable :: wpair(:) 
-  integer, allocatable :: ri_pos(:,:), ri_neg(:,:)
+  integer, pointer :: ri_pos(:,:), ri_neg(:,:)
   real(real64) :: wcenter
 
   PUSH_SUB(X(nl_operator_operate_batch))
@@ -79,6 +78,15 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
     if (present(factor)) then
       wre(1:op%stencil%size) = wre(1:op%stencil%size)*factor
     end if
+
+    if (op%symmetry /= OP_GENERAL) then
+      wpair = op%wpair
+      wcenter = op%wcenter
+      if (present(factor)) then
+        wpair = wpair * factor
+        wcenter = wcenter * factor
+      end if
+    end if
   end if
 
   use_accel = .false.
@@ -92,10 +100,8 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
     else if (X(function_global) == OP_FORTRAN) then
       select case(op%symmetry)
       case(OP_ANTISYMMETRIC)
-        call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg) 
         call operate_const_weights_antisymmetric()
       case(OP_SYMMETRIC)
-        call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter) 
         call operate_const_weights_symmetric()
       case(OP_GENERAL)
         call operate_const_weights()
@@ -104,13 +110,6 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
       end select
     else
 
-      select case(op%symmetry)
-      case(OP_SYMMETRIC)
-        call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter)
-      case(OP_ANTISYMMETRIC)
-        call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg)
-      end select
-
       ! for the moment this is not implemented
       !$omp parallel private(ini, nri_loc, ist)
       call multicomm_divide_range_omp(nri, ini, nri_loc)
@@ -125,10 +124,10 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
           call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
             fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
         case(OP_SYMMETRIC)
-          call X(operate_ri_sym_vec)(npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+          call X(operate_ri_sym_vec)(op%npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
             imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
         case(OP_ANTISYMMETRIC)
-          call X(operate_ri_antisym_vec)(npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+          call X(operate_ri_antisym_vec)(op%npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
             imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1))
         case default
           ASSERT(.false.)
@@ -144,10 +143,10 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
             call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), &
               fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
           case(OP_SYMMETRIC)
-            call X(operate_ri_sym_vec)(npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+            call X(operate_ri_sym_vec)(op%npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
               imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
           case(OP_ANTISYMMETRIC)
-            call X(operate_ri_antisym_vec)(npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
+            call X(operate_ri_antisym_vec)(op%npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), &
               imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist))
           case default
             ASSERT(.false.)
@@ -166,8 +165,6 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
 
   SAFE_DEALLOCATE_A(wre)
   SAFE_DEALLOCATE_A(wpair)
-  SAFE_DEALLOCATE_A(ri_pos)
-  SAFE_DEALLOCATE_A(ri_neg)
 
   if (profile_) call profiling_out(TOSTRING(X(NL_OPERATOR_BATCH)))
   POP_SUB(X(nl_operator_operate_batch))
@@ -185,16 +182,22 @@ contains
       imin => op%rimap_inv(1:)
       imax => op%rimap_inv(2:)
       ri   => op%ri
+      ri_pos   => op%ri_pos
+      ri_neg   => op%ri_neg
     case (OP_INNER)
       nri  =  op%inner%nri
       imin => op%inner%imin
       imax => op%inner%imax
       ri   => op%inner%ri
+      ri_pos   => op%inner%ri_pos
+      ri_neg   => op%inner%ri_neg
     case (OP_OUTER)
       nri  =  op%outer%nri
       imin => op%outer%imin
       imax => op%outer%imax
       ri   => op%outer%ri
+      ri_pos   => op%outer%ri_pos
+      ri_neg   => op%outer%ri_neg
     case default
       ASSERT(.false.)
     end select
@@ -270,7 +273,7 @@ contains
         do ii = imin(ll) + 1, imax(ll)
           do ist = 1, fi%nst_linear
             fo%X(ff_linear)(ii, ist) = M_ZERO
-            do ipair = 1, npairs
+            do ipair = 1, op%npairs
               fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + &
                 wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) - fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist))
             end do
@@ -286,7 +289,7 @@ contains
         do ii = imin(ll) + 1, imax(ll)
           do ist = 1, fi%nst_linear
             fo%X(ff_pack)(ist, ii) = M_ZERO
-            do ipair = 1, npairs
+            do ipair = 1, op%npairs
               fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) &
                 + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) - fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll)))
             end do
@@ -321,7 +324,7 @@ contains
         do ii = imin(ll) + 1, imax(ll)
           do ist = 1, fi%nst_linear
             fo%X(ff_linear)(ii, ist) = M_ZERO
-            do ipair = 1, npairs
+            do ipair = 1, op%npairs
               fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + &
                 wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) + fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist))
             end do
@@ -338,7 +341,7 @@ contains
         do ii = imin(ll) + 1, imax(ll)
           do ist = 1, fi%nst_linear
             fo%X(ff_pack)(ist, ii) = M_ZERO
-            do ipair = 1, npairs
+            do ipair = 1, op%npairs
               fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) &
                 + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) + fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll)))
             end do
-- 
GitLab


From 19a1438d116cea98fe7ae000be4d87dcd0a9a2e5 Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Sat, 13 Dec 2025 08:38:47 +0100
Subject: [PATCH 04/11] Optimization of the symmetric operator by reducing the
 integer operations.

---
 src/grid/nl_operator.F90       |  31 +-
 src/grid/nl_operator_inc.F90   |   2 +-
 src/grid/operate_antisym_inc.c |   1 +
 src/grid/operate_sym_inc.c     | 568 +++++++++++++++++----------------
 src/include/vectors.h          |   6 +-
 5 files changed, 320 insertions(+), 288 deletions(-)

diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index 3a7c6a4fee..faeadd9a3b 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -325,6 +325,13 @@ contains
       SAFE_ALLOCATE_SOURCE_A(opo%outer%imin, opi%outer%imin)
       SAFE_ALLOCATE_SOURCE_A(opo%outer%imax, opi%outer%imax)
       SAFE_ALLOCATE_SOURCE_A(opo%outer%ri,   opi%outer%ri)
+
+      if (opi%symmetry /= OP_GENERAL) then
+        SAFE_ALLOCATE_SOURCE_A(opo%inner%ri_pos, opi%inner%ri_pos)
+        SAFE_ALLOCATE_SOURCE_A(opo%inner%ri_neg, opi%inner%ri_neg)
+        SAFE_ALLOCATE_SOURCE_A(opo%outer%ri_pos, opi%outer%ri_pos)
+        SAFE_ALLOCATE_SOURCE_A(opo%outer%ri_neg, opi%outer%ri_neg)
+      end if
     end if
 
     ! We create the corresponding GPU buffers
@@ -672,6 +679,10 @@ contains
     SAFE_DEALLOCATE_A(op%ri_pos)
     SAFE_DEALLOCATE_A(op%ri_neg)
 
+    SAFE_DEALLOCATE_A(op%inner%ri_pos)
+    SAFE_DEALLOCATE_A(op%inner%ri_neg)
+    SAFE_DEALLOCATE_A(op%outer%ri_pos)
+    SAFE_DEALLOCATE_A(op%outer%ri_neg)
     call stencil_end(op%stencil)
 
     POP_SUB(nl_operator_end)
@@ -972,17 +983,21 @@ subroutine nl_operator_build_symmetric_weights(op)
   case(OP_SYMMETRIC)
     call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, &
       op%npairs, op%wpair, op%ri_pos, op%ri_neg, op%wcenter)
-    call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
-      op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg, op%wcenter)
-    call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
-      op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg, op%wcenter)
+    if (op%mesh%parallel_in_domains) then
+      call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
+        op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg, op%wcenter)
+      call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
+        op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg, op%wcenter)
+    end if
   case(OP_ANTISYMMETRIC)
     call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, &
       op%npairs, op%wpair, op%ri_pos, op%ri_neg)
-    call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
-      op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg)
-    call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
-      op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg)
+    if (op%mesh%parallel_in_domains) then
+      call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
+        op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg)
+      call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
+        op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg)
+    end if
   end select
 
   POP_SUB(nl_operator_build_symmetric_weights)
diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90
index e7c885e33b..ce87ec9be0 100644
--- a/src/grid/nl_operator_inc.F90
+++ b/src/grid/nl_operator_inc.F90
@@ -80,7 +80,7 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
     end if
 
     if (op%symmetry /= OP_GENERAL) then
-      wpair = op%wpair
+      SAFE_ALLOCATE_SOURCE_A(wpair, op%wpair)
       wcenter = op%wcenter
       if (present(factor)) then
         wpair = wpair * factor
diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c
index 3e14fc74a4..da1da0907d 100644
--- a/src/grid/operate_antisym_inc.c
+++ b/src/grid/operate_antisym_inc.c
@@ -1,5 +1,6 @@
 /*
  Copyright (C) 2006 X. Andrade
+ Copyright (C) 2025 N. Tancogne-Dejean
 
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c
index bf89e9ba67..57a22c51d4 100644
--- a/src/grid/operate_sym_inc.c
+++ b/src/grid/operate_sym_inc.c
@@ -1,5 +1,6 @@
 /*
  Copyright (C) 2006 X. Andrade
+ Copyright (C) 2025 N. Tancogne-Dejean
 
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -52,6 +53,9 @@
   const int *restrict index_pos;
   const int *restrict index_neg;
 
+  // Hoisting wcenter
+  const double w0 = wcenter[0];
+
   for (l = 0; l < nri; l++) {
     index_pos = opri_pos + npairs * l;
     index_neg = opri_neg + npairs * l;
@@ -79,178 +83,180 @@
         b8 = b9 = ba = bb = VEC_ZERO;
         bc = bd = be = bf = VEC_ZERO;
 
+        const ptrdiff_t base_ik = (ptrdiff_t) i << ldf; + k;
+
         for (j = 0; j < npairs; j++) {
 #ifdef VEC_SCAL_LD
           register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
+          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
           a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
-          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
           a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
-          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
           a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
-          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
           a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
-          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
           a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
-          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
           a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
-          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
           a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
-          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE);
           a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8);
-          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE);
           a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9);
-          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE);
           aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa);
-          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE);
           ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab);
-          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE);
           ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac);
-          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE);
           ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad);
-          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE);
           ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae);
-          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE);
           af = VEC_FMA(wj, VEC_ADD(pos, neg), af);
-          pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE);
           b0 = VEC_FMA(wj, VEC_ADD(pos, neg), b0);
-          pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE);
           b1 = VEC_FMA(wj, VEC_ADD(pos, neg), b1);
-          pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE);
           b2 = VEC_FMA(wj, VEC_ADD(pos, neg), b2);
-          pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE);
           b3 = VEC_FMA(wj, VEC_ADD(pos, neg), b3);
-          pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE);
           b4 = VEC_FMA(wj, VEC_ADD(pos, neg), b4);
-          pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE);
           b5 = VEC_FMA(wj, VEC_ADD(pos, neg), b5);
-          pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE);
           b6 = VEC_FMA(wj, VEC_ADD(pos, neg), b6);
-          pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE);
           b7 = VEC_FMA(wj, VEC_ADD(pos, neg), b7);
-          pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE);
           b8 = VEC_FMA(wj, VEC_ADD(pos, neg), b8);
-          pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE);
           b9 = VEC_FMA(wj, VEC_ADD(pos, neg), b9);
-          pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE);
           ba = VEC_FMA(wj, VEC_ADD(pos, neg), ba);
-          pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE);
           bb = VEC_FMA(wj, VEC_ADD(pos, neg), bb);
-          pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE);
           bc = VEC_FMA(wj, VEC_ADD(pos, neg), bc);
-          pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE);
           bd = VEC_FMA(wj, VEC_ADD(pos, neg), bd);
-          pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE);
           be = VEC_FMA(wj, VEC_ADD(pos, neg), be); 
-          pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE);
           be = VEC_FMA(wj, VEC_ADD(pos, neg), be);
         }
-        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
-        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
-        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
-        a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2);
-        a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3);
-        a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4);
-        a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5);
-        a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6);
-        a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7);
-        a8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 8 * VEC_SIZE + k), a8);
-        a9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 9 * VEC_SIZE + k), a9);
-        aa = VEC_FMA(wj, LOAD(fi + (i << ldf) + 10 * VEC_SIZE + k), aa);
-        ab = VEC_FMA(wj, LOAD(fi + (i << ldf) + 11 * VEC_SIZE + k), ab);
-        ac = VEC_FMA(wj, LOAD(fi + (i << ldf) + 12 * VEC_SIZE + k), ac);
-        ad = VEC_FMA(wj, LOAD(fi + (i << ldf) + 13 * VEC_SIZE + k), ad);
-        ae = VEC_FMA(wj, LOAD(fi + (i << ldf) + 14 * VEC_SIZE + k), ae);
-        af = VEC_FMA(wj, LOAD(fi + (i << ldf) + 15 * VEC_SIZE + k), af);
-        b0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 16 * VEC_SIZE + k), b0);
-        b1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 17 * VEC_SIZE + k), b1);
-        b2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 18 * VEC_SIZE + k), b2);
-        b3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 19 * VEC_SIZE + k), b3);
-        b4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 20 * VEC_SIZE + k), b4);
-        b5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 21 * VEC_SIZE + k), b5);
-        b6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 22 * VEC_SIZE + k), b6);
-        b7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 23 * VEC_SIZE + k), b7);
-        b8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 24 * VEC_SIZE + k), b8);
-        b9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 25 * VEC_SIZE + k), b9);
-        ba = VEC_FMA(wj, LOAD(fi + (i << ldf) + 26 * VEC_SIZE + k), ba);
-        bb = VEC_FMA(wj, LOAD(fi + (i << ldf) + 27 * VEC_SIZE + k), bb);
-        bc = VEC_FMA(wj, LOAD(fi + (i << ldf) + 28 * VEC_SIZE + k), bc);
-        bd = VEC_FMA(wj, LOAD(fi + (i << ldf) + 29 * VEC_SIZE + k), bd);
-        be = VEC_FMA(wj, LOAD(fi + (i << ldf) + 30 * VEC_SIZE + k), be);
-        bf = VEC_FMA(wj, LOAD(fi + (i << ldf) + 31 * VEC_SIZE + k), bf);
-
-
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
-        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
-        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
-        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
-        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
-        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
-        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
-        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
-        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
-        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
-        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
-        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
-        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
-        STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0);
-        STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1);
-        STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2);
-        STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3);
-        STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4);
-        STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5);
-        STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6);
-        STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7);
-        STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8);
-        STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9);
-        STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba);
-        STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb);
-        STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc);
-        STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd);
-        STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be);
-        STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf);
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
+        a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4);
+        a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5);
+        a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6);
+        a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7);
+        a8 = VEC_FMA(wj, LOAD(fi + base_ik + 8 * VEC_SIZE), a8);
+        a9 = VEC_FMA(wj, LOAD(fi + base_ik + 9 * VEC_SIZE), a9);
+        aa = VEC_FMA(wj, LOAD(fi + base_ik + 10 * VEC_SIZE), aa);
+        ab = VEC_FMA(wj, LOAD(fi + base_ik + 11 * VEC_SIZE), ab);
+        ac = VEC_FMA(wj, LOAD(fi + base_ik + 12 * VEC_SIZE), ac);
+        ad = VEC_FMA(wj, LOAD(fi + base_ik + 13 * VEC_SIZE), ad);
+        ae = VEC_FMA(wj, LOAD(fi + base_ik + 14 * VEC_SIZE), ae);
+        af = VEC_FMA(wj, LOAD(fi + base_ik + 15 * VEC_SIZE), af);
+        b0 = VEC_FMA(wj, LOAD(fi + base_ik + 16 * VEC_SIZE), b0);
+        b1 = VEC_FMA(wj, LOAD(fi + base_ik + 17 * VEC_SIZE), b1);
+        b2 = VEC_FMA(wj, LOAD(fi + base_ik + 18 * VEC_SIZE), b2);
+        b3 = VEC_FMA(wj, LOAD(fi + base_ik + 19 * VEC_SIZE), b3);
+        b4 = VEC_FMA(wj, LOAD(fi + base_ik + 20 * VEC_SIZE), b4);
+        b5 = VEC_FMA(wj, LOAD(fi + base_ik + 21 * VEC_SIZE), b5);
+        b6 = VEC_FMA(wj, LOAD(fi + base_ik + 22 * VEC_SIZE), b6);
+        b7 = VEC_FMA(wj, LOAD(fi + base_ik + 23 * VEC_SIZE), b7);
+        b8 = VEC_FMA(wj, LOAD(fi + base_ik + 24 * VEC_SIZE), b8);
+        b9 = VEC_FMA(wj, LOAD(fi + base_ik + 25 * VEC_SIZE), b9);
+        ba = VEC_FMA(wj, LOAD(fi + base_ik + 26 * VEC_SIZE), ba);
+        bb = VEC_FMA(wj, LOAD(fi + base_ik + 27 * VEC_SIZE), bb);
+        bc = VEC_FMA(wj, LOAD(fi + base_ik + 28 * VEC_SIZE), bc);
+        bd = VEC_FMA(wj, LOAD(fi + base_ik + 29 * VEC_SIZE), bd);
+        be = VEC_FMA(wj, LOAD(fi + base_ik + 30 * VEC_SIZE), be);
+        bf = VEC_FMA(wj, LOAD(fi + base_ik + 31 * VEC_SIZE), bf);
+
+
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
+        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
+        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
+        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
+        STORE(fo + base_ik + 8 * VEC_SIZE, a8);
+        STORE(fo + base_ik + 9 * VEC_SIZE, a9);
+        STORE(fo + base_ik + 10 * VEC_SIZE, aa);
+        STORE(fo + base_ik + 11 * VEC_SIZE, ab);
+        STORE(fo + base_ik + 12 * VEC_SIZE, ac);
+        STORE(fo + base_ik + 13 * VEC_SIZE, ad);
+        STORE(fo + base_ik + 14 * VEC_SIZE, ae);
+        STORE(fo + base_ik + 15 * VEC_SIZE, af);
+        STORE(fo + base_ik + 16 * VEC_SIZE, b0);
+        STORE(fo + base_ik + 17 * VEC_SIZE, b1);
+        STORE(fo + base_ik + 18 * VEC_SIZE, b2);
+        STORE(fo + base_ik + 19 * VEC_SIZE, b3);
+        STORE(fo + base_ik + 20 * VEC_SIZE, b4);
+        STORE(fo + base_ik + 21 * VEC_SIZE, b5);
+        STORE(fo + base_ik + 22 * VEC_SIZE, b6);
+        STORE(fo + base_ik + 23 * VEC_SIZE, b7);
+        STORE(fo + base_ik + 24 * VEC_SIZE, b8);
+        STORE(fo + base_ik + 25 * VEC_SIZE, b9);
+        STORE(fo + base_ik + 26 * VEC_SIZE, ba);
+        STORE(fo + base_ik + 27 * VEC_SIZE, bb);
+        STORE(fo + base_ik + 28 * VEC_SIZE, bc);
+        STORE(fo + base_ik + 29 * VEC_SIZE, bd);
+        STORE(fo + base_ik + 30 * VEC_SIZE, be);
+        STORE(fo + base_ik + 31 * VEC_SIZE, bf);
       }
     }
 #endif
@@ -269,98 +275,100 @@
         a8 = a9 = aa = ab = VEC_ZERO;
         ac = ad = ae = af = VEC_ZERO;
 
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
         for (j = 0; j < npairs; j++) {
 #ifdef VEC_SCAL_LD
           register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
+          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
           a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
-          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
           a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
-          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
           a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
-          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
           a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
-          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
           a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
-          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
           a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
-          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
           a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
-          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE);
           a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8);
-          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE);
           a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9);
-          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE);
           aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa);
-          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE);
           ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab);
-          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE);
           ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac);
-          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE);
           ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad);
-          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE);
           ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae);
-          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE);
           af = VEC_FMA(wj, VEC_ADD(pos, neg), af);
 
         }
-        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
-        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
-        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
-        a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2);
-        a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3);
-        a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4);
-        a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5);
-        a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6);
-        a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7);
-        a8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 8 * VEC_SIZE + k), a8);
-        a9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 9 * VEC_SIZE + k), a9);
-        aa = VEC_FMA(wj, LOAD(fi + (i << ldf) + 10 * VEC_SIZE + k), aa);
-        ab = VEC_FMA(wj, LOAD(fi + (i << ldf) + 11 * VEC_SIZE + k), ab);
-        ac = VEC_FMA(wj, LOAD(fi + (i << ldf) + 12 * VEC_SIZE + k), ac);
-        ad = VEC_FMA(wj, LOAD(fi + (i << ldf) + 13 * VEC_SIZE + k), ad);
-        ae = VEC_FMA(wj, LOAD(fi + (i << ldf) + 14 * VEC_SIZE + k), ae);
-        af = VEC_FMA(wj, LOAD(fi + (i << ldf) + 15 * VEC_SIZE + k), af);
-
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
-        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
-        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
-        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
-        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
-        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
-        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
-        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
-        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
-        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
-        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
-        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
-        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
+        a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4);
+        a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5);
+        a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6);
+        a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7);
+        a8 = VEC_FMA(wj, LOAD(fi + base_ik + 8 * VEC_SIZE), a8);
+        a9 = VEC_FMA(wj, LOAD(fi + base_ik + 9 * VEC_SIZE), a9);
+        aa = VEC_FMA(wj, LOAD(fi + base_ik + 10 * VEC_SIZE), aa);
+        ab = VEC_FMA(wj, LOAD(fi + base_ik + 11 * VEC_SIZE), ab);
+        ac = VEC_FMA(wj, LOAD(fi + base_ik + 12 * VEC_SIZE), ac);
+        ad = VEC_FMA(wj, LOAD(fi + base_ik + 13 * VEC_SIZE), ad);
+        ae = VEC_FMA(wj, LOAD(fi + base_ik + 14 * VEC_SIZE), ae);
+        af = VEC_FMA(wj, LOAD(fi + base_ik + 15 * VEC_SIZE), af);
+
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
+        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
+        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
+        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
+        STORE(fo + base_ik + 8 * VEC_SIZE, a8);
+        STORE(fo + base_ik + 9 * VEC_SIZE, a9);
+        STORE(fo + base_ik + 10 * VEC_SIZE, aa);
+        STORE(fo + base_ik + 11 * VEC_SIZE, ab);
+        STORE(fo + base_ik + 12 * VEC_SIZE, ac);
+        STORE(fo + base_ik + 13 * VEC_SIZE, ad);
+        STORE(fo + base_ik + 14 * VEC_SIZE, ae);
+        STORE(fo + base_ik + 15 * VEC_SIZE, af);
       }
     }
 #endif
@@ -375,56 +383,58 @@
         a0 = a1 = a2 = a3 = VEC_ZERO;
         a4 = a5 = a6 = a7 = VEC_ZERO;
 
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
         for (j = 0; j < npairs; j++) {
 #ifdef VEC_SCAL_LD
           register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
+          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
           a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
-          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
           a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
-          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
           a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
-          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
           a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
-          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
           a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
-          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
           a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
-          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
           a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
         }
-        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
-        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
-        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
-        a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2);
-        a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3);
-        a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4);
-        a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5);
-        a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6);
-        a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7);
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
-        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
-        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
-        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
-        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
+        a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4);
+        a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5);
+        a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6);
+        a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
+        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
+        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
+        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
       }
     }
 #endif
@@ -437,36 +447,38 @@
 
         a0 = a1 = a2 = a3 = VEC_ZERO;
 
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
         for (j = 0; j < npairs; j++) {
 #ifdef VEC_SCAL_LD
           register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
+          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
           a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
-          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
           a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
-          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
           a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
         }
-        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
-        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
-        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
-        a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2);
-        a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3);
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
+        a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
+        a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
       }
     }
 #endif
@@ -479,26 +491,28 @@
 
         a0 = a1 = VEC_ZERO;
 
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
         for (j = 0; j < npairs; j++) {
 #ifdef VEC_SCAL_LD
           register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
+          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
           a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
         }
-        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
-        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
-        a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1);
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
       }
     }
 #endif
@@ -508,21 +522,22 @@
       ptrdiff_t k;
       for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
         register VEC_TYPE a0 = VEC_ZERO;
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
         for (j = 0; j < npairs; j++) {
 #ifdef VEC_SCAL_LD
           register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
+          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
         }
-        register VEC_TYPE wj = VEC_SCAL(wcenter[0]);
-        a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0);
-        STORE(fo + (i << ldf) + k, a0);
+        const VEC_TYPE wj = VEC_SCAL(w0);
+        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
+        STORE(fo + base_ik, a0);
       }
     }
 #endif
@@ -532,10 +547,11 @@
       ptrdiff_t k;
       for (k = 0; k < (1 << ldf); k++) {
         double a = 0.0;
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
         for (j = 0; j < npairs; j++)
-          a += wpair[j] * ( fi[((index_pos[j] + i) << ldf) + k] + fi[((index_neg[j] + i) << ldf) + k]);
-        a += wcenter[0] * fi[(i << ldf) + k];
-        fo[(i << ldf) + k] = a;
+          a += wpair[j] * ( fi[(index_pos[j]<<ldf) + base_ik] + fi[(index_neg[j]<<ldf) + base_ik]);
+        a += w0 * fi[base_ik];
+        fo[base_ik] = a;
       }
     }
 #endif
diff --git a/src/include/vectors.h b/src/include/vectors.h
index 3c15558890..8927198068 100644
--- a/src/include/vectors.h
+++ b/src/include/vectors.h
@@ -124,10 +124,10 @@
 #define VEC_LDU(addr) VEC_LD(addr)
 #define VEC_ST(addr, vec) (addr)[0] = vec
 #define VEC_STU(addr, vec) VEC_ST(addr, vec)
-#define VEC_FMA(aa, bb, cc) aa *bb + cc
+#define VEC_FMA(aa, bb, cc) aa * (bb) + cc
 #define VEC_SCAL(aa) aa
-#define VEC_ADD(aa, bb) aa+bb
-#define VEC_SUB(aa, bb) aa-bb
+#define VEC_ADD(aa, bb) (aa+bb)
+#define VEC_SUB(aa, bb) (aa-bb)
 #define VEC_ZERO 0.0
 
 #define DEPTH 8
-- 
GitLab


From 270f2787f818eaa91466236a69e4f90ee3246efc Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Mon, 15 Dec 2025 20:33:22 +0100
Subject: [PATCH 05/11] Precompute the offset multiplications for the stencil
 shifts. The array are reallocated if needed.

---
 src/grid/derivatives_test.F90     | 218 ++++++++++-----------
 src/grid/nl_operator.F90          | 309 +++++++++++++++++-------------
 src/grid/nl_operator_inc.F90      |  35 +++-
 src/grid/operate_sym_inc.c        |  44 +++--
 src/utils/dielectric_function.F90 |  16 +-
 5 files changed, 345 insertions(+), 277 deletions(-)

diff --git a/src/grid/derivatives_test.F90 b/src/grid/derivatives_test.F90
index c9231922dc..c17406ce34 100644
--- a/src/grid/derivatives_test.F90
+++ b/src/grid/derivatives_test.F90
@@ -51,156 +51,156 @@ module derivatives_test_oct_m
 contains
 
 !>@brief Further unit tests design to challenge numerical stability of the finite differences
-subroutine derivatives_advanced_benchmark(this, namespace)
-  type(derivatives_t), intent(in) :: this
-  type(namespace_t),   intent(in) :: namespace
+  subroutine derivatives_advanced_benchmark(this, namespace)
+    type(derivatives_t), intent(in) :: this
+    type(namespace_t),   intent(in) :: namespace
 
-  real(real64), allocatable :: ff(:), grad(:,:), resgrad(:, :), lapl(:), reslapl(:)
-  real(real64) :: kvec(3)
+    real(real64), allocatable :: ff(:), grad(:,:), resgrad(:, :), lapl(:), reslapl(:)
+    real(real64) :: kvec(3)
 
-  SAFE_ALLOCATE(ff(1:this%mesh%np_part))
-  SAFE_ALLOCATE(grad(1:this%mesh%np, 1:this%mesh%box%dim))
-  SAFE_ALLOCATE(lapl(1:this%mesh%np))
-  SAFE_ALLOCATE(reslapl(1:this%mesh%np))
-  SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim))
+    SAFE_ALLOCATE(ff(1:this%mesh%np_part))
+    SAFE_ALLOCATE(grad(1:this%mesh%np, 1:this%mesh%box%dim))
+    SAFE_ALLOCATE(lapl(1:this%mesh%np))
+    SAFE_ALLOCATE(reslapl(1:this%mesh%np))
+    SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim))
 
-  ! Testing on some planewaves
-  kvec = [0.5_real64, 0.0_real64, 0.0_real64]         ! low frequency
-  call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
-  call test_function("Testing Planewave k=(0.5,0.0,0.0)")
+    ! Testing on some planewaves
+    kvec = [0.5_real64, 0.0_real64, 0.0_real64]         ! low frequency
+    call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+    call test_function("Testing Planewave k=(0.5,0.0,0.0)")
 
-  kvec = [1.0_real64, 1.0_real64, 0.0_real64]         ! diagonal
-  call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
-  call test_function("Testing Planewave k=(1.0,1.0,0.0)")
+    kvec = [1.0_real64, 1.0_real64, 0.0_real64]         ! diagonal
+    call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+    call test_function("Testing Planewave k=(1.0,1.0,0.0)")
 
-  kvec = [M_PI/this%mesh%spacing(1), 0.0_real64, 0.0_real64]            ! Nyquist
-  call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
-  call test_function("Testing Planewave k=(\pi/dr,0.0,0.0)")
+    kvec = [M_PI/this%mesh%spacing(1), 0.0_real64, 0.0_real64]            ! Nyquist
+    call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+    call test_function("Testing Planewave k=(\pi/dr,0.0,0.0)")
 
-  kvec = [M_PI/this%mesh%spacing(1), M_PI/this%mesh%spacing(2), 0.0_real64]         ! diagonal Nyquist
-  call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
-  call test_function("Testing Planewave k=(\pi/dr,\pi/dr,0.0)")
+    kvec = [M_PI/this%mesh%spacing(1), M_PI/this%mesh%spacing(2), 0.0_real64]         ! diagonal Nyquist
+    call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec)
+    call test_function("Testing Planewave k=(\pi/dr,\pi/dr,0.0)")
 
-  ! Testing on some polynomials
-  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=2)
-  call test_function("Testing degree-2 polynomial")
+    ! Testing on some polynomials
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=2)
+    call test_function("Testing degree-2 polynomial")
 
-  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=3)
-  call test_function("Testing degree-3 polynomial")
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=3)
+    call test_function("Testing degree-3 polynomial")
 
-  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=4)
-  call test_function("Testing degree-4 polynomial")
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=4)
+    call test_function("Testing degree-4 polynomial")
 
-  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=6)
-  call test_function("Testing degree-6 polynomial")
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=6)
+    call test_function("Testing degree-6 polynomial")
 
-  call testfunc_polynomial(this%mesh, ff, grad, lapl, p=8)
-  call test_function("Testing degree-8 polynomial")
+    call testfunc_polynomial(this%mesh, ff, grad, lapl, p=8)
+    call test_function("Testing degree-8 polynomial")
 
 
-  SAFE_DEALLOCATE_A(ff)
-  SAFE_DEALLOCATE_A(grad)
-  SAFE_DEALLOCATE_A(lapl)
-  SAFE_DEALLOCATE_A(resgrad)
-  SAFE_DEALLOCATE_A(reslapl)
+    SAFE_DEALLOCATE_A(ff)
+    SAFE_DEALLOCATE_A(grad)
+    SAFE_DEALLOCATE_A(lapl)
+    SAFE_DEALLOCATE_A(resgrad)
+    SAFE_DEALLOCATE_A(reslapl)
 
-contains
-  subroutine test_function(label)
-    character(len=*), intent(in) :: label
+  contains
+    subroutine test_function(label)
+      character(len=*), intent(in) :: label
 
-    message(1) = trim(label)
+      message(1) = trim(label)
 
-    call dderivatives_grad(this, ff, resgrad, set_bc=.false.)
-    resgrad = resgrad - grad
-    write(message(2), '(a, es17.10)') 'Gradient err = ', dmf_nrm2(this%mesh, this%mesh%box%dim, resgrad)
+      call dderivatives_grad(this, ff, resgrad, set_bc=.false.)
+      resgrad = resgrad - grad
+      write(message(2), '(a, es17.10)') 'Gradient err = ', dmf_nrm2(this%mesh, this%mesh%box%dim, resgrad)
 
-    call dderivatives_lapl(this, ff, reslapl, set_bc=.false.)
-    reslapl = reslapl - lapl
-    write(message(3), '(a, es17.10)') 'Laplacian err = ', dmf_nrm2(this%mesh, reslapl)
+      call dderivatives_lapl(this, ff, reslapl, set_bc=.false.)
+      reslapl = reslapl - lapl
+      write(message(3), '(a, es17.10)') 'Laplacian err = ', dmf_nrm2(this%mesh, reslapl)
 
-    call messages_info(3, namespace=namespace)
-  end subroutine test_function
-end subroutine derivatives_advanced_benchmark
+      call messages_info(3, namespace=namespace)
+    end subroutine test_function
+  end subroutine derivatives_advanced_benchmark
 
 !>@brief Generate a plane-wave given its momentum, as well as the gradient and the Laplacian
-subroutine testfunc_plane_wave(mesh, ff, grad, lapl, kvec)
-  type(mesh_t), intent(in) :: mesh
-  real(real64), intent(out) :: ff(:), grad(:, :), lapl(:)
-  real(real64), intent(in) :: kvec(3)
+  subroutine testfunc_plane_wave(mesh, ff, grad, lapl, kvec)
+    type(mesh_t), intent(in) :: mesh
+    real(real64), intent(out) :: ff(:), grad(:, :), lapl(:)
+    real(real64), intent(in) :: kvec(3)
 
-  integer :: ip, dim, idir
-  real(real64) :: phase, kdotx, kk
+    integer :: ip, dim, idir
+    real(real64) :: phase, kdotx, kk
 
-  dim = mesh%box%dim
+    dim = mesh%box%dim
 
-  kk = dot_product(kvec(1:dim), kvec(1:dim))
-  do ip = 1, mesh%np
-     kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim))
-     phase = cos(kdotx)
+    kk = dot_product(kvec(1:dim), kvec(1:dim))
+    do ip = 1, mesh%np
+      kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim))
+      phase = cos(kdotx)
 
-     ff(ip) = phase
+      ff(ip) = phase
 
-     ! exact gradient
-     do idir = 1, mesh%box%dim
+      ! exact gradient
+      do idir = 1, mesh%box%dim
         grad(ip, idir) = -kvec(idir)*sin(kdotx)
-     end do
+      end do
 
-     ! exact Laplacian
-     lapl(ip) = -kk * phase
-  end do
+      ! exact Laplacian
+      lapl(ip) = -kk * phase
+    end do
 
-  do ip = mesh%np+1, mesh%np_part
-    kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim))
-    phase = cos(kdotx)
+    do ip = mesh%np+1, mesh%np_part
+      kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim))
+      phase = cos(kdotx)
 
-    ff(ip) = phase    
-  end do
-end subroutine testfunc_plane_wave
+      ff(ip) = phase
+    end do
+  end subroutine testfunc_plane_wave
 
 !>@brief Generate a polyomial of degree p, as well as the gradient and the Laplacian
-subroutine testfunc_polynomial(mesh, ff, grad, lapl, p)
-  type(mesh_t), intent(in) :: mesh
-  real(real64), intent(out) :: ff(:), grad(:, :), lapl(:)
-  integer, intent(in) :: p
-  integer :: ip, d, dim
-  real(real64) :: x(3), r2
+  subroutine testfunc_polynomial(mesh, ff, grad, lapl, p)
+    type(mesh_t), intent(in) :: mesh
+    real(real64), intent(out) :: ff(:), grad(:, :), lapl(:)
+    integer, intent(in) :: p
+    integer :: ip, d, dim
+    real(real64) :: x(3), r2
 
-  dim = mesh%box%dim
+    dim = mesh%box%dim
 
-  do ip = 1, mesh%np
-     x = 0.0_real64
-     x(1:dim) = mesh%x(ip, 1:dim)
+    do ip = 1, mesh%np
+      x = 0.0_real64
+      x(1:dim) = mesh%x(ip, 1:dim)
 
-     ! f = (x^2 + y^2 + z^2)^(p/2)
-     r2 = dot_product(x(1:dim), x(1:dim))
-     ff(ip) = r2**(0.5d0*p)
+      ! f = (x^2 + y^2 + z^2)^(p/2)
+      r2 = dot_product(x(1:dim), x(1:dim))
+      ff(ip) = r2**(0.5d0*p)
 
-     ! gradient: grad(f) = p * r^(p-2) * x
-     if (p >= 2) then
+      ! gradient: grad(f) = p * r^(p-2) * x
+      if (p >= 2) then
         do d = 1, dim
-           grad(ip,d) = p * r2**(0.5d0*p - 1.0d0) * x(d)
+          grad(ip,d) = p * r2**(0.5d0*p - 1.0d0) * x(d)
         end do
-     else
+      else
         grad(ip,:) = 0.0
-     end if
+      end if
 
-     ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2)
-     if (p >= 2) then
+      ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2)
+      if (p >= 2) then
         lapl(ip) = p * (p + dim - 2) * r2**(0.5d0*p - 1.0d0)
-     else
+      else
         lapl(ip) = 0.0
-     end if
-  end do
-
-  do ip = mesh%np+1, mesh%np_part
-    x = 0.0_real64
-    x(1:dim) = mesh%x(ip,1:dim)
-
-    ! f = (x^2 + y^2 + z^2)^(p/2)
-    r2 = dot_product(x(1:dim), x(1:dim))
-    ff(ip) = r2**(0.5d0*p)
-  end do
-end subroutine testfunc_polynomial
+      end if
+    end do
+
+    do ip = mesh%np+1, mesh%np_part
+      x = 0.0_real64
+      x(1:dim) = mesh%x(ip,1:dim)
+
+      ! f = (x^2 + y^2 + z^2)^(p/2)
+      r2 = dot_product(x(1:dim), x(1:dim))
+      ff(ip) = r2**(0.5d0*p)
+    end do
+  end subroutine testfunc_polynomial
 
 
 #include "undef.F90"
diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index faeadd9a3b..5ccdb727e5 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -80,8 +80,8 @@ module nl_operator_oct_m
     integer, allocatable :: imin(:)
     integer, allocatable :: imax(:)
     integer, allocatable :: ri(:, :)
-    integer, allocatable :: ri_pos(:,:)
-    integer, allocatable :: ri_neg(:,:)
+    integer, allocatable :: ri_pos(:,:,:)
+    integer, allocatable :: ri_neg(:,:,:)
   end type nl_operator_index_t
 
   integer, public, parameter ::  &
@@ -116,10 +116,11 @@ module nl_operator_oct_m
 
     !> For symmetry-based stencils
     integer :: npairs = 0
-    real(real64),allocatable :: wpair(:) 
+    real(real64),allocatable :: wpair(:)
     real(real64)             :: wcenter
-    integer, allocatable :: ri_pos(:,:)
-    integer, allocatable :: ri_neg(:,:)
+    integer, allocatable :: ri_pos(:,:,:)
+    integer, allocatable :: ri_neg(:,:,:)
+    integer :: max_allocated_ri_pair = 0
 
     integer                   :: ninner = 0
     integer                   :: nouter = 0
@@ -818,57 +819,53 @@ contains
     POP_SUB(nl_operator_remove_zero_weight_points)
   end subroutine nl_operator_remove_zero_weight_points
 
-!> Take a list of weights and offsets and build pairs of symmetric points with common weights 
-subroutine group_by_pairs_sym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg, wcenter)
-  integer,                    intent(in)  :: size
-  integer,                    intent(in)  :: offsets(:, :)
-  real(real64),               intent(in)  :: wre(:)
-  integer,                    intent(in)  :: ri(:, :)
-  integer,                    intent(in)  :: nri
-  integer,                    intent(out) :: npairs
-  real(real64), allocatable,  intent(out) :: wpair(:)
-  integer, allocatable,       intent(out) :: pair_pos(:,:), pair_neg(:,:)
-  real(real64),               intent(out) :: wcenter
-
-  logical, allocatable :: used(:)
-  integer :: i, j, d, ndim, s
-  logical :: same
-  real(real64), allocatable :: abs_wre(:)
-  integer, allocatable :: idx(:)
-
-  real(real64), parameter :: tol = 1.0e-12_real64
-
-  PUSH_SUB(group_by_pairs_sym)
-
-  ASSERT(mod(size,2) == 1)
-
-  SAFE_ALLOCATE(used(1:size))
-  used = .false.
-  npairs = 0
-
-  ndim = ubound(offsets, dim=1) 
-
-  SAFE_ALLOCATE(abs_wre(1:size))
-  SAFE_ALLOCATE(idx(1:size))
-  abs_wre = abs(wre)
-  call sort(abs_wre, idx)
-  
-  ! Max pairs = n/2
-  SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri))
-  SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri))
-  SAFE_ALLOCATE(wpair(1:size/2))
-
-  do i = 1, size
-     if (used(i)) cycle
-
-     if (all(offsets(:, idx(i))==0)) then
+!> Take a list of weights and offsets and build pairs of symmetric points with common weights
+  subroutine group_by_pairs_sym(size, ldf, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg, wcenter)
+    integer,                    intent(in)  :: size
+    integer,                    intent(in)  :: ldf
+    integer,                    intent(in)  :: offsets(:, :)
+    real(real64),               intent(in)  :: wre(:)
+    integer,                    intent(in)  :: ri(:, :)
+    integer,                    intent(in)  :: nri
+    integer,                    intent(out) :: npairs
+    real(real64),             intent(inout) :: wpair(:)
+    integer,                  intent(inout) :: pair_pos(:,:), pair_neg(:,:)
+    real(real64),               intent(out) :: wcenter
+
+    logical, allocatable :: used(:)
+    integer :: i, j, ndim, s
+    logical :: same
+    real(real64), allocatable :: abs_wre(:)
+    integer, allocatable :: idx(:)
+
+    real(real64), parameter :: tol = 1.0e-12_real64
+
+    PUSH_SUB(group_by_pairs_sym)
+
+    ASSERT(mod(size,2) == 1)
+
+    SAFE_ALLOCATE(used(1:size))
+    used = .false.
+    npairs = 0
+
+    ndim = ubound(offsets, dim=1)
+
+    SAFE_ALLOCATE(abs_wre(1:size))
+    SAFE_ALLOCATE(idx(1:size))
+    abs_wre = abs(wre)
+    call sort(abs_wre, idx)
+
+    do i = 1, size
+      if (used(i)) cycle
+
+      if (all(offsets(:, idx(i))==0)) then
         wcenter = wre(idx(i))
         used(i) = .true.
         cycle
-     end if
+      end if
 
-     ! Try to find symmetric partner j
-     do j = i+1, size
+      ! Try to find symmetric partner j
+      do j = i+1, size
         if (used(j)) cycle
 
         ! Weight equality
@@ -880,69 +877,69 @@ subroutine group_by_pairs_sym(size, offsets, wre, ri, nri, npairs, wpair, pair_p
 
         npairs = npairs + 1
         do s = 1, nri
-          pair_pos(npairs, s) = ri(idx(i), s)
-          pair_neg(npairs, s) = ri(idx(j), s)
+          pair_pos(npairs, s) = ishft(ri(idx(i), s), ldf)
+          pair_neg(npairs, s) = ishft(ri(idx(j), s), ldf)
         end do
         wpair(npairs)  = M_HALF*(wre(idx(i)) + wre(idx(j)))
 
         used(i) = .true.
         used(j) = .true.
         exit
-     end do
-  end do
+      end do
+    end do
+
+    ASSERT(npairs == size/2)
 
-  ASSERT(npairs == size/2)
+    SAFE_DEALLOCATE_A(abs_wre)
+    SAFE_DEALLOCATE_A(idx)
 
-  SAFE_DEALLOCATE_A(abs_wre)
-  SAFE_DEALLOCATE_A(idx)
+    POP_SUB(group_by_pairs_sym)
+  end subroutine group_by_pairs_sym
 
-  POP_SUB(group_by_pairs_sym)
-end subroutine group_by_pairs_sym
+!> Take a list of weights and offsets and build pairs of symmetric points with common weights
+  subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg)
+    integer,                    intent(in)  :: size
+    integer,                    intent(in)  :: offsets(:, :)
+    real(real64),               intent(in)  :: wre(:)
+    integer,                    intent(in)  :: ri(:, :)
+    integer,                    intent(in)  :: nri
+    integer,                    intent(out) :: npairs
+    real(real64), allocatable,  intent(out) :: wpair(:)
+    integer, allocatable,       intent(out) :: pair_pos(:,:,:), pair_neg(:,:,:)
 
-!> Take a list of weights and offsets and build pairs of symmetric points with common weights 
-subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg)
-  integer,                    intent(in)  :: size
-  integer,                    intent(in)  :: offsets(:, :)
-  real(real64),               intent(in)  :: wre(:)
-  integer,                    intent(in)  :: ri(:, :)
-  integer,                    intent(in)  :: nri
-  integer,                    intent(out) :: npairs
-  real(real64), allocatable,  intent(out) :: wpair(:)
-  integer, allocatable,       intent(out) :: pair_pos(:,:), pair_neg(:,:)
+    logical, allocatable :: used(:)
+    integer :: i, j, ndim, s
+    logical :: same
+    real(real64), allocatable :: abs_wre(:)
+    integer, allocatable :: idx(:)
 
-  logical, allocatable :: used(:)
-  integer :: i, j, d, ndim, s
-  logical :: same
-  real(real64), allocatable :: abs_wre(:)
-  integer, allocatable :: idx(:)
+    real(real64), parameter :: tol = 1.0e-12_real64
 
-  real(real64), parameter :: tol = 1.0e-12_real64
+    PUSH_SUB(group_by_pairs_antisym)
 
-  PUSH_SUB(group_by_pairs_antisym)
+    ASSERT(mod(size,2) == 0)
 
-  ASSERT(mod(size,2) == 0)
+    SAFE_ALLOCATE(used(1:size))
+    used = .false.
+    npairs = 0
 
-  SAFE_ALLOCATE(used(1:size))
-  used = .false.
-  npairs = 0
+    ndim = ubound(offsets, dim=1)
 
-  ndim = ubound(offsets, dim=1) 
+    SAFE_ALLOCATE(abs_wre(1:size))
+    SAFE_ALLOCATE(idx(1:size))
+    abs_wre = abs(wre)
+    call sort(abs_wre, idx)
 
-  SAFE_ALLOCATE(abs_wre(1:size))
-  SAFE_ALLOCATE(idx(1:size))
-  abs_wre = abs(wre)
-  call sort(abs_wre, idx)
-  
-  ! Max pairs = n/2
-  SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri))
-  SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri))
-  SAFE_ALLOCATE(wpair(1:size/2))
+    ! Max pairs = n/2
+    SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri, 1:1))
+    SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri, 1:1))
+    SAFE_ALLOCATE(wpair(1:size/2))
 
-  do i = 1, size
-     if (used(i)) cycle
+    do i = 1, size
+      if (used(i)) cycle
 
-     ! Try to find symmetric partner j
-     do j = i+1, size
+      ! Try to find symmetric partner j
+      do j = i+1, size
         if (used(j)) cycle
 
         ! Weight equality
@@ -954,54 +951,104 @@ subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pa
 
         npairs = npairs + 1
         do s = 1, nri
-          pair_pos(npairs, s) = ri(idx(i), s)
-          pair_neg(npairs, s) = ri(idx(j), s)
+          pair_pos(npairs, s, 1) = ri(idx(i), s)
+          pair_neg(npairs, s, 1) = ri(idx(j), s)
         end do
         wpair(npairs)  = M_HALF*(wre(idx(i)) - wre(idx(j)))
 
         used(i) = .true.
         used(j) = .true.
         exit
-     end do
-  end do
+      end do
+    end do
 
-  ASSERT(npairs == size/2)
+    ASSERT(npairs == size/2)
 
-  SAFE_DEALLOCATE_A(abs_wre)
-  SAFE_DEALLOCATE_A(idx)
-  POP_SUB(group_by_pairs_antisym)
-end subroutine group_by_pairs_antisym
+    SAFE_DEALLOCATE_A(abs_wre)
+    SAFE_DEALLOCATE_A(idx)
+    POP_SUB(group_by_pairs_antisym)
+  end subroutine group_by_pairs_antisym
 
-subroutine nl_operator_build_symmetric_weights(op)
-  type(nl_operator_t), intent(inout)   :: op
+  subroutine nl_operator_build_symmetric_weights(op, max_size)
+    type(nl_operator_t), intent(inout)   :: op
+    integer, optional,   intent(in)      :: max_size
 
-  PUSH_SUB(nl_operator_build_symmetric_weights)
+    integer :: ldf, start, end
 
-  ASSERT(op%const_w)
+    PUSH_SUB(nl_operator_build_symmetric_weights)
 
-  select case(op%symmetry)
-  case(OP_SYMMETRIC)
-    call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, &
-      op%npairs, op%wpair, op%ri_pos, op%ri_neg, op%wcenter)
-    if (op%mesh%parallel_in_domains) then
-      call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
-        op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg, op%wcenter)
-      call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
-        op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg, op%wcenter)
-    end if
-  case(OP_ANTISYMMETRIC)
-    call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, &
-      op%npairs, op%wpair, op%ri_pos, op%ri_neg)
-    if (op%mesh%parallel_in_domains) then
-      call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
-        op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg)
-      call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
-        op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg)
-    end if
-  end select
+    ASSERT(op%const_w)
+    ASSERT(conf%target_states_block_size > 0)
+
+
+    select case(op%symmetry)
+    case(OP_SYMMETRIC)
+      if(present(max_size)) then
+        start = op%max_allocated_ri_pair + 1
+        end = max_size
+        call reallocate_array(op%ri_pos, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end)
+        call reallocate_array(op%ri_neg, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end)
+        if (op%mesh%parallel_in_domains) then
+          call reallocate_array(op%inner%ri_pos, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end)
+          call reallocate_array(op%inner%ri_neg, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end)
+          call reallocate_array(op%outer%ri_pos, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end)
+          call reallocate_array(op%outer%ri_neg, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end)
+        end if
+      else
+        SAFE_ALLOCATE(op%wpair(1:op%stencil%size/2))
+        start = 1
+        end = log2(conf%target_states_block_size)+1
+
+        SAFE_ALLOCATE(op%ri_pos(1:op%stencil%size/2, 1:op%nri, 1:end))
+        SAFE_ALLOCATE(op%ri_neg(1:op%stencil%size/2, 1:op%nri, 1:end))
+        if (op%mesh%parallel_in_domains) then
+          SAFE_ALLOCATE(op%inner%ri_pos(1:op%stencil%size/2, 1:op%inner%nri, 1:end))
+          SAFE_ALLOCATE(op%inner%ri_neg(1:op%stencil%size/2, 1:op%inner%nri, 1:end))
+          SAFE_ALLOCATE(op%outer%ri_pos(1:op%stencil%size/2, 1:op%outer%nri, 1:end))
+          SAFE_ALLOCATE(op%outer%ri_neg(1:op%stencil%size/2, 1:op%outer%nri, 1:end))
+        end if
+      end if
+      op%max_allocated_ri_pair = end
+      ! Max pairs = n/2
+
+      op%max_allocated_ri_pair = end
+      do ldf = start-1, end-1
+        call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, &
+          op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1), op%wcenter)
+        if (op%mesh%parallel_in_domains) then
+          call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
+            op%npairs, op%wpair, op%inner%ri_pos(:,:,ldf+1), op%inner%ri_neg(:,:,ldf+1), op%wcenter)
+          call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
+            op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1), op%wcenter)
+        end if
+      end do
+    case(OP_ANTISYMMETRIC)
+      call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, &
+        op%npairs, op%wpair, op%ri_pos, op%ri_neg)
+      if (op%mesh%parallel_in_domains) then
+        call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
+          op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg)
+        call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
+          op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg)
+      end if
+    end select
+
+    POP_SUB(nl_operator_build_symmetric_weights)
+  end subroutine nl_operator_build_symmetric_weights
+
+  subroutine reallocate_array(ri, stencil_size, nri, old_size, new_size)
+    integer,      allocatable, intent(inout) :: ri(:,:,:)
+    integer,                   intent(in)    :: stencil_size, nri
+    integer,                   intent(in)    :: old_size, new_size
+
+    integer, allocatable :: tmp(:,:,:)
 
-  POP_SUB(nl_operator_build_symmetric_weights)
-end subroutine nl_operator_build_symmetric_weights
+    SAFE_ALLOCATE_SOURCE_A(tmp, ri)
+    SAFE_DEALLOCATE_A(ri)
+    SAFE_ALLOCATE(ri(1:stencil_size, 1:nri, 1:new_size))
+    ri(:,:,1:old_size) = tmp
+    SAFE_DEALLOCATE_A(tmp)
+  end subroutine reallocate_array
 
 #include "undef.F90"
 #include "real.F90"
diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90
index ce87ec9be0..39eada0cdf 100644
--- a/src/grid/nl_operator_inc.F90
+++ b/src/grid/nl_operator_inc.F90
@@ -19,7 +19,7 @@
 ! ---------------------------------------------------------
 
 subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, points, factor, async)
-  type(nl_operator_t), target, intent(in)    :: op
+  type(nl_operator_t), target, intent(inout) :: op
   type(batch_t),       target, intent(inout) :: fi
   type(batch_t),               intent(inout) :: fo !< this should be target, but old ifort 9.1 segfaults with it
   logical,         optional,   intent(in)    :: ghost_update
@@ -41,7 +41,7 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point
 #endif
   integer :: nri_loc, ini
   !> Symmetry of the stencil can be used to group points by pairs
-  real(real64),   allocatable :: wpair(:) 
+  real(real64),   allocatable :: wpair(:)
   integer, pointer :: ri_pos(:,:), ri_neg(:,:)
   real(real64) :: wcenter
 
@@ -174,30 +174,45 @@ contains
   ! ---------------------------------------------------------
   subroutine select_op()
 
+    integer :: ldf
+
     PUSH_SUB(X(nl_operator_operate_batch).select_op)
 
+    ldf = logldf+1
+    if (fi%status() == BATCH_PACKED) then
+      ldf = log2(int(fi%pack_size_real(1), int32))+1
+    end if
+
+    if (op%symmetry == OP_SYMMETRIC) then
+      if (ldf > op%max_allocated_ri_pair) then
+        call nl_operator_build_symmetric_weights(op, ldf)
+      end if
+    else
+      ldf = 1
+    end if
+
     select case (points_)
     case (OP_ALL)
       nri  =  op%nri
       imin => op%rimap_inv(1:)
       imax => op%rimap_inv(2:)
       ri   => op%ri
-      ri_pos   => op%ri_pos
-      ri_neg   => op%ri_neg
+      ri_pos   => op%ri_pos(:,:,ldf)
+      ri_neg   => op%ri_neg(:,:,ldf)
     case (OP_INNER)
       nri  =  op%inner%nri
       imin => op%inner%imin
       imax => op%inner%imax
       ri   => op%inner%ri
-      ri_pos   => op%inner%ri_pos
-      ri_neg   => op%inner%ri_neg
+      ri_pos   => op%inner%ri_pos(:,:,ldf)
+      ri_neg   => op%inner%ri_neg(:,:,ldf)
     case (OP_OUTER)
       nri  =  op%outer%nri
       imin => op%outer%imin
       imax => op%outer%imax
       ri   => op%outer%ri
-      ri_pos   => op%outer%ri_pos
-      ri_neg   => op%outer%ri_neg
+      ri_pos   => op%outer%ri_pos(:,:,ldf)
+      ri_neg   => op%outer%ri_neg(:,:,ldf)
     case default
       ASSERT(.false.)
     end select
@@ -256,7 +271,7 @@ contains
   subroutine operate_const_weights_antisymmetric()
     integer :: nn, ll, ii, ist, ipair
 
-    PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric)
+    PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_antisymmetric)
 
     nn = op%stencil%size
 
@@ -623,7 +638,7 @@ end subroutine X(nl_operator_operate_batch)
 
 subroutine X(nl_operator_operate)(op, fi, fo, ghost_update, profile, points)
   R_TYPE, contiguous,         intent(inout) :: fi(:)  !< fi(op%np_part)
-  type(nl_operator_t),        intent(in)    :: op
+  type(nl_operator_t),        intent(inout) :: op
   R_TYPE, contiguous, target, intent(out)   :: fo(:)
   logical, optional,          intent(in)    :: ghost_update
   logical, optional,          intent(in)    :: profile
diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c
index 57a22c51d4..93740d5de5 100644
--- a/src/grid/operate_sym_inc.c
+++ b/src/grid/operate_sym_inc.c
@@ -91,8 +91,8 @@
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
-          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
           register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
@@ -283,8 +283,8 @@
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
-          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
           register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
@@ -391,8 +391,8 @@
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
-          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
           register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
@@ -455,8 +455,8 @@
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
-          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
           register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
@@ -499,8 +499,8 @@
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
-          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
           register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
@@ -529,8 +529,8 @@
 #else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
 #endif
-          ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik;
-          ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik;
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
           register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
@@ -543,15 +543,21 @@
 #endif
 
 #if VEC_SIZE > 1
+
+    const ptrdiff_t size =  (ptrdiff_t) 1 << ldf;
+    double a;
+
     for (; i < rimap_inv_max[l]; i++) {
-      ptrdiff_t k;
-      for (k = 0; k < (1 << ldf); k++) {
-        double a = 0.0;
+      for (ptrdiff_t k = 0; k < size; k++) {
+        a = 0.0;
         const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
-        for (j = 0; j < npairs; j++)
-          a += wpair[j] * ( fi[(index_pos[j]<<ldf) + base_ik] + fi[(index_neg[j]<<ldf) + base_ik]);
-        a += w0 * fi[base_ik];
-        fo[base_ik] = a;
+        for (j = 0; j < npairs; j++) {
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          const double sum = fi[indexj_pos] + fi[indexj_neg];  
+          a += wpair[j] * sum;
+        }
+        fo[base_ik] = a + w0 * fi[base_ik];
       }
     }
 #endif
diff --git a/src/utils/dielectric_function.F90 b/src/utils/dielectric_function.F90
index 754aa2301b..77b63c574d 100644
--- a/src/utils/dielectric_function.F90
+++ b/src/utils/dielectric_function.F90
@@ -207,14 +207,14 @@ contains
     ! Treat the Drude peak
     ! We remove the offset before applying the mask, to get proper treatment of metals
     do kk = 1, space%dim*3
-     average(kk) = M_ZERO
-     do ii = time_steps/2, time_steps
-       average(kk) = average(kk) + vecpot(ii, kk)
-     end do
-     average = average/(time_steps -time_steps/2 + 1)
-     do ii = 1, time_steps
-       vecpot(ii, kk) = vecpot(ii, kk) - average(kk)
-     end do
+      average(kk) = M_ZERO
+      do ii = time_steps/2, time_steps
+        average(kk) = average(kk) + vecpot(ii, kk)
+      end do
+      average = average/(time_steps -time_steps/2 + 1)
+      do ii = 1, time_steps
+        vecpot(ii, kk) = vecpot(ii, kk) - average(kk)
+      end do
     end do
     Aext(1:space%dim) = Aext(1:space%dim) + average(1:space%dim)
 
-- 
GitLab


From bc1a791fc1d5131c6a693ed0ae6d8c014c750f2f Mon Sep 17 00:00:00 2001
From: Nicolas Tancogne-Dejean <nicolas.tancogne-dejean@rub.de>
Date: Wed, 17 Dec 2025 10:48:56 +0100
Subject: [PATCH 06/11] Apply 1 suggestion(s) to 1 file(s)

Co-authored-by: Sebastian Ohlmann <sebastian.ohlmann@mpcdf.mpg.de>
---
 src/grid/nl_operator.F90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index 5ccdb727e5..626721dc43 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -221,7 +221,7 @@ contains
     !%Section Execution::Optimization
     !%Default no
     !%Description
-    !% This variable selects if the operators are build using symmetries or not.
+    !% This variable selects if the operators are built using symmetries or not.
     !%End
     call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries)
 
-- 
GitLab


From e21246e5f6238e76c71ffdc533f86d600281940b Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Wed, 17 Dec 2025 12:35:23 +0100
Subject: [PATCH 07/11] Improvements and fixes.

---
 src/grid/derivatives_test.F90     | 20 ++++++++++----------
 src/grid/nl_operator.F90          |  7 ++++++-
 src/utils/dielectric_function.F90 | 22 +++-------------------
 3 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/src/grid/derivatives_test.F90 b/src/grid/derivatives_test.F90
index c17406ce34..e176269586 100644
--- a/src/grid/derivatives_test.F90
+++ b/src/grid/derivatives_test.F90
@@ -135,7 +135,7 @@ contains
 
     kk = dot_product(kvec(1:dim), kvec(1:dim))
     do ip = 1, mesh%np
-      kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim))
+      kdotx = dot_product(kvec(1:dim), mesh%x(1:dim, ip))
       phase = cos(kdotx)
 
       ff(ip) = phase
@@ -150,7 +150,7 @@ contains
     end do
 
     do ip = mesh%np+1, mesh%np_part
-      kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim))
+      kdotx = dot_product(kvec(1:dim), mesh%x(1:dim, ip))
       phase = cos(kdotx)
 
       ff(ip) = phase
@@ -169,36 +169,36 @@ contains
 
     do ip = 1, mesh%np
       x = 0.0_real64
-      x(1:dim) = mesh%x(ip, 1:dim)
+      x(1:dim) = mesh%x(1:dim, ip)
 
       ! f = (x^2 + y^2 + z^2)^(p/2)
       r2 = dot_product(x(1:dim), x(1:dim))
-      ff(ip) = r2**(0.5d0*p)
+      ff(ip) = r2**(0.5_real64*p)
 
       ! gradient: grad(f) = p * r^(p-2) * x
       if (p >= 2) then
         do d = 1, dim
-          grad(ip,d) = p * r2**(0.5d0*p - 1.0d0) * x(d)
+          grad(ip,d) = p * r2**(0.5_real64*p - 1.0_real64) * x(d)
         end do
       else
-        grad(ip,:) = 0.0
+        grad(ip,:) = 0.0_real64
       end if
 
       ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2)
       if (p >= 2) then
-        lapl(ip) = p * (p + dim - 2) * r2**(0.5d0*p - 1.0d0)
+        lapl(ip) = p * (p + dim - 2) * r2**(0.5_real64*p - 1.0_real64)
       else
-        lapl(ip) = 0.0
+        lapl(ip) = 0.0_real64
       end if
     end do
 
     do ip = mesh%np+1, mesh%np_part
       x = 0.0_real64
-      x(1:dim) = mesh%x(ip,1:dim)
+      x(1:dim) = mesh%x(1:dim, ip)
 
       ! f = (x^2 + y^2 + z^2)^(p/2)
       r2 = dot_product(x(1:dim), x(1:dim))
-      ff(ip) = r2**(0.5d0*p)
+      ff(ip) = r2**(0.5_real64*p)
     end do
   end subroutine testfunc_polynomial
 
diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index 626721dc43..4b5c52635c 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -224,6 +224,9 @@ contains
     !% This variable selects if the operators are built using symmetries or not.
     !%End
     call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries)
+    if (accel_is_enabled() .and. use_symmetries) then
+      call messages_not_implemented("OperateUseSymmetries=yes with GPUs")    
+    end if
 
     if (accel_is_enabled()) then
 
@@ -969,9 +972,10 @@ contains
     POP_SUB(group_by_pairs_antisym)
   end subroutine group_by_pairs_antisym
 
+  !>@brief Builds (or rebuild) the necessary arrays for symmetric and antisymmetric stencils
   subroutine nl_operator_build_symmetric_weights(op, max_size)
     type(nl_operator_t), intent(inout)   :: op
-    integer, optional,   intent(in)      :: max_size
+    integer, optional,   intent(in)      :: max_size !< For reallocation
 
     integer :: ldf, start, end
 
@@ -1036,6 +1040,7 @@ contains
     POP_SUB(nl_operator_build_symmetric_weights)
   end subroutine nl_operator_build_symmetric_weights
 
+  !>@brief Reallocate an ri array
   subroutine reallocate_array(ri, stencil_size, nri, old_size, new_size)
     integer,      allocatable, intent(inout) :: ri(:,:,:)
     integer,                   intent(in)    :: stencil_size, nri
diff --git a/src/utils/dielectric_function.F90 b/src/utils/dielectric_function.F90
index 77b63c574d..750875b12b 100644
--- a/src/utils/dielectric_function.F90
+++ b/src/utils/dielectric_function.F90
@@ -84,7 +84,7 @@ contains
     real(real64)   :: dt, dt_ref, tt, ww, norm_Aext
     real(real64), allocatable :: vecpot(:, :), Aext(:), Eind_real(:, :), Eind_imag(:, :)
     complex(real64), allocatable :: dielectric(:), chi(:), invdielectric(:)
-    real(real64), allocatable :: vecpot_ref(:, :), average(:)
+    real(real64), allocatable :: vecpot_ref(:, :)
     type(spectrum_t)  :: spectrum
     type(block_t)     :: blk
     type(space_t)     :: space
@@ -102,7 +102,6 @@ contains
 
     SAFE_ALLOCATE(Aext(1:space%dim))
     SAFE_ALLOCATE(Eind(1:space%dim))
-    SAFE_ALLOCATE(average(1:space%dim*3))
 
     if (parse_block(global_namespace, 'GaugeVectorField', blk) == 0) then
 
@@ -193,8 +192,8 @@ contains
         read(ref_file, *) jj, tt, (vecpot_ref(ii, kk), kk = 1, space%dim*3)
       end do
       call io_close(ref_file)
-      do kk = 1, space%dim*3
-        do ii = 1, time_steps
+      do ii = 1, time_steps
+        do kk = 1, space%dim*3
           vecpot(ii, kk) = vecpot(ii, kk) - vecpot_ref(ii, kk)
         end do
       end do
@@ -204,20 +203,6 @@ contains
       trim(io_workpath('td.general/gauge_field', global_namespace))//"'"
     call messages_info(1)
 
-    ! Treat the Drude peak
-    ! We remove the offset before applying the mask, to get proper treatment of metals
-    do kk = 1, space%dim*3
-      average(kk) = M_ZERO
-      do ii = time_steps/2, time_steps
-        average(kk) = average(kk) + vecpot(ii, kk)
-      end do
-      average = average/(time_steps -time_steps/2 + 1)
-      do ii = 1, time_steps
-        vecpot(ii, kk) = vecpot(ii, kk) - average(kk)
-      end do
-    end do
-    Aext(1:space%dim) = Aext(1:space%dim) + average(1:space%dim)
-
 
     ! Find out the iteration numbers corresponding to the time limits.
     ! Max time correspond to time_steps-1, as we start from 0.
@@ -312,7 +297,6 @@ contains
     SAFE_DEALLOCATE_A(vecpot_ref)
     SAFE_DEALLOCATE_A(Aext)
     SAFE_DEALLOCATE_A(Eind)
-    SAFE_DEALLOCATE_A(average)
     SAFE_DEALLOCATE_A(Eind_real)
     SAFE_DEALLOCATE_A(Eind_imag)
 
-- 
GitLab


From cfef122e26248ff08ebe1cf4212033d75e98ee26 Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Wed, 17 Dec 2025 14:51:40 +0100
Subject: [PATCH 08/11] Removing the depth of 32, as well as the VEC_SCAL_LD
 dead path.

---
 src/grid/operate_antisym_inc.c | 186 ---------------------------
 src/grid/operate_inc.c         | 121 ------------------
 src/grid/operate_sym_inc.c     | 223 ---------------------------------
 3 files changed, 530 deletions(-)

diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c
index da1da0907d..8ab6f9fdfe 100644
--- a/src/grid/operate_antisym_inc.c
+++ b/src/grid/operate_antisym_inc.c
@@ -30,9 +30,6 @@
 {
   const ptrdiff_t npairs = opn[0];
   const ptrdiff_t nri = opnri[0];
-#if DEPTH >= 32
-  const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf);
-#endif
 #if DEPTH >= 16
   const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
 #endif
@@ -58,169 +55,6 @@
     index_neg = opri_neg + npairs * l;
     i = rimap_inv[l];
 
-#if DEPTH >= 32
-    for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) {
-      ptrdiff_t k;
-      for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) {
-        register VEC_TYPE a0, a1, a2, a3;
-        register VEC_TYPE a4, a5, a6, a7;
-        register VEC_TYPE a8, a9, aa, ab;
-        register VEC_TYPE ac, ad, ae, af;
-        register VEC_TYPE b0, b1, b2, b3;
-        register VEC_TYPE b4, b5, b6, b7;
-        register VEC_TYPE b8, b9, ba, bb;
-        register VEC_TYPE bc, bd, be, bf;
-
-        a0 = a1 = a2 = a3 = VEC_ZERO;
-        a4 = a5 = a6 = a7 = VEC_ZERO;
-        a8 = a9 = aa = ab = VEC_ZERO;
-        ac = ad = ae = af = VEC_ZERO;
-        b0 = b1 = b2 = b3 = VEC_ZERO;
-        b4 = b5 = b6 = b7 = VEC_ZERO;
-        b8 = b9 = ba = bb = VEC_ZERO;
-        bc = bd = be = bf = VEC_ZERO;
-
-        for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
-          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
-          a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
-          a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
-          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
-          a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
-          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
-          a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
-          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
-          a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
-          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
-          a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
-          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
-          a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
-          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
-          a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
-          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k);
-          a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8);
-          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k);
-          a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9);
-          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k);
-          aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa);
-          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k);
-          ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab);
-          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k);
-          ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac);
-          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k);
-          ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad);
-          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k);
-          ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae);
-          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k);
-          af = VEC_FMA(wj, VEC_SUB(pos, neg), af);
-          pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE + k);
-          b0 = VEC_FMA(wj, VEC_SUB(pos, neg), b0);
-          pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE + k);
-          b1 = VEC_FMA(wj, VEC_SUB(pos, neg), b1);
-          pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE + k);
-          b2 = VEC_FMA(wj, VEC_SUB(pos, neg), b2);
-          pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE + k);
-          b3 = VEC_FMA(wj, VEC_SUB(pos, neg), b3);
-          pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE + k);
-          b4 = VEC_FMA(wj, VEC_SUB(pos, neg), b4);
-          pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE + k);
-          b5 = VEC_FMA(wj, VEC_SUB(pos, neg), b5);
-          pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE + k);
-          b6 = VEC_FMA(wj, VEC_SUB(pos, neg), b6);
-          pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE + k);
-          b7 = VEC_FMA(wj, VEC_SUB(pos, neg), b7);
-          pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE + k);
-          b8 = VEC_FMA(wj, VEC_SUB(pos, neg), b8);
-          pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE + k);
-          b9 = VEC_FMA(wj, VEC_SUB(pos, neg), b9);
-          pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE + k);
-          ba = VEC_FMA(wj, VEC_SUB(pos, neg), ba);
-          pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE + k);
-          bb = VEC_FMA(wj, VEC_SUB(pos, neg), bb);
-          pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE + k);
-          bc = VEC_FMA(wj, VEC_SUB(pos, neg), bc);
-          pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE + k);
-          bd = VEC_FMA(wj, VEC_SUB(pos, neg), bd);
-          pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE + k);
-          be = VEC_FMA(wj, VEC_SUB(pos, neg), be); 
-          pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE + k);
-          be = VEC_FMA(wj, VEC_SUB(pos, neg), be);
-        }
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
-        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
-        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
-        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
-        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
-        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
-        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
-        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
-        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
-        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
-        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
-        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
-        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
-        STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0);
-        STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1);
-        STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2);
-        STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3);
-        STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4);
-        STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5);
-        STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6);
-        STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7);
-        STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8);
-        STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9);
-        STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba);
-        STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb);
-        STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc);
-        STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd);
-        STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be);
-        STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf);
-      }
-    }
-#endif
-
 #if DEPTH >= 16
     for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
       ptrdiff_t k;
@@ -236,11 +70,7 @@
         ac = ad = ae = af = VEC_ZERO;
 
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
           ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
           register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
@@ -324,11 +154,7 @@
         a4 = a5 = a6 = a7 = VEC_ZERO;
 
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
           ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
           register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
@@ -377,11 +203,7 @@
         a0 = a1 = a2 = a3 = VEC_ZERO;
 
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
           ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
           register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
@@ -414,11 +236,7 @@
         a0 = a1 = VEC_ZERO;
 
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
           ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
           register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
@@ -440,11 +258,7 @@
       for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
         register VEC_TYPE a0 = VEC_ZERO;
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
           ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
           register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
diff --git a/src/grid/operate_inc.c b/src/grid/operate_inc.c
index d9014ac367..2eeb3d0ebf 100644
--- a/src/grid/operate_inc.c
+++ b/src/grid/operate_inc.c
@@ -29,9 +29,6 @@
 {
   const ptrdiff_t n = opn[0];
   const ptrdiff_t nri = opnri[0];
-#if DEPTH >= 32
-  const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf);
-#endif
 #if DEPTH >= 16
   const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
 #endif
@@ -55,104 +52,6 @@
     index = opri + n * l;
     i = rimap_inv[l];
 
-#if DEPTH >= 32
-    for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) {
-      ptrdiff_t k;
-      for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) {
-        register VEC_TYPE a0, a1, a2, a3;
-        register VEC_TYPE a4, a5, a6, a7;
-        register VEC_TYPE a8, a9, aa, ab;
-        register VEC_TYPE ac, ad, ae, af;
-        register VEC_TYPE b0, b1, b2, b3;
-        register VEC_TYPE b4, b5, b6, b7;
-        register VEC_TYPE b8, b9, ba, bb;
-        register VEC_TYPE bc, bd, be, bf;
-
-        a0 = a1 = a2 = a3 = VEC_ZERO;
-        a4 = a5 = a6 = a7 = VEC_ZERO;
-        a8 = a9 = aa = ab = VEC_ZERO;
-        ac = ad = ae = af = VEC_ZERO;
-        b0 = b1 = b2 = b3 = VEC_ZERO;
-        b4 = b5 = b6 = b7 = VEC_ZERO;
-        b8 = b9 = ba = bb = VEC_ZERO;
-        bc = bd = be = bf = VEC_ZERO;
-
-        for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
-          register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
-          ptrdiff_t indexj = (index[j] + i) << ldf;
-          a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
-          a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
-          a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2);
-          a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3);
-          a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4);
-          a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5);
-          a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6);
-          a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7);
-          a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8);
-          a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9);
-          aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa);
-          ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab);
-          ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac);
-          ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad);
-          ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae);
-          af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af);
-          b0 = VEC_FMA(wj, LOAD(fi + indexj + 16 * VEC_SIZE + k), b0);
-          b1 = VEC_FMA(wj, LOAD(fi + indexj + 17 * VEC_SIZE + k), b1);
-          b2 = VEC_FMA(wj, LOAD(fi + indexj + 18 * VEC_SIZE + k), b2);
-          b3 = VEC_FMA(wj, LOAD(fi + indexj + 19 * VEC_SIZE + k), b3);
-          b4 = VEC_FMA(wj, LOAD(fi + indexj + 20 * VEC_SIZE + k), b4);
-          b5 = VEC_FMA(wj, LOAD(fi + indexj + 21 * VEC_SIZE + k), b5);
-          b6 = VEC_FMA(wj, LOAD(fi + indexj + 22 * VEC_SIZE + k), b6);
-          b7 = VEC_FMA(wj, LOAD(fi + indexj + 23 * VEC_SIZE + k), b7);
-          b8 = VEC_FMA(wj, LOAD(fi + indexj + 24 * VEC_SIZE + k), b8);
-          b9 = VEC_FMA(wj, LOAD(fi + indexj + 25 * VEC_SIZE + k), b9);
-          ba = VEC_FMA(wj, LOAD(fi + indexj + 26 * VEC_SIZE + k), ba);
-          bb = VEC_FMA(wj, LOAD(fi + indexj + 27 * VEC_SIZE + k), bb);
-          bc = VEC_FMA(wj, LOAD(fi + indexj + 28 * VEC_SIZE + k), bc);
-          bd = VEC_FMA(wj, LOAD(fi + indexj + 29 * VEC_SIZE + k), bd);
-          be = VEC_FMA(wj, LOAD(fi + indexj + 30 * VEC_SIZE + k), be);
-          bf = VEC_FMA(wj, LOAD(fi + indexj + 31 * VEC_SIZE + k), bf);
-        }
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
-        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
-        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
-        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
-        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
-        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
-        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
-        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
-        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
-        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
-        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
-        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
-        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
-        STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0);
-        STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1);
-        STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2);
-        STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3);
-        STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4);
-        STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5);
-        STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6);
-        STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7);
-        STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8);
-        STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9);
-        STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba);
-        STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb);
-        STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc);
-        STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd);
-        STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be);
-        STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf);
-      }
-    }
-#endif
-
 #if DEPTH >= 16
     for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
       ptrdiff_t k;
@@ -168,11 +67,7 @@
         ac = ad = ae = af = VEC_ZERO;
 
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
           a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
@@ -222,11 +117,7 @@
         a4 = a5 = a6 = a7 = VEC_ZERO;
 
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
           a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
@@ -258,11 +149,7 @@
         a0 = a1 = a2 = a3 = VEC_ZERO;
 
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
           a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
@@ -286,11 +173,7 @@
         a0 = a1 = VEC_ZERO;
 
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
           a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1);
@@ -307,11 +190,7 @@
       for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
         register VEC_TYPE a0 = VEC_ZERO;
         for (j = 0; j < n; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(w + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(w[j]);
-#endif
           ptrdiff_t indexj = (index[j] + i) << ldf;
           a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0);
         }
diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c
index 93740d5de5..fc5ba4ac88 100644
--- a/src/grid/operate_sym_inc.c
+++ b/src/grid/operate_sym_inc.c
@@ -30,9 +30,6 @@
 {
   const ptrdiff_t npairs = opn[0];
   const ptrdiff_t nri = opnri[0];
-#if DEPTH >= 32
-  const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf);
-#endif
 #if DEPTH >= 16
   const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf);
 #endif
@@ -61,206 +58,6 @@
     index_neg = opri_neg + npairs * l;
     i = rimap_inv[l];
 
-#if DEPTH >= 32
-    for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) {
-      ptrdiff_t k;
-      for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) {
-        register VEC_TYPE a0, a1, a2, a3;
-        register VEC_TYPE a4, a5, a6, a7;
-        register VEC_TYPE a8, a9, aa, ab;
-        register VEC_TYPE ac, ad, ae, af;
-        register VEC_TYPE b0, b1, b2, b3;
-        register VEC_TYPE b4, b5, b6, b7;
-        register VEC_TYPE b8, b9, ba, bb;
-        register VEC_TYPE bc, bd, be, bf;
-
-        a0 = a1 = a2 = a3 = VEC_ZERO;
-        a4 = a5 = a6 = a7 = VEC_ZERO;
-        a8 = a9 = aa = ab = VEC_ZERO;
-        ac = ad = ae = af = VEC_ZERO;
-        b0 = b1 = b2 = b3 = VEC_ZERO;
-        b4 = b5 = b6 = b7 = VEC_ZERO;
-        b8 = b9 = ba = bb = VEC_ZERO;
-        bc = bd = be = bf = VEC_ZERO;
-
-        const ptrdiff_t base_ik = (ptrdiff_t) i << ldf; + k;
-
-        for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
-          register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
-          ptrdiff_t indexj_pos = index_pos[j] + base_ik;
-          ptrdiff_t indexj_neg = index_neg[j] + base_ik;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg);
-          a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
-          a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1);
-          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
-          a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2);
-          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
-          a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3);
-          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
-          a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4);
-          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
-          a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5);
-          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
-          a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6);
-          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
-          a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7);
-          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE);
-          a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8);
-          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE);
-          a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9);
-          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE);
-          aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa);
-          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE);
-          ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab);
-          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE);
-          ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac);
-          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE);
-          ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad);
-          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE);
-          ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae);
-          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE);
-          af = VEC_FMA(wj, VEC_ADD(pos, neg), af);
-          pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE);
-          b0 = VEC_FMA(wj, VEC_ADD(pos, neg), b0);
-          pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE);
-          b1 = VEC_FMA(wj, VEC_ADD(pos, neg), b1);
-          pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE);
-          b2 = VEC_FMA(wj, VEC_ADD(pos, neg), b2);
-          pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE);
-          b3 = VEC_FMA(wj, VEC_ADD(pos, neg), b3);
-          pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE);
-          b4 = VEC_FMA(wj, VEC_ADD(pos, neg), b4);
-          pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE);
-          b5 = VEC_FMA(wj, VEC_ADD(pos, neg), b5);
-          pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE);
-          b6 = VEC_FMA(wj, VEC_ADD(pos, neg), b6);
-          pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE);
-          b7 = VEC_FMA(wj, VEC_ADD(pos, neg), b7);
-          pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE);
-          b8 = VEC_FMA(wj, VEC_ADD(pos, neg), b8);
-          pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE);
-          b9 = VEC_FMA(wj, VEC_ADD(pos, neg), b9);
-          pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE);
-          ba = VEC_FMA(wj, VEC_ADD(pos, neg), ba);
-          pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE);
-          bb = VEC_FMA(wj, VEC_ADD(pos, neg), bb);
-          pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE);
-          bc = VEC_FMA(wj, VEC_ADD(pos, neg), bc);
-          pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE);
-          bd = VEC_FMA(wj, VEC_ADD(pos, neg), bd);
-          pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE);
-          be = VEC_FMA(wj, VEC_ADD(pos, neg), be); 
-          pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE);
-          neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE);
-          be = VEC_FMA(wj, VEC_ADD(pos, neg), be);
-        }
-        const VEC_TYPE wj = VEC_SCAL(w0);
-        a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0);
-        a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1);
-        a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2);
-        a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3);
-        a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4);
-        a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5);
-        a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6);
-        a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7);
-        a8 = VEC_FMA(wj, LOAD(fi + base_ik + 8 * VEC_SIZE), a8);
-        a9 = VEC_FMA(wj, LOAD(fi + base_ik + 9 * VEC_SIZE), a9);
-        aa = VEC_FMA(wj, LOAD(fi + base_ik + 10 * VEC_SIZE), aa);
-        ab = VEC_FMA(wj, LOAD(fi + base_ik + 11 * VEC_SIZE), ab);
-        ac = VEC_FMA(wj, LOAD(fi + base_ik + 12 * VEC_SIZE), ac);
-        ad = VEC_FMA(wj, LOAD(fi + base_ik + 13 * VEC_SIZE), ad);
-        ae = VEC_FMA(wj, LOAD(fi + base_ik + 14 * VEC_SIZE), ae);
-        af = VEC_FMA(wj, LOAD(fi + base_ik + 15 * VEC_SIZE), af);
-        b0 = VEC_FMA(wj, LOAD(fi + base_ik + 16 * VEC_SIZE), b0);
-        b1 = VEC_FMA(wj, LOAD(fi + base_ik + 17 * VEC_SIZE), b1);
-        b2 = VEC_FMA(wj, LOAD(fi + base_ik + 18 * VEC_SIZE), b2);
-        b3 = VEC_FMA(wj, LOAD(fi + base_ik + 19 * VEC_SIZE), b3);
-        b4 = VEC_FMA(wj, LOAD(fi + base_ik + 20 * VEC_SIZE), b4);
-        b5 = VEC_FMA(wj, LOAD(fi + base_ik + 21 * VEC_SIZE), b5);
-        b6 = VEC_FMA(wj, LOAD(fi + base_ik + 22 * VEC_SIZE), b6);
-        b7 = VEC_FMA(wj, LOAD(fi + base_ik + 23 * VEC_SIZE), b7);
-        b8 = VEC_FMA(wj, LOAD(fi + base_ik + 24 * VEC_SIZE), b8);
-        b9 = VEC_FMA(wj, LOAD(fi + base_ik + 25 * VEC_SIZE), b9);
-        ba = VEC_FMA(wj, LOAD(fi + base_ik + 26 * VEC_SIZE), ba);
-        bb = VEC_FMA(wj, LOAD(fi + base_ik + 27 * VEC_SIZE), bb);
-        bc = VEC_FMA(wj, LOAD(fi + base_ik + 28 * VEC_SIZE), bc);
-        bd = VEC_FMA(wj, LOAD(fi + base_ik + 29 * VEC_SIZE), bd);
-        be = VEC_FMA(wj, LOAD(fi + base_ik + 30 * VEC_SIZE), be);
-        bf = VEC_FMA(wj, LOAD(fi + base_ik + 31 * VEC_SIZE), bf);
-
-
-        STORE(fo + base_ik, a0);
-        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
-        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
-        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
-        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
-        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
-        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
-        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
-        STORE(fo + base_ik + 8 * VEC_SIZE, a8);
-        STORE(fo + base_ik + 9 * VEC_SIZE, a9);
-        STORE(fo + base_ik + 10 * VEC_SIZE, aa);
-        STORE(fo + base_ik + 11 * VEC_SIZE, ab);
-        STORE(fo + base_ik + 12 * VEC_SIZE, ac);
-        STORE(fo + base_ik + 13 * VEC_SIZE, ad);
-        STORE(fo + base_ik + 14 * VEC_SIZE, ae);
-        STORE(fo + base_ik + 15 * VEC_SIZE, af);
-        STORE(fo + base_ik + 16 * VEC_SIZE, b0);
-        STORE(fo + base_ik + 17 * VEC_SIZE, b1);
-        STORE(fo + base_ik + 18 * VEC_SIZE, b2);
-        STORE(fo + base_ik + 19 * VEC_SIZE, b3);
-        STORE(fo + base_ik + 20 * VEC_SIZE, b4);
-        STORE(fo + base_ik + 21 * VEC_SIZE, b5);
-        STORE(fo + base_ik + 22 * VEC_SIZE, b6);
-        STORE(fo + base_ik + 23 * VEC_SIZE, b7);
-        STORE(fo + base_ik + 24 * VEC_SIZE, b8);
-        STORE(fo + base_ik + 25 * VEC_SIZE, b9);
-        STORE(fo + base_ik + 26 * VEC_SIZE, ba);
-        STORE(fo + base_ik + 27 * VEC_SIZE, bb);
-        STORE(fo + base_ik + 28 * VEC_SIZE, bc);
-        STORE(fo + base_ik + 29 * VEC_SIZE, bd);
-        STORE(fo + base_ik + 30 * VEC_SIZE, be);
-        STORE(fo + base_ik + 31 * VEC_SIZE, bf);
-      }
-    }
-#endif
-
 #if DEPTH >= 16
     for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) {
       ptrdiff_t k;
@@ -278,11 +75,7 @@
         const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
 
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
           const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
@@ -386,11 +179,7 @@
         const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
 
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
           const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
@@ -450,11 +239,7 @@
         const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
 
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
           const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
@@ -494,11 +279,7 @@
         const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
 
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
           const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
@@ -524,11 +305,7 @@
         register VEC_TYPE a0 = VEC_ZERO;
         const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
         for (j = 0; j < npairs; j++) {
-#ifdef VEC_SCAL_LD
-          register VEC_TYPE wj = VEC_SCAL_LD(wpair + j);
-#else
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-#endif
           const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
           const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
           register VEC_TYPE pos = LOAD(fi + indexj_pos);
-- 
GitLab


From ade83dc39b0017add50130ace02c946f9699b901 Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Wed, 17 Dec 2025 14:53:22 +0100
Subject: [PATCH 09/11] Adding parenthesis for safety of the macros.

---
 src/include/vectors.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/include/vectors.h b/src/include/vectors.h
index 8927198068..8f735020a6 100644
--- a/src/include/vectors.h
+++ b/src/include/vectors.h
@@ -124,10 +124,10 @@
 #define VEC_LDU(addr) VEC_LD(addr)
 #define VEC_ST(addr, vec) (addr)[0] = vec
 #define VEC_STU(addr, vec) VEC_ST(addr, vec)
-#define VEC_FMA(aa, bb, cc) aa * (bb) + cc
+#define VEC_FMA(aa, bb, cc) ((aa) * (bb) + (cc))
 #define VEC_SCAL(aa) aa
-#define VEC_ADD(aa, bb) (aa+bb)
-#define VEC_SUB(aa, bb) (aa-bb)
+#define VEC_ADD(aa, bb) ((aa)+(bb))
+#define VEC_SUB(aa, bb) ((aa)-(bb))
 #define VEC_ZERO 0.0
 
 #define DEPTH 8
-- 
GitLab


From 08834e935af76c79a0bd448194e986eed9ae930d Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Wed, 17 Dec 2025 16:09:57 +0100
Subject: [PATCH 10/11] Use the precomputed offsets for the antisymmetric
 operator too.

---
 src/grid/nl_operator.F90       | 101 +++++++-------
 src/grid/nl_operator_inc.F90   |  11 +-
 src/grid/operate_antisym_inc.c | 235 ++++++++++++++++++---------------
 3 files changed, 180 insertions(+), 167 deletions(-)

diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index 4b5c52635c..71054ef80f 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -225,7 +225,7 @@ contains
     !%End
     call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries)
     if (accel_is_enabled() .and. use_symmetries) then
-      call messages_not_implemented("OperateUseSymmetries=yes with GPUs")    
+      call messages_not_implemented("OperateUseSymmetries=yes with GPUs")
     end if
 
     if (accel_is_enabled()) then
@@ -900,15 +900,16 @@ contains
   end subroutine group_by_pairs_sym
 
 !> Take a list of weights and offsets and build pairs of symmetric points with common weights
-  subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg)
+  subroutine group_by_pairs_antisym(size, ldf, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg)
     integer,                    intent(in)  :: size
+    integer,                    intent(in)  :: ldf
     integer,                    intent(in)  :: offsets(:, :)
     real(real64),               intent(in)  :: wre(:)
     integer,                    intent(in)  :: ri(:, :)
     integer,                    intent(in)  :: nri
-    integer,                    intent(out) :: npairs
-    real(real64), allocatable,  intent(out) :: wpair(:)
-    integer, allocatable,       intent(out) :: pair_pos(:,:,:), pair_neg(:,:,:)
+    integer,                   intent(out)  :: npairs
+    real(real64),             intent(inout) :: wpair(:)
+    integer,                  intent(inout) :: pair_pos(:,:), pair_neg(:,:)
 
     logical, allocatable :: used(:)
     integer :: i, j, ndim, s
@@ -934,10 +935,6 @@ contains
     call sort(abs_wre, idx)
 
     ! Max pairs = n/2
-    SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri, 1:1))
-    SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri, 1:1))
-    SAFE_ALLOCATE(wpair(1:size/2))
-
     do i = 1, size
       if (used(i)) cycle
 
@@ -954,8 +951,8 @@ contains
 
         npairs = npairs + 1
         do s = 1, nri
-          pair_pos(npairs, s, 1) = ri(idx(i), s)
-          pair_neg(npairs, s, 1) = ri(idx(j), s)
+          pair_pos(npairs, s) = ishft(ri(idx(i), s), ldf)
+          pair_neg(npairs, s) = ishft(ri(idx(j), s), ldf)
         end do
         wpair(npairs)  = M_HALF*(wre(idx(i)) - wre(idx(j)))
 
@@ -979,44 +976,44 @@ contains
 
     integer :: ldf, start, end
 
+    if (op%symmetry == OP_GENERAL) return
+
     PUSH_SUB(nl_operator_build_symmetric_weights)
 
     ASSERT(op%const_w)
     ASSERT(conf%target_states_block_size > 0)
 
-
-    select case(op%symmetry)
-    case(OP_SYMMETRIC)
-      if(present(max_size)) then
-        start = op%max_allocated_ri_pair + 1
-        end = max_size
-        call reallocate_array(op%ri_pos, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end)
-        call reallocate_array(op%ri_neg, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end)
-        if (op%mesh%parallel_in_domains) then
-          call reallocate_array(op%inner%ri_pos, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end)
-          call reallocate_array(op%inner%ri_neg, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end)
-          call reallocate_array(op%outer%ri_pos, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end)
-          call reallocate_array(op%outer%ri_neg, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end)
-        end if
-      else
-        SAFE_ALLOCATE(op%wpair(1:op%stencil%size/2))
-        start = 1
-        end = log2(conf%target_states_block_size)+1
-
-        SAFE_ALLOCATE(op%ri_pos(1:op%stencil%size/2, 1:op%nri, 1:end))
-        SAFE_ALLOCATE(op%ri_neg(1:op%stencil%size/2, 1:op%nri, 1:end))
-        if (op%mesh%parallel_in_domains) then
-          SAFE_ALLOCATE(op%inner%ri_pos(1:op%stencil%size/2, 1:op%inner%nri, 1:end))
-          SAFE_ALLOCATE(op%inner%ri_neg(1:op%stencil%size/2, 1:op%inner%nri, 1:end))
-          SAFE_ALLOCATE(op%outer%ri_pos(1:op%stencil%size/2, 1:op%outer%nri, 1:end))
-          SAFE_ALLOCATE(op%outer%ri_neg(1:op%stencil%size/2, 1:op%outer%nri, 1:end))
-        end if
+    if(present(max_size)) then
+      start = op%max_allocated_ri_pair + 1
+      end = max_size
+      call reallocate_array(op%ri_pos, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end)
+      call reallocate_array(op%ri_neg, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end)
+      if (op%mesh%parallel_in_domains) then
+        call reallocate_array(op%inner%ri_pos, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end)
+        call reallocate_array(op%inner%ri_neg, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end)
+        call reallocate_array(op%outer%ri_pos, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end)
+        call reallocate_array(op%outer%ri_neg, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end)
       end if
-      op%max_allocated_ri_pair = end
+    else
       ! Max pairs = n/2
+      SAFE_ALLOCATE(op%wpair(1:op%stencil%size/2))
+      start = 1
+      end = log2(conf%target_states_block_size)+1
 
-      op%max_allocated_ri_pair = end
-      do ldf = start-1, end-1
+      SAFE_ALLOCATE(op%ri_pos(1:op%stencil%size/2, 1:op%nri, 1:end))
+      SAFE_ALLOCATE(op%ri_neg(1:op%stencil%size/2, 1:op%nri, 1:end))
+      if (op%mesh%parallel_in_domains) then
+        SAFE_ALLOCATE(op%inner%ri_pos(1:op%stencil%size/2, 1:op%inner%nri, 1:end))
+        SAFE_ALLOCATE(op%inner%ri_neg(1:op%stencil%size/2, 1:op%inner%nri, 1:end))
+        SAFE_ALLOCATE(op%outer%ri_pos(1:op%stencil%size/2, 1:op%outer%nri, 1:end))
+        SAFE_ALLOCATE(op%outer%ri_neg(1:op%stencil%size/2, 1:op%outer%nri, 1:end))
+      end if
+    end if
+    op%max_allocated_ri_pair = end
+
+    do ldf = start-1, end-1
+      select case(op%symmetry)
+      case(OP_SYMMETRIC)
         call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, &
           op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1), op%wcenter)
         if (op%mesh%parallel_in_domains) then
@@ -1025,17 +1022,17 @@ contains
           call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
             op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1), op%wcenter)
         end if
-      end do
-    case(OP_ANTISYMMETRIC)
-      call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, &
-        op%npairs, op%wpair, op%ri_pos, op%ri_neg)
-      if (op%mesh%parallel_in_domains) then
-        call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
-          op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg)
-        call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
-          op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg)
-      end if
-    end select
+      case(OP_ANTISYMMETRIC)
+        call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, &
+          op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1))
+        if (op%mesh%parallel_in_domains) then
+          call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, &
+            op%npairs, op%wpair, op%inner%ri_pos(:,:,ldf+1), op%inner%ri_neg(:,:,ldf+1))
+          call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, &
+            op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1))
+        end if
+      end select
+    end do
 
     POP_SUB(nl_operator_build_symmetric_weights)
   end subroutine nl_operator_build_symmetric_weights
diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90
index 39eada0cdf..3229904da8 100644
--- a/src/grid/nl_operator_inc.F90
+++ b/src/grid/nl_operator_inc.F90
@@ -183,12 +183,11 @@ contains
       ldf = log2(int(fi%pack_size_real(1), int32))+1
     end if
 
-    if (op%symmetry == OP_SYMMETRIC) then
-      if (ldf > op%max_allocated_ri_pair) then
-        call nl_operator_build_symmetric_weights(op, ldf)
-      end if
-    else
-      ldf = 1
+    if (X(function_global) == OP_FORTRAN .or. op%symmetry == OP_GENERAL) ldf = 1
+
+    ! If needed, we realloc the offsets
+    if (op%symmetry /= OP_GENERAL .and. ldf > op%max_allocated_ri_pair) then
+      call nl_operator_build_symmetric_weights(op, ldf)
     end if
 
     select case (points_)
diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c
index 8ab6f9fdfe..efa735b6a8 100644
--- a/src/grid/operate_antisym_inc.c
+++ b/src/grid/operate_antisym_inc.c
@@ -69,76 +69,78 @@
         a8 = a9 = aa = ab = VEC_ZERO;
         ac = ad = ae = af = VEC_ZERO;
 
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
         for (j = 0; j < npairs; j++) {
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
           a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
-          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
           a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
-          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
           a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
-          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
           a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
-          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
           a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
-          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
           a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
-          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
           a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
-          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE);
           a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8);
-          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE);
           a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9);
-          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE);
           aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa);
-          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE);
           ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab);
-          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE);
           ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac);
-          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE);
           ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad);
-          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE);
           ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae);
-          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE);
           af = VEC_FMA(wj, VEC_SUB(pos, neg), af);
 
         }
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
-        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
-        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
-        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
-        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
-        STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8);
-        STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9);
-        STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa);
-        STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab);
-        STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac);
-        STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad);
-        STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae);
-        STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
+        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
+        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
+        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
+        STORE(fo + base_ik + 8 * VEC_SIZE, a8);
+        STORE(fo + base_ik + 9 * VEC_SIZE, a9);
+        STORE(fo + base_ik + 10 * VEC_SIZE, aa);
+        STORE(fo + base_ik + 11 * VEC_SIZE, ab);
+        STORE(fo + base_ik + 12 * VEC_SIZE, ac);
+        STORE(fo + base_ik + 13 * VEC_SIZE, ad);
+        STORE(fo + base_ik + 14 * VEC_SIZE, ae);
+        STORE(fo + base_ik + 15 * VEC_SIZE, af);
       }
     }
 #endif
@@ -153,43 +155,45 @@
         a0 = a1 = a2 = a3 = VEC_ZERO;
         a4 = a5 = a6 = a7 = VEC_ZERO;
 
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
         for (j = 0; j < npairs; j++) {
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
           a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
-          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
           a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
-          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
           a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
-          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE);
           a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4);
-          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE);
           a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5);
-          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE);
           a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6);
-          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE);
           a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7);
         }
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
-        STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4);
-        STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5);
-        STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6);
-        STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
+        STORE(fo + base_ik + 4 * VEC_SIZE, a4);
+        STORE(fo + base_ik + 5 * VEC_SIZE, a5);
+        STORE(fo + base_ik + 6 * VEC_SIZE, a6);
+        STORE(fo + base_ik + 7 * VEC_SIZE, a7);
       }
     }
 #endif
@@ -202,27 +206,29 @@
 
         a0 = a1 = a2 = a3 = VEC_ZERO;
 
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
         for (j = 0; j < npairs; j++) {
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
           a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
-          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE);
           a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2);
-          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE);
           a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3);
         }
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
-        STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2);
-        STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
+        STORE(fo + base_ik + 2 * VEC_SIZE, a2);
+        STORE(fo + base_ik + 3 * VEC_SIZE, a3);
       }
     }
 #endif
@@ -235,19 +241,21 @@
 
         a0 = a1 = VEC_ZERO;
 
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k;
+
         for (j = 0; j < npairs; j++) {
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
-          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k);
-          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k);
+          pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE);
+          neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE);
           a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1);
         }
-        STORE(fo + (i << ldf) + k, a0);
-        STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1);
+        STORE(fo + base_ik, a0);
+        STORE(fo + base_ik + 1 * VEC_SIZE, a1);
       }
     }
 #endif
@@ -257,27 +265,36 @@
       ptrdiff_t k;
       for (k = 0; k < (1 << ldf); k += VEC_SIZE) {
         register VEC_TYPE a0 = VEC_ZERO;
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
         for (j = 0; j < npairs; j++) {
           register VEC_TYPE wj = VEC_SCAL(wpair[j]);
-          ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf;
-          ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf;
-          register VEC_TYPE pos = LOAD(fi + indexj_pos + k);
-          register VEC_TYPE neg = LOAD(fi + indexj_neg + k);
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          register VEC_TYPE pos = LOAD(fi + indexj_pos);
+          register VEC_TYPE neg = LOAD(fi + indexj_neg);
           a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0);
         }
-        STORE(fo + (i << ldf) + k, a0);
+        STORE(fo + base_ik, a0);
       }
     }
 #endif
 
 #if VEC_SIZE > 1
+
+    const ptrdiff_t size =  (ptrdiff_t) 1 << ldf;
+    double a;
+
     for (; i < rimap_inv_max[l]; i++) {
-      ptrdiff_t k;
-      for (k = 0; k < (1 << ldf); k++) {
-        double a = 0.0;
-        for (j = 0; j < npairs; j++)
-          a += wpair[j] * ( fi[((index_pos[j] + i) << ldf) + k] - fi[((index_neg[j] + i) << ldf) + k]);
-        fo[(i << ldf) + k] = a;
+      for (ptrdiff_t k = 0; k < size; k++) {
+        a = 0.0;
+        const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) +  k; 
+        for (j = 0; j < npairs; j++) {
+          const ptrdiff_t indexj_pos = index_pos[j] + base_ik;
+          const ptrdiff_t indexj_neg = index_neg[j] + base_ik;
+          const double diff = fi[indexj_pos] - fi[indexj_neg];  
+          a += wpair[j] * diff;
+        }
+        fo[base_ik] = a;
       }
     }
 #endif
-- 
GitLab


From 6c13b21267b99c97f64e94b79feb663a5c505591 Mon Sep 17 00:00:00 2001
From: NicolasTD <nicolas.tancogne-dejean@mpsd.mpg.de>
Date: Wed, 17 Dec 2025 17:03:22 +0100
Subject: [PATCH 11/11] Implement a robbust algorithm for the sorting of the
 points of the stencils

---
 src/basic/sort.F90           | 117 ++++++++++++++++++++++++++++++++++-
 src/grid/derivatives.F90     |   2 +-
 src/grid/nl_operator.F90     |  24 +++----
 src/grid/nl_operator_inc.F90 |  18 ++++--
 src/main/test.F90            |   4 +-
 5 files changed, 145 insertions(+), 20 deletions(-)

diff --git a/src/basic/sort.F90 b/src/basic/sort.F90
index 22add9d042..3d833f7683 100644
--- a/src/basic/sort.F90
+++ b/src/basic/sort.F90
@@ -29,7 +29,8 @@ module sort_oct_m
 
   private
   public ::                     &
-    sort
+    sort,                       &
+    robbust_sort_by_abs
 
   ! ------------------------------------------------------------------------------
   !> This is the common interface to a sorting routine.
@@ -171,6 +172,120 @@ contains
   end subroutine lsort
 
 
+  !>@brief Sorting criterium for the robbust sorting below
+  pure logical function less_idx(i, j, off, kabs, ksgn) result(less)
+    integer, intent(in) :: i, j
+    integer, intent(in) :: off(:, :)          ! (ndim, n)
+    integer(int64), intent(in) :: kabs(:), ksgn(:)
+    integer :: d, ndim
+
+    if (kabs(i) /= kabs(j)) then
+      less = kabs(i) < kabs(j)
+      return
+    end if
+
+    if (ksgn(i) /= ksgn(j)) then
+      less = ksgn(i) < ksgn(j)    ! 0 before 1
+      return
+    end if
+
+    ndim = size(off, 1)
+    do d = 1, ndim
+      if (off(d,i) /= off(d,j)) then
+        less = off(d,i) < off(d,j)
+        return
+      end if
+    end do
+
+    less = i < j                  ! final stability (should not matter if offsets are unique)
+  end function less_idx
+
+  !>@brief Perform the permutations for the sorting
+  recursive subroutine mergesort_perm(perm, tmp, l, r, off, kabs, ksgn)
+    integer, intent(inout) :: perm(:), tmp(:)
+    integer, intent(in)    :: l, r
+    integer, intent(in) :: off(:, :)
+    integer(int64), intent(in) :: kabs(:), ksgn(:)
+
+    integer :: m, i, j, k
+
+    if (r <= l) return
+    m = (l + r) / 2
+
+    call mergesort_perm(perm, tmp, l,   m, off, kabs, ksgn)
+    call mergesort_perm(perm, tmp, m+1, r, off, kabs, ksgn)
+
+    i = l; j = m+1; k = l
+    do while (i <= m .and. j <= r)
+      if (less_idx(perm(i), perm(j), off, kabs, ksgn)) then
+        tmp(k) = perm(i); i = i+1
+      else
+        tmp(k) = perm(j); j = j+1
+      end if
+      k = k+1
+    end do
+    do while (i <= m); tmp(k) = perm(i); i=i+1; k=k+1; end do
+    do while (j <= r); tmp(k) = perm(j); j=j+1; k=k+1; end do
+
+    perm(l:r) = tmp(l:r)
+  end subroutine mergesort_perm
+
+  !>@brief Robbust sorting of floating point numbers by absolute values
+  !!
+  !! This works as follow:
+  !!  - Convert the floating point number to integers
+  !!  - Put negative number first
+  !!  - Use the offset integer values, assumed to be unique key, to order the degenerated points
+  subroutine robbust_sort_by_abs(v, off, perm, negative_first)
+    real(real64), intent(in) :: v(:)
+    integer, intent(in)      :: off(:, :)              ! (ndim, n)
+    integer, intent(out)     :: perm(size(v))
+    logical, intent(in), optional :: negative_first
+
+    integer :: n, i
+    logical :: neg_first
+    integer, allocatable :: tmp(:)
+    integer(int64), allocatable :: kabs(:), ksgn(:)
+    real(real64) :: eps, ratio, max_ratio
+    integer(int64) :: kmax
+
+    PUSH_SUB(robbust_sort_by_abs)
+
+    n = size(v)
+    ASSERT(size(off, dim=2) == n)
+
+    neg_first = optional_default(negative_first, .true.)
+
+    ! Tolerance for the quantization, to eliminate noise
+    eps = 1.0e-14_real64 * maxval(abs(v))
+
+    kmax = huge(0_int64)
+    max_ratio = real(kmax, real64) - M_ONE
+
+    allocate(tmp(n), kabs(n), ksgn(n))
+
+    do i = 1, n
+      perm(i) = i
+      ratio = abs(v(i)) / eps
+      if (ratio < max_ratio) then
+        kabs(i) = nint( abs(v(i)) / eps, int64 )
+      else
+        kabs(i) = kmax
+      end if
+
+      if (neg_first) then
+        ksgn(i) = merge(0, 1, v(i) < 0.0_real64)   ! neg (0) then pos (1)
+      else
+        ksgn(i) = merge(0, 1, v(i) >= 0.0_real64)  ! pos (0) then neg (1)
+      end if
+    end do
+
+    call mergesort_perm(perm, tmp, 1, n, off, kabs, ksgn)
+
+    deallocate(tmp, kabs, ksgn)
+    POP_SUB(robbust_sort_by_abs)
+  end subroutine robbust_sort_by_abs
+
 #include "undef.F90"
 #include "complex.F90"
 #include "sort_inc.F90"
diff --git a/src/grid/derivatives.F90 b/src/grid/derivatives.F90
index 5d3ff64ee4..1352860a8e 100644
--- a/src/grid/derivatives.F90
+++ b/src/grid/derivatives.F90
@@ -893,10 +893,10 @@ contains
       if (der%remove_zero_weight_points) then
         call nl_operator_remove_zero_weight_points(op(i), space, der%mesh)
       end if
+      call nl_operator_output_weights(op(i))
       if (op(i)%const_w) then
         call nl_operator_build_symmetric_weights(op(i))
       end if
-      call nl_operator_output_weights(op(i))
     end do
 
     ! In case of constant weights, we store the weights of the Laplacian on the GPU, as this
diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90
index 71054ef80f..ed6d8be53e 100644
--- a/src/grid/nl_operator.F90
+++ b/src/grid/nl_operator.F90
@@ -838,7 +838,6 @@ contains
     logical, allocatable :: used(:)
     integer :: i, j, ndim, s
     logical :: same
-    real(real64), allocatable :: abs_wre(:)
     integer, allocatable :: idx(:)
 
     real(real64), parameter :: tol = 1.0e-12_real64
@@ -853,10 +852,8 @@ contains
 
     ndim = ubound(offsets, dim=1)
 
-    SAFE_ALLOCATE(abs_wre(1:size))
     SAFE_ALLOCATE(idx(1:size))
-    abs_wre = abs(wre)
-    call sort(abs_wre, idx)
+    call robbust_sort_by_abs(wre, offsets, idx)
 
     do i = 1, size
       if (used(i)) cycle
@@ -893,7 +890,6 @@ contains
 
     ASSERT(npairs == size/2)
 
-    SAFE_DEALLOCATE_A(abs_wre)
     SAFE_DEALLOCATE_A(idx)
 
     POP_SUB(group_by_pairs_sym)
@@ -914,7 +910,6 @@ contains
     logical, allocatable :: used(:)
     integer :: i, j, ndim, s
     logical :: same
-    real(real64), allocatable :: abs_wre(:)
     integer, allocatable :: idx(:)
 
     real(real64), parameter :: tol = 1.0e-12_real64
@@ -929,10 +924,8 @@ contains
 
     ndim = ubound(offsets, dim=1)
 
-    SAFE_ALLOCATE(abs_wre(1:size))
     SAFE_ALLOCATE(idx(1:size))
-    abs_wre = abs(wre)
-    call sort(abs_wre, idx)
+    call robbust_sort_by_abs(wre, offsets, idx)
 
     ! Max pairs = n/2
     do i = 1, size
@@ -964,7 +957,6 @@ contains
 
     ASSERT(npairs == size/2)
 
-    SAFE_DEALLOCATE_A(abs_wre)
     SAFE_DEALLOCATE_A(idx)
     POP_SUB(group_by_pairs_antisym)
   end subroutine group_by_pairs_antisym
@@ -974,7 +966,7 @@ contains
     type(nl_operator_t), intent(inout)   :: op
     integer, optional,   intent(in)      :: max_size !< For reallocation
 
-    integer :: ldf, start, end
+    integer :: ldf, start, end, ipair
 
     if (op%symmetry == OP_GENERAL) return
 
@@ -1034,6 +1026,16 @@ contains
       end select
     end do
 
+    if (.not. present(max_size)) then
+      write(message(1), '(3a)') 'Debug info: Sorted weights for ', trim(op%label), '.'
+      call messages_info(1, debug_only=.true.)
+
+      do ipair = 1, op%npairs
+        write(message(1), '(a,i3,f25.10,2(1x,i4))') '      ', ipair, op%wpair(ipair), op%ri_pos(ipair,1,1), op%ri_neg(ipair,1,1)
+        call messages_info(1, debug_only=.true.)
+      end do
+    end if
+
     POP_SUB(nl_operator_build_symmetric_weights)
   end subroutine nl_operator_build_symmetric_weights
 
diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90
index 3229904da8..6ba64178b3 100644
--- a/src/grid/nl_operator_inc.F90
+++ b/src/grid/nl_operator_inc.F90
@@ -196,22 +196,28 @@ contains
       imin => op%rimap_inv(1:)
       imax => op%rimap_inv(2:)
       ri   => op%ri
-      ri_pos   => op%ri_pos(:,:,ldf)
-      ri_neg   => op%ri_neg(:,:,ldf)
+      if (op%symmetry /= OP_GENERAL) then
+        ri_pos   => op%ri_pos(:,:,ldf)
+        ri_neg   => op%ri_neg(:,:,ldf)
+      end if
     case (OP_INNER)
       nri  =  op%inner%nri
       imin => op%inner%imin
       imax => op%inner%imax
       ri   => op%inner%ri
-      ri_pos   => op%inner%ri_pos(:,:,ldf)
-      ri_neg   => op%inner%ri_neg(:,:,ldf)
+      if (op%symmetry /= OP_GENERAL) then
+        ri_pos   => op%inner%ri_pos(:,:,ldf)
+        ri_neg   => op%inner%ri_neg(:,:,ldf)
+      end if
     case (OP_OUTER)
       nri  =  op%outer%nri
       imin => op%outer%imin
       imax => op%outer%imax
       ri   => op%outer%ri
-      ri_pos   => op%outer%ri_pos(:,:,ldf)
-      ri_neg   => op%outer%ri_neg(:,:,ldf)
+      if (op%symmetry /= OP_GENERAL) then
+        ri_pos   => op%outer%ri_pos(:,:,ldf)
+        ri_neg   => op%outer%ri_neg(:,:,ldf)
+      end if
     case default
       ASSERT(.false.)
     end select
diff --git a/src/main/test.F90 b/src/main/test.F90
index 0a8d3be31c..dac77f904b 100644
--- a/src/main/test.F90
+++ b/src/main/test.F90
@@ -1304,7 +1304,9 @@ contains
       call zderivatives_test(sys%gr%der, sys%namespace, param%repetitions, param%min_blocksize, param%max_blocksize)
     end if
 
-    call derivatives_advanced_benchmark(sys%gr%der, sys%namespace)
+    if (sys%space%dim > 1) then
+      call derivatives_advanced_benchmark(sys%gr%der, sys%namespace)
+    end if
 
     SAFE_DEALLOCATE_P(sys)
 
-- 
GitLab