From 54e67cb920535bcde36d25fbafb1334dcb21735e Mon Sep 17 00:00:00 2001 From: NicolasTD Date: Fri, 12 Dec 2025 15:51:33 +0100 Subject: [PATCH 01/11] Improved numerical stability strategy for the finite differences The logic of this change is to change the order on which points are summed over. Instead of summing over points in a random order, the summation is done by pairs of points, with a sorting of the weights by theior absolute values. Some challenging numerical unit tests are added, to reveal the impact of the changes. In all the tests, the new method seems to reduce the numerical error. The new method is only implemented in the Fortran operator at the moment. --- src/grid/CMakeLists.txt | 1 + src/grid/derivatives.F90 | 15 +- src/grid/derivatives_inc.F90 | 295 ---------------------------- src/grid/derivatives_test.F90 | 219 +++++++++++++++++++++ src/grid/derivatives_test_inc.F90 | 312 ++++++++++++++++++++++++++++++ src/grid/nl_operator.F90 | 168 +++++++++++++++- src/grid/nl_operator_inc.F90 | 127 +++++++++++- src/main/test.F90 | 3 + src/utils/dielectric_function.F90 | 22 ++- 9 files changed, 849 insertions(+), 313 deletions(-) create mode 100644 src/grid/derivatives_test.F90 create mode 100644 src/grid/derivatives_test_inc.F90 diff --git a/src/grid/CMakeLists.txt b/src/grid/CMakeLists.txt index 2b277d8025..e2557c2395 100644 --- a/src/grid/CMakeLists.txt +++ b/src/grid/CMakeLists.txt @@ -16,6 +16,7 @@ target_sources(Octopus_lib PRIVATE curv_gygi.F90 curv_modine.F90 derivatives.F90 + derivatives_test.F90 fourier_shell.F90 fourier_space.F90 grid.F90 diff --git a/src/grid/derivatives.F90 b/src/grid/derivatives.F90 index 6a8c3dcf9f..3392f2425b 100644 --- a/src/grid/derivatives.F90 +++ b/src/grid/derivatives.F90 @@ -42,10 +42,8 @@ module derivatives_oct_m use, intrinsic :: iso_fortran_env use lalg_basic_oct_m use lalg_adv_oct_m - use loct_oct_m use math_oct_m use mesh_oct_m - use mesh_function_oct_m use messages_oct_m use namespace_oct_m use nl_operator_oct_m @@ -64,13 +62,6 @@ module derivatives_oct_m use utils_oct_m use varinfo_oct_m -! debug purposes -! use io_binary_oct_m -! use io_function_oct_m -! use io_oct_m -! use unit_oct_m -! use unit_system_oct_m - implicit none private @@ -80,8 +71,6 @@ module derivatives_oct_m derivatives_end, & derivatives_build, & derivatives_handle_batch_t, & - dderivatives_test, & - zderivatives_test, & dderivatives_batch_start, & zderivatives_batch_start, & dderivatives_batch_finish, & @@ -430,7 +419,7 @@ contains order_ = optional_default(order, der%order) ! initialize nl operator - call nl_operator_init(lapl, name_) + call nl_operator_init(lapl, name_, symm=OP_SYMMETRIC) ! create stencil select case (der%stencil_type) @@ -484,7 +473,7 @@ contains dir_label = ' ' if (ii < 5) dir_label = index2axis(ii) - call nl_operator_init(der%grad(ii), "Gradient "//dir_label) + call nl_operator_init(der%grad(ii), "Gradient "//dir_label, symm=OP_ANTISYMMETRIC) ! create stencil select case (der%stencil_type) diff --git a/src/grid/derivatives_inc.F90 b/src/grid/derivatives_inc.F90 index 99b247edfd..ecb6287463 100644 --- a/src/grid/derivatives_inc.F90 +++ b/src/grid/derivatives_inc.F90 @@ -322,301 +322,6 @@ subroutine X(derivatives_curl)(der, ff, op_ff, ghost_update, set_bc) POP_SUB(X(derivatives_curl)) end subroutine X(derivatives_curl) - -! ---------------------------------------------------------- -!> @brief unit test for derivatives -!! -!! This will be called from the test run mode. -subroutine X(derivatives_test)(this, namespace, repetitions, min_blocksize, max_blocksize) - type(derivatives_t), intent(in) :: this - type(namespace_t), intent(in) :: namespace - integer, intent(in) :: repetitions - integer, intent(in) :: min_blocksize - integer, intent(in) :: max_blocksize - - R_TYPE, allocatable :: ff(:), opff(:, :), gradff(:, :), curlff(:, :), res(:), resgrad(:, :), rescurl(:, :) - R_TYPE :: aa, bb, cc - integer :: ip, idir, ist, ib - type(batch_t) :: ffb, opffb - type(batch_t) :: gradffb(this%mesh%box%dim) - integer :: blocksize, itime - logical :: packstates - real(real64) :: stime, etime - character(len=20) :: type - real(real64) :: norm - - call parse_variable(namespace, 'StatesPack', .true., packstates) - - SAFE_ALLOCATE(ff(1:this%mesh%np_part)) - SAFE_ALLOCATE(opff(1:this%mesh%np, 1:this%mesh%box%dim)) - SAFE_ALLOCATE(gradff(1:this%mesh%np_part, 1:this%mesh%box%dim)) - SAFE_ALLOCATE(res(1:this%mesh%np_part)) - SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim)) - SAFE_ALLOCATE(curlff(1:this%mesh%np, 1:this%mesh%box%dim)) - -#ifdef R_TREAL - type = 'real' -#else - type = 'complex' -#endif - - ! Note: here we need to use a constant function or anything that - ! is constant at the borders, since we assume that all boundary - ! points have equal values to optimize the application of the nl-operator. - - aa = R_TOTYPE(1.0)/this%mesh%box%bounding_box_l(1) - bb = R_TOTYPE(10.0) - cc = R_TOTYPE(100.0) - -#ifdef R_TCOMPLEX - ! we make things more "complex" - aa = aa + M_ZI*R_TOTYPE(0.01) - bb = bb*exp(M_ZI*R_TOTYPE(0.345)) - cc = cc - M_ZI*R_TOTYPE(50.0) -#endif - - - do ip = 1, this%mesh%np_part - ff(ip) = bb*exp(-aa*sum(this%mesh%x(:, ip)**2)) + cc - end do - do ip = 1, this%mesh%np - do idir = 1, this%mesh%box%dim - gradff(ip, idir) = -M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2)) - end do - end do - - ! curl only needed in 3D - if (this%mesh%box%dim == 3) then - do ip = 1, this%mesh%np - curlff(ip, 1) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(2, ip) - this%mesh%x(3, ip)) - curlff(ip, 2) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(3, ip) - this%mesh%x(1, ip)) - curlff(ip, 3) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(1, ip) - this%mesh%x(2, ip)) - end do - end if - - message(1) = "Testing Laplacian" - call messages_info(1, namespace=namespace) - - ! test Laplacian - blocksize = min_blocksize - do - - call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) - call X(batch_init)(opffb, 1, 1, blocksize, this%mesh%np) - - do ist = 1, blocksize - call batch_set_state(ffb, ist, this%mesh%np_part, ff) - end do - - if (packstates) then - call ffb%do_pack() - call opffb%do_pack(copy = .false.) - end if - - if (repetitions > 1) then - call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF) - end if - - stime = loct_clock() - do itime = 1, repetitions - call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF) - end do - etime = (loct_clock() - stime)/real(repetitions, real64) - - call batch_get_state(opffb, blocksize, this%mesh%np, res) - - call ffb%end() - call opffb%end() - - do ip = 1, this%mesh%np - res(ip) = M_TWO*res(ip) - & - (M_FOUR*aa**2*bb*sum(this%mesh%x(:, ip)**2)*exp(-aa*sum(this%mesh%x(:, ip)**2)) & - - this%mesh%box%dim*M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))) - end do - - write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & - 'Laplacian ', trim(type), & - ' bsize = ', blocksize, & - ' , error = ', X(mf_nrm2)(this%mesh, res), & - ' , Gflops = ', & -#ifdef R_TREAL - blocksize*this%mesh%np*M_TWO*this%lapl%stencil%size/(etime*1.0e9_real64) -#else - blocksize*this%mesh%np*M_FOUR*this%lapl%stencil%size/(etime*1.0e9_real64) -#endif - - call messages_info(1, namespace=namespace) - - blocksize = 2*blocksize - if (blocksize > max_blocksize) exit - - end do - - message(1) = "Testing Gradient" - call messages_info(1, namespace=namespace) - - ! test Gradient - blocksize = min_blocksize - do - call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) - - do ist = 1, blocksize - call batch_set_state(ffb, ist, this%mesh%np_part, ff) - end do - - if (packstates) then - call ffb%do_pack() - end if - - if (repetitions > 1) then - call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) - end if - - stime = loct_clock() - do itime = 1, repetitions - call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) - end do - etime = (loct_clock() - stime)/real(repetitions, real64) - - do idir = 1, this%mesh%box%dim - call batch_get_state(gradffb(idir), blocksize, this%mesh%np, resgrad(:, idir)) - end do - - call ffb%end() - do idir = 1, this%mesh%box%dim - call gradffb(idir)%end() - end do - - do idir = 1, this%mesh%box%dim - do ip = 1, this%mesh%np - resgrad(ip, idir) = resgrad(ip, idir) - gradff(ip, idir) - end do - end do - - write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & - 'Batch gradient ', trim(type), & - ' bsize = ', blocksize, & - ' , error (x direction) = ', X(mf_nrm2)(this%mesh, resgrad(:, 1)), & - ' , Gflops = ', & -#ifdef R_TREAL - blocksize*this%mesh%np*M_TWO*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64) -#else - blocksize*this%mesh%np*M_FOUR*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64) -#endif - call messages_info(1, namespace=namespace) - - blocksize = 2*blocksize - if (blocksize > max_blocksize) exit - end do - - call X(derivatives_grad)(this, ff, opff, set_bc = .false.) - - do idir = 1, this%mesh%box%dim - do ip = 1, this%mesh%np - opff(ip, idir) = opff(ip, idir) - (-M_TWO*aa*bb*this%mesh%x_t(ip, idir)*exp(-aa*sum(this%mesh%x(:, ip)**2))) - end do - end do - - message(1) = '' - call messages_info(1, namespace=namespace) - - write(message(1), '(3a, es17.10)') 'Gradient ', trim(type), & - ' err = ', X(mf_nrm2)(this%mesh, this%mesh%box%dim, opff) - call messages_info(1, namespace=namespace) - - message(1) = '' - call messages_info(1, namespace=namespace) - - ! test curl for 3D case - if (this%mesh%box%dim == 3) then - message(1) = "Testing curl" - call messages_info(1, namespace=namespace) - - do blocksize = 3, 6, 3 - call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) - - do ist = 1, blocksize - call batch_set_state(ffb, ist, this%mesh%np_part, ff) - end do - - if (packstates) then - call ffb%do_pack() - end if - - ! test for one blocksize without precomputed gradient - if (blocksize == 6) then - call X(derivatives_batch_curl)(this, ffb, set_bc=.false.) - else - call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) - call X(derivatives_batch_curl_from_gradient)(this, ffb, gradffb) - end if - - SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:blocksize)) - do ist = 1, blocksize - call batch_get_state(ffb, ist, this%mesh%np, rescurl(:, ist)) - end do - - call ffb%end() - do idir = 1, this%mesh%box%dim - call gradffb(idir)%end() - end do - - do ip = 1, this%mesh%np - do ib = 0, blocksize-1, this%mesh%box%dim - do idir = 1, this%mesh%box%dim - rescurl(ip, ib+idir) = rescurl(ip, ib+idir) - curlff(ip, idir) - end do - end do - end do - - norm = M_ZERO - do ist = 1, blocksize - norm = norm + X(mf_nrm2)(this%mesh, rescurl(:, ist)) - end do - - write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & - 'Batch curl ', trim(type), ' bsize = ', blocksize, ' , error = ', norm - call messages_info(1, namespace=namespace) - - SAFE_DEALLOCATE_A(rescurl) - end do - - ! Testing the regular curl with the same logic as above - SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:this%mesh%box%dim)) - do idir = 1, this%mesh%box%dim - call lalg_copy(this%mesh%np_part, ff, gradff(:, idir)) - end do - call X(derivatives_curl)(this, gradff, rescurl, set_bc = .false.) - - do idir = 1, this%mesh%box%dim - do ip = 1, this%mesh%np - opff(ip, idir) = rescurl(ip, idir) - curlff(ip, idir) - end do - end do - - norm = M_ZERO - do ist = 1, this%mesh%box%dim - norm = norm + X(mf_nrm2)(this%mesh, opff(:, ist)) - end do - - write(message(1), '(3a, es17.10)') 'Curl ', trim(type), ' err = ', norm - call messages_info(1, namespace=namespace) - - SAFE_DEALLOCATE_A(rescurl) - end if - - message(1) = '' - call messages_info(1, namespace=namespace) - - SAFE_DEALLOCATE_A(ff) - SAFE_DEALLOCATE_A(opff) - SAFE_DEALLOCATE_A(gradff) - SAFE_DEALLOCATE_A(res) - SAFE_DEALLOCATE_A(resgrad) - SAFE_DEALLOCATE_A(curlff) - -end subroutine X(derivatives_test) - !===================================================================================== ! Batched versions of the routines: diff --git a/src/grid/derivatives_test.F90 b/src/grid/derivatives_test.F90 new file mode 100644 index 0000000000..c9231922dc --- /dev/null +++ b/src/grid/derivatives_test.F90 @@ -0,0 +1,219 @@ +!! Copyright (C) 2002-2006 M. Marques, A. Castro, A. Rubio, G. Bertsch +!! +!! This program is free software; you can redistribute it and/or modify +!! it under the terms of the GNU General Public License as published by +!! the Free Software Foundation; either version 2, or (at your option) +!! any later version. +!! +!! This program is distributed in the hope that it will be useful, +!! but WITHOUT ANY WARRANTY; without even the implied warranty of +!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +!! GNU General Public License for more details. +!! +!! You should have received a copy of the GNU General Public License +!! along with this program; if not, write to the Free Software +!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +!! 02110-1301, USA. +!! + +#include "global.h" + +!> @brief This module provides unit tests for the derivatives + +module derivatives_test_oct_m + use batch_oct_m + use batch_ops_oct_m + use boundaries_oct_m + use debug_oct_m + use derivatives_oct_m + use global_oct_m + use, intrinsic :: iso_fortran_env + use lalg_basic_oct_m + use lalg_adv_oct_m + use loct_oct_m + use mesh_oct_m + use mesh_function_oct_m + use messages_oct_m + use namespace_oct_m + use parser_oct_m + use profiling_oct_m + use space_oct_m + use utils_oct_m + + implicit none + + private + public :: & + dderivatives_test, & + zderivatives_test, & + derivatives_advanced_benchmark + +contains + +!>@brief Further unit tests design to challenge numerical stability of the finite differences +subroutine derivatives_advanced_benchmark(this, namespace) + type(derivatives_t), intent(in) :: this + type(namespace_t), intent(in) :: namespace + + real(real64), allocatable :: ff(:), grad(:,:), resgrad(:, :), lapl(:), reslapl(:) + real(real64) :: kvec(3) + + SAFE_ALLOCATE(ff(1:this%mesh%np_part)) + SAFE_ALLOCATE(grad(1:this%mesh%np, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(lapl(1:this%mesh%np)) + SAFE_ALLOCATE(reslapl(1:this%mesh%np)) + SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim)) + + ! Testing on some planewaves + kvec = [0.5_real64, 0.0_real64, 0.0_real64] ! low frequency + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(0.5,0.0,0.0)") + + kvec = [1.0_real64, 1.0_real64, 0.0_real64] ! diagonal + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(1.0,1.0,0.0)") + + kvec = [M_PI/this%mesh%spacing(1), 0.0_real64, 0.0_real64] ! Nyquist + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(\pi/dr,0.0,0.0)") + + kvec = [M_PI/this%mesh%spacing(1), M_PI/this%mesh%spacing(2), 0.0_real64] ! diagonal Nyquist + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(\pi/dr,\pi/dr,0.0)") + + ! Testing on some polynomials + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=2) + call test_function("Testing degree-2 polynomial") + + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=3) + call test_function("Testing degree-3 polynomial") + + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=4) + call test_function("Testing degree-4 polynomial") + + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=6) + call test_function("Testing degree-6 polynomial") + + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=8) + call test_function("Testing degree-8 polynomial") + + + SAFE_DEALLOCATE_A(ff) + SAFE_DEALLOCATE_A(grad) + SAFE_DEALLOCATE_A(lapl) + SAFE_DEALLOCATE_A(resgrad) + SAFE_DEALLOCATE_A(reslapl) + +contains + subroutine test_function(label) + character(len=*), intent(in) :: label + + message(1) = trim(label) + + call dderivatives_grad(this, ff, resgrad, set_bc=.false.) + resgrad = resgrad - grad + write(message(2), '(a, es17.10)') 'Gradient err = ', dmf_nrm2(this%mesh, this%mesh%box%dim, resgrad) + + call dderivatives_lapl(this, ff, reslapl, set_bc=.false.) + reslapl = reslapl - lapl + write(message(3), '(a, es17.10)') 'Laplacian err = ', dmf_nrm2(this%mesh, reslapl) + + call messages_info(3, namespace=namespace) + end subroutine test_function +end subroutine derivatives_advanced_benchmark + +!>@brief Generate a plane-wave given its momentum, as well as the gradient and the Laplacian +subroutine testfunc_plane_wave(mesh, ff, grad, lapl, kvec) + type(mesh_t), intent(in) :: mesh + real(real64), intent(out) :: ff(:), grad(:, :), lapl(:) + real(real64), intent(in) :: kvec(3) + + integer :: ip, dim, idir + real(real64) :: phase, kdotx, kk + + dim = mesh%box%dim + + kk = dot_product(kvec(1:dim), kvec(1:dim)) + do ip = 1, mesh%np + kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim)) + phase = cos(kdotx) + + ff(ip) = phase + + ! exact gradient + do idir = 1, mesh%box%dim + grad(ip, idir) = -kvec(idir)*sin(kdotx) + end do + + ! exact Laplacian + lapl(ip) = -kk * phase + end do + + do ip = mesh%np+1, mesh%np_part + kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim)) + phase = cos(kdotx) + + ff(ip) = phase + end do +end subroutine testfunc_plane_wave + +!>@brief Generate a polyomial of degree p, as well as the gradient and the Laplacian +subroutine testfunc_polynomial(mesh, ff, grad, lapl, p) + type(mesh_t), intent(in) :: mesh + real(real64), intent(out) :: ff(:), grad(:, :), lapl(:) + integer, intent(in) :: p + integer :: ip, d, dim + real(real64) :: x(3), r2 + + dim = mesh%box%dim + + do ip = 1, mesh%np + x = 0.0_real64 + x(1:dim) = mesh%x(ip, 1:dim) + + ! f = (x^2 + y^2 + z^2)^(p/2) + r2 = dot_product(x(1:dim), x(1:dim)) + ff(ip) = r2**(0.5d0*p) + + ! gradient: grad(f) = p * r^(p-2) * x + if (p >= 2) then + do d = 1, dim + grad(ip,d) = p * r2**(0.5d0*p - 1.0d0) * x(d) + end do + else + grad(ip,:) = 0.0 + end if + + ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2) + if (p >= 2) then + lapl(ip) = p * (p + dim - 2) * r2**(0.5d0*p - 1.0d0) + else + lapl(ip) = 0.0 + end if + end do + + do ip = mesh%np+1, mesh%np_part + x = 0.0_real64 + x(1:dim) = mesh%x(ip,1:dim) + + ! f = (x^2 + y^2 + z^2)^(p/2) + r2 = dot_product(x(1:dim), x(1:dim)) + ff(ip) = r2**(0.5d0*p) + end do +end subroutine testfunc_polynomial + + +#include "undef.F90" +#include "real.F90" +#include "derivatives_test_inc.F90" + +#include "undef.F90" +#include "complex.F90" +#include "derivatives_test_inc.F90" + +end module derivatives_test_oct_m + +!! Local Variables: +!! mode: f90 +!! coding: utf-8 +!! End: diff --git a/src/grid/derivatives_test_inc.F90 b/src/grid/derivatives_test_inc.F90 new file mode 100644 index 0000000000..84028e3362 --- /dev/null +++ b/src/grid/derivatives_test_inc.F90 @@ -0,0 +1,312 @@ +!! Copyright (C) 2002-2006 M. Marques, A. Castro, A. Rubio, G. Bertsch +!! +!! This program is free software; you can redistribute it and/or modify +!! it under the terms of the GNU General Public License as published by +!! the Free Software Foundation; either version 2, or (at your option) +!! any later version. +!! +!! This program is distributed in the hope that it will be useful, +!! but WITHOUT ANY WARRANTY; without even the implied warranty of +!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +!! GNU General Public License for more details. +!! +!! You should have received a copy of the GNU General Public License +!! along with this program; if not, write to the Free Software +!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +!! 02110-1301, USA. +!! + +! ---------------------------------------------------------- +!> @brief unit test for derivatives +!! +!! This will be called from the test run mode. +subroutine X(derivatives_test)(this, namespace, repetitions, min_blocksize, max_blocksize) + type(derivatives_t), intent(in) :: this + type(namespace_t), intent(in) :: namespace + integer, intent(in) :: repetitions + integer, intent(in) :: min_blocksize + integer, intent(in) :: max_blocksize + + R_TYPE, allocatable :: ff(:), opff(:, :), gradff(:, :), curlff(:, :), res(:), resgrad(:, :), rescurl(:, :) + R_TYPE :: aa, bb, cc + integer :: ip, idir, ist, ib + type(batch_t) :: ffb, opffb + type(batch_t) :: gradffb(this%mesh%box%dim) + integer :: blocksize, itime + logical :: packstates + real(real64) :: stime, etime + character(len=20) :: type + real(real64) :: norm + + call parse_variable(namespace, 'StatesPack', .true., packstates) + + SAFE_ALLOCATE(ff(1:this%mesh%np_part)) + SAFE_ALLOCATE(opff(1:this%mesh%np, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(gradff(1:this%mesh%np_part, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(res(1:this%mesh%np_part)) + SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(curlff(1:this%mesh%np, 1:this%mesh%box%dim)) + +#ifdef R_TREAL + type = 'real' +#else + type = 'complex' +#endif + + ! Note: here we need to use a constant function or anything that + ! is constant at the borders, since we assume that all boundary + ! points have equal values to optimize the application of the nl-operator. + + aa = R_TOTYPE(1.0)/this%mesh%box%bounding_box_l(1) + bb = R_TOTYPE(10.0) + cc = R_TOTYPE(100.0) + +#ifdef R_TCOMPLEX + ! we make things more "complex" + aa = aa + M_ZI*R_TOTYPE(0.01) + bb = bb*exp(M_ZI*R_TOTYPE(0.345)) + cc = cc - M_ZI*R_TOTYPE(50.0) +#endif + + + do ip = 1, this%mesh%np_part + ff(ip) = bb*exp(-aa*sum(this%mesh%x(:, ip)**2)) + cc + end do + do ip = 1, this%mesh%np + do idir = 1, this%mesh%box%dim + gradff(ip, idir) = -M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2)) + end do + end do + + ! curl only needed in 3D + if (this%mesh%box%dim == 3) then + do ip = 1, this%mesh%np + curlff(ip, 1) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(2, ip) - this%mesh%x(3, ip)) + curlff(ip, 2) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(3, ip) - this%mesh%x(1, ip)) + curlff(ip, 3) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(1, ip) - this%mesh%x(2, ip)) + end do + end if + + message(1) = "Testing Laplacian" + call messages_info(1, namespace=namespace) + + ! test Laplacian + blocksize = min_blocksize + do + + call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) + call X(batch_init)(opffb, 1, 1, blocksize, this%mesh%np) + + do ist = 1, blocksize + call batch_set_state(ffb, ist, this%mesh%np_part, ff) + end do + + if (packstates) then + call ffb%do_pack() + call opffb%do_pack(copy = .false.) + end if + + if (repetitions > 1) then + call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF) + end if + + stime = loct_clock() + do itime = 1, repetitions + call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF) + end do + etime = (loct_clock() - stime)/real(repetitions, real64) + + call batch_get_state(opffb, blocksize, this%mesh%np, res) + + call ffb%end() + call opffb%end() + + do ip = 1, this%mesh%np + res(ip) = M_TWO*res(ip) - & + (M_FOUR*aa**2*bb*sum(this%mesh%x(:, ip)**2)*exp(-aa*sum(this%mesh%x(:, ip)**2)) & + - this%mesh%box%dim*M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))) + end do + + write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & + 'Laplacian ', trim(type), & + ' bsize = ', blocksize, & + ' , error = ', X(mf_nrm2)(this%mesh, res), & + ' , Gflops = ', & +#ifdef R_TREAL + blocksize*this%mesh%np*M_TWO*this%lapl%stencil%size/(etime*1.0e9_real64) +#else + blocksize*this%mesh%np*M_FOUR*this%lapl%stencil%size/(etime*1.0e9_real64) +#endif + + call messages_info(1, namespace=namespace) + + blocksize = 2*blocksize + if (blocksize > max_blocksize) exit + + end do + + message(1) = "Testing Gradient" + call messages_info(1, namespace=namespace) + + ! test Gradient + blocksize = min_blocksize + do + call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) + + do ist = 1, blocksize + call batch_set_state(ffb, ist, this%mesh%np_part, ff) + end do + + if (packstates) then + call ffb%do_pack() + end if + + if (repetitions > 1) then + call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) + end if + + stime = loct_clock() + do itime = 1, repetitions + call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) + end do + etime = (loct_clock() - stime)/real(repetitions, real64) + + do idir = 1, this%mesh%box%dim + call batch_get_state(gradffb(idir), blocksize, this%mesh%np, resgrad(:, idir)) + end do + + call ffb%end() + do idir = 1, this%mesh%box%dim + call gradffb(idir)%end() + end do + + do idir = 1, this%mesh%box%dim + do ip = 1, this%mesh%np + resgrad(ip, idir) = resgrad(ip, idir) - gradff(ip, idir) + end do + end do + + write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & + 'Batch gradient ', trim(type), & + ' bsize = ', blocksize, & + ' , error (x direction) = ', X(mf_nrm2)(this%mesh, resgrad(:, 1)), & + ' , Gflops = ', & +#ifdef R_TREAL + blocksize*this%mesh%np*M_TWO*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64) +#else + blocksize*this%mesh%np*M_FOUR*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64) +#endif + call messages_info(1, namespace=namespace) + + blocksize = 2*blocksize + if (blocksize > max_blocksize) exit + end do + + call X(derivatives_grad)(this, ff, opff, set_bc = .false.) + + do idir = 1, this%mesh%box%dim + do ip = 1, this%mesh%np + opff(ip, idir) = opff(ip, idir) - (-M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2))) + end do + end do + + message(1) = '' + call messages_info(1, namespace=namespace) + + write(message(1), '(3a, es17.10)') 'Gradient ', trim(type), & + ' err = ', X(mf_nrm2)(this%mesh, this%mesh%box%dim, opff) + call messages_info(1, namespace=namespace) + + message(1) = '' + call messages_info(1, namespace=namespace) + + ! test curl for 3D case + if (this%mesh%box%dim == 3) then + message(1) = "Testing curl" + call messages_info(1, namespace=namespace) + + do blocksize = 3, 6, 3 + call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) + + do ist = 1, blocksize + call batch_set_state(ffb, ist, this%mesh%np_part, ff) + end do + + if (packstates) then + call ffb%do_pack() + end if + + ! test for one blocksize without precomputed gradient + if (blocksize == 6) then + call X(derivatives_batch_curl)(this, ffb, set_bc=.false.) + else + call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) + call X(derivatives_batch_curl_from_gradient)(this, ffb, gradffb) + end if + + SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:blocksize)) + do ist = 1, blocksize + call batch_get_state(ffb, ist, this%mesh%np, rescurl(:, ist)) + end do + + call ffb%end() + do idir = 1, this%mesh%box%dim + call gradffb(idir)%end() + end do + + do ip = 1, this%mesh%np + do ib = 0, blocksize-1, this%mesh%box%dim + do idir = 1, this%mesh%box%dim + rescurl(ip, ib+idir) = rescurl(ip, ib+idir) - curlff(ip, idir) + end do + end do + end do + + norm = M_ZERO + do ist = 1, blocksize + norm = norm + X(mf_nrm2)(this%mesh, rescurl(:, ist)) + end do + + write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & + 'Batch curl ', trim(type), ' bsize = ', blocksize, ' , error = ', norm + call messages_info(1, namespace=namespace) + + SAFE_DEALLOCATE_A(rescurl) + end do + + ! Testing the regular curl with the same logic as above + SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:this%mesh%box%dim)) + do idir = 1, this%mesh%box%dim + call lalg_copy(this%mesh%np_part, ff, gradff(:, idir)) + end do + call X(derivatives_curl)(this, gradff, rescurl, set_bc = .false.) + + do idir = 1, this%mesh%box%dim + do ip = 1, this%mesh%np + opff(ip, idir) = rescurl(ip, idir) - curlff(ip, idir) + end do + end do + + norm = M_ZERO + do ist = 1, this%mesh%box%dim + norm = norm + X(mf_nrm2)(this%mesh, opff(:, ist)) + end do + + write(message(1), '(3a, es17.10)') 'Curl ', trim(type), ' err = ', norm + call messages_info(1, namespace=namespace) + + SAFE_DEALLOCATE_A(rescurl) + end if + + message(1) = '' + call messages_info(1, namespace=namespace) + + SAFE_DEALLOCATE_A(ff) + SAFE_DEALLOCATE_A(opff) + SAFE_DEALLOCATE_A(gradff) + SAFE_DEALLOCATE_A(res) + SAFE_DEALLOCATE_A(resgrad) + SAFE_DEALLOCATE_A(curlff) + +end subroutine X(derivatives_test) + diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index f5f198c62e..453d83b098 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -39,6 +39,7 @@ module nl_operator_oct_m use parser_oct_m use profiling_oct_m use space_oct_m + use sort_oct_m use stencil_oct_m use types_oct_m use varinfo_oct_m @@ -80,6 +81,11 @@ module nl_operator_oct_m integer, allocatable :: ri(:, :) end type nl_operator_index_t + integer, public, parameter :: & + OP_GENERAL = 1, & + OP_SYMMETRIC = 2, & + OP_ANTISYMMETRIC = 3 + !> data type for non local operators type nl_operator_t private @@ -95,6 +101,8 @@ module nl_operator_oct_m type(accel_mem_t), public :: buff_weights !< buffer with constant weights type(accel_mem_t), public :: buff_half_weights !< buffer with weights multiplied by -1/2 + integer :: symmetry = OP_GENERAL !< Symmetry of the weights to be used + character(len=40) :: label !> the compressed index of grid points @@ -232,13 +240,15 @@ contains !> @brief initialize an instance of a non-local operator !! by setting the label !! - subroutine nl_operator_init(op, label) + subroutine nl_operator_init(op, label, symm) type(nl_operator_t), intent(inout) :: op character(len=*), intent(in) :: label + integer, optional, intent(in) :: symm PUSH_SUB(nl_operator_init) op%label = label + op%symmetry = optional_default(symm, OP_GENERAL) POP_SUB(nl_operator_init) end subroutine nl_operator_init @@ -261,6 +271,7 @@ contains opo%np = opi%np opo%mesh => opi%mesh + opo%symmetry = opi%symmetry SAFE_ALLOCATE_SOURCE_A(opo%nn, opi%nn) SAFE_ALLOCATE_SOURCE_A(opo%w, opi%w) @@ -762,6 +773,161 @@ contains POP_SUB(nl_operator_remove_zero_weight_points) end subroutine nl_operator_remove_zero_weight_points +!> Take a list of weights and offsets and build pairs of symmetric points with common weights +subroutine group_by_pairs_sym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg, wcenter) + integer, intent(in) :: size + integer, intent(in) :: offsets(:, :) + real(real64), intent(in) :: wre(:) + integer, intent(in) :: ri(:, :) + integer, intent(in) :: nri + integer, intent(out) :: npairs + real(real64), allocatable, intent(out) :: wpair(:) + integer, allocatable, intent(out) :: pair_pos(:,:), pair_neg(:,:) + real(real64), intent(out) :: wcenter + + logical, allocatable :: used(:) + integer :: i, j, d, ndim, s + logical :: same + real(real64), allocatable :: abs_wre(:) + integer, allocatable :: idx(:) + + real(real64), parameter :: tol = 1.0e-12_real64 + + PUSH_SUB(group_by_pairs_sym) + + ASSERT(mod(size,2) == 1) + + SAFE_ALLOCATE(used(1:size)) + used = .false. + npairs = 0 + + ndim = ubound(offsets, dim=1) + + SAFE_ALLOCATE(abs_wre(1:size)) + SAFE_ALLOCATE(idx(1:size)) + abs_wre = abs(wre) + call sort(abs_wre, idx) + + ! Max pairs = n/2 + SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri)) + SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri)) + SAFE_ALLOCATE(wpair(1:size/2)) + + do i = 1, size + if (used(i)) cycle + + if (all(offsets(:, idx(i))==0)) then + wcenter = wre(idx(i)) + used(i) = .true. + cycle + end if + + ! Try to find symmetric partner j + do j = i+1, size + if (used(j)) cycle + + ! Weight equality + same = abs(wre(idx(i)) - wre(idx(j))) <= tol + if (.not. same) cycle + + ! Offsets equal and opposite + if (any(offsets(:,idx(j))+offsets(:, idx(i)) /= 0)) cycle + + npairs = npairs + 1 + do s = 1, nri + pair_pos(npairs, s) = ri(idx(i), s) + pair_neg(npairs, s) = ri(idx(j), s) + end do + wpair(npairs) = M_HALF*(wre(idx(i)) + wre(idx(j))) + + used(i) = .true. + used(j) = .true. + exit + end do + end do + + ASSERT(npairs == size/2) + + SAFE_DEALLOCATE_A(abs_wre) + SAFE_DEALLOCATE_A(idx) + + POP_SUB(group_by_pairs_sym) +end subroutine group_by_pairs_sym + +!> Take a list of weights and offsets and build pairs of symmetric points with common weights +subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg) + integer, intent(in) :: size + integer, intent(in) :: offsets(:, :) + real(real64), intent(in) :: wre(:) + integer, intent(in) :: ri(:, :) + integer, intent(in) :: nri + integer, intent(out) :: npairs + real(real64), allocatable, intent(out) :: wpair(:) + integer, allocatable, intent(out) :: pair_pos(:,:), pair_neg(:,:) + + logical, allocatable :: used(:) + integer :: i, j, d, ndim, s + logical :: same + real(real64), allocatable :: abs_wre(:) + integer, allocatable :: idx(:) + + real(real64), parameter :: tol = 1.0e-12_real64 + + PUSH_SUB(group_by_pairs_antisym) + + ASSERT(mod(size,2) == 0) + + SAFE_ALLOCATE(used(1:size)) + used = .false. + npairs = 0 + + ndim = ubound(offsets, dim=1) + + SAFE_ALLOCATE(abs_wre(1:size)) + SAFE_ALLOCATE(idx(1:size)) + abs_wre = abs(wre) + call sort(abs_wre, idx) + + ! Max pairs = n/2 + SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri)) + SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri)) + SAFE_ALLOCATE(wpair(1:size/2)) + + do i = 1, size + if (used(i)) cycle + + ! Try to find symmetric partner j + do j = i+1, size + if (used(j)) cycle + + ! Weight equality + same = abs(wre(idx(i)) + wre(idx(j))) <= tol + if (.not. same) cycle + + ! Offsets equal and opposite + if (any(offsets(:,idx(j))+offsets(:, idx(i)) /= 0)) cycle + + npairs = npairs + 1 + do s = 1, nri + pair_pos(npairs, s) = ri(idx(i), s) + pair_neg(npairs, s) = ri(idx(j), s) + end do + wpair(npairs) = M_HALF*(wre(idx(i)) - wre(idx(j))) + + used(i) = .true. + used(j) = .true. + exit + end do + end do + + ASSERT(npairs == size/2) + + SAFE_DEALLOCATE_A(abs_wre) + SAFE_DEALLOCATE_A(idx) + POP_SUB(group_by_pairs_antisym) +end subroutine group_by_pairs_antisym + + #include "undef.F90" #include "real.F90" #include "nl_operator_inc.F90" diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90 index d2eb96aec1..5e3e8a2e13 100644 --- a/src/grid/nl_operator_inc.F90 +++ b/src/grid/nl_operator_inc.F90 @@ -41,6 +41,11 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point integer, parameter :: logldf = 1 #endif integer :: nri_loc, ini + !> Symmetry of the stencil can be used to group points by pairs + integer :: npairs + real(real64), allocatable :: wpair(:) + integer, allocatable :: ri_pos(:,:), ri_neg(:,:) + real(real64) :: wcenter PUSH_SUB(X(nl_operator_operate_batch)) @@ -86,7 +91,24 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point use_accel = .true. call operate_accel() else if (X(function_global) == OP_FORTRAN) then - call operate_const_weights() + select case(op%symmetry) + case(OP_GENERAL) + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg) + call operate_const_weights_antisymmetric() + SAFE_DEALLOCATE_A(wpair) + SAFE_DEALLOCATE_A(ri_pos) + SAFE_DEALLOCATE_A(ri_neg) + case(OP_SYMMETRIC) + call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter) + call operate_const_weights_symmetric() + SAFE_DEALLOCATE_A(wpair) + SAFE_DEALLOCATE_A(ri_pos) + SAFE_DEALLOCATE_A(ri_neg) + case(OP_ANTISYMMETRIC) + call operate_const_weights() + case default + ASSERT(.false.) + end select else ! for the moment this is not implemented @@ -203,6 +225,109 @@ contains POP_SUB(X(nl_operator_operate_batch).operate_const_weights) end subroutine operate_const_weights + ! --------------------------------------------------------- + subroutine operate_const_weights_antisymmetric() + integer :: nn, ll, ii, ist, ipair + + PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric) + + nn = op%stencil%size + + select case (fi%status()) + + case (BATCH_DEVICE_PACKED) + + ASSERT(.false.) + + case (BATCH_NOT_PACKED) + + !$omp parallel do private(ll, ist, ii, ipair) + do ll = 1, nri + do ii = imin(ll) + 1, imax(ll) + do ist = 1, fi%nst_linear + fo%X(ff_linear)(ii, ist) = M_ZERO + do ipair = 1, npairs + fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + & + wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) - fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist)) + end do + end do + end do + end do + !$omp end parallel do + + case (BATCH_PACKED) + + !$omp parallel do private(ll, ist, ii, ipair) + do ll = 1, nri + do ii = imin(ll) + 1, imax(ll) + do ist = 1, fi%nst_linear + fo%X(ff_pack)(ist, ii) = M_ZERO + do ipair = 1, npairs + fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) & + + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) - fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll))) + end do + end do + end do + end do + !$omp end parallel do + + end select + + POP_SUB(X(nl_operator_operate_batch).operate_const_weights_antisymmetric) + end subroutine operate_const_weights_antisymmetric + + ! --------------------------------------------------------- + subroutine operate_const_weights_symmetric() + integer :: nn, ll, ii, ist, ipair + + PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric) + + nn = op%stencil%size + + select case (fi%status()) + + case (BATCH_DEVICE_PACKED) + + ASSERT(.false.) + + case (BATCH_NOT_PACKED) + + !$omp parallel do private(ll, ist, ii, ipair) + do ll = 1, nri + do ii = imin(ll) + 1, imax(ll) + do ist = 1, fi%nst_linear + fo%X(ff_linear)(ii, ist) = M_ZERO + do ipair = 1, npairs + fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + & + wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) + fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist)) + end do + fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + wcenter * fi%X(ff_linear)(ii, ist) + end do + end do + end do + !$omp end parallel do + + case (BATCH_PACKED) + + !$omp parallel do private(ll, ist, ii, ipair) + do ll = 1, nri + do ii = imin(ll) + 1, imax(ll) + do ist = 1, fi%nst_linear + fo%X(ff_pack)(ist, ii) = M_ZERO + do ipair = 1, npairs + fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) & + + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) + fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll))) + end do + fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) + wcenter * fi%X(ff_pack)(ist, ii) + end do + end do + end do + !$omp end parallel do + + end select + + POP_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric) + end subroutine operate_const_weights_symmetric ! --------------------------------------------------------- subroutine operate_non_const_weights() diff --git a/src/main/test.F90 b/src/main/test.F90 index d33d56c0c0..0a8d3be31c 100644 --- a/src/main/test.F90 +++ b/src/main/test.F90 @@ -34,6 +34,7 @@ module test_oct_m use debug_oct_m use density_oct_m use derivatives_oct_m + use derivatives_test_oct_m use eigen_chebyshev_oct_m use electron_space_oct_m use electrons_oct_m @@ -1303,6 +1304,8 @@ contains call zderivatives_test(sys%gr%der, sys%namespace, param%repetitions, param%min_blocksize, param%max_blocksize) end if + call derivatives_advanced_benchmark(sys%gr%der, sys%namespace) + SAFE_DEALLOCATE_P(sys) POP_SUB(test_derivatives) diff --git a/src/utils/dielectric_function.F90 b/src/utils/dielectric_function.F90 index 750875b12b..754aa2301b 100644 --- a/src/utils/dielectric_function.F90 +++ b/src/utils/dielectric_function.F90 @@ -84,7 +84,7 @@ contains real(real64) :: dt, dt_ref, tt, ww, norm_Aext real(real64), allocatable :: vecpot(:, :), Aext(:), Eind_real(:, :), Eind_imag(:, :) complex(real64), allocatable :: dielectric(:), chi(:), invdielectric(:) - real(real64), allocatable :: vecpot_ref(:, :) + real(real64), allocatable :: vecpot_ref(:, :), average(:) type(spectrum_t) :: spectrum type(block_t) :: blk type(space_t) :: space @@ -102,6 +102,7 @@ contains SAFE_ALLOCATE(Aext(1:space%dim)) SAFE_ALLOCATE(Eind(1:space%dim)) + SAFE_ALLOCATE(average(1:space%dim*3)) if (parse_block(global_namespace, 'GaugeVectorField', blk) == 0) then @@ -192,8 +193,8 @@ contains read(ref_file, *) jj, tt, (vecpot_ref(ii, kk), kk = 1, space%dim*3) end do call io_close(ref_file) - do ii = 1, time_steps - do kk = 1, space%dim*3 + do kk = 1, space%dim*3 + do ii = 1, time_steps vecpot(ii, kk) = vecpot(ii, kk) - vecpot_ref(ii, kk) end do end do @@ -203,6 +204,20 @@ contains trim(io_workpath('td.general/gauge_field', global_namespace))//"'" call messages_info(1) + ! Treat the Drude peak + ! We remove the offset before applying the mask, to get proper treatment of metals + do kk = 1, space%dim*3 + average(kk) = M_ZERO + do ii = time_steps/2, time_steps + average(kk) = average(kk) + vecpot(ii, kk) + end do + average = average/(time_steps -time_steps/2 + 1) + do ii = 1, time_steps + vecpot(ii, kk) = vecpot(ii, kk) - average(kk) + end do + end do + Aext(1:space%dim) = Aext(1:space%dim) + average(1:space%dim) + ! Find out the iteration numbers corresponding to the time limits. ! Max time correspond to time_steps-1, as we start from 0. @@ -297,6 +312,7 @@ contains SAFE_DEALLOCATE_A(vecpot_ref) SAFE_DEALLOCATE_A(Aext) SAFE_DEALLOCATE_A(Eind) + SAFE_DEALLOCATE_A(average) SAFE_DEALLOCATE_A(Eind_real) SAFE_DEALLOCATE_A(Eind_imag) -- GitLab From d0be0c96390244077a812daf567e9f781a8b7a34 Mon Sep 17 00:00:00 2001 From: NicolasTD Date: Fri, 12 Dec 2025 19:58:43 +0100 Subject: [PATCH 02/11] Implement the symmetric and antisymmetric operators in vectorized forms Adding an input variable to control the use of the symmetric form or not. --- src/grid/grid.F90 | 3 +- src/grid/nl_operator.F90 | 14 +- src/grid/nl_operator_inc.F90 | 56 +++- src/grid/operate.c | 73 +++++ src/grid/operate_antisym_inc.c | 480 ++++++++++++++++++++++++++++ src/grid/operate_f.F90 | 71 +++++ src/grid/operate_sym_inc.c | 553 +++++++++++++++++++++++++++++++++ src/include/vectors.h | 8 + 8 files changed, 1240 insertions(+), 18 deletions(-) create mode 100644 src/grid/operate_antisym_inc.c create mode 100644 src/grid/operate_sym_inc.c diff --git a/src/grid/grid.F90 b/src/grid/grid.F90 index 5c90041617..423e9200d7 100644 --- a/src/grid/grid.F90 +++ b/src/grid/grid.F90 @@ -193,6 +193,7 @@ contains call initialize_coordinate_system(gr, namespace, space, grid_spacing, latt, n_sites, site_position) ! initialize derivatives + call nl_operator_global_init(namespace) call derivatives_init(gr%der, namespace, space, gr%coord_system) ! the stencil used to generate the grid is a union of a cube (for ! multigrid) and the Laplacian @@ -254,6 +255,7 @@ contains call initialize_coordinate_system(gr, namespace, space, gr%spacing, latt, n_sites, site_position) ! initialize derivatives + call nl_operator_global_init(namespace) call derivatives_init(gr%der, namespace, space, gr%coord_system) ! the stencil used to generate the grid is a union of a cube (for ! multigrid) and the Laplacian @@ -383,7 +385,6 @@ contains call mesh_init_stage_3(gr, namespace, space, gr%stencil, mc) - call nl_operator_global_init(namespace) call derivatives_build(gr%der, namespace, space, gr, qvector) ! print info concerning the grid diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index 453d83b098..3acbed8422 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -157,6 +157,7 @@ module nl_operator_oct_m integer :: dfunction_global = -1 integer :: zfunction_global = -1 integer :: function_accel + logical :: use_symmetries = .false. contains @@ -204,6 +205,14 @@ contains call parse_variable(namespace, 'OperateComplex', default, zfunction_global) if (.not. varinfo_valid_option('OperateComplex', zfunction_global)) call messages_input_error(namespace, 'OperateComplex') + !%Variable OperateUseSymmetries + !%Type logical + !%Section Execution::Optimization + !%Default no + !%Description + !% This variable selects if the operators are build using symmetries or not. + !%End + call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries) if (accel_is_enabled()) then @@ -248,7 +257,10 @@ contains PUSH_SUB(nl_operator_init) op%label = label - op%symmetry = optional_default(symm, OP_GENERAL) + op%symmetry = OP_GENERAL + if (use_symmetries) then + op%symmetry = optional_default(symm, OP_GENERAL) + end if POP_SUB(nl_operator_init) end subroutine nl_operator_init diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90 index 5e3e8a2e13..a5d3eb1fae 100644 --- a/src/grid/nl_operator_inc.F90 +++ b/src/grid/nl_operator_inc.F90 @@ -33,8 +33,7 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point logical :: ghost_update_, profile_, use_accel integer :: nri integer, pointer :: imin(:), imax(:), ri(:, :) -! real(real64), allocatable :: wre(:), wim(:) - R_BASE, allocatable :: wre(:), wim(:) + R_BASE, allocatable :: wre(:) #ifdef R_TREAL integer, parameter :: logldf = 0 #else @@ -92,26 +91,27 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point call operate_accel() else if (X(function_global) == OP_FORTRAN) then select case(op%symmetry) - case(OP_GENERAL) + case(OP_ANTISYMMETRIC) call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg) call operate_const_weights_antisymmetric() - SAFE_DEALLOCATE_A(wpair) - SAFE_DEALLOCATE_A(ri_pos) - SAFE_DEALLOCATE_A(ri_neg) case(OP_SYMMETRIC) call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter) call operate_const_weights_symmetric() - SAFE_DEALLOCATE_A(wpair) - SAFE_DEALLOCATE_A(ri_pos) - SAFE_DEALLOCATE_A(ri_neg) - case(OP_ANTISYMMETRIC) + case(OP_GENERAL) call operate_const_weights() case default ASSERT(.false.) end select else -! for the moment this is not implemented + select case(op%symmetry) + case(OP_SYMMETRIC) + call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter) + case(OP_ANTISYMMETRIC) + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg) + end select + + ! for the moment this is not implemented !$omp parallel private(ini, nri_loc, ist) call multicomm_divide_range_omp(nri, ini, nri_loc) @@ -120,16 +120,38 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point ASSERT(ubound(fi%X(ff_pack), dim = 2) >= op%mesh%np_part) ASSERT(ubound(fo%X(ff_pack), dim = 2) >= op%mesh%np) - call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & - fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) + select case(op%symmetry) + case(OP_GENERAL) + call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & + fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) + case(OP_SYMMETRIC) + call X(operate_ri_sym_vec)(npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) + case(OP_ANTISYMMETRIC) + call X(operate_ri_antisym_vec)(npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) + case default + ASSERT(.false.) + end select else do ist = 1, fi%nst_linear ASSERT(ubound(fi%X(ff_linear), dim=1) == op%mesh%np_part) ASSERT(ubound(fo%X(ff_linear), dim=1) >= op%mesh%np) - call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & - fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) + select case(op%symmetry) + case(OP_GENERAL) + call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & + fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) + case(OP_SYMMETRIC) + call X(operate_ri_sym_vec)(npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) + case(OP_ANTISYMMETRIC) + call X(operate_ri_antisym_vec)(npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) + case default + ASSERT(.false.) + end select end do end if !$omp end parallel @@ -143,7 +165,9 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point end if SAFE_DEALLOCATE_A(wre) - SAFE_DEALLOCATE_A(wim) + SAFE_DEALLOCATE_A(wpair) + SAFE_DEALLOCATE_A(ri_pos) + SAFE_DEALLOCATE_A(ri_neg) if (profile_) call profiling_out(TOSTRING(X(NL_OPERATOR_BATCH))) POP_SUB(X(nl_operator_operate_batch)) diff --git a/src/grid/operate.c b/src/grid/operate.c index 482b8ed3ce..ce5071e9a6 100644 --- a/src/grid/operate.c +++ b/src/grid/operate.c @@ -64,6 +64,57 @@ void FC_FUNC_(doperate_ri_vec, } } +void FC_FUNC_(doperate_ri_sym_vec, + DOPERATE_RI_SYM_VEC)(const int *opn, const double *restrict wpair, const double *restrict wcenter, + const int *opnri, const int *opri_pos, const int *opri_neg, + const int *rimap_inv, const int *rimap_inv_max, + const double *restrict fi, const int *ldfp, + double *restrict fo) { + const size_t ldf = ldfp[0]; + + /* check whether we got aligned vectors or not */ + int aligned = 1; + aligned = aligned && (((long long)fi) % (8 * VEC_SIZE) == 0); + aligned = aligned && (((long long)fo) % (8 * VEC_SIZE) == 0); + aligned = aligned && ((1 << ldf) % VEC_SIZE == 0); + + if (aligned) { +#define ALIGNED +#include "operate_sym_inc.c" +#undef ALIGNED + + } else { + /* not aligned */ + #include "operate_sym_inc.c" + } +} + +void FC_FUNC_(doperate_ri_antisym_vec, + DOPERATE_RI_ANTISYM_VEC)(const int *opn, const double *restrict wpair, + const int *opnri, const int *opri_pos, const int *opri_neg, + const int *rimap_inv, const int *rimap_inv_max, + const double *restrict fi, const int *ldfp, + double *restrict fo) { + const size_t ldf = ldfp[0]; + + /* check whether we got aligned vectors or not */ + int aligned = 1; + aligned = aligned && (((long long)fi) % (8 * VEC_SIZE) == 0); + aligned = aligned && (((long long)fo) % (8 * VEC_SIZE) == 0); + aligned = aligned && ((1 << ldf) % VEC_SIZE == 0); + + if (aligned) { +#define ALIGNED +#include "operate_antisym_inc.c" +#undef ALIGNED + + } else { + /* not aligned */ +#include "operate_antisym_inc.c" + } +} + + /* the same as doperate_ri_vec, but allows giving each an appropriate Fortan interface in which fi and fo are actually complex in Fortran Could be inline, but in that case pgcc will not put it in the symbol table. */ @@ -77,6 +128,28 @@ void FC_FUNC_(zoperate_ri_vec, (opn, w, opnri, opri, rimap_inv, rimap_inv_max, fi, ldfp, fo); } +void FC_FUNC_(zoperate_ri_sym_vec, + ZOPERATE_RI_SYM_VEC)(const int *opn, const double *restrict w, + const double *restrict wcenter, + const int *opnri, const int *opri_pos, const int *opri_neg, + const int *rimap_inv, const int *rimap_inv_max, + const double *restrict fi, const int *ldfp, + double *restrict fo) { + FC_FUNC_(doperate_ri_sym_vec, DOPERATE_RI_SYM_VEC) + (opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo); +} + +void FC_FUNC_(zoperate_ri_antisym_vec, + ZOPERATE_RI_ANTISYM_VEC)(const int *opn, const double *restrict w, + const int *opnri, const int *opri_pos, const int *opri_neg, + const int *rimap_inv, const int *rimap_inv_max, + const double *restrict fi, const int *ldfp, + double *restrict fo) { + FC_FUNC_(doperate_ri_antisym_vec, DOPERATE_RI_SYM_VEC) + (opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo); +} + + void FC_FUNC_(dgauss_seidel, DGAUSS_SEIDEL)(const int *opn, const double *restrict w, const int *opnri, const int *opri, diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c new file mode 100644 index 0000000000..3e14fc74a4 --- /dev/null +++ b/src/grid/operate_antisym_inc.c @@ -0,0 +1,480 @@ +/* + Copyright (C) 2006 X. Andrade + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +*/ + +#ifdef ALIGNED +#define LOAD VEC_LD +#define STORE VEC_ST +#else +#define LOAD VEC_LDU +#define STORE VEC_STU +#endif + +{ + const ptrdiff_t npairs = opn[0]; + const ptrdiff_t nri = opnri[0]; +#if DEPTH >= 32 + const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 16 + const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 8 + const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 4 + const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 2 + const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 1 + const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf); +#endif + + ptrdiff_t l, i, j; + const int *restrict index_pos; + const int *restrict index_neg; + + for (l = 0; l < nri; l++) { + index_pos = opri_pos + npairs * l; + index_neg = opri_neg + npairs * l; + i = rimap_inv[l]; + +#if DEPTH >= 32 + for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + register VEC_TYPE a8, a9, aa, ab; + register VEC_TYPE ac, ad, ae, af; + register VEC_TYPE b0, b1, b2, b3; + register VEC_TYPE b4, b5, b6, b7; + register VEC_TYPE b8, b9, ba, bb; + register VEC_TYPE bc, bd, be, bf; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + a8 = a9 = aa = ab = VEC_ZERO; + ac = ad = ae = af = VEC_ZERO; + b0 = b1 = b2 = b3 = VEC_ZERO; + b4 = b5 = b6 = b7 = VEC_ZERO; + b8 = b9 = ba = bb = VEC_ZERO; + bc = bd = be = bf = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7); + pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k); + a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8); + pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k); + a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9); + pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k); + aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa); + pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k); + ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab); + pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k); + ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac); + pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k); + ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad); + pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k); + ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae); + pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k); + af = VEC_FMA(wj, VEC_SUB(pos, neg), af); + pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE + k); + b0 = VEC_FMA(wj, VEC_SUB(pos, neg), b0); + pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE + k); + b1 = VEC_FMA(wj, VEC_SUB(pos, neg), b1); + pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE + k); + b2 = VEC_FMA(wj, VEC_SUB(pos, neg), b2); + pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE + k); + b3 = VEC_FMA(wj, VEC_SUB(pos, neg), b3); + pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE + k); + b4 = VEC_FMA(wj, VEC_SUB(pos, neg), b4); + pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE + k); + b5 = VEC_FMA(wj, VEC_SUB(pos, neg), b5); + pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE + k); + b6 = VEC_FMA(wj, VEC_SUB(pos, neg), b6); + pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE + k); + b7 = VEC_FMA(wj, VEC_SUB(pos, neg), b7); + pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE + k); + b8 = VEC_FMA(wj, VEC_SUB(pos, neg), b8); + pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE + k); + b9 = VEC_FMA(wj, VEC_SUB(pos, neg), b9); + pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE + k); + ba = VEC_FMA(wj, VEC_SUB(pos, neg), ba); + pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE + k); + bb = VEC_FMA(wj, VEC_SUB(pos, neg), bb); + pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE + k); + bc = VEC_FMA(wj, VEC_SUB(pos, neg), bc); + pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE + k); + bd = VEC_FMA(wj, VEC_SUB(pos, neg), bd); + pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE + k); + be = VEC_FMA(wj, VEC_SUB(pos, neg), be); + pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE + k); + be = VEC_FMA(wj, VEC_SUB(pos, neg), be); + } + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); + STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); + STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); + STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); + STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); + STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); + STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); + STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); + STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); + STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); + STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); + STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); + STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); + STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0); + STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1); + STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2); + STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3); + STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4); + STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5); + STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6); + STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7); + STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8); + STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9); + STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba); + STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb); + STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc); + STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd); + STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be); + STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf); + } + } +#endif + +#if DEPTH >= 16 + for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + register VEC_TYPE a8, a9, aa, ab; + register VEC_TYPE ac, ad, ae, af; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + a8 = a9 = aa = ab = VEC_ZERO; + ac = ad = ae = af = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7); + pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k); + a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8); + pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k); + a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9); + pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k); + aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa); + pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k); + ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab); + pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k); + ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac); + pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k); + ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad); + pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k); + ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae); + pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k); + af = VEC_FMA(wj, VEC_SUB(pos, neg), af); + + } + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); + STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); + STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); + STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); + STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); + STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); + STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); + STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); + STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); + STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); + STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); + STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); + STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); + } + } +#endif + +#if DEPTH >= 8 + for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7); + } + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); + STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); + STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); + STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); + STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); + } + } +#endif + +#if DEPTH >= 4 + for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + + a0 = a1 = a2 = a3 = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); + } + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); + STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + } + } +#endif + +#if DEPTH >= 2 + for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) { + register VEC_TYPE a0, a1; + + a0 = a1 = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); + } + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + } + } +#endif + +#if DEPTH >= 1 + for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += VEC_SIZE) { + register VEC_TYPE a0 = VEC_ZERO; + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + } + STORE(fo + (i << ldf) + k, a0); + } + } +#endif + +#if VEC_SIZE > 1 + for (; i < rimap_inv_max[l]; i++) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k++) { + double a = 0.0; + for (j = 0; j < npairs; j++) + a += wpair[j] * ( fi[((index_pos[j] + i) << ldf) + k] - fi[((index_neg[j] + i) << ldf) + k]); + fo[(i << ldf) + k] = a; + } + } +#endif + + } /* l */ + + // this fence instruction is needed to ensure correctness when using + // non-temporal stores +#if defined(ALIGNED) && defined(FENCE) + FENCE; +#endif +} + +#undef LOAD +#undef STORE diff --git a/src/grid/operate_f.F90 b/src/grid/operate_f.F90 index f17b4b1a7d..51c467b6b2 100644 --- a/src/grid/operate_f.F90 +++ b/src/grid/operate_f.F90 @@ -61,6 +61,77 @@ module operate_f_oct_m end subroutine zoperate_ri_vec end interface + interface + subroutine doperate_ri_sym_vec(opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo) + import :: real64 + implicit none + integer, intent(in) :: opn + real(real64), intent(in) :: w + real(real64), intent(in) :: wcenter + integer, intent(in) :: opnri + integer, intent(in) :: opri_pos + integer, intent(in) :: opri_neg + integer, intent(in) :: rimap_inv + integer, intent(in) :: rimap_inv_max + real(real64), intent(in) :: fi + integer, intent(in) :: ldfp + real(real64), intent(inout) :: fo + end subroutine doperate_ri_sym_vec + end interface + + interface + subroutine zoperate_ri_sym_vec(opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo) + import :: real64 + implicit none + integer, intent(in) :: opn + real(real64), intent(in) :: w + real(real64), intent(in) :: wcenter + integer, intent(in) :: opnri + integer, intent(in) :: opri_pos + integer, intent(in) :: opri_neg + integer, intent(in) :: rimap_inv + integer, intent(in) :: rimap_inv_max + complex(real64), intent(in) :: fi + integer, intent(in) :: ldfp + complex(real64), intent(inout) :: fo + end subroutine zoperate_ri_sym_vec + end interface + + interface + subroutine doperate_ri_antisym_vec(opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo) + import :: real64 + implicit none + integer, intent(in) :: opn + real(real64), intent(in) :: w + integer, intent(in) :: opnri + integer, intent(in) :: opri_pos + integer, intent(in) :: opri_neg + integer, intent(in) :: rimap_inv + integer, intent(in) :: rimap_inv_max + real(real64), intent(in) :: fi + integer, intent(in) :: ldfp + real(real64), intent(inout) :: fo + end subroutine doperate_ri_antisym_vec + end interface + + interface + subroutine zoperate_ri_antisym_vec(opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo) + import :: real64 + implicit none + integer, intent(in) :: opn + real(real64), intent(in) :: w + integer, intent(in) :: opnri + integer, intent(in) :: opri_pos + integer, intent(in) :: opri_neg + integer, intent(in) :: rimap_inv + integer, intent(in) :: rimap_inv_max + complex(real64), intent(in) :: fi + integer, intent(in) :: ldfp + complex(real64), intent(inout) :: fo + end subroutine zoperate_ri_antisym_vec + end interface + + interface subroutine dgauss_seidel(opn, w, opnri, opri, rimap_inv, rimap_inv_max, factor, pot, rho) import :: real64 diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c new file mode 100644 index 0000000000..bf89e9ba67 --- /dev/null +++ b/src/grid/operate_sym_inc.c @@ -0,0 +1,553 @@ +/* + Copyright (C) 2006 X. Andrade + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +*/ + +#ifdef ALIGNED +#define LOAD VEC_LD +#define STORE VEC_ST +#else +#define LOAD VEC_LDU +#define STORE VEC_STU +#endif + +{ + const ptrdiff_t npairs = opn[0]; + const ptrdiff_t nri = opnri[0]; +#if DEPTH >= 32 + const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 16 + const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 8 + const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 4 + const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 2 + const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 1 + const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf); +#endif + + ptrdiff_t l, i, j; + const int *restrict index_pos; + const int *restrict index_neg; + + for (l = 0; l < nri; l++) { + index_pos = opri_pos + npairs * l; + index_neg = opri_neg + npairs * l; + i = rimap_inv[l]; + +#if DEPTH >= 32 + for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + register VEC_TYPE a8, a9, aa, ab; + register VEC_TYPE ac, ad, ae, af; + register VEC_TYPE b0, b1, b2, b3; + register VEC_TYPE b4, b5, b6, b7; + register VEC_TYPE b8, b9, ba, bb; + register VEC_TYPE bc, bd, be, bf; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + a8 = a9 = aa = ab = VEC_ZERO; + ac = ad = ae = af = VEC_ZERO; + b0 = b1 = b2 = b3 = VEC_ZERO; + b4 = b5 = b6 = b7 = VEC_ZERO; + b8 = b9 = ba = bb = VEC_ZERO; + bc = bd = be = bf = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7); + pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k); + a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8); + pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k); + a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9); + pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k); + aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa); + pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k); + ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab); + pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k); + ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac); + pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k); + ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad); + pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k); + ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae); + pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k); + af = VEC_FMA(wj, VEC_ADD(pos, neg), af); + pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE + k); + b0 = VEC_FMA(wj, VEC_ADD(pos, neg), b0); + pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE + k); + b1 = VEC_FMA(wj, VEC_ADD(pos, neg), b1); + pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE + k); + b2 = VEC_FMA(wj, VEC_ADD(pos, neg), b2); + pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE + k); + b3 = VEC_FMA(wj, VEC_ADD(pos, neg), b3); + pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE + k); + b4 = VEC_FMA(wj, VEC_ADD(pos, neg), b4); + pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE + k); + b5 = VEC_FMA(wj, VEC_ADD(pos, neg), b5); + pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE + k); + b6 = VEC_FMA(wj, VEC_ADD(pos, neg), b6); + pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE + k); + b7 = VEC_FMA(wj, VEC_ADD(pos, neg), b7); + pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE + k); + b8 = VEC_FMA(wj, VEC_ADD(pos, neg), b8); + pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE + k); + b9 = VEC_FMA(wj, VEC_ADD(pos, neg), b9); + pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE + k); + ba = VEC_FMA(wj, VEC_ADD(pos, neg), ba); + pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE + k); + bb = VEC_FMA(wj, VEC_ADD(pos, neg), bb); + pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE + k); + bc = VEC_FMA(wj, VEC_ADD(pos, neg), bc); + pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE + k); + bd = VEC_FMA(wj, VEC_ADD(pos, neg), bd); + pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE + k); + be = VEC_FMA(wj, VEC_ADD(pos, neg), be); + pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE + k); + be = VEC_FMA(wj, VEC_ADD(pos, neg), be); + } + register VEC_TYPE wj = VEC_SCAL(wcenter[0]); + a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); + a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); + a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2); + a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3); + a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4); + a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5); + a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6); + a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7); + a8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 8 * VEC_SIZE + k), a8); + a9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 9 * VEC_SIZE + k), a9); + aa = VEC_FMA(wj, LOAD(fi + (i << ldf) + 10 * VEC_SIZE + k), aa); + ab = VEC_FMA(wj, LOAD(fi + (i << ldf) + 11 * VEC_SIZE + k), ab); + ac = VEC_FMA(wj, LOAD(fi + (i << ldf) + 12 * VEC_SIZE + k), ac); + ad = VEC_FMA(wj, LOAD(fi + (i << ldf) + 13 * VEC_SIZE + k), ad); + ae = VEC_FMA(wj, LOAD(fi + (i << ldf) + 14 * VEC_SIZE + k), ae); + af = VEC_FMA(wj, LOAD(fi + (i << ldf) + 15 * VEC_SIZE + k), af); + b0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 16 * VEC_SIZE + k), b0); + b1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 17 * VEC_SIZE + k), b1); + b2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 18 * VEC_SIZE + k), b2); + b3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 19 * VEC_SIZE + k), b3); + b4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 20 * VEC_SIZE + k), b4); + b5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 21 * VEC_SIZE + k), b5); + b6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 22 * VEC_SIZE + k), b6); + b7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 23 * VEC_SIZE + k), b7); + b8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 24 * VEC_SIZE + k), b8); + b9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 25 * VEC_SIZE + k), b9); + ba = VEC_FMA(wj, LOAD(fi + (i << ldf) + 26 * VEC_SIZE + k), ba); + bb = VEC_FMA(wj, LOAD(fi + (i << ldf) + 27 * VEC_SIZE + k), bb); + bc = VEC_FMA(wj, LOAD(fi + (i << ldf) + 28 * VEC_SIZE + k), bc); + bd = VEC_FMA(wj, LOAD(fi + (i << ldf) + 29 * VEC_SIZE + k), bd); + be = VEC_FMA(wj, LOAD(fi + (i << ldf) + 30 * VEC_SIZE + k), be); + bf = VEC_FMA(wj, LOAD(fi + (i << ldf) + 31 * VEC_SIZE + k), bf); + + + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); + STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); + STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); + STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); + STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); + STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); + STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); + STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); + STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); + STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); + STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); + STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); + STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); + STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0); + STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1); + STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2); + STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3); + STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4); + STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5); + STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6); + STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7); + STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8); + STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9); + STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba); + STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb); + STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc); + STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd); + STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be); + STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf); + } + } +#endif + +#if DEPTH >= 16 + for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + register VEC_TYPE a8, a9, aa, ab; + register VEC_TYPE ac, ad, ae, af; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + a8 = a9 = aa = ab = VEC_ZERO; + ac = ad = ae = af = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7); + pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k); + a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8); + pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k); + a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9); + pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k); + aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa); + pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k); + ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab); + pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k); + ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac); + pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k); + ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad); + pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k); + ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae); + pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k); + af = VEC_FMA(wj, VEC_ADD(pos, neg), af); + + } + register VEC_TYPE wj = VEC_SCAL(wcenter[0]); + a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); + a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); + a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2); + a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3); + a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4); + a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5); + a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6); + a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7); + a8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 8 * VEC_SIZE + k), a8); + a9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 9 * VEC_SIZE + k), a9); + aa = VEC_FMA(wj, LOAD(fi + (i << ldf) + 10 * VEC_SIZE + k), aa); + ab = VEC_FMA(wj, LOAD(fi + (i << ldf) + 11 * VEC_SIZE + k), ab); + ac = VEC_FMA(wj, LOAD(fi + (i << ldf) + 12 * VEC_SIZE + k), ac); + ad = VEC_FMA(wj, LOAD(fi + (i << ldf) + 13 * VEC_SIZE + k), ad); + ae = VEC_FMA(wj, LOAD(fi + (i << ldf) + 14 * VEC_SIZE + k), ae); + af = VEC_FMA(wj, LOAD(fi + (i << ldf) + 15 * VEC_SIZE + k), af); + + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); + STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); + STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); + STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); + STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); + STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); + STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); + STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); + STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); + STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); + STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); + STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); + STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); + } + } +#endif + +#if DEPTH >= 8 + for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7); + } + register VEC_TYPE wj = VEC_SCAL(wcenter[0]); + a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); + a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); + a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2); + a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3); + a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4); + a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5); + a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6); + a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7); + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); + STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); + STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); + STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); + STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); + } + } +#endif + +#if DEPTH >= 4 + for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + + a0 = a1 = a2 = a3 = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); + } + register VEC_TYPE wj = VEC_SCAL(wcenter[0]); + a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); + a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); + a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2); + a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3); + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); + STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + } + } +#endif + +#if DEPTH >= 2 + for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) { + register VEC_TYPE a0, a1; + + a0 = a1 = VEC_ZERO; + + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); + } + register VEC_TYPE wj = VEC_SCAL(wcenter[0]); + a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); + a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); + STORE(fo + (i << ldf) + k, a0); + STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + } + } +#endif + +#if DEPTH >= 1 + for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += VEC_SIZE) { + register VEC_TYPE a0 = VEC_ZERO; + for (j = 0; j < npairs; j++) { +#ifdef VEC_SCAL_LD + register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); +#else + register VEC_TYPE wj = VEC_SCAL(wpair[j]); +#endif + ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; + ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; + register VEC_TYPE pos = LOAD(fi + indexj_pos + k); + register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + } + register VEC_TYPE wj = VEC_SCAL(wcenter[0]); + a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); + STORE(fo + (i << ldf) + k, a0); + } + } +#endif + +#if VEC_SIZE > 1 + for (; i < rimap_inv_max[l]; i++) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k++) { + double a = 0.0; + for (j = 0; j < npairs; j++) + a += wpair[j] * ( fi[((index_pos[j] + i) << ldf) + k] + fi[((index_neg[j] + i) << ldf) + k]); + a += wcenter[0] * fi[(i << ldf) + k]; + fo[(i << ldf) + k] = a; + } + } +#endif + + } /* l */ + + // this fence instruction is needed to ensure correctness when using + // non-temporal stores +#if defined(ALIGNED) && defined(FENCE) + FENCE; +#endif +} + +#undef LOAD +#undef STORE diff --git a/src/include/vectors.h b/src/include/vectors.h index 4b85e9a7c6..3c15558890 100644 --- a/src/include/vectors.h +++ b/src/include/vectors.h @@ -41,6 +41,8 @@ #define VEC_STU(addr, vec) _mm512_storeu_pd(addr, vec) #define VEC_FMA(aa, bb, cc) _mm512_fmadd_pd(aa, bb, cc) #define VEC_SCAL(aa) _mm512_set1_pd(aa) +#define VEC_ADD(aa, bb) _mm512_add_pd(aa, bb) +#define VEC_SUB(aa, bb) _mm512_sub_pd(aa, bb) #define VEC_ZERO _mm512_setzero_pd() #include #define FENCE _mm_mfence() @@ -73,6 +75,8 @@ #define VEC_FMA(aa, bb, cc) _mm256_add_pd(cc, _mm256_mul_pd(aa, bb)) #endif #define VEC_SCAL(aa) _mm256_set1_pd(aa) +#define VEC_ADD(aa, bb) _mm256_add_pd(aa, bb) +#define VEC_SUB(aa, bb) _mm256_sub_pd(aa, bb) #define VEC_ZERO _mm256_setzero_pd() #include #define FENCE _mm_mfence() @@ -101,6 +105,8 @@ #define VEC_FMA(aa, bb, cc) _mm_add_pd(cc, _mm_mul_pd(aa, bb)) #endif #define VEC_SCAL(aa) _mm_set1_pd(aa) +#define VEC_ADD(aa, bb) _mm_add_pd(aa, bb) +#define VEC_SUB(aa, bb) _mm_sub_pd(aa, bb) #define VEC_ZERO _mm_setzero_pd() #define FENCE _mm_mfence() @@ -120,6 +126,8 @@ #define VEC_STU(addr, vec) VEC_ST(addr, vec) #define VEC_FMA(aa, bb, cc) aa *bb + cc #define VEC_SCAL(aa) aa +#define VEC_ADD(aa, bb) aa+bb +#define VEC_SUB(aa, bb) aa-bb #define VEC_ZERO 0.0 #define DEPTH 8 -- GitLab From d79027e96c97eeb698b966e3afcbb1c5c786b417 Mon Sep 17 00:00:00 2001 From: NicolasTD Date: Fri, 12 Dec 2025 21:24:43 +0100 Subject: [PATCH 03/11] Store the pairs of points to avoid multiple recomputations. --- src/grid/derivatives.F90 | 3 +++ src/grid/nl_operator.F90 | 50 +++++++++++++++++++++++++++++++++++- src/grid/nl_operator_inc.F90 | 45 +++++++++++++++++--------------- 3 files changed, 76 insertions(+), 22 deletions(-) diff --git a/src/grid/derivatives.F90 b/src/grid/derivatives.F90 index 3392f2425b..5d3ff64ee4 100644 --- a/src/grid/derivatives.F90 +++ b/src/grid/derivatives.F90 @@ -893,6 +893,9 @@ contains if (der%remove_zero_weight_points) then call nl_operator_remove_zero_weight_points(op(i), space, der%mesh) end if + if (op(i)%const_w) then + call nl_operator_build_symmetric_weights(op(i)) + end if call nl_operator_output_weights(op(i)) end do diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index 3acbed8422..3a7c6a4fee 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -67,7 +67,8 @@ module nl_operator_oct_m nl_operator_np_zero_bc, & nl_operator_allocate_gpu_buffers, & nl_operator_update_gpu_buffers, & - nl_operator_remove_zero_weight_points + nl_operator_remove_zero_weight_points, & + nl_operator_build_symmetric_weights !> @brief index type for non-local operators !! @@ -79,6 +80,8 @@ module nl_operator_oct_m integer, allocatable :: imin(:) integer, allocatable :: imax(:) integer, allocatable :: ri(:, :) + integer, allocatable :: ri_pos(:,:) + integer, allocatable :: ri_neg(:,:) end type nl_operator_index_t integer, public, parameter :: & @@ -111,6 +114,13 @@ module nl_operator_oct_m integer, allocatable, public :: rimap(:) integer, allocatable, public :: rimap_inv(:) + !> For symmetry-based stencils + integer :: npairs = 0 + real(real64),allocatable :: wpair(:) + real(real64) :: wcenter + integer, allocatable :: ri_pos(:,:) + integer, allocatable :: ri_neg(:,:) + integer :: ninner = 0 integer :: nouter = 0 @@ -297,6 +307,14 @@ contains SAFE_ALLOCATE_SOURCE_A(opo%rimap, opi%rimap) SAFE_ALLOCATE_SOURCE_A(opo%rimap_inv, opi%rimap_inv) + opo%npairs = opi%npairs + if (opi%symmetry /= OP_GENERAL) then + opo%wcenter = opi%wcenter + SAFE_ALLOCATE_SOURCE_A(opo%wpair, opi%wpair) + SAFE_ALLOCATE_SOURCE_A(opo%ri_pos, opi%ri_pos) + SAFE_ALLOCATE_SOURCE_A(opo%ri_neg, opi%ri_neg) + end if + if (opi%mesh%parallel_in_domains) then opo%inner%nri = opi%inner%nri SAFE_ALLOCATE_SOURCE_A(opo%inner%imin, opi%inner%imin) @@ -650,6 +668,10 @@ contains SAFE_DEALLOCATE_A(op%rimap_inv) SAFE_DEALLOCATE_A(op%nn) + SAFE_DEALLOCATE_A(op%wpair) + SAFE_DEALLOCATE_A(op%ri_pos) + SAFE_DEALLOCATE_A(op%ri_neg) + call stencil_end(op%stencil) POP_SUB(nl_operator_end) @@ -939,6 +961,32 @@ subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pa POP_SUB(group_by_pairs_antisym) end subroutine group_by_pairs_antisym +subroutine nl_operator_build_symmetric_weights(op) + type(nl_operator_t), intent(inout) :: op + + PUSH_SUB(nl_operator_build_symmetric_weights) + + ASSERT(op%const_w) + + select case(op%symmetry) + case(OP_SYMMETRIC) + call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, & + op%npairs, op%wpair, op%ri_pos, op%ri_neg, op%wcenter) + call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & + op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg, op%wcenter) + call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & + op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg, op%wcenter) + case(OP_ANTISYMMETRIC) + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, & + op%npairs, op%wpair, op%ri_pos, op%ri_neg) + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & + op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg) + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & + op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg) + end select + + POP_SUB(nl_operator_build_symmetric_weights) +end subroutine nl_operator_build_symmetric_weights #include "undef.F90" #include "real.F90" diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90 index a5d3eb1fae..e7c885e33b 100644 --- a/src/grid/nl_operator_inc.F90 +++ b/src/grid/nl_operator_inc.F90 @@ -41,9 +41,8 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point #endif integer :: nri_loc, ini !> Symmetry of the stencil can be used to group points by pairs - integer :: npairs real(real64), allocatable :: wpair(:) - integer, allocatable :: ri_pos(:,:), ri_neg(:,:) + integer, pointer :: ri_pos(:,:), ri_neg(:,:) real(real64) :: wcenter PUSH_SUB(X(nl_operator_operate_batch)) @@ -79,6 +78,15 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point if (present(factor)) then wre(1:op%stencil%size) = wre(1:op%stencil%size)*factor end if + + if (op%symmetry /= OP_GENERAL) then + wpair = op%wpair + wcenter = op%wcenter + if (present(factor)) then + wpair = wpair * factor + wcenter = wcenter * factor + end if + end if end if use_accel = .false. @@ -92,10 +100,8 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point else if (X(function_global) == OP_FORTRAN) then select case(op%symmetry) case(OP_ANTISYMMETRIC) - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg) call operate_const_weights_antisymmetric() case(OP_SYMMETRIC) - call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter) call operate_const_weights_symmetric() case(OP_GENERAL) call operate_const_weights() @@ -104,13 +110,6 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point end select else - select case(op%symmetry) - case(OP_SYMMETRIC) - call group_by_pairs_sym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg, wcenter) - case(OP_ANTISYMMETRIC) - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, wre, ri, nri, npairs, wpair, ri_pos, ri_neg) - end select - ! for the moment this is not implemented !$omp parallel private(ini, nri_loc, ist) call multicomm_divide_range_omp(nri, ini, nri_loc) @@ -125,10 +124,10 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) case(OP_SYMMETRIC) - call X(operate_ri_sym_vec)(npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + call X(operate_ri_sym_vec)(op%npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), & imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) case(OP_ANTISYMMETRIC) - call X(operate_ri_antisym_vec)(npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + call X(operate_ri_antisym_vec)(op%npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), & imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) case default ASSERT(.false.) @@ -144,10 +143,10 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) case(OP_SYMMETRIC) - call X(operate_ri_sym_vec)(npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + call X(operate_ri_sym_vec)(op%npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), & imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) case(OP_ANTISYMMETRIC) - call X(operate_ri_antisym_vec)(npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + call X(operate_ri_antisym_vec)(op%npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), & imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) case default ASSERT(.false.) @@ -166,8 +165,6 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point SAFE_DEALLOCATE_A(wre) SAFE_DEALLOCATE_A(wpair) - SAFE_DEALLOCATE_A(ri_pos) - SAFE_DEALLOCATE_A(ri_neg) if (profile_) call profiling_out(TOSTRING(X(NL_OPERATOR_BATCH))) POP_SUB(X(nl_operator_operate_batch)) @@ -185,16 +182,22 @@ contains imin => op%rimap_inv(1:) imax => op%rimap_inv(2:) ri => op%ri + ri_pos => op%ri_pos + ri_neg => op%ri_neg case (OP_INNER) nri = op%inner%nri imin => op%inner%imin imax => op%inner%imax ri => op%inner%ri + ri_pos => op%inner%ri_pos + ri_neg => op%inner%ri_neg case (OP_OUTER) nri = op%outer%nri imin => op%outer%imin imax => op%outer%imax ri => op%outer%ri + ri_pos => op%outer%ri_pos + ri_neg => op%outer%ri_neg case default ASSERT(.false.) end select @@ -270,7 +273,7 @@ contains do ii = imin(ll) + 1, imax(ll) do ist = 1, fi%nst_linear fo%X(ff_linear)(ii, ist) = M_ZERO - do ipair = 1, npairs + do ipair = 1, op%npairs fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + & wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) - fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist)) end do @@ -286,7 +289,7 @@ contains do ii = imin(ll) + 1, imax(ll) do ist = 1, fi%nst_linear fo%X(ff_pack)(ist, ii) = M_ZERO - do ipair = 1, npairs + do ipair = 1, op%npairs fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) & + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) - fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll))) end do @@ -321,7 +324,7 @@ contains do ii = imin(ll) + 1, imax(ll) do ist = 1, fi%nst_linear fo%X(ff_linear)(ii, ist) = M_ZERO - do ipair = 1, npairs + do ipair = 1, op%npairs fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + & wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) + fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist)) end do @@ -338,7 +341,7 @@ contains do ii = imin(ll) + 1, imax(ll) do ist = 1, fi%nst_linear fo%X(ff_pack)(ist, ii) = M_ZERO - do ipair = 1, npairs + do ipair = 1, op%npairs fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) & + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) + fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll))) end do -- GitLab From 19a1438d116cea98fe7ae000be4d87dcd0a9a2e5 Mon Sep 17 00:00:00 2001 From: NicolasTD Date: Sat, 13 Dec 2025 08:38:47 +0100 Subject: [PATCH 04/11] Optimization of the symmetric operator by reducing the integer operations. --- src/grid/nl_operator.F90 | 31 +- src/grid/nl_operator_inc.F90 | 2 +- src/grid/operate_antisym_inc.c | 1 + src/grid/operate_sym_inc.c | 568 +++++++++++++++++---------------- src/include/vectors.h | 6 +- 5 files changed, 320 insertions(+), 288 deletions(-) diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index 3a7c6a4fee..faeadd9a3b 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -325,6 +325,13 @@ contains SAFE_ALLOCATE_SOURCE_A(opo%outer%imin, opi%outer%imin) SAFE_ALLOCATE_SOURCE_A(opo%outer%imax, opi%outer%imax) SAFE_ALLOCATE_SOURCE_A(opo%outer%ri, opi%outer%ri) + + if (opi%symmetry /= OP_GENERAL) then + SAFE_ALLOCATE_SOURCE_A(opo%inner%ri_pos, opi%inner%ri_pos) + SAFE_ALLOCATE_SOURCE_A(opo%inner%ri_neg, opi%inner%ri_neg) + SAFE_ALLOCATE_SOURCE_A(opo%outer%ri_pos, opi%outer%ri_pos) + SAFE_ALLOCATE_SOURCE_A(opo%outer%ri_neg, opi%outer%ri_neg) + end if end if ! We create the corresponding GPU buffers @@ -672,6 +679,10 @@ contains SAFE_DEALLOCATE_A(op%ri_pos) SAFE_DEALLOCATE_A(op%ri_neg) + SAFE_DEALLOCATE_A(op%inner%ri_pos) + SAFE_DEALLOCATE_A(op%inner%ri_neg) + SAFE_DEALLOCATE_A(op%outer%ri_pos) + SAFE_DEALLOCATE_A(op%outer%ri_neg) call stencil_end(op%stencil) POP_SUB(nl_operator_end) @@ -972,17 +983,21 @@ subroutine nl_operator_build_symmetric_weights(op) case(OP_SYMMETRIC) call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, & op%npairs, op%wpair, op%ri_pos, op%ri_neg, op%wcenter) - call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & - op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg, op%wcenter) - call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & - op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg, op%wcenter) + if (op%mesh%parallel_in_domains) then + call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & + op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg, op%wcenter) + call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & + op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg, op%wcenter) + end if case(OP_ANTISYMMETRIC) call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, & op%npairs, op%wpair, op%ri_pos, op%ri_neg) - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & - op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg) - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & - op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg) + if (op%mesh%parallel_in_domains) then + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & + op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg) + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & + op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg) + end if end select POP_SUB(nl_operator_build_symmetric_weights) diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90 index e7c885e33b..ce87ec9be0 100644 --- a/src/grid/nl_operator_inc.F90 +++ b/src/grid/nl_operator_inc.F90 @@ -80,7 +80,7 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point end if if (op%symmetry /= OP_GENERAL) then - wpair = op%wpair + SAFE_ALLOCATE_SOURCE_A(wpair, op%wpair) wcenter = op%wcenter if (present(factor)) then wpair = wpair * factor diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c index 3e14fc74a4..da1da0907d 100644 --- a/src/grid/operate_antisym_inc.c +++ b/src/grid/operate_antisym_inc.c @@ -1,5 +1,6 @@ /* Copyright (C) 2006 X. Andrade + Copyright (C) 2025 N. Tancogne-Dejean This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c index bf89e9ba67..57a22c51d4 100644 --- a/src/grid/operate_sym_inc.c +++ b/src/grid/operate_sym_inc.c @@ -1,5 +1,6 @@ /* Copyright (C) 2006 X. Andrade + Copyright (C) 2025 N. Tancogne-Dejean This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -52,6 +53,9 @@ const int *restrict index_pos; const int *restrict index_neg; + // Hoisting wcenter + const double w0 = wcenter[0]; + for (l = 0; l < nri; l++) { index_pos = opri_pos + npairs * l; index_neg = opri_neg + npairs * l; @@ -79,178 +83,180 @@ b8 = b9 = ba = bb = VEC_ZERO; bc = bd = be = bf = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) i << ldf; + k; + for (j = 0; j < npairs; j++) { #ifdef VEC_SCAL_LD register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; + ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); - pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); - pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); - pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4); - pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5); - pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6); - pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7); - pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE); a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8); - pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE); a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9); - pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE); aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa); - pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE); ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab); - pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE); ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac); - pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE); ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad); - pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE); ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae); - pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE); af = VEC_FMA(wj, VEC_ADD(pos, neg), af); - pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE); b0 = VEC_FMA(wj, VEC_ADD(pos, neg), b0); - pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE); b1 = VEC_FMA(wj, VEC_ADD(pos, neg), b1); - pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE); b2 = VEC_FMA(wj, VEC_ADD(pos, neg), b2); - pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE); b3 = VEC_FMA(wj, VEC_ADD(pos, neg), b3); - pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE); b4 = VEC_FMA(wj, VEC_ADD(pos, neg), b4); - pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE); b5 = VEC_FMA(wj, VEC_ADD(pos, neg), b5); - pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE); b6 = VEC_FMA(wj, VEC_ADD(pos, neg), b6); - pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE); b7 = VEC_FMA(wj, VEC_ADD(pos, neg), b7); - pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE); b8 = VEC_FMA(wj, VEC_ADD(pos, neg), b8); - pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE); b9 = VEC_FMA(wj, VEC_ADD(pos, neg), b9); - pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE); ba = VEC_FMA(wj, VEC_ADD(pos, neg), ba); - pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE); bb = VEC_FMA(wj, VEC_ADD(pos, neg), bb); - pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE); bc = VEC_FMA(wj, VEC_ADD(pos, neg), bc); - pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE); bd = VEC_FMA(wj, VEC_ADD(pos, neg), bd); - pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE); be = VEC_FMA(wj, VEC_ADD(pos, neg), be); - pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE); be = VEC_FMA(wj, VEC_ADD(pos, neg), be); } - register VEC_TYPE wj = VEC_SCAL(wcenter[0]); - a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); - a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); - a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2); - a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3); - a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4); - a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5); - a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6); - a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7); - a8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 8 * VEC_SIZE + k), a8); - a9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 9 * VEC_SIZE + k), a9); - aa = VEC_FMA(wj, LOAD(fi + (i << ldf) + 10 * VEC_SIZE + k), aa); - ab = VEC_FMA(wj, LOAD(fi + (i << ldf) + 11 * VEC_SIZE + k), ab); - ac = VEC_FMA(wj, LOAD(fi + (i << ldf) + 12 * VEC_SIZE + k), ac); - ad = VEC_FMA(wj, LOAD(fi + (i << ldf) + 13 * VEC_SIZE + k), ad); - ae = VEC_FMA(wj, LOAD(fi + (i << ldf) + 14 * VEC_SIZE + k), ae); - af = VEC_FMA(wj, LOAD(fi + (i << ldf) + 15 * VEC_SIZE + k), af); - b0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 16 * VEC_SIZE + k), b0); - b1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 17 * VEC_SIZE + k), b1); - b2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 18 * VEC_SIZE + k), b2); - b3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 19 * VEC_SIZE + k), b3); - b4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 20 * VEC_SIZE + k), b4); - b5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 21 * VEC_SIZE + k), b5); - b6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 22 * VEC_SIZE + k), b6); - b7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 23 * VEC_SIZE + k), b7); - b8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 24 * VEC_SIZE + k), b8); - b9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 25 * VEC_SIZE + k), b9); - ba = VEC_FMA(wj, LOAD(fi + (i << ldf) + 26 * VEC_SIZE + k), ba); - bb = VEC_FMA(wj, LOAD(fi + (i << ldf) + 27 * VEC_SIZE + k), bb); - bc = VEC_FMA(wj, LOAD(fi + (i << ldf) + 28 * VEC_SIZE + k), bc); - bd = VEC_FMA(wj, LOAD(fi + (i << ldf) + 29 * VEC_SIZE + k), bd); - be = VEC_FMA(wj, LOAD(fi + (i << ldf) + 30 * VEC_SIZE + k), be); - bf = VEC_FMA(wj, LOAD(fi + (i << ldf) + 31 * VEC_SIZE + k), bf); - - - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); - STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); - STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); - STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); - STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); - STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); - STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); - STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); - STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); - STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); - STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); - STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); - STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); - STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0); - STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1); - STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2); - STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3); - STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4); - STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5); - STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6); - STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7); - STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8); - STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9); - STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba); - STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb); - STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc); - STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd); - STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be); - STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf); + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); + a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2); + a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3); + a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4); + a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5); + a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6); + a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7); + a8 = VEC_FMA(wj, LOAD(fi + base_ik + 8 * VEC_SIZE), a8); + a9 = VEC_FMA(wj, LOAD(fi + base_ik + 9 * VEC_SIZE), a9); + aa = VEC_FMA(wj, LOAD(fi + base_ik + 10 * VEC_SIZE), aa); + ab = VEC_FMA(wj, LOAD(fi + base_ik + 11 * VEC_SIZE), ab); + ac = VEC_FMA(wj, LOAD(fi + base_ik + 12 * VEC_SIZE), ac); + ad = VEC_FMA(wj, LOAD(fi + base_ik + 13 * VEC_SIZE), ad); + ae = VEC_FMA(wj, LOAD(fi + base_ik + 14 * VEC_SIZE), ae); + af = VEC_FMA(wj, LOAD(fi + base_ik + 15 * VEC_SIZE), af); + b0 = VEC_FMA(wj, LOAD(fi + base_ik + 16 * VEC_SIZE), b0); + b1 = VEC_FMA(wj, LOAD(fi + base_ik + 17 * VEC_SIZE), b1); + b2 = VEC_FMA(wj, LOAD(fi + base_ik + 18 * VEC_SIZE), b2); + b3 = VEC_FMA(wj, LOAD(fi + base_ik + 19 * VEC_SIZE), b3); + b4 = VEC_FMA(wj, LOAD(fi + base_ik + 20 * VEC_SIZE), b4); + b5 = VEC_FMA(wj, LOAD(fi + base_ik + 21 * VEC_SIZE), b5); + b6 = VEC_FMA(wj, LOAD(fi + base_ik + 22 * VEC_SIZE), b6); + b7 = VEC_FMA(wj, LOAD(fi + base_ik + 23 * VEC_SIZE), b7); + b8 = VEC_FMA(wj, LOAD(fi + base_ik + 24 * VEC_SIZE), b8); + b9 = VEC_FMA(wj, LOAD(fi + base_ik + 25 * VEC_SIZE), b9); + ba = VEC_FMA(wj, LOAD(fi + base_ik + 26 * VEC_SIZE), ba); + bb = VEC_FMA(wj, LOAD(fi + base_ik + 27 * VEC_SIZE), bb); + bc = VEC_FMA(wj, LOAD(fi + base_ik + 28 * VEC_SIZE), bc); + bd = VEC_FMA(wj, LOAD(fi + base_ik + 29 * VEC_SIZE), bd); + be = VEC_FMA(wj, LOAD(fi + base_ik + 30 * VEC_SIZE), be); + bf = VEC_FMA(wj, LOAD(fi + base_ik + 31 * VEC_SIZE), bf); + + + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + STORE(fo + base_ik + 4 * VEC_SIZE, a4); + STORE(fo + base_ik + 5 * VEC_SIZE, a5); + STORE(fo + base_ik + 6 * VEC_SIZE, a6); + STORE(fo + base_ik + 7 * VEC_SIZE, a7); + STORE(fo + base_ik + 8 * VEC_SIZE, a8); + STORE(fo + base_ik + 9 * VEC_SIZE, a9); + STORE(fo + base_ik + 10 * VEC_SIZE, aa); + STORE(fo + base_ik + 11 * VEC_SIZE, ab); + STORE(fo + base_ik + 12 * VEC_SIZE, ac); + STORE(fo + base_ik + 13 * VEC_SIZE, ad); + STORE(fo + base_ik + 14 * VEC_SIZE, ae); + STORE(fo + base_ik + 15 * VEC_SIZE, af); + STORE(fo + base_ik + 16 * VEC_SIZE, b0); + STORE(fo + base_ik + 17 * VEC_SIZE, b1); + STORE(fo + base_ik + 18 * VEC_SIZE, b2); + STORE(fo + base_ik + 19 * VEC_SIZE, b3); + STORE(fo + base_ik + 20 * VEC_SIZE, b4); + STORE(fo + base_ik + 21 * VEC_SIZE, b5); + STORE(fo + base_ik + 22 * VEC_SIZE, b6); + STORE(fo + base_ik + 23 * VEC_SIZE, b7); + STORE(fo + base_ik + 24 * VEC_SIZE, b8); + STORE(fo + base_ik + 25 * VEC_SIZE, b9); + STORE(fo + base_ik + 26 * VEC_SIZE, ba); + STORE(fo + base_ik + 27 * VEC_SIZE, bb); + STORE(fo + base_ik + 28 * VEC_SIZE, bc); + STORE(fo + base_ik + 29 * VEC_SIZE, bd); + STORE(fo + base_ik + 30 * VEC_SIZE, be); + STORE(fo + base_ik + 31 * VEC_SIZE, bf); } } #endif @@ -269,98 +275,100 @@ a8 = a9 = aa = ab = VEC_ZERO; ac = ad = ae = af = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { #ifdef VEC_SCAL_LD register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; + ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); - pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); - pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); - pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4); - pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5); - pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6); - pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7); - pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE); a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8); - pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE); a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9); - pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE); aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa); - pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE); ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab); - pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE); ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac); - pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE); ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad); - pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE); ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae); - pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE); af = VEC_FMA(wj, VEC_ADD(pos, neg), af); } - register VEC_TYPE wj = VEC_SCAL(wcenter[0]); - a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); - a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); - a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2); - a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3); - a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4); - a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5); - a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6); - a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7); - a8 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 8 * VEC_SIZE + k), a8); - a9 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 9 * VEC_SIZE + k), a9); - aa = VEC_FMA(wj, LOAD(fi + (i << ldf) + 10 * VEC_SIZE + k), aa); - ab = VEC_FMA(wj, LOAD(fi + (i << ldf) + 11 * VEC_SIZE + k), ab); - ac = VEC_FMA(wj, LOAD(fi + (i << ldf) + 12 * VEC_SIZE + k), ac); - ad = VEC_FMA(wj, LOAD(fi + (i << ldf) + 13 * VEC_SIZE + k), ad); - ae = VEC_FMA(wj, LOAD(fi + (i << ldf) + 14 * VEC_SIZE + k), ae); - af = VEC_FMA(wj, LOAD(fi + (i << ldf) + 15 * VEC_SIZE + k), af); - - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); - STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); - STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); - STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); - STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); - STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); - STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); - STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); - STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); - STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); - STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); - STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); - STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); + a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2); + a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3); + a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4); + a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5); + a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6); + a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7); + a8 = VEC_FMA(wj, LOAD(fi + base_ik + 8 * VEC_SIZE), a8); + a9 = VEC_FMA(wj, LOAD(fi + base_ik + 9 * VEC_SIZE), a9); + aa = VEC_FMA(wj, LOAD(fi + base_ik + 10 * VEC_SIZE), aa); + ab = VEC_FMA(wj, LOAD(fi + base_ik + 11 * VEC_SIZE), ab); + ac = VEC_FMA(wj, LOAD(fi + base_ik + 12 * VEC_SIZE), ac); + ad = VEC_FMA(wj, LOAD(fi + base_ik + 13 * VEC_SIZE), ad); + ae = VEC_FMA(wj, LOAD(fi + base_ik + 14 * VEC_SIZE), ae); + af = VEC_FMA(wj, LOAD(fi + base_ik + 15 * VEC_SIZE), af); + + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + STORE(fo + base_ik + 4 * VEC_SIZE, a4); + STORE(fo + base_ik + 5 * VEC_SIZE, a5); + STORE(fo + base_ik + 6 * VEC_SIZE, a6); + STORE(fo + base_ik + 7 * VEC_SIZE, a7); + STORE(fo + base_ik + 8 * VEC_SIZE, a8); + STORE(fo + base_ik + 9 * VEC_SIZE, a9); + STORE(fo + base_ik + 10 * VEC_SIZE, aa); + STORE(fo + base_ik + 11 * VEC_SIZE, ab); + STORE(fo + base_ik + 12 * VEC_SIZE, ac); + STORE(fo + base_ik + 13 * VEC_SIZE, ad); + STORE(fo + base_ik + 14 * VEC_SIZE, ae); + STORE(fo + base_ik + 15 * VEC_SIZE, af); } } #endif @@ -375,56 +383,58 @@ a0 = a1 = a2 = a3 = VEC_ZERO; a4 = a5 = a6 = a7 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { #ifdef VEC_SCAL_LD register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; + ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); - pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); - pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); - pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4); - pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5); - pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6); - pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7); } - register VEC_TYPE wj = VEC_SCAL(wcenter[0]); - a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); - a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); - a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2); - a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3); - a4 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 4 * VEC_SIZE + k), a4); - a5 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 5 * VEC_SIZE + k), a5); - a6 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 6 * VEC_SIZE + k), a6); - a7 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 7 * VEC_SIZE + k), a7); - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); - STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); - STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); - STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); - STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); + a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2); + a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3); + a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4); + a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5); + a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6); + a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + STORE(fo + base_ik + 4 * VEC_SIZE, a4); + STORE(fo + base_ik + 5 * VEC_SIZE, a5); + STORE(fo + base_ik + 6 * VEC_SIZE, a6); + STORE(fo + base_ik + 7 * VEC_SIZE, a7); } } #endif @@ -437,36 +447,38 @@ a0 = a1 = a2 = a3 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { #ifdef VEC_SCAL_LD register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; + ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); - pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); - pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); } - register VEC_TYPE wj = VEC_SCAL(wcenter[0]); - a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); - a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); - a2 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 2 * VEC_SIZE + k), a2); - a3 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 3 * VEC_SIZE + k), a3); - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); + a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2); + a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); } } #endif @@ -479,26 +491,28 @@ a0 = a1 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { #ifdef VEC_SCAL_LD register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; + ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); } - register VEC_TYPE wj = VEC_SCAL(wcenter[0]); - a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); - a1 = VEC_FMA(wj, LOAD(fi + (i << ldf) + 1 * VEC_SIZE + k), a1); - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); } } #endif @@ -508,21 +522,22 @@ ptrdiff_t k; for (k = 0; k < (1 << ldf); k += VEC_SIZE) { register VEC_TYPE a0 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; for (j = 0; j < npairs; j++) { #ifdef VEC_SCAL_LD register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; + ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); } - register VEC_TYPE wj = VEC_SCAL(wcenter[0]); - a0 = VEC_FMA(wj, LOAD(fi + (i << ldf) + k), a0); - STORE(fo + (i << ldf) + k, a0); + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + STORE(fo + base_ik, a0); } } #endif @@ -532,10 +547,11 @@ ptrdiff_t k; for (k = 0; k < (1 << ldf); k++) { double a = 0.0; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; for (j = 0; j < npairs; j++) - a += wpair[j] * ( fi[((index_pos[j] + i) << ldf) + k] + fi[((index_neg[j] + i) << ldf) + k]); - a += wcenter[0] * fi[(i << ldf) + k]; - fo[(i << ldf) + k] = a; + a += wpair[j] * ( fi[(index_pos[j]< Date: Mon, 15 Dec 2025 20:33:22 +0100 Subject: [PATCH 05/11] Precompute the offset multiplications for the stencil shifts. The array are reallocated if needed. --- src/grid/derivatives_test.F90 | 218 ++++++++++----------- src/grid/nl_operator.F90 | 309 +++++++++++++++++------------- src/grid/nl_operator_inc.F90 | 35 +++- src/grid/operate_sym_inc.c | 44 +++-- src/utils/dielectric_function.F90 | 16 +- 5 files changed, 345 insertions(+), 277 deletions(-) diff --git a/src/grid/derivatives_test.F90 b/src/grid/derivatives_test.F90 index c9231922dc..c17406ce34 100644 --- a/src/grid/derivatives_test.F90 +++ b/src/grid/derivatives_test.F90 @@ -51,156 +51,156 @@ module derivatives_test_oct_m contains !>@brief Further unit tests design to challenge numerical stability of the finite differences -subroutine derivatives_advanced_benchmark(this, namespace) - type(derivatives_t), intent(in) :: this - type(namespace_t), intent(in) :: namespace + subroutine derivatives_advanced_benchmark(this, namespace) + type(derivatives_t), intent(in) :: this + type(namespace_t), intent(in) :: namespace - real(real64), allocatable :: ff(:), grad(:,:), resgrad(:, :), lapl(:), reslapl(:) - real(real64) :: kvec(3) + real(real64), allocatable :: ff(:), grad(:,:), resgrad(:, :), lapl(:), reslapl(:) + real(real64) :: kvec(3) - SAFE_ALLOCATE(ff(1:this%mesh%np_part)) - SAFE_ALLOCATE(grad(1:this%mesh%np, 1:this%mesh%box%dim)) - SAFE_ALLOCATE(lapl(1:this%mesh%np)) - SAFE_ALLOCATE(reslapl(1:this%mesh%np)) - SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(ff(1:this%mesh%np_part)) + SAFE_ALLOCATE(grad(1:this%mesh%np, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(lapl(1:this%mesh%np)) + SAFE_ALLOCATE(reslapl(1:this%mesh%np)) + SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim)) - ! Testing on some planewaves - kvec = [0.5_real64, 0.0_real64, 0.0_real64] ! low frequency - call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) - call test_function("Testing Planewave k=(0.5,0.0,0.0)") + ! Testing on some planewaves + kvec = [0.5_real64, 0.0_real64, 0.0_real64] ! low frequency + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(0.5,0.0,0.0)") - kvec = [1.0_real64, 1.0_real64, 0.0_real64] ! diagonal - call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) - call test_function("Testing Planewave k=(1.0,1.0,0.0)") + kvec = [1.0_real64, 1.0_real64, 0.0_real64] ! diagonal + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(1.0,1.0,0.0)") - kvec = [M_PI/this%mesh%spacing(1), 0.0_real64, 0.0_real64] ! Nyquist - call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) - call test_function("Testing Planewave k=(\pi/dr,0.0,0.0)") + kvec = [M_PI/this%mesh%spacing(1), 0.0_real64, 0.0_real64] ! Nyquist + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(\pi/dr,0.0,0.0)") - kvec = [M_PI/this%mesh%spacing(1), M_PI/this%mesh%spacing(2), 0.0_real64] ! diagonal Nyquist - call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) - call test_function("Testing Planewave k=(\pi/dr,\pi/dr,0.0)") + kvec = [M_PI/this%mesh%spacing(1), M_PI/this%mesh%spacing(2), 0.0_real64] ! diagonal Nyquist + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(\pi/dr,\pi/dr,0.0)") - ! Testing on some polynomials - call testfunc_polynomial(this%mesh, ff, grad, lapl, p=2) - call test_function("Testing degree-2 polynomial") + ! Testing on some polynomials + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=2) + call test_function("Testing degree-2 polynomial") - call testfunc_polynomial(this%mesh, ff, grad, lapl, p=3) - call test_function("Testing degree-3 polynomial") + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=3) + call test_function("Testing degree-3 polynomial") - call testfunc_polynomial(this%mesh, ff, grad, lapl, p=4) - call test_function("Testing degree-4 polynomial") + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=4) + call test_function("Testing degree-4 polynomial") - call testfunc_polynomial(this%mesh, ff, grad, lapl, p=6) - call test_function("Testing degree-6 polynomial") + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=6) + call test_function("Testing degree-6 polynomial") - call testfunc_polynomial(this%mesh, ff, grad, lapl, p=8) - call test_function("Testing degree-8 polynomial") + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=8) + call test_function("Testing degree-8 polynomial") - SAFE_DEALLOCATE_A(ff) - SAFE_DEALLOCATE_A(grad) - SAFE_DEALLOCATE_A(lapl) - SAFE_DEALLOCATE_A(resgrad) - SAFE_DEALLOCATE_A(reslapl) + SAFE_DEALLOCATE_A(ff) + SAFE_DEALLOCATE_A(grad) + SAFE_DEALLOCATE_A(lapl) + SAFE_DEALLOCATE_A(resgrad) + SAFE_DEALLOCATE_A(reslapl) -contains - subroutine test_function(label) - character(len=*), intent(in) :: label + contains + subroutine test_function(label) + character(len=*), intent(in) :: label - message(1) = trim(label) + message(1) = trim(label) - call dderivatives_grad(this, ff, resgrad, set_bc=.false.) - resgrad = resgrad - grad - write(message(2), '(a, es17.10)') 'Gradient err = ', dmf_nrm2(this%mesh, this%mesh%box%dim, resgrad) + call dderivatives_grad(this, ff, resgrad, set_bc=.false.) + resgrad = resgrad - grad + write(message(2), '(a, es17.10)') 'Gradient err = ', dmf_nrm2(this%mesh, this%mesh%box%dim, resgrad) - call dderivatives_lapl(this, ff, reslapl, set_bc=.false.) - reslapl = reslapl - lapl - write(message(3), '(a, es17.10)') 'Laplacian err = ', dmf_nrm2(this%mesh, reslapl) + call dderivatives_lapl(this, ff, reslapl, set_bc=.false.) + reslapl = reslapl - lapl + write(message(3), '(a, es17.10)') 'Laplacian err = ', dmf_nrm2(this%mesh, reslapl) - call messages_info(3, namespace=namespace) - end subroutine test_function -end subroutine derivatives_advanced_benchmark + call messages_info(3, namespace=namespace) + end subroutine test_function + end subroutine derivatives_advanced_benchmark !>@brief Generate a plane-wave given its momentum, as well as the gradient and the Laplacian -subroutine testfunc_plane_wave(mesh, ff, grad, lapl, kvec) - type(mesh_t), intent(in) :: mesh - real(real64), intent(out) :: ff(:), grad(:, :), lapl(:) - real(real64), intent(in) :: kvec(3) + subroutine testfunc_plane_wave(mesh, ff, grad, lapl, kvec) + type(mesh_t), intent(in) :: mesh + real(real64), intent(out) :: ff(:), grad(:, :), lapl(:) + real(real64), intent(in) :: kvec(3) - integer :: ip, dim, idir - real(real64) :: phase, kdotx, kk + integer :: ip, dim, idir + real(real64) :: phase, kdotx, kk - dim = mesh%box%dim + dim = mesh%box%dim - kk = dot_product(kvec(1:dim), kvec(1:dim)) - do ip = 1, mesh%np - kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim)) - phase = cos(kdotx) + kk = dot_product(kvec(1:dim), kvec(1:dim)) + do ip = 1, mesh%np + kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim)) + phase = cos(kdotx) - ff(ip) = phase + ff(ip) = phase - ! exact gradient - do idir = 1, mesh%box%dim + ! exact gradient + do idir = 1, mesh%box%dim grad(ip, idir) = -kvec(idir)*sin(kdotx) - end do + end do - ! exact Laplacian - lapl(ip) = -kk * phase - end do + ! exact Laplacian + lapl(ip) = -kk * phase + end do - do ip = mesh%np+1, mesh%np_part - kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim)) - phase = cos(kdotx) + do ip = mesh%np+1, mesh%np_part + kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim)) + phase = cos(kdotx) - ff(ip) = phase - end do -end subroutine testfunc_plane_wave + ff(ip) = phase + end do + end subroutine testfunc_plane_wave !>@brief Generate a polyomial of degree p, as well as the gradient and the Laplacian -subroutine testfunc_polynomial(mesh, ff, grad, lapl, p) - type(mesh_t), intent(in) :: mesh - real(real64), intent(out) :: ff(:), grad(:, :), lapl(:) - integer, intent(in) :: p - integer :: ip, d, dim - real(real64) :: x(3), r2 + subroutine testfunc_polynomial(mesh, ff, grad, lapl, p) + type(mesh_t), intent(in) :: mesh + real(real64), intent(out) :: ff(:), grad(:, :), lapl(:) + integer, intent(in) :: p + integer :: ip, d, dim + real(real64) :: x(3), r2 - dim = mesh%box%dim + dim = mesh%box%dim - do ip = 1, mesh%np - x = 0.0_real64 - x(1:dim) = mesh%x(ip, 1:dim) + do ip = 1, mesh%np + x = 0.0_real64 + x(1:dim) = mesh%x(ip, 1:dim) - ! f = (x^2 + y^2 + z^2)^(p/2) - r2 = dot_product(x(1:dim), x(1:dim)) - ff(ip) = r2**(0.5d0*p) + ! f = (x^2 + y^2 + z^2)^(p/2) + r2 = dot_product(x(1:dim), x(1:dim)) + ff(ip) = r2**(0.5d0*p) - ! gradient: grad(f) = p * r^(p-2) * x - if (p >= 2) then + ! gradient: grad(f) = p * r^(p-2) * x + if (p >= 2) then do d = 1, dim - grad(ip,d) = p * r2**(0.5d0*p - 1.0d0) * x(d) + grad(ip,d) = p * r2**(0.5d0*p - 1.0d0) * x(d) end do - else + else grad(ip,:) = 0.0 - end if + end if - ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2) - if (p >= 2) then + ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2) + if (p >= 2) then lapl(ip) = p * (p + dim - 2) * r2**(0.5d0*p - 1.0d0) - else + else lapl(ip) = 0.0 - end if - end do - - do ip = mesh%np+1, mesh%np_part - x = 0.0_real64 - x(1:dim) = mesh%x(ip,1:dim) - - ! f = (x^2 + y^2 + z^2)^(p/2) - r2 = dot_product(x(1:dim), x(1:dim)) - ff(ip) = r2**(0.5d0*p) - end do -end subroutine testfunc_polynomial + end if + end do + + do ip = mesh%np+1, mesh%np_part + x = 0.0_real64 + x(1:dim) = mesh%x(ip,1:dim) + + ! f = (x^2 + y^2 + z^2)^(p/2) + r2 = dot_product(x(1:dim), x(1:dim)) + ff(ip) = r2**(0.5d0*p) + end do + end subroutine testfunc_polynomial #include "undef.F90" diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index faeadd9a3b..5ccdb727e5 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -80,8 +80,8 @@ module nl_operator_oct_m integer, allocatable :: imin(:) integer, allocatable :: imax(:) integer, allocatable :: ri(:, :) - integer, allocatable :: ri_pos(:,:) - integer, allocatable :: ri_neg(:,:) + integer, allocatable :: ri_pos(:,:,:) + integer, allocatable :: ri_neg(:,:,:) end type nl_operator_index_t integer, public, parameter :: & @@ -116,10 +116,11 @@ module nl_operator_oct_m !> For symmetry-based stencils integer :: npairs = 0 - real(real64),allocatable :: wpair(:) + real(real64),allocatable :: wpair(:) real(real64) :: wcenter - integer, allocatable :: ri_pos(:,:) - integer, allocatable :: ri_neg(:,:) + integer, allocatable :: ri_pos(:,:,:) + integer, allocatable :: ri_neg(:,:,:) + integer :: max_allocated_ri_pair = 0 integer :: ninner = 0 integer :: nouter = 0 @@ -818,57 +819,53 @@ contains POP_SUB(nl_operator_remove_zero_weight_points) end subroutine nl_operator_remove_zero_weight_points -!> Take a list of weights and offsets and build pairs of symmetric points with common weights -subroutine group_by_pairs_sym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg, wcenter) - integer, intent(in) :: size - integer, intent(in) :: offsets(:, :) - real(real64), intent(in) :: wre(:) - integer, intent(in) :: ri(:, :) - integer, intent(in) :: nri - integer, intent(out) :: npairs - real(real64), allocatable, intent(out) :: wpair(:) - integer, allocatable, intent(out) :: pair_pos(:,:), pair_neg(:,:) - real(real64), intent(out) :: wcenter - - logical, allocatable :: used(:) - integer :: i, j, d, ndim, s - logical :: same - real(real64), allocatable :: abs_wre(:) - integer, allocatable :: idx(:) - - real(real64), parameter :: tol = 1.0e-12_real64 - - PUSH_SUB(group_by_pairs_sym) - - ASSERT(mod(size,2) == 1) - - SAFE_ALLOCATE(used(1:size)) - used = .false. - npairs = 0 - - ndim = ubound(offsets, dim=1) - - SAFE_ALLOCATE(abs_wre(1:size)) - SAFE_ALLOCATE(idx(1:size)) - abs_wre = abs(wre) - call sort(abs_wre, idx) - - ! Max pairs = n/2 - SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri)) - SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri)) - SAFE_ALLOCATE(wpair(1:size/2)) - - do i = 1, size - if (used(i)) cycle - - if (all(offsets(:, idx(i))==0)) then +!> Take a list of weights and offsets and build pairs of symmetric points with common weights + subroutine group_by_pairs_sym(size, ldf, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg, wcenter) + integer, intent(in) :: size + integer, intent(in) :: ldf + integer, intent(in) :: offsets(:, :) + real(real64), intent(in) :: wre(:) + integer, intent(in) :: ri(:, :) + integer, intent(in) :: nri + integer, intent(out) :: npairs + real(real64), intent(inout) :: wpair(:) + integer, intent(inout) :: pair_pos(:,:), pair_neg(:,:) + real(real64), intent(out) :: wcenter + + logical, allocatable :: used(:) + integer :: i, j, ndim, s + logical :: same + real(real64), allocatable :: abs_wre(:) + integer, allocatable :: idx(:) + + real(real64), parameter :: tol = 1.0e-12_real64 + + PUSH_SUB(group_by_pairs_sym) + + ASSERT(mod(size,2) == 1) + + SAFE_ALLOCATE(used(1:size)) + used = .false. + npairs = 0 + + ndim = ubound(offsets, dim=1) + + SAFE_ALLOCATE(abs_wre(1:size)) + SAFE_ALLOCATE(idx(1:size)) + abs_wre = abs(wre) + call sort(abs_wre, idx) + + do i = 1, size + if (used(i)) cycle + + if (all(offsets(:, idx(i))==0)) then wcenter = wre(idx(i)) used(i) = .true. cycle - end if + end if - ! Try to find symmetric partner j - do j = i+1, size + ! Try to find symmetric partner j + do j = i+1, size if (used(j)) cycle ! Weight equality @@ -880,69 +877,69 @@ subroutine group_by_pairs_sym(size, offsets, wre, ri, nri, npairs, wpair, pair_p npairs = npairs + 1 do s = 1, nri - pair_pos(npairs, s) = ri(idx(i), s) - pair_neg(npairs, s) = ri(idx(j), s) + pair_pos(npairs, s) = ishft(ri(idx(i), s), ldf) + pair_neg(npairs, s) = ishft(ri(idx(j), s), ldf) end do wpair(npairs) = M_HALF*(wre(idx(i)) + wre(idx(j))) used(i) = .true. used(j) = .true. exit - end do - end do + end do + end do + + ASSERT(npairs == size/2) - ASSERT(npairs == size/2) + SAFE_DEALLOCATE_A(abs_wre) + SAFE_DEALLOCATE_A(idx) - SAFE_DEALLOCATE_A(abs_wre) - SAFE_DEALLOCATE_A(idx) + POP_SUB(group_by_pairs_sym) + end subroutine group_by_pairs_sym - POP_SUB(group_by_pairs_sym) -end subroutine group_by_pairs_sym +!> Take a list of weights and offsets and build pairs of symmetric points with common weights + subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg) + integer, intent(in) :: size + integer, intent(in) :: offsets(:, :) + real(real64), intent(in) :: wre(:) + integer, intent(in) :: ri(:, :) + integer, intent(in) :: nri + integer, intent(out) :: npairs + real(real64), allocatable, intent(out) :: wpair(:) + integer, allocatable, intent(out) :: pair_pos(:,:,:), pair_neg(:,:,:) -!> Take a list of weights and offsets and build pairs of symmetric points with common weights -subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg) - integer, intent(in) :: size - integer, intent(in) :: offsets(:, :) - real(real64), intent(in) :: wre(:) - integer, intent(in) :: ri(:, :) - integer, intent(in) :: nri - integer, intent(out) :: npairs - real(real64), allocatable, intent(out) :: wpair(:) - integer, allocatable, intent(out) :: pair_pos(:,:), pair_neg(:,:) + logical, allocatable :: used(:) + integer :: i, j, ndim, s + logical :: same + real(real64), allocatable :: abs_wre(:) + integer, allocatable :: idx(:) - logical, allocatable :: used(:) - integer :: i, j, d, ndim, s - logical :: same - real(real64), allocatable :: abs_wre(:) - integer, allocatable :: idx(:) + real(real64), parameter :: tol = 1.0e-12_real64 - real(real64), parameter :: tol = 1.0e-12_real64 + PUSH_SUB(group_by_pairs_antisym) - PUSH_SUB(group_by_pairs_antisym) + ASSERT(mod(size,2) == 0) - ASSERT(mod(size,2) == 0) + SAFE_ALLOCATE(used(1:size)) + used = .false. + npairs = 0 - SAFE_ALLOCATE(used(1:size)) - used = .false. - npairs = 0 + ndim = ubound(offsets, dim=1) - ndim = ubound(offsets, dim=1) + SAFE_ALLOCATE(abs_wre(1:size)) + SAFE_ALLOCATE(idx(1:size)) + abs_wre = abs(wre) + call sort(abs_wre, idx) - SAFE_ALLOCATE(abs_wre(1:size)) - SAFE_ALLOCATE(idx(1:size)) - abs_wre = abs(wre) - call sort(abs_wre, idx) - - ! Max pairs = n/2 - SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri)) - SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri)) - SAFE_ALLOCATE(wpair(1:size/2)) + ! Max pairs = n/2 + SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri, 1:1)) + SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri, 1:1)) + SAFE_ALLOCATE(wpair(1:size/2)) - do i = 1, size - if (used(i)) cycle + do i = 1, size + if (used(i)) cycle - ! Try to find symmetric partner j - do j = i+1, size + ! Try to find symmetric partner j + do j = i+1, size if (used(j)) cycle ! Weight equality @@ -954,54 +951,104 @@ subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pa npairs = npairs + 1 do s = 1, nri - pair_pos(npairs, s) = ri(idx(i), s) - pair_neg(npairs, s) = ri(idx(j), s) + pair_pos(npairs, s, 1) = ri(idx(i), s) + pair_neg(npairs, s, 1) = ri(idx(j), s) end do wpair(npairs) = M_HALF*(wre(idx(i)) - wre(idx(j))) used(i) = .true. used(j) = .true. exit - end do - end do + end do + end do - ASSERT(npairs == size/2) + ASSERT(npairs == size/2) - SAFE_DEALLOCATE_A(abs_wre) - SAFE_DEALLOCATE_A(idx) - POP_SUB(group_by_pairs_antisym) -end subroutine group_by_pairs_antisym + SAFE_DEALLOCATE_A(abs_wre) + SAFE_DEALLOCATE_A(idx) + POP_SUB(group_by_pairs_antisym) + end subroutine group_by_pairs_antisym -subroutine nl_operator_build_symmetric_weights(op) - type(nl_operator_t), intent(inout) :: op + subroutine nl_operator_build_symmetric_weights(op, max_size) + type(nl_operator_t), intent(inout) :: op + integer, optional, intent(in) :: max_size - PUSH_SUB(nl_operator_build_symmetric_weights) + integer :: ldf, start, end - ASSERT(op%const_w) + PUSH_SUB(nl_operator_build_symmetric_weights) - select case(op%symmetry) - case(OP_SYMMETRIC) - call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, & - op%npairs, op%wpair, op%ri_pos, op%ri_neg, op%wcenter) - if (op%mesh%parallel_in_domains) then - call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & - op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg, op%wcenter) - call group_by_pairs_sym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & - op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg, op%wcenter) - end if - case(OP_ANTISYMMETRIC) - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, & - op%npairs, op%wpair, op%ri_pos, op%ri_neg) - if (op%mesh%parallel_in_domains) then - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & - op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg) - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & - op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg) - end if - end select + ASSERT(op%const_w) + ASSERT(conf%target_states_block_size > 0) + + + select case(op%symmetry) + case(OP_SYMMETRIC) + if(present(max_size)) then + start = op%max_allocated_ri_pair + 1 + end = max_size + call reallocate_array(op%ri_pos, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%ri_neg, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end) + if (op%mesh%parallel_in_domains) then + call reallocate_array(op%inner%ri_pos, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%inner%ri_neg, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%outer%ri_pos, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%outer%ri_neg, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end) + end if + else + SAFE_ALLOCATE(op%wpair(1:op%stencil%size/2)) + start = 1 + end = log2(conf%target_states_block_size)+1 + + SAFE_ALLOCATE(op%ri_pos(1:op%stencil%size/2, 1:op%nri, 1:end)) + SAFE_ALLOCATE(op%ri_neg(1:op%stencil%size/2, 1:op%nri, 1:end)) + if (op%mesh%parallel_in_domains) then + SAFE_ALLOCATE(op%inner%ri_pos(1:op%stencil%size/2, 1:op%inner%nri, 1:end)) + SAFE_ALLOCATE(op%inner%ri_neg(1:op%stencil%size/2, 1:op%inner%nri, 1:end)) + SAFE_ALLOCATE(op%outer%ri_pos(1:op%stencil%size/2, 1:op%outer%nri, 1:end)) + SAFE_ALLOCATE(op%outer%ri_neg(1:op%stencil%size/2, 1:op%outer%nri, 1:end)) + end if + end if + op%max_allocated_ri_pair = end + ! Max pairs = n/2 + + op%max_allocated_ri_pair = end + do ldf = start-1, end-1 + call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, & + op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1), op%wcenter) + if (op%mesh%parallel_in_domains) then + call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & + op%npairs, op%wpair, op%inner%ri_pos(:,:,ldf+1), op%inner%ri_neg(:,:,ldf+1), op%wcenter) + call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & + op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1), op%wcenter) + end if + end do + case(OP_ANTISYMMETRIC) + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, & + op%npairs, op%wpair, op%ri_pos, op%ri_neg) + if (op%mesh%parallel_in_domains) then + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & + op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg) + call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & + op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg) + end if + end select + + POP_SUB(nl_operator_build_symmetric_weights) + end subroutine nl_operator_build_symmetric_weights + + subroutine reallocate_array(ri, stencil_size, nri, old_size, new_size) + integer, allocatable, intent(inout) :: ri(:,:,:) + integer, intent(in) :: stencil_size, nri + integer, intent(in) :: old_size, new_size + + integer, allocatable :: tmp(:,:,:) - POP_SUB(nl_operator_build_symmetric_weights) -end subroutine nl_operator_build_symmetric_weights + SAFE_ALLOCATE_SOURCE_A(tmp, ri) + SAFE_DEALLOCATE_A(ri) + SAFE_ALLOCATE(ri(1:stencil_size, 1:nri, 1:new_size)) + ri(:,:,1:old_size) = tmp + SAFE_DEALLOCATE_A(tmp) + end subroutine reallocate_array #include "undef.F90" #include "real.F90" diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90 index ce87ec9be0..39eada0cdf 100644 --- a/src/grid/nl_operator_inc.F90 +++ b/src/grid/nl_operator_inc.F90 @@ -19,7 +19,7 @@ ! --------------------------------------------------------- subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, points, factor, async) - type(nl_operator_t), target, intent(in) :: op + type(nl_operator_t), target, intent(inout) :: op type(batch_t), target, intent(inout) :: fi type(batch_t), intent(inout) :: fo !< this should be target, but old ifort 9.1 segfaults with it logical, optional, intent(in) :: ghost_update @@ -41,7 +41,7 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point #endif integer :: nri_loc, ini !> Symmetry of the stencil can be used to group points by pairs - real(real64), allocatable :: wpair(:) + real(real64), allocatable :: wpair(:) integer, pointer :: ri_pos(:,:), ri_neg(:,:) real(real64) :: wcenter @@ -174,30 +174,45 @@ contains ! --------------------------------------------------------- subroutine select_op() + integer :: ldf + PUSH_SUB(X(nl_operator_operate_batch).select_op) + ldf = logldf+1 + if (fi%status() == BATCH_PACKED) then + ldf = log2(int(fi%pack_size_real(1), int32))+1 + end if + + if (op%symmetry == OP_SYMMETRIC) then + if (ldf > op%max_allocated_ri_pair) then + call nl_operator_build_symmetric_weights(op, ldf) + end if + else + ldf = 1 + end if + select case (points_) case (OP_ALL) nri = op%nri imin => op%rimap_inv(1:) imax => op%rimap_inv(2:) ri => op%ri - ri_pos => op%ri_pos - ri_neg => op%ri_neg + ri_pos => op%ri_pos(:,:,ldf) + ri_neg => op%ri_neg(:,:,ldf) case (OP_INNER) nri = op%inner%nri imin => op%inner%imin imax => op%inner%imax ri => op%inner%ri - ri_pos => op%inner%ri_pos - ri_neg => op%inner%ri_neg + ri_pos => op%inner%ri_pos(:,:,ldf) + ri_neg => op%inner%ri_neg(:,:,ldf) case (OP_OUTER) nri = op%outer%nri imin => op%outer%imin imax => op%outer%imax ri => op%outer%ri - ri_pos => op%outer%ri_pos - ri_neg => op%outer%ri_neg + ri_pos => op%outer%ri_pos(:,:,ldf) + ri_neg => op%outer%ri_neg(:,:,ldf) case default ASSERT(.false.) end select @@ -256,7 +271,7 @@ contains subroutine operate_const_weights_antisymmetric() integer :: nn, ll, ii, ist, ipair - PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric) + PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_antisymmetric) nn = op%stencil%size @@ -623,7 +638,7 @@ end subroutine X(nl_operator_operate_batch) subroutine X(nl_operator_operate)(op, fi, fo, ghost_update, profile, points) R_TYPE, contiguous, intent(inout) :: fi(:) !< fi(op%np_part) - type(nl_operator_t), intent(in) :: op + type(nl_operator_t), intent(inout) :: op R_TYPE, contiguous, target, intent(out) :: fo(:) logical, optional, intent(in) :: ghost_update logical, optional, intent(in) :: profile diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c index 57a22c51d4..93740d5de5 100644 --- a/src/grid/operate_sym_inc.c +++ b/src/grid/operate_sym_inc.c @@ -91,8 +91,8 @@ #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; - ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + ptrdiff_t indexj_pos = index_pos[j] + base_ik; + ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); @@ -283,8 +283,8 @@ #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; - ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); @@ -391,8 +391,8 @@ #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; - ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); @@ -455,8 +455,8 @@ #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; - ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); @@ -499,8 +499,8 @@ #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; - ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); @@ -529,8 +529,8 @@ #else register VEC_TYPE wj = VEC_SCAL(wpair[j]); #endif - ptrdiff_t indexj_pos = (index_pos[j] << ldf) + base_ik; - ptrdiff_t indexj_neg = (index_neg[j] << ldf) + base_ik; + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); @@ -543,15 +543,21 @@ #endif #if VEC_SIZE > 1 + + const ptrdiff_t size = (ptrdiff_t) 1 << ldf; + double a; + for (; i < rimap_inv_max[l]; i++) { - ptrdiff_t k; - for (k = 0; k < (1 << ldf); k++) { - double a = 0.0; + for (ptrdiff_t k = 0; k < size; k++) { + a = 0.0; const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; - for (j = 0; j < npairs; j++) - a += wpair[j] * ( fi[(index_pos[j]< Date: Wed, 17 Dec 2025 10:48:56 +0100 Subject: [PATCH 06/11] Apply 1 suggestion(s) to 1 file(s) Co-authored-by: Sebastian Ohlmann --- src/grid/nl_operator.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index 5ccdb727e5..626721dc43 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -221,7 +221,7 @@ contains !%Section Execution::Optimization !%Default no !%Description - !% This variable selects if the operators are build using symmetries or not. + !% This variable selects if the operators are built using symmetries or not. !%End call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries) -- GitLab From e21246e5f6238e76c71ffdc533f86d600281940b Mon Sep 17 00:00:00 2001 From: NicolasTD Date: Wed, 17 Dec 2025 12:35:23 +0100 Subject: [PATCH 07/11] Improvements and fixes. --- src/grid/derivatives_test.F90 | 20 ++++++++++---------- src/grid/nl_operator.F90 | 7 ++++++- src/utils/dielectric_function.F90 | 22 +++------------------- 3 files changed, 19 insertions(+), 30 deletions(-) diff --git a/src/grid/derivatives_test.F90 b/src/grid/derivatives_test.F90 index c17406ce34..e176269586 100644 --- a/src/grid/derivatives_test.F90 +++ b/src/grid/derivatives_test.F90 @@ -135,7 +135,7 @@ contains kk = dot_product(kvec(1:dim), kvec(1:dim)) do ip = 1, mesh%np - kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim)) + kdotx = dot_product(kvec(1:dim), mesh%x(1:dim, ip)) phase = cos(kdotx) ff(ip) = phase @@ -150,7 +150,7 @@ contains end do do ip = mesh%np+1, mesh%np_part - kdotx = dot_product(kvec(1:dim), mesh%x(ip, 1:dim)) + kdotx = dot_product(kvec(1:dim), mesh%x(1:dim, ip)) phase = cos(kdotx) ff(ip) = phase @@ -169,36 +169,36 @@ contains do ip = 1, mesh%np x = 0.0_real64 - x(1:dim) = mesh%x(ip, 1:dim) + x(1:dim) = mesh%x(1:dim, ip) ! f = (x^2 + y^2 + z^2)^(p/2) r2 = dot_product(x(1:dim), x(1:dim)) - ff(ip) = r2**(0.5d0*p) + ff(ip) = r2**(0.5_real64*p) ! gradient: grad(f) = p * r^(p-2) * x if (p >= 2) then do d = 1, dim - grad(ip,d) = p * r2**(0.5d0*p - 1.0d0) * x(d) + grad(ip,d) = p * r2**(0.5_real64*p - 1.0_real64) * x(d) end do else - grad(ip,:) = 0.0 + grad(ip,:) = 0.0_real64 end if ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2) if (p >= 2) then - lapl(ip) = p * (p + dim - 2) * r2**(0.5d0*p - 1.0d0) + lapl(ip) = p * (p + dim - 2) * r2**(0.5_real64*p - 1.0_real64) else - lapl(ip) = 0.0 + lapl(ip) = 0.0_real64 end if end do do ip = mesh%np+1, mesh%np_part x = 0.0_real64 - x(1:dim) = mesh%x(ip,1:dim) + x(1:dim) = mesh%x(1:dim, ip) ! f = (x^2 + y^2 + z^2)^(p/2) r2 = dot_product(x(1:dim), x(1:dim)) - ff(ip) = r2**(0.5d0*p) + ff(ip) = r2**(0.5_real64*p) end do end subroutine testfunc_polynomial diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index 626721dc43..4b5c52635c 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -224,6 +224,9 @@ contains !% This variable selects if the operators are built using symmetries or not. !%End call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries) + if (accel_is_enabled() .and. use_symmetries) then + call messages_not_implemented("OperateUseSymmetries=yes with GPUs") + end if if (accel_is_enabled()) then @@ -969,9 +972,10 @@ contains POP_SUB(group_by_pairs_antisym) end subroutine group_by_pairs_antisym + !>@brief Builds (or rebuild) the necessary arrays for symmetric and antisymmetric stencils subroutine nl_operator_build_symmetric_weights(op, max_size) type(nl_operator_t), intent(inout) :: op - integer, optional, intent(in) :: max_size + integer, optional, intent(in) :: max_size !< For reallocation integer :: ldf, start, end @@ -1036,6 +1040,7 @@ contains POP_SUB(nl_operator_build_symmetric_weights) end subroutine nl_operator_build_symmetric_weights + !>@brief Reallocate an ri array subroutine reallocate_array(ri, stencil_size, nri, old_size, new_size) integer, allocatable, intent(inout) :: ri(:,:,:) integer, intent(in) :: stencil_size, nri diff --git a/src/utils/dielectric_function.F90 b/src/utils/dielectric_function.F90 index 77b63c574d..750875b12b 100644 --- a/src/utils/dielectric_function.F90 +++ b/src/utils/dielectric_function.F90 @@ -84,7 +84,7 @@ contains real(real64) :: dt, dt_ref, tt, ww, norm_Aext real(real64), allocatable :: vecpot(:, :), Aext(:), Eind_real(:, :), Eind_imag(:, :) complex(real64), allocatable :: dielectric(:), chi(:), invdielectric(:) - real(real64), allocatable :: vecpot_ref(:, :), average(:) + real(real64), allocatable :: vecpot_ref(:, :) type(spectrum_t) :: spectrum type(block_t) :: blk type(space_t) :: space @@ -102,7 +102,6 @@ contains SAFE_ALLOCATE(Aext(1:space%dim)) SAFE_ALLOCATE(Eind(1:space%dim)) - SAFE_ALLOCATE(average(1:space%dim*3)) if (parse_block(global_namespace, 'GaugeVectorField', blk) == 0) then @@ -193,8 +192,8 @@ contains read(ref_file, *) jj, tt, (vecpot_ref(ii, kk), kk = 1, space%dim*3) end do call io_close(ref_file) - do kk = 1, space%dim*3 - do ii = 1, time_steps + do ii = 1, time_steps + do kk = 1, space%dim*3 vecpot(ii, kk) = vecpot(ii, kk) - vecpot_ref(ii, kk) end do end do @@ -204,20 +203,6 @@ contains trim(io_workpath('td.general/gauge_field', global_namespace))//"'" call messages_info(1) - ! Treat the Drude peak - ! We remove the offset before applying the mask, to get proper treatment of metals - do kk = 1, space%dim*3 - average(kk) = M_ZERO - do ii = time_steps/2, time_steps - average(kk) = average(kk) + vecpot(ii, kk) - end do - average = average/(time_steps -time_steps/2 + 1) - do ii = 1, time_steps - vecpot(ii, kk) = vecpot(ii, kk) - average(kk) - end do - end do - Aext(1:space%dim) = Aext(1:space%dim) + average(1:space%dim) - ! Find out the iteration numbers corresponding to the time limits. ! Max time correspond to time_steps-1, as we start from 0. @@ -312,7 +297,6 @@ contains SAFE_DEALLOCATE_A(vecpot_ref) SAFE_DEALLOCATE_A(Aext) SAFE_DEALLOCATE_A(Eind) - SAFE_DEALLOCATE_A(average) SAFE_DEALLOCATE_A(Eind_real) SAFE_DEALLOCATE_A(Eind_imag) -- GitLab From cfef122e26248ff08ebe1cf4212033d75e98ee26 Mon Sep 17 00:00:00 2001 From: NicolasTD Date: Wed, 17 Dec 2025 14:51:40 +0100 Subject: [PATCH 08/11] Removing the depth of 32, as well as the VEC_SCAL_LD dead path. --- src/grid/operate_antisym_inc.c | 186 --------------------------- src/grid/operate_inc.c | 121 ------------------ src/grid/operate_sym_inc.c | 223 --------------------------------- 3 files changed, 530 deletions(-) diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c index da1da0907d..8ab6f9fdfe 100644 --- a/src/grid/operate_antisym_inc.c +++ b/src/grid/operate_antisym_inc.c @@ -30,9 +30,6 @@ { const ptrdiff_t npairs = opn[0]; const ptrdiff_t nri = opnri[0]; -#if DEPTH >= 32 - const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf); -#endif #if DEPTH >= 16 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf); #endif @@ -58,169 +55,6 @@ index_neg = opri_neg + npairs * l; i = rimap_inv[l]; -#if DEPTH >= 32 - for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) { - ptrdiff_t k; - for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) { - register VEC_TYPE a0, a1, a2, a3; - register VEC_TYPE a4, a5, a6, a7; - register VEC_TYPE a8, a9, aa, ab; - register VEC_TYPE ac, ad, ae, af; - register VEC_TYPE b0, b1, b2, b3; - register VEC_TYPE b4, b5, b6, b7; - register VEC_TYPE b8, b9, ba, bb; - register VEC_TYPE bc, bd, be, bf; - - a0 = a1 = a2 = a3 = VEC_ZERO; - a4 = a5 = a6 = a7 = VEC_ZERO; - a8 = a9 = aa = ab = VEC_ZERO; - ac = ad = ae = af = VEC_ZERO; - b0 = b1 = b2 = b3 = VEC_ZERO; - b4 = b5 = b6 = b7 = VEC_ZERO; - b8 = b9 = ba = bb = VEC_ZERO; - bc = bd = be = bf = VEC_ZERO; - - for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else - register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); - a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); - a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); - pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); - a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); - pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); - a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); - pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); - a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4); - pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); - a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5); - pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); - a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6); - pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); - a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7); - pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k); - a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8); - pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k); - a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9); - pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k); - aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa); - pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k); - ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab); - pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k); - ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac); - pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k); - ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad); - pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k); - ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae); - pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k); - af = VEC_FMA(wj, VEC_SUB(pos, neg), af); - pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE + k); - b0 = VEC_FMA(wj, VEC_SUB(pos, neg), b0); - pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE + k); - b1 = VEC_FMA(wj, VEC_SUB(pos, neg), b1); - pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE + k); - b2 = VEC_FMA(wj, VEC_SUB(pos, neg), b2); - pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE + k); - b3 = VEC_FMA(wj, VEC_SUB(pos, neg), b3); - pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE + k); - b4 = VEC_FMA(wj, VEC_SUB(pos, neg), b4); - pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE + k); - b5 = VEC_FMA(wj, VEC_SUB(pos, neg), b5); - pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE + k); - b6 = VEC_FMA(wj, VEC_SUB(pos, neg), b6); - pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE + k); - b7 = VEC_FMA(wj, VEC_SUB(pos, neg), b7); - pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE + k); - b8 = VEC_FMA(wj, VEC_SUB(pos, neg), b8); - pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE + k); - b9 = VEC_FMA(wj, VEC_SUB(pos, neg), b9); - pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE + k); - ba = VEC_FMA(wj, VEC_SUB(pos, neg), ba); - pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE + k); - bb = VEC_FMA(wj, VEC_SUB(pos, neg), bb); - pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE + k); - bc = VEC_FMA(wj, VEC_SUB(pos, neg), bc); - pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE + k); - bd = VEC_FMA(wj, VEC_SUB(pos, neg), bd); - pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE + k); - be = VEC_FMA(wj, VEC_SUB(pos, neg), be); - pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE + k); - be = VEC_FMA(wj, VEC_SUB(pos, neg), be); - } - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); - STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); - STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); - STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); - STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); - STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); - STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); - STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); - STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); - STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); - STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); - STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); - STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); - STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0); - STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1); - STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2); - STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3); - STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4); - STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5); - STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6); - STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7); - STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8); - STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9); - STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba); - STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb); - STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc); - STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd); - STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be); - STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf); - } - } -#endif - #if DEPTH >= 16 for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) { ptrdiff_t k; @@ -236,11 +70,7 @@ ac = ad = ae = af = VEC_ZERO; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; register VEC_TYPE pos = LOAD(fi + indexj_pos + k); @@ -324,11 +154,7 @@ a4 = a5 = a6 = a7 = VEC_ZERO; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; register VEC_TYPE pos = LOAD(fi + indexj_pos + k); @@ -377,11 +203,7 @@ a0 = a1 = a2 = a3 = VEC_ZERO; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; register VEC_TYPE pos = LOAD(fi + indexj_pos + k); @@ -414,11 +236,7 @@ a0 = a1 = VEC_ZERO; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; register VEC_TYPE pos = LOAD(fi + indexj_pos + k); @@ -440,11 +258,7 @@ for (k = 0; k < (1 << ldf); k += VEC_SIZE) { register VEC_TYPE a0 = VEC_ZERO; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; register VEC_TYPE pos = LOAD(fi + indexj_pos + k); diff --git a/src/grid/operate_inc.c b/src/grid/operate_inc.c index d9014ac367..2eeb3d0ebf 100644 --- a/src/grid/operate_inc.c +++ b/src/grid/operate_inc.c @@ -29,9 +29,6 @@ { const ptrdiff_t n = opn[0]; const ptrdiff_t nri = opnri[0]; -#if DEPTH >= 32 - const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf); -#endif #if DEPTH >= 16 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf); #endif @@ -55,104 +52,6 @@ index = opri + n * l; i = rimap_inv[l]; -#if DEPTH >= 32 - for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) { - ptrdiff_t k; - for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) { - register VEC_TYPE a0, a1, a2, a3; - register VEC_TYPE a4, a5, a6, a7; - register VEC_TYPE a8, a9, aa, ab; - register VEC_TYPE ac, ad, ae, af; - register VEC_TYPE b0, b1, b2, b3; - register VEC_TYPE b4, b5, b6, b7; - register VEC_TYPE b8, b9, ba, bb; - register VEC_TYPE bc, bd, be, bf; - - a0 = a1 = a2 = a3 = VEC_ZERO; - a4 = a5 = a6 = a7 = VEC_ZERO; - a8 = a9 = aa = ab = VEC_ZERO; - ac = ad = ae = af = VEC_ZERO; - b0 = b1 = b2 = b3 = VEC_ZERO; - b4 = b5 = b6 = b7 = VEC_ZERO; - b8 = b9 = ba = bb = VEC_ZERO; - bc = bd = be = bf = VEC_ZERO; - - for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else - register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif - ptrdiff_t indexj = (index[j] + i) << ldf; - a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); - a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); - a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2); - a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3); - a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4); - a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5); - a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6); - a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7); - a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8); - a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9); - aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa); - ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab); - ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac); - ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad); - ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae); - af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af); - b0 = VEC_FMA(wj, LOAD(fi + indexj + 16 * VEC_SIZE + k), b0); - b1 = VEC_FMA(wj, LOAD(fi + indexj + 17 * VEC_SIZE + k), b1); - b2 = VEC_FMA(wj, LOAD(fi + indexj + 18 * VEC_SIZE + k), b2); - b3 = VEC_FMA(wj, LOAD(fi + indexj + 19 * VEC_SIZE + k), b3); - b4 = VEC_FMA(wj, LOAD(fi + indexj + 20 * VEC_SIZE + k), b4); - b5 = VEC_FMA(wj, LOAD(fi + indexj + 21 * VEC_SIZE + k), b5); - b6 = VEC_FMA(wj, LOAD(fi + indexj + 22 * VEC_SIZE + k), b6); - b7 = VEC_FMA(wj, LOAD(fi + indexj + 23 * VEC_SIZE + k), b7); - b8 = VEC_FMA(wj, LOAD(fi + indexj + 24 * VEC_SIZE + k), b8); - b9 = VEC_FMA(wj, LOAD(fi + indexj + 25 * VEC_SIZE + k), b9); - ba = VEC_FMA(wj, LOAD(fi + indexj + 26 * VEC_SIZE + k), ba); - bb = VEC_FMA(wj, LOAD(fi + indexj + 27 * VEC_SIZE + k), bb); - bc = VEC_FMA(wj, LOAD(fi + indexj + 28 * VEC_SIZE + k), bc); - bd = VEC_FMA(wj, LOAD(fi + indexj + 29 * VEC_SIZE + k), bd); - be = VEC_FMA(wj, LOAD(fi + indexj + 30 * VEC_SIZE + k), be); - bf = VEC_FMA(wj, LOAD(fi + indexj + 31 * VEC_SIZE + k), bf); - } - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); - STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); - STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); - STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); - STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); - STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); - STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); - STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); - STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); - STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); - STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); - STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); - STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); - STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0); - STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1); - STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2); - STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3); - STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4); - STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5); - STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6); - STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7); - STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8); - STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9); - STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba); - STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb); - STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc); - STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd); - STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be); - STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf); - } - } -#endif - #if DEPTH >= 16 for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) { ptrdiff_t k; @@ -168,11 +67,7 @@ ac = ad = ae = af = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); @@ -222,11 +117,7 @@ a4 = a5 = a6 = a7 = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); @@ -258,11 +149,7 @@ a0 = a1 = a2 = a3 = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); @@ -286,11 +173,7 @@ a0 = a1 = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); @@ -307,11 +190,7 @@ for (k = 0; k < (1 << ldf); k += VEC_SIZE) { register VEC_TYPE a0 = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); } diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c index 93740d5de5..fc5ba4ac88 100644 --- a/src/grid/operate_sym_inc.c +++ b/src/grid/operate_sym_inc.c @@ -30,9 +30,6 @@ { const ptrdiff_t npairs = opn[0]; const ptrdiff_t nri = opnri[0]; -#if DEPTH >= 32 - const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf); -#endif #if DEPTH >= 16 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf); #endif @@ -61,206 +58,6 @@ index_neg = opri_neg + npairs * l; i = rimap_inv[l]; -#if DEPTH >= 32 - for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) { - ptrdiff_t k; - for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) { - register VEC_TYPE a0, a1, a2, a3; - register VEC_TYPE a4, a5, a6, a7; - register VEC_TYPE a8, a9, aa, ab; - register VEC_TYPE ac, ad, ae, af; - register VEC_TYPE b0, b1, b2, b3; - register VEC_TYPE b4, b5, b6, b7; - register VEC_TYPE b8, b9, ba, bb; - register VEC_TYPE bc, bd, be, bf; - - a0 = a1 = a2 = a3 = VEC_ZERO; - a4 = a5 = a6 = a7 = VEC_ZERO; - a8 = a9 = aa = ab = VEC_ZERO; - ac = ad = ae = af = VEC_ZERO; - b0 = b1 = b2 = b3 = VEC_ZERO; - b4 = b5 = b6 = b7 = VEC_ZERO; - b8 = b9 = ba = bb = VEC_ZERO; - bc = bd = be = bf = VEC_ZERO; - - const ptrdiff_t base_ik = (ptrdiff_t) i << ldf; + k; - - for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else - register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif - ptrdiff_t indexj_pos = index_pos[j] + base_ik; - ptrdiff_t indexj_neg = index_neg[j] + base_ik; - register VEC_TYPE pos = LOAD(fi + indexj_pos); - register VEC_TYPE neg = LOAD(fi + indexj_neg); - a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); - a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); - pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); - a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); - pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); - a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); - pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); - a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4); - pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); - a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5); - pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); - a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6); - pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); - a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7); - pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE); - a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8); - pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE); - a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9); - pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE); - aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa); - pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE); - ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab); - pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE); - ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac); - pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE); - ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad); - pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE); - ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae); - pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE); - af = VEC_FMA(wj, VEC_ADD(pos, neg), af); - pos = LOAD(fi + indexj_pos + 16 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 16 * VEC_SIZE); - b0 = VEC_FMA(wj, VEC_ADD(pos, neg), b0); - pos = LOAD(fi + indexj_pos + 17 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 17 * VEC_SIZE); - b1 = VEC_FMA(wj, VEC_ADD(pos, neg), b1); - pos = LOAD(fi + indexj_pos + 18 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 18 * VEC_SIZE); - b2 = VEC_FMA(wj, VEC_ADD(pos, neg), b2); - pos = LOAD(fi + indexj_pos + 19 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 19 * VEC_SIZE); - b3 = VEC_FMA(wj, VEC_ADD(pos, neg), b3); - pos = LOAD(fi + indexj_pos + 20 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 20 * VEC_SIZE); - b4 = VEC_FMA(wj, VEC_ADD(pos, neg), b4); - pos = LOAD(fi + indexj_pos + 21 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 21 * VEC_SIZE); - b5 = VEC_FMA(wj, VEC_ADD(pos, neg), b5); - pos = LOAD(fi + indexj_pos + 22 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 22 * VEC_SIZE); - b6 = VEC_FMA(wj, VEC_ADD(pos, neg), b6); - pos = LOAD(fi + indexj_pos + 23 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 23 * VEC_SIZE); - b7 = VEC_FMA(wj, VEC_ADD(pos, neg), b7); - pos = LOAD(fi + indexj_pos + 24 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 24 * VEC_SIZE); - b8 = VEC_FMA(wj, VEC_ADD(pos, neg), b8); - pos = LOAD(fi + indexj_pos + 25 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 25 * VEC_SIZE); - b9 = VEC_FMA(wj, VEC_ADD(pos, neg), b9); - pos = LOAD(fi + indexj_pos + 26 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 26 * VEC_SIZE); - ba = VEC_FMA(wj, VEC_ADD(pos, neg), ba); - pos = LOAD(fi + indexj_pos + 27 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 27 * VEC_SIZE); - bb = VEC_FMA(wj, VEC_ADD(pos, neg), bb); - pos = LOAD(fi + indexj_pos + 28 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 28 * VEC_SIZE); - bc = VEC_FMA(wj, VEC_ADD(pos, neg), bc); - pos = LOAD(fi + indexj_pos + 29 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 29 * VEC_SIZE); - bd = VEC_FMA(wj, VEC_ADD(pos, neg), bd); - pos = LOAD(fi + indexj_pos + 30 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 30 * VEC_SIZE); - be = VEC_FMA(wj, VEC_ADD(pos, neg), be); - pos = LOAD(fi + indexj_pos + 31 * VEC_SIZE); - neg = LOAD(fi + indexj_neg + 31 * VEC_SIZE); - be = VEC_FMA(wj, VEC_ADD(pos, neg), be); - } - const VEC_TYPE wj = VEC_SCAL(w0); - a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); - a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); - a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2); - a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3); - a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4); - a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5); - a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6); - a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7); - a8 = VEC_FMA(wj, LOAD(fi + base_ik + 8 * VEC_SIZE), a8); - a9 = VEC_FMA(wj, LOAD(fi + base_ik + 9 * VEC_SIZE), a9); - aa = VEC_FMA(wj, LOAD(fi + base_ik + 10 * VEC_SIZE), aa); - ab = VEC_FMA(wj, LOAD(fi + base_ik + 11 * VEC_SIZE), ab); - ac = VEC_FMA(wj, LOAD(fi + base_ik + 12 * VEC_SIZE), ac); - ad = VEC_FMA(wj, LOAD(fi + base_ik + 13 * VEC_SIZE), ad); - ae = VEC_FMA(wj, LOAD(fi + base_ik + 14 * VEC_SIZE), ae); - af = VEC_FMA(wj, LOAD(fi + base_ik + 15 * VEC_SIZE), af); - b0 = VEC_FMA(wj, LOAD(fi + base_ik + 16 * VEC_SIZE), b0); - b1 = VEC_FMA(wj, LOAD(fi + base_ik + 17 * VEC_SIZE), b1); - b2 = VEC_FMA(wj, LOAD(fi + base_ik + 18 * VEC_SIZE), b2); - b3 = VEC_FMA(wj, LOAD(fi + base_ik + 19 * VEC_SIZE), b3); - b4 = VEC_FMA(wj, LOAD(fi + base_ik + 20 * VEC_SIZE), b4); - b5 = VEC_FMA(wj, LOAD(fi + base_ik + 21 * VEC_SIZE), b5); - b6 = VEC_FMA(wj, LOAD(fi + base_ik + 22 * VEC_SIZE), b6); - b7 = VEC_FMA(wj, LOAD(fi + base_ik + 23 * VEC_SIZE), b7); - b8 = VEC_FMA(wj, LOAD(fi + base_ik + 24 * VEC_SIZE), b8); - b9 = VEC_FMA(wj, LOAD(fi + base_ik + 25 * VEC_SIZE), b9); - ba = VEC_FMA(wj, LOAD(fi + base_ik + 26 * VEC_SIZE), ba); - bb = VEC_FMA(wj, LOAD(fi + base_ik + 27 * VEC_SIZE), bb); - bc = VEC_FMA(wj, LOAD(fi + base_ik + 28 * VEC_SIZE), bc); - bd = VEC_FMA(wj, LOAD(fi + base_ik + 29 * VEC_SIZE), bd); - be = VEC_FMA(wj, LOAD(fi + base_ik + 30 * VEC_SIZE), be); - bf = VEC_FMA(wj, LOAD(fi + base_ik + 31 * VEC_SIZE), bf); - - - STORE(fo + base_ik, a0); - STORE(fo + base_ik + 1 * VEC_SIZE, a1); - STORE(fo + base_ik + 2 * VEC_SIZE, a2); - STORE(fo + base_ik + 3 * VEC_SIZE, a3); - STORE(fo + base_ik + 4 * VEC_SIZE, a4); - STORE(fo + base_ik + 5 * VEC_SIZE, a5); - STORE(fo + base_ik + 6 * VEC_SIZE, a6); - STORE(fo + base_ik + 7 * VEC_SIZE, a7); - STORE(fo + base_ik + 8 * VEC_SIZE, a8); - STORE(fo + base_ik + 9 * VEC_SIZE, a9); - STORE(fo + base_ik + 10 * VEC_SIZE, aa); - STORE(fo + base_ik + 11 * VEC_SIZE, ab); - STORE(fo + base_ik + 12 * VEC_SIZE, ac); - STORE(fo + base_ik + 13 * VEC_SIZE, ad); - STORE(fo + base_ik + 14 * VEC_SIZE, ae); - STORE(fo + base_ik + 15 * VEC_SIZE, af); - STORE(fo + base_ik + 16 * VEC_SIZE, b0); - STORE(fo + base_ik + 17 * VEC_SIZE, b1); - STORE(fo + base_ik + 18 * VEC_SIZE, b2); - STORE(fo + base_ik + 19 * VEC_SIZE, b3); - STORE(fo + base_ik + 20 * VEC_SIZE, b4); - STORE(fo + base_ik + 21 * VEC_SIZE, b5); - STORE(fo + base_ik + 22 * VEC_SIZE, b6); - STORE(fo + base_ik + 23 * VEC_SIZE, b7); - STORE(fo + base_ik + 24 * VEC_SIZE, b8); - STORE(fo + base_ik + 25 * VEC_SIZE, b9); - STORE(fo + base_ik + 26 * VEC_SIZE, ba); - STORE(fo + base_ik + 27 * VEC_SIZE, bb); - STORE(fo + base_ik + 28 * VEC_SIZE, bc); - STORE(fo + base_ik + 29 * VEC_SIZE, bd); - STORE(fo + base_ik + 30 * VEC_SIZE, be); - STORE(fo + base_ik + 31 * VEC_SIZE, bf); - } - } -#endif - #if DEPTH >= 16 for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) { ptrdiff_t k; @@ -278,11 +75,7 @@ const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif const ptrdiff_t indexj_pos = index_pos[j] + base_ik; const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); @@ -386,11 +179,7 @@ const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif const ptrdiff_t indexj_pos = index_pos[j] + base_ik; const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); @@ -450,11 +239,7 @@ const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif const ptrdiff_t indexj_pos = index_pos[j] + base_ik; const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); @@ -494,11 +279,7 @@ const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif const ptrdiff_t indexj_pos = index_pos[j] + base_ik; const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); @@ -524,11 +305,7 @@ register VEC_TYPE a0 = VEC_ZERO; const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; for (j = 0; j < npairs; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(wpair + j); -#else register VEC_TYPE wj = VEC_SCAL(wpair[j]); -#endif const ptrdiff_t indexj_pos = index_pos[j] + base_ik; const ptrdiff_t indexj_neg = index_neg[j] + base_ik; register VEC_TYPE pos = LOAD(fi + indexj_pos); -- GitLab From ade83dc39b0017add50130ace02c946f9699b901 Mon Sep 17 00:00:00 2001 From: NicolasTD Date: Wed, 17 Dec 2025 14:53:22 +0100 Subject: [PATCH 09/11] Adding parenthesis for safety of the macros. --- src/include/vectors.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/include/vectors.h b/src/include/vectors.h index 8927198068..8f735020a6 100644 --- a/src/include/vectors.h +++ b/src/include/vectors.h @@ -124,10 +124,10 @@ #define VEC_LDU(addr) VEC_LD(addr) #define VEC_ST(addr, vec) (addr)[0] = vec #define VEC_STU(addr, vec) VEC_ST(addr, vec) -#define VEC_FMA(aa, bb, cc) aa * (bb) + cc +#define VEC_FMA(aa, bb, cc) ((aa) * (bb) + (cc)) #define VEC_SCAL(aa) aa -#define VEC_ADD(aa, bb) (aa+bb) -#define VEC_SUB(aa, bb) (aa-bb) +#define VEC_ADD(aa, bb) ((aa)+(bb)) +#define VEC_SUB(aa, bb) ((aa)-(bb)) #define VEC_ZERO 0.0 #define DEPTH 8 -- GitLab From 08834e935af76c79a0bd448194e986eed9ae930d Mon Sep 17 00:00:00 2001 From: NicolasTD Date: Wed, 17 Dec 2025 16:09:57 +0100 Subject: [PATCH 10/11] Use the precomputed offsets for the antisymmetric operator too. --- src/grid/nl_operator.F90 | 101 +++++++------- src/grid/nl_operator_inc.F90 | 11 +- src/grid/operate_antisym_inc.c | 235 ++++++++++++++++++--------------- 3 files changed, 180 insertions(+), 167 deletions(-) diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index 4b5c52635c..71054ef80f 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -225,7 +225,7 @@ contains !%End call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries) if (accel_is_enabled() .and. use_symmetries) then - call messages_not_implemented("OperateUseSymmetries=yes with GPUs") + call messages_not_implemented("OperateUseSymmetries=yes with GPUs") end if if (accel_is_enabled()) then @@ -900,15 +900,16 @@ contains end subroutine group_by_pairs_sym !> Take a list of weights and offsets and build pairs of symmetric points with common weights - subroutine group_by_pairs_antisym(size, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg) + subroutine group_by_pairs_antisym(size, ldf, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg) integer, intent(in) :: size + integer, intent(in) :: ldf integer, intent(in) :: offsets(:, :) real(real64), intent(in) :: wre(:) integer, intent(in) :: ri(:, :) integer, intent(in) :: nri - integer, intent(out) :: npairs - real(real64), allocatable, intent(out) :: wpair(:) - integer, allocatable, intent(out) :: pair_pos(:,:,:), pair_neg(:,:,:) + integer, intent(out) :: npairs + real(real64), intent(inout) :: wpair(:) + integer, intent(inout) :: pair_pos(:,:), pair_neg(:,:) logical, allocatable :: used(:) integer :: i, j, ndim, s @@ -934,10 +935,6 @@ contains call sort(abs_wre, idx) ! Max pairs = n/2 - SAFE_ALLOCATE(pair_pos(1:size/2, 1:nri, 1:1)) - SAFE_ALLOCATE(pair_neg(1:size/2, 1:nri, 1:1)) - SAFE_ALLOCATE(wpair(1:size/2)) - do i = 1, size if (used(i)) cycle @@ -954,8 +951,8 @@ contains npairs = npairs + 1 do s = 1, nri - pair_pos(npairs, s, 1) = ri(idx(i), s) - pair_neg(npairs, s, 1) = ri(idx(j), s) + pair_pos(npairs, s) = ishft(ri(idx(i), s), ldf) + pair_neg(npairs, s) = ishft(ri(idx(j), s), ldf) end do wpair(npairs) = M_HALF*(wre(idx(i)) - wre(idx(j))) @@ -979,44 +976,44 @@ contains integer :: ldf, start, end + if (op%symmetry == OP_GENERAL) return + PUSH_SUB(nl_operator_build_symmetric_weights) ASSERT(op%const_w) ASSERT(conf%target_states_block_size > 0) - - select case(op%symmetry) - case(OP_SYMMETRIC) - if(present(max_size)) then - start = op%max_allocated_ri_pair + 1 - end = max_size - call reallocate_array(op%ri_pos, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end) - call reallocate_array(op%ri_neg, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end) - if (op%mesh%parallel_in_domains) then - call reallocate_array(op%inner%ri_pos, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end) - call reallocate_array(op%inner%ri_neg, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end) - call reallocate_array(op%outer%ri_pos, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end) - call reallocate_array(op%outer%ri_neg, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end) - end if - else - SAFE_ALLOCATE(op%wpair(1:op%stencil%size/2)) - start = 1 - end = log2(conf%target_states_block_size)+1 - - SAFE_ALLOCATE(op%ri_pos(1:op%stencil%size/2, 1:op%nri, 1:end)) - SAFE_ALLOCATE(op%ri_neg(1:op%stencil%size/2, 1:op%nri, 1:end)) - if (op%mesh%parallel_in_domains) then - SAFE_ALLOCATE(op%inner%ri_pos(1:op%stencil%size/2, 1:op%inner%nri, 1:end)) - SAFE_ALLOCATE(op%inner%ri_neg(1:op%stencil%size/2, 1:op%inner%nri, 1:end)) - SAFE_ALLOCATE(op%outer%ri_pos(1:op%stencil%size/2, 1:op%outer%nri, 1:end)) - SAFE_ALLOCATE(op%outer%ri_neg(1:op%stencil%size/2, 1:op%outer%nri, 1:end)) - end if + if(present(max_size)) then + start = op%max_allocated_ri_pair + 1 + end = max_size + call reallocate_array(op%ri_pos, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%ri_neg, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end) + if (op%mesh%parallel_in_domains) then + call reallocate_array(op%inner%ri_pos, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%inner%ri_neg, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%outer%ri_pos, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%outer%ri_neg, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end) end if - op%max_allocated_ri_pair = end + else ! Max pairs = n/2 + SAFE_ALLOCATE(op%wpair(1:op%stencil%size/2)) + start = 1 + end = log2(conf%target_states_block_size)+1 - op%max_allocated_ri_pair = end - do ldf = start-1, end-1 + SAFE_ALLOCATE(op%ri_pos(1:op%stencil%size/2, 1:op%nri, 1:end)) + SAFE_ALLOCATE(op%ri_neg(1:op%stencil%size/2, 1:op%nri, 1:end)) + if (op%mesh%parallel_in_domains) then + SAFE_ALLOCATE(op%inner%ri_pos(1:op%stencil%size/2, 1:op%inner%nri, 1:end)) + SAFE_ALLOCATE(op%inner%ri_neg(1:op%stencil%size/2, 1:op%inner%nri, 1:end)) + SAFE_ALLOCATE(op%outer%ri_pos(1:op%stencil%size/2, 1:op%outer%nri, 1:end)) + SAFE_ALLOCATE(op%outer%ri_neg(1:op%stencil%size/2, 1:op%outer%nri, 1:end)) + end if + end if + op%max_allocated_ri_pair = end + + do ldf = start-1, end-1 + select case(op%symmetry) + case(OP_SYMMETRIC) call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, & op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1), op%wcenter) if (op%mesh%parallel_in_domains) then @@ -1025,17 +1022,17 @@ contains call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1), op%wcenter) end if - end do - case(OP_ANTISYMMETRIC) - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%ri, op%nri, & - op%npairs, op%wpair, op%ri_pos, op%ri_neg) - if (op%mesh%parallel_in_domains) then - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & - op%npairs, op%wpair, op%inner%ri_pos, op%inner%ri_neg) - call group_by_pairs_antisym(op%stencil%size, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & - op%npairs, op%wpair, op%outer%ri_pos, op%outer%ri_neg) - end if - end select + case(OP_ANTISYMMETRIC) + call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, & + op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1)) + if (op%mesh%parallel_in_domains) then + call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & + op%npairs, op%wpair, op%inner%ri_pos(:,:,ldf+1), op%inner%ri_neg(:,:,ldf+1)) + call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & + op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1)) + end if + end select + end do POP_SUB(nl_operator_build_symmetric_weights) end subroutine nl_operator_build_symmetric_weights diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90 index 39eada0cdf..3229904da8 100644 --- a/src/grid/nl_operator_inc.F90 +++ b/src/grid/nl_operator_inc.F90 @@ -183,12 +183,11 @@ contains ldf = log2(int(fi%pack_size_real(1), int32))+1 end if - if (op%symmetry == OP_SYMMETRIC) then - if (ldf > op%max_allocated_ri_pair) then - call nl_operator_build_symmetric_weights(op, ldf) - end if - else - ldf = 1 + if (X(function_global) == OP_FORTRAN .or. op%symmetry == OP_GENERAL) ldf = 1 + + ! If needed, we realloc the offsets + if (op%symmetry /= OP_GENERAL .and. ldf > op%max_allocated_ri_pair) then + call nl_operator_build_symmetric_weights(op, ldf) end if select case (points_) diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c index 8ab6f9fdfe..efa735b6a8 100644 --- a/src/grid/operate_antisym_inc.c +++ b/src/grid/operate_antisym_inc.c @@ -69,76 +69,78 @@ a8 = a9 = aa = ab = VEC_ZERO; ac = ad = ae = af = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { register VEC_TYPE wj = VEC_SCAL(wpair[j]); - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); - pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); - pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); - pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4); - pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5); - pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6); - pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7); - pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE); a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8); - pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE); a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9); - pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE); aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa); - pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE); ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab); - pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE); ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac); - pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE); ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad); - pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE); ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae); - pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE); af = VEC_FMA(wj, VEC_SUB(pos, neg), af); } - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); - STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); - STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); - STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); - STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); - STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); - STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); - STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); - STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); - STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); - STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); - STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); - STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + STORE(fo + base_ik + 4 * VEC_SIZE, a4); + STORE(fo + base_ik + 5 * VEC_SIZE, a5); + STORE(fo + base_ik + 6 * VEC_SIZE, a6); + STORE(fo + base_ik + 7 * VEC_SIZE, a7); + STORE(fo + base_ik + 8 * VEC_SIZE, a8); + STORE(fo + base_ik + 9 * VEC_SIZE, a9); + STORE(fo + base_ik + 10 * VEC_SIZE, aa); + STORE(fo + base_ik + 11 * VEC_SIZE, ab); + STORE(fo + base_ik + 12 * VEC_SIZE, ac); + STORE(fo + base_ik + 13 * VEC_SIZE, ad); + STORE(fo + base_ik + 14 * VEC_SIZE, ae); + STORE(fo + base_ik + 15 * VEC_SIZE, af); } } #endif @@ -153,43 +155,45 @@ a0 = a1 = a2 = a3 = VEC_ZERO; a4 = a5 = a6 = a7 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { register VEC_TYPE wj = VEC_SCAL(wpair[j]); - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); - pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); - pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); - pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4); - pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5); - pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6); - pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7); } - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); - STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); - STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); - STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); - STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + STORE(fo + base_ik + 4 * VEC_SIZE, a4); + STORE(fo + base_ik + 5 * VEC_SIZE, a5); + STORE(fo + base_ik + 6 * VEC_SIZE, a6); + STORE(fo + base_ik + 7 * VEC_SIZE, a7); } } #endif @@ -202,27 +206,29 @@ a0 = a1 = a2 = a3 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { register VEC_TYPE wj = VEC_SCAL(wpair[j]); - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); - pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); - pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); } - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); } } #endif @@ -235,19 +241,21 @@ a0 = a1 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { register VEC_TYPE wj = VEC_SCAL(wpair[j]); - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); - pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE + k); - neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE + k); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); } - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); } } #endif @@ -257,27 +265,36 @@ ptrdiff_t k; for (k = 0; k < (1 << ldf); k += VEC_SIZE) { register VEC_TYPE a0 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; for (j = 0; j < npairs; j++) { register VEC_TYPE wj = VEC_SCAL(wpair[j]); - ptrdiff_t indexj_pos = (index_pos[j] + i) << ldf; - ptrdiff_t indexj_neg = (index_neg[j] + i) << ldf; - register VEC_TYPE pos = LOAD(fi + indexj_pos + k); - register VEC_TYPE neg = LOAD(fi + indexj_neg + k); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); } - STORE(fo + (i << ldf) + k, a0); + STORE(fo + base_ik, a0); } } #endif #if VEC_SIZE > 1 + + const ptrdiff_t size = (ptrdiff_t) 1 << ldf; + double a; + for (; i < rimap_inv_max[l]; i++) { - ptrdiff_t k; - for (k = 0; k < (1 << ldf); k++) { - double a = 0.0; - for (j = 0; j < npairs; j++) - a += wpair[j] * ( fi[((index_pos[j] + i) << ldf) + k] - fi[((index_neg[j] + i) << ldf) + k]); - fo[(i << ldf) + k] = a; + for (ptrdiff_t k = 0; k < size; k++) { + a = 0.0; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + const double diff = fi[indexj_pos] - fi[indexj_neg]; + a += wpair[j] * diff; + } + fo[base_ik] = a; } } #endif -- GitLab From 6c13b21267b99c97f64e94b79feb663a5c505591 Mon Sep 17 00:00:00 2001 From: NicolasTD Date: Wed, 17 Dec 2025 17:03:22 +0100 Subject: [PATCH 11/11] Implement a robbust algorithm for the sorting of the points of the stencils --- src/basic/sort.F90 | 117 ++++++++++++++++++++++++++++++++++- src/grid/derivatives.F90 | 2 +- src/grid/nl_operator.F90 | 24 +++---- src/grid/nl_operator_inc.F90 | 18 ++++-- src/main/test.F90 | 4 +- 5 files changed, 145 insertions(+), 20 deletions(-) diff --git a/src/basic/sort.F90 b/src/basic/sort.F90 index 22add9d042..3d833f7683 100644 --- a/src/basic/sort.F90 +++ b/src/basic/sort.F90 @@ -29,7 +29,8 @@ module sort_oct_m private public :: & - sort + sort, & + robbust_sort_by_abs ! ------------------------------------------------------------------------------ !> This is the common interface to a sorting routine. @@ -171,6 +172,120 @@ contains end subroutine lsort + !>@brief Sorting criterium for the robbust sorting below + pure logical function less_idx(i, j, off, kabs, ksgn) result(less) + integer, intent(in) :: i, j + integer, intent(in) :: off(:, :) ! (ndim, n) + integer(int64), intent(in) :: kabs(:), ksgn(:) + integer :: d, ndim + + if (kabs(i) /= kabs(j)) then + less = kabs(i) < kabs(j) + return + end if + + if (ksgn(i) /= ksgn(j)) then + less = ksgn(i) < ksgn(j) ! 0 before 1 + return + end if + + ndim = size(off, 1) + do d = 1, ndim + if (off(d,i) /= off(d,j)) then + less = off(d,i) < off(d,j) + return + end if + end do + + less = i < j ! final stability (should not matter if offsets are unique) + end function less_idx + + !>@brief Perform the permutations for the sorting + recursive subroutine mergesort_perm(perm, tmp, l, r, off, kabs, ksgn) + integer, intent(inout) :: perm(:), tmp(:) + integer, intent(in) :: l, r + integer, intent(in) :: off(:, :) + integer(int64), intent(in) :: kabs(:), ksgn(:) + + integer :: m, i, j, k + + if (r <= l) return + m = (l + r) / 2 + + call mergesort_perm(perm, tmp, l, m, off, kabs, ksgn) + call mergesort_perm(perm, tmp, m+1, r, off, kabs, ksgn) + + i = l; j = m+1; k = l + do while (i <= m .and. j <= r) + if (less_idx(perm(i), perm(j), off, kabs, ksgn)) then + tmp(k) = perm(i); i = i+1 + else + tmp(k) = perm(j); j = j+1 + end if + k = k+1 + end do + do while (i <= m); tmp(k) = perm(i); i=i+1; k=k+1; end do + do while (j <= r); tmp(k) = perm(j); j=j+1; k=k+1; end do + + perm(l:r) = tmp(l:r) + end subroutine mergesort_perm + + !>@brief Robbust sorting of floating point numbers by absolute values + !! + !! This works as follow: + !! - Convert the floating point number to integers + !! - Put negative number first + !! - Use the offset integer values, assumed to be unique key, to order the degenerated points + subroutine robbust_sort_by_abs(v, off, perm, negative_first) + real(real64), intent(in) :: v(:) + integer, intent(in) :: off(:, :) ! (ndim, n) + integer, intent(out) :: perm(size(v)) + logical, intent(in), optional :: negative_first + + integer :: n, i + logical :: neg_first + integer, allocatable :: tmp(:) + integer(int64), allocatable :: kabs(:), ksgn(:) + real(real64) :: eps, ratio, max_ratio + integer(int64) :: kmax + + PUSH_SUB(robbust_sort_by_abs) + + n = size(v) + ASSERT(size(off, dim=2) == n) + + neg_first = optional_default(negative_first, .true.) + + ! Tolerance for the quantization, to eliminate noise + eps = 1.0e-14_real64 * maxval(abs(v)) + + kmax = huge(0_int64) + max_ratio = real(kmax, real64) - M_ONE + + allocate(tmp(n), kabs(n), ksgn(n)) + + do i = 1, n + perm(i) = i + ratio = abs(v(i)) / eps + if (ratio < max_ratio) then + kabs(i) = nint( abs(v(i)) / eps, int64 ) + else + kabs(i) = kmax + end if + + if (neg_first) then + ksgn(i) = merge(0, 1, v(i) < 0.0_real64) ! neg (0) then pos (1) + else + ksgn(i) = merge(0, 1, v(i) >= 0.0_real64) ! pos (0) then neg (1) + end if + end do + + call mergesort_perm(perm, tmp, 1, n, off, kabs, ksgn) + + deallocate(tmp, kabs, ksgn) + POP_SUB(robbust_sort_by_abs) + end subroutine robbust_sort_by_abs + #include "undef.F90" #include "complex.F90" #include "sort_inc.F90" diff --git a/src/grid/derivatives.F90 b/src/grid/derivatives.F90 index 5d3ff64ee4..1352860a8e 100644 --- a/src/grid/derivatives.F90 +++ b/src/grid/derivatives.F90 @@ -893,10 +893,10 @@ contains if (der%remove_zero_weight_points) then call nl_operator_remove_zero_weight_points(op(i), space, der%mesh) end if + call nl_operator_output_weights(op(i)) if (op(i)%const_w) then call nl_operator_build_symmetric_weights(op(i)) end if - call nl_operator_output_weights(op(i)) end do ! In case of constant weights, we store the weights of the Laplacian on the GPU, as this diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index 71054ef80f..ed6d8be53e 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -838,7 +838,6 @@ contains logical, allocatable :: used(:) integer :: i, j, ndim, s logical :: same - real(real64), allocatable :: abs_wre(:) integer, allocatable :: idx(:) real(real64), parameter :: tol = 1.0e-12_real64 @@ -853,10 +852,8 @@ contains ndim = ubound(offsets, dim=1) - SAFE_ALLOCATE(abs_wre(1:size)) SAFE_ALLOCATE(idx(1:size)) - abs_wre = abs(wre) - call sort(abs_wre, idx) + call robbust_sort_by_abs(wre, offsets, idx) do i = 1, size if (used(i)) cycle @@ -893,7 +890,6 @@ contains ASSERT(npairs == size/2) - SAFE_DEALLOCATE_A(abs_wre) SAFE_DEALLOCATE_A(idx) POP_SUB(group_by_pairs_sym) @@ -914,7 +910,6 @@ contains logical, allocatable :: used(:) integer :: i, j, ndim, s logical :: same - real(real64), allocatable :: abs_wre(:) integer, allocatable :: idx(:) real(real64), parameter :: tol = 1.0e-12_real64 @@ -929,10 +924,8 @@ contains ndim = ubound(offsets, dim=1) - SAFE_ALLOCATE(abs_wre(1:size)) SAFE_ALLOCATE(idx(1:size)) - abs_wre = abs(wre) - call sort(abs_wre, idx) + call robbust_sort_by_abs(wre, offsets, idx) ! Max pairs = n/2 do i = 1, size @@ -964,7 +957,6 @@ contains ASSERT(npairs == size/2) - SAFE_DEALLOCATE_A(abs_wre) SAFE_DEALLOCATE_A(idx) POP_SUB(group_by_pairs_antisym) end subroutine group_by_pairs_antisym @@ -974,7 +966,7 @@ contains type(nl_operator_t), intent(inout) :: op integer, optional, intent(in) :: max_size !< For reallocation - integer :: ldf, start, end + integer :: ldf, start, end, ipair if (op%symmetry == OP_GENERAL) return @@ -1034,6 +1026,16 @@ contains end select end do + if (.not. present(max_size)) then + write(message(1), '(3a)') 'Debug info: Sorted weights for ', trim(op%label), '.' + call messages_info(1, debug_only=.true.) + + do ipair = 1, op%npairs + write(message(1), '(a,i3,f25.10,2(1x,i4))') ' ', ipair, op%wpair(ipair), op%ri_pos(ipair,1,1), op%ri_neg(ipair,1,1) + call messages_info(1, debug_only=.true.) + end do + end if + POP_SUB(nl_operator_build_symmetric_weights) end subroutine nl_operator_build_symmetric_weights diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90 index 3229904da8..6ba64178b3 100644 --- a/src/grid/nl_operator_inc.F90 +++ b/src/grid/nl_operator_inc.F90 @@ -196,22 +196,28 @@ contains imin => op%rimap_inv(1:) imax => op%rimap_inv(2:) ri => op%ri - ri_pos => op%ri_pos(:,:,ldf) - ri_neg => op%ri_neg(:,:,ldf) + if (op%symmetry /= OP_GENERAL) then + ri_pos => op%ri_pos(:,:,ldf) + ri_neg => op%ri_neg(:,:,ldf) + end if case (OP_INNER) nri = op%inner%nri imin => op%inner%imin imax => op%inner%imax ri => op%inner%ri - ri_pos => op%inner%ri_pos(:,:,ldf) - ri_neg => op%inner%ri_neg(:,:,ldf) + if (op%symmetry /= OP_GENERAL) then + ri_pos => op%inner%ri_pos(:,:,ldf) + ri_neg => op%inner%ri_neg(:,:,ldf) + end if case (OP_OUTER) nri = op%outer%nri imin => op%outer%imin imax => op%outer%imax ri => op%outer%ri - ri_pos => op%outer%ri_pos(:,:,ldf) - ri_neg => op%outer%ri_neg(:,:,ldf) + if (op%symmetry /= OP_GENERAL) then + ri_pos => op%outer%ri_pos(:,:,ldf) + ri_neg => op%outer%ri_neg(:,:,ldf) + end if case default ASSERT(.false.) end select diff --git a/src/main/test.F90 b/src/main/test.F90 index 0a8d3be31c..dac77f904b 100644 --- a/src/main/test.F90 +++ b/src/main/test.F90 @@ -1304,7 +1304,9 @@ contains call zderivatives_test(sys%gr%der, sys%namespace, param%repetitions, param%min_blocksize, param%max_blocksize) end if - call derivatives_advanced_benchmark(sys%gr%der, sys%namespace) + if (sys%space%dim > 1) then + call derivatives_advanced_benchmark(sys%gr%der, sys%namespace) + end if SAFE_DEALLOCATE_P(sys) -- GitLab