diff --git a/src/basic/sort.F90 b/src/basic/sort.F90 index 22add9d0426f97826297efa62f5ca5d39708176b..3d833f76839621e77d276b8d6f26aea3edfc93b9 100644 --- a/src/basic/sort.F90 +++ b/src/basic/sort.F90 @@ -29,7 +29,8 @@ module sort_oct_m private public :: & - sort + sort, & + robbust_sort_by_abs ! ------------------------------------------------------------------------------ !> This is the common interface to a sorting routine. @@ -171,6 +172,120 @@ contains end subroutine lsort + !>@brief Sorting criterium for the robbust sorting below + pure logical function less_idx(i, j, off, kabs, ksgn) result(less) + integer, intent(in) :: i, j + integer, intent(in) :: off(:, :) ! (ndim, n) + integer(int64), intent(in) :: kabs(:), ksgn(:) + integer :: d, ndim + + if (kabs(i) /= kabs(j)) then + less = kabs(i) < kabs(j) + return + end if + + if (ksgn(i) /= ksgn(j)) then + less = ksgn(i) < ksgn(j) ! 0 before 1 + return + end if + + ndim = size(off, 1) + do d = 1, ndim + if (off(d,i) /= off(d,j)) then + less = off(d,i) < off(d,j) + return + end if + end do + + less = i < j ! final stability (should not matter if offsets are unique) + end function less_idx + + !>@brief Perform the permutations for the sorting + recursive subroutine mergesort_perm(perm, tmp, l, r, off, kabs, ksgn) + integer, intent(inout) :: perm(:), tmp(:) + integer, intent(in) :: l, r + integer, intent(in) :: off(:, :) + integer(int64), intent(in) :: kabs(:), ksgn(:) + + integer :: m, i, j, k + + if (r <= l) return + m = (l + r) / 2 + + call mergesort_perm(perm, tmp, l, m, off, kabs, ksgn) + call mergesort_perm(perm, tmp, m+1, r, off, kabs, ksgn) + + i = l; j = m+1; k = l + do while (i <= m .and. j <= r) + if (less_idx(perm(i), perm(j), off, kabs, ksgn)) then + tmp(k) = perm(i); i = i+1 + else + tmp(k) = perm(j); j = j+1 + end if + k = k+1 + end do + do while (i <= m); tmp(k) = perm(i); i=i+1; k=k+1; end do + do while (j <= r); tmp(k) = perm(j); j=j+1; k=k+1; end do + + perm(l:r) = tmp(l:r) + end subroutine mergesort_perm + + !>@brief Robbust sorting of floating point numbers by absolute values + !! + !! This works as follow: + !! - Convert the floating point number to integers + !! - Put negative number first + !! - Use the offset integer values, assumed to be unique key, to order the degenerated points + subroutine robbust_sort_by_abs(v, off, perm, negative_first) + real(real64), intent(in) :: v(:) + integer, intent(in) :: off(:, :) ! (ndim, n) + integer, intent(out) :: perm(size(v)) + logical, intent(in), optional :: negative_first + + integer :: n, i + logical :: neg_first + integer, allocatable :: tmp(:) + integer(int64), allocatable :: kabs(:), ksgn(:) + real(real64) :: eps, ratio, max_ratio + integer(int64) :: kmax + + PUSH_SUB(robbust_sort_by_abs) + + n = size(v) + ASSERT(size(off, dim=2) == n) + + neg_first = optional_default(negative_first, .true.) + + ! Tolerance for the quantization, to eliminate noise + eps = 1.0e-14_real64 * maxval(abs(v)) + + kmax = huge(0_int64) + max_ratio = real(kmax, real64) - M_ONE + + allocate(tmp(n), kabs(n), ksgn(n)) + + do i = 1, n + perm(i) = i + ratio = abs(v(i)) / eps + if (ratio < max_ratio) then + kabs(i) = nint( abs(v(i)) / eps, int64 ) + else + kabs(i) = kmax + end if + + if (neg_first) then + ksgn(i) = merge(0, 1, v(i) < 0.0_real64) ! neg (0) then pos (1) + else + ksgn(i) = merge(0, 1, v(i) >= 0.0_real64) ! pos (0) then neg (1) + end if + end do + + call mergesort_perm(perm, tmp, 1, n, off, kabs, ksgn) + + deallocate(tmp, kabs, ksgn) + POP_SUB(robbust_sort_by_abs) + end subroutine robbust_sort_by_abs + #include "undef.F90" #include "complex.F90" #include "sort_inc.F90" diff --git a/src/grid/CMakeLists.txt b/src/grid/CMakeLists.txt index 2b277d8025a5ac9ec0c8792dfc85b52b10d23e77..e2557c23953b4b8b564ac18ab816e811f3f9606a 100644 --- a/src/grid/CMakeLists.txt +++ b/src/grid/CMakeLists.txt @@ -16,6 +16,7 @@ target_sources(Octopus_lib PRIVATE curv_gygi.F90 curv_modine.F90 derivatives.F90 + derivatives_test.F90 fourier_shell.F90 fourier_space.F90 grid.F90 diff --git a/src/grid/derivatives.F90 b/src/grid/derivatives.F90 index 6a8c3dcf9fa605f0145d52aa9253667581985a75..1352860a8ef6be74fb6fceb1136dcd6738fe4095 100644 --- a/src/grid/derivatives.F90 +++ b/src/grid/derivatives.F90 @@ -42,10 +42,8 @@ module derivatives_oct_m use, intrinsic :: iso_fortran_env use lalg_basic_oct_m use lalg_adv_oct_m - use loct_oct_m use math_oct_m use mesh_oct_m - use mesh_function_oct_m use messages_oct_m use namespace_oct_m use nl_operator_oct_m @@ -64,13 +62,6 @@ module derivatives_oct_m use utils_oct_m use varinfo_oct_m -! debug purposes -! use io_binary_oct_m -! use io_function_oct_m -! use io_oct_m -! use unit_oct_m -! use unit_system_oct_m - implicit none private @@ -80,8 +71,6 @@ module derivatives_oct_m derivatives_end, & derivatives_build, & derivatives_handle_batch_t, & - dderivatives_test, & - zderivatives_test, & dderivatives_batch_start, & zderivatives_batch_start, & dderivatives_batch_finish, & @@ -430,7 +419,7 @@ contains order_ = optional_default(order, der%order) ! initialize nl operator - call nl_operator_init(lapl, name_) + call nl_operator_init(lapl, name_, symm=OP_SYMMETRIC) ! create stencil select case (der%stencil_type) @@ -484,7 +473,7 @@ contains dir_label = ' ' if (ii < 5) dir_label = index2axis(ii) - call nl_operator_init(der%grad(ii), "Gradient "//dir_label) + call nl_operator_init(der%grad(ii), "Gradient "//dir_label, symm=OP_ANTISYMMETRIC) ! create stencil select case (der%stencil_type) @@ -905,6 +894,9 @@ contains call nl_operator_remove_zero_weight_points(op(i), space, der%mesh) end if call nl_operator_output_weights(op(i)) + if (op(i)%const_w) then + call nl_operator_build_symmetric_weights(op(i)) + end if end do ! In case of constant weights, we store the weights of the Laplacian on the GPU, as this diff --git a/src/grid/derivatives_inc.F90 b/src/grid/derivatives_inc.F90 index 99b247edfd880b20b1f191cb646a31546680566c..ecb628746326f771a75ee013a08d5f71f9e7215f 100644 --- a/src/grid/derivatives_inc.F90 +++ b/src/grid/derivatives_inc.F90 @@ -322,301 +322,6 @@ subroutine X(derivatives_curl)(der, ff, op_ff, ghost_update, set_bc) POP_SUB(X(derivatives_curl)) end subroutine X(derivatives_curl) - -! ---------------------------------------------------------- -!> @brief unit test for derivatives -!! -!! This will be called from the test run mode. -subroutine X(derivatives_test)(this, namespace, repetitions, min_blocksize, max_blocksize) - type(derivatives_t), intent(in) :: this - type(namespace_t), intent(in) :: namespace - integer, intent(in) :: repetitions - integer, intent(in) :: min_blocksize - integer, intent(in) :: max_blocksize - - R_TYPE, allocatable :: ff(:), opff(:, :), gradff(:, :), curlff(:, :), res(:), resgrad(:, :), rescurl(:, :) - R_TYPE :: aa, bb, cc - integer :: ip, idir, ist, ib - type(batch_t) :: ffb, opffb - type(batch_t) :: gradffb(this%mesh%box%dim) - integer :: blocksize, itime - logical :: packstates - real(real64) :: stime, etime - character(len=20) :: type - real(real64) :: norm - - call parse_variable(namespace, 'StatesPack', .true., packstates) - - SAFE_ALLOCATE(ff(1:this%mesh%np_part)) - SAFE_ALLOCATE(opff(1:this%mesh%np, 1:this%mesh%box%dim)) - SAFE_ALLOCATE(gradff(1:this%mesh%np_part, 1:this%mesh%box%dim)) - SAFE_ALLOCATE(res(1:this%mesh%np_part)) - SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim)) - SAFE_ALLOCATE(curlff(1:this%mesh%np, 1:this%mesh%box%dim)) - -#ifdef R_TREAL - type = 'real' -#else - type = 'complex' -#endif - - ! Note: here we need to use a constant function or anything that - ! is constant at the borders, since we assume that all boundary - ! points have equal values to optimize the application of the nl-operator. - - aa = R_TOTYPE(1.0)/this%mesh%box%bounding_box_l(1) - bb = R_TOTYPE(10.0) - cc = R_TOTYPE(100.0) - -#ifdef R_TCOMPLEX - ! we make things more "complex" - aa = aa + M_ZI*R_TOTYPE(0.01) - bb = bb*exp(M_ZI*R_TOTYPE(0.345)) - cc = cc - M_ZI*R_TOTYPE(50.0) -#endif - - - do ip = 1, this%mesh%np_part - ff(ip) = bb*exp(-aa*sum(this%mesh%x(:, ip)**2)) + cc - end do - do ip = 1, this%mesh%np - do idir = 1, this%mesh%box%dim - gradff(ip, idir) = -M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2)) - end do - end do - - ! curl only needed in 3D - if (this%mesh%box%dim == 3) then - do ip = 1, this%mesh%np - curlff(ip, 1) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(2, ip) - this%mesh%x(3, ip)) - curlff(ip, 2) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(3, ip) - this%mesh%x(1, ip)) - curlff(ip, 3) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(1, ip) - this%mesh%x(2, ip)) - end do - end if - - message(1) = "Testing Laplacian" - call messages_info(1, namespace=namespace) - - ! test Laplacian - blocksize = min_blocksize - do - - call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) - call X(batch_init)(opffb, 1, 1, blocksize, this%mesh%np) - - do ist = 1, blocksize - call batch_set_state(ffb, ist, this%mesh%np_part, ff) - end do - - if (packstates) then - call ffb%do_pack() - call opffb%do_pack(copy = .false.) - end if - - if (repetitions > 1) then - call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF) - end if - - stime = loct_clock() - do itime = 1, repetitions - call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF) - end do - etime = (loct_clock() - stime)/real(repetitions, real64) - - call batch_get_state(opffb, blocksize, this%mesh%np, res) - - call ffb%end() - call opffb%end() - - do ip = 1, this%mesh%np - res(ip) = M_TWO*res(ip) - & - (M_FOUR*aa**2*bb*sum(this%mesh%x(:, ip)**2)*exp(-aa*sum(this%mesh%x(:, ip)**2)) & - - this%mesh%box%dim*M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))) - end do - - write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & - 'Laplacian ', trim(type), & - ' bsize = ', blocksize, & - ' , error = ', X(mf_nrm2)(this%mesh, res), & - ' , Gflops = ', & -#ifdef R_TREAL - blocksize*this%mesh%np*M_TWO*this%lapl%stencil%size/(etime*1.0e9_real64) -#else - blocksize*this%mesh%np*M_FOUR*this%lapl%stencil%size/(etime*1.0e9_real64) -#endif - - call messages_info(1, namespace=namespace) - - blocksize = 2*blocksize - if (blocksize > max_blocksize) exit - - end do - - message(1) = "Testing Gradient" - call messages_info(1, namespace=namespace) - - ! test Gradient - blocksize = min_blocksize - do - call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) - - do ist = 1, blocksize - call batch_set_state(ffb, ist, this%mesh%np_part, ff) - end do - - if (packstates) then - call ffb%do_pack() - end if - - if (repetitions > 1) then - call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) - end if - - stime = loct_clock() - do itime = 1, repetitions - call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) - end do - etime = (loct_clock() - stime)/real(repetitions, real64) - - do idir = 1, this%mesh%box%dim - call batch_get_state(gradffb(idir), blocksize, this%mesh%np, resgrad(:, idir)) - end do - - call ffb%end() - do idir = 1, this%mesh%box%dim - call gradffb(idir)%end() - end do - - do idir = 1, this%mesh%box%dim - do ip = 1, this%mesh%np - resgrad(ip, idir) = resgrad(ip, idir) - gradff(ip, idir) - end do - end do - - write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & - 'Batch gradient ', trim(type), & - ' bsize = ', blocksize, & - ' , error (x direction) = ', X(mf_nrm2)(this%mesh, resgrad(:, 1)), & - ' , Gflops = ', & -#ifdef R_TREAL - blocksize*this%mesh%np*M_TWO*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64) -#else - blocksize*this%mesh%np*M_FOUR*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64) -#endif - call messages_info(1, namespace=namespace) - - blocksize = 2*blocksize - if (blocksize > max_blocksize) exit - end do - - call X(derivatives_grad)(this, ff, opff, set_bc = .false.) - - do idir = 1, this%mesh%box%dim - do ip = 1, this%mesh%np - opff(ip, idir) = opff(ip, idir) - (-M_TWO*aa*bb*this%mesh%x_t(ip, idir)*exp(-aa*sum(this%mesh%x(:, ip)**2))) - end do - end do - - message(1) = '' - call messages_info(1, namespace=namespace) - - write(message(1), '(3a, es17.10)') 'Gradient ', trim(type), & - ' err = ', X(mf_nrm2)(this%mesh, this%mesh%box%dim, opff) - call messages_info(1, namespace=namespace) - - message(1) = '' - call messages_info(1, namespace=namespace) - - ! test curl for 3D case - if (this%mesh%box%dim == 3) then - message(1) = "Testing curl" - call messages_info(1, namespace=namespace) - - do blocksize = 3, 6, 3 - call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) - - do ist = 1, blocksize - call batch_set_state(ffb, ist, this%mesh%np_part, ff) - end do - - if (packstates) then - call ffb%do_pack() - end if - - ! test for one blocksize without precomputed gradient - if (blocksize == 6) then - call X(derivatives_batch_curl)(this, ffb, set_bc=.false.) - else - call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) - call X(derivatives_batch_curl_from_gradient)(this, ffb, gradffb) - end if - - SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:blocksize)) - do ist = 1, blocksize - call batch_get_state(ffb, ist, this%mesh%np, rescurl(:, ist)) - end do - - call ffb%end() - do idir = 1, this%mesh%box%dim - call gradffb(idir)%end() - end do - - do ip = 1, this%mesh%np - do ib = 0, blocksize-1, this%mesh%box%dim - do idir = 1, this%mesh%box%dim - rescurl(ip, ib+idir) = rescurl(ip, ib+idir) - curlff(ip, idir) - end do - end do - end do - - norm = M_ZERO - do ist = 1, blocksize - norm = norm + X(mf_nrm2)(this%mesh, rescurl(:, ist)) - end do - - write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & - 'Batch curl ', trim(type), ' bsize = ', blocksize, ' , error = ', norm - call messages_info(1, namespace=namespace) - - SAFE_DEALLOCATE_A(rescurl) - end do - - ! Testing the regular curl with the same logic as above - SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:this%mesh%box%dim)) - do idir = 1, this%mesh%box%dim - call lalg_copy(this%mesh%np_part, ff, gradff(:, idir)) - end do - call X(derivatives_curl)(this, gradff, rescurl, set_bc = .false.) - - do idir = 1, this%mesh%box%dim - do ip = 1, this%mesh%np - opff(ip, idir) = rescurl(ip, idir) - curlff(ip, idir) - end do - end do - - norm = M_ZERO - do ist = 1, this%mesh%box%dim - norm = norm + X(mf_nrm2)(this%mesh, opff(:, ist)) - end do - - write(message(1), '(3a, es17.10)') 'Curl ', trim(type), ' err = ', norm - call messages_info(1, namespace=namespace) - - SAFE_DEALLOCATE_A(rescurl) - end if - - message(1) = '' - call messages_info(1, namespace=namespace) - - SAFE_DEALLOCATE_A(ff) - SAFE_DEALLOCATE_A(opff) - SAFE_DEALLOCATE_A(gradff) - SAFE_DEALLOCATE_A(res) - SAFE_DEALLOCATE_A(resgrad) - SAFE_DEALLOCATE_A(curlff) - -end subroutine X(derivatives_test) - !===================================================================================== ! Batched versions of the routines: diff --git a/src/grid/derivatives_test.F90 b/src/grid/derivatives_test.F90 new file mode 100644 index 0000000000000000000000000000000000000000..e1762695867b3359dee98eff70b4b2a18a10ea53 --- /dev/null +++ b/src/grid/derivatives_test.F90 @@ -0,0 +1,219 @@ +!! Copyright (C) 2002-2006 M. Marques, A. Castro, A. Rubio, G. Bertsch +!! +!! This program is free software; you can redistribute it and/or modify +!! it under the terms of the GNU General Public License as published by +!! the Free Software Foundation; either version 2, or (at your option) +!! any later version. +!! +!! This program is distributed in the hope that it will be useful, +!! but WITHOUT ANY WARRANTY; without even the implied warranty of +!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +!! GNU General Public License for more details. +!! +!! You should have received a copy of the GNU General Public License +!! along with this program; if not, write to the Free Software +!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +!! 02110-1301, USA. +!! + +#include "global.h" + +!> @brief This module provides unit tests for the derivatives + +module derivatives_test_oct_m + use batch_oct_m + use batch_ops_oct_m + use boundaries_oct_m + use debug_oct_m + use derivatives_oct_m + use global_oct_m + use, intrinsic :: iso_fortran_env + use lalg_basic_oct_m + use lalg_adv_oct_m + use loct_oct_m + use mesh_oct_m + use mesh_function_oct_m + use messages_oct_m + use namespace_oct_m + use parser_oct_m + use profiling_oct_m + use space_oct_m + use utils_oct_m + + implicit none + + private + public :: & + dderivatives_test, & + zderivatives_test, & + derivatives_advanced_benchmark + +contains + +!>@brief Further unit tests design to challenge numerical stability of the finite differences + subroutine derivatives_advanced_benchmark(this, namespace) + type(derivatives_t), intent(in) :: this + type(namespace_t), intent(in) :: namespace + + real(real64), allocatable :: ff(:), grad(:,:), resgrad(:, :), lapl(:), reslapl(:) + real(real64) :: kvec(3) + + SAFE_ALLOCATE(ff(1:this%mesh%np_part)) + SAFE_ALLOCATE(grad(1:this%mesh%np, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(lapl(1:this%mesh%np)) + SAFE_ALLOCATE(reslapl(1:this%mesh%np)) + SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim)) + + ! Testing on some planewaves + kvec = [0.5_real64, 0.0_real64, 0.0_real64] ! low frequency + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(0.5,0.0,0.0)") + + kvec = [1.0_real64, 1.0_real64, 0.0_real64] ! diagonal + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(1.0,1.0,0.0)") + + kvec = [M_PI/this%mesh%spacing(1), 0.0_real64, 0.0_real64] ! Nyquist + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(\pi/dr,0.0,0.0)") + + kvec = [M_PI/this%mesh%spacing(1), M_PI/this%mesh%spacing(2), 0.0_real64] ! diagonal Nyquist + call testfunc_plane_wave(this%mesh, ff, grad, lapl, kvec) + call test_function("Testing Planewave k=(\pi/dr,\pi/dr,0.0)") + + ! Testing on some polynomials + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=2) + call test_function("Testing degree-2 polynomial") + + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=3) + call test_function("Testing degree-3 polynomial") + + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=4) + call test_function("Testing degree-4 polynomial") + + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=6) + call test_function("Testing degree-6 polynomial") + + call testfunc_polynomial(this%mesh, ff, grad, lapl, p=8) + call test_function("Testing degree-8 polynomial") + + + SAFE_DEALLOCATE_A(ff) + SAFE_DEALLOCATE_A(grad) + SAFE_DEALLOCATE_A(lapl) + SAFE_DEALLOCATE_A(resgrad) + SAFE_DEALLOCATE_A(reslapl) + + contains + subroutine test_function(label) + character(len=*), intent(in) :: label + + message(1) = trim(label) + + call dderivatives_grad(this, ff, resgrad, set_bc=.false.) + resgrad = resgrad - grad + write(message(2), '(a, es17.10)') 'Gradient err = ', dmf_nrm2(this%mesh, this%mesh%box%dim, resgrad) + + call dderivatives_lapl(this, ff, reslapl, set_bc=.false.) + reslapl = reslapl - lapl + write(message(3), '(a, es17.10)') 'Laplacian err = ', dmf_nrm2(this%mesh, reslapl) + + call messages_info(3, namespace=namespace) + end subroutine test_function + end subroutine derivatives_advanced_benchmark + +!>@brief Generate a plane-wave given its momentum, as well as the gradient and the Laplacian + subroutine testfunc_plane_wave(mesh, ff, grad, lapl, kvec) + type(mesh_t), intent(in) :: mesh + real(real64), intent(out) :: ff(:), grad(:, :), lapl(:) + real(real64), intent(in) :: kvec(3) + + integer :: ip, dim, idir + real(real64) :: phase, kdotx, kk + + dim = mesh%box%dim + + kk = dot_product(kvec(1:dim), kvec(1:dim)) + do ip = 1, mesh%np + kdotx = dot_product(kvec(1:dim), mesh%x(1:dim, ip)) + phase = cos(kdotx) + + ff(ip) = phase + + ! exact gradient + do idir = 1, mesh%box%dim + grad(ip, idir) = -kvec(idir)*sin(kdotx) + end do + + ! exact Laplacian + lapl(ip) = -kk * phase + end do + + do ip = mesh%np+1, mesh%np_part + kdotx = dot_product(kvec(1:dim), mesh%x(1:dim, ip)) + phase = cos(kdotx) + + ff(ip) = phase + end do + end subroutine testfunc_plane_wave + +!>@brief Generate a polyomial of degree p, as well as the gradient and the Laplacian + subroutine testfunc_polynomial(mesh, ff, grad, lapl, p) + type(mesh_t), intent(in) :: mesh + real(real64), intent(out) :: ff(:), grad(:, :), lapl(:) + integer, intent(in) :: p + integer :: ip, d, dim + real(real64) :: x(3), r2 + + dim = mesh%box%dim + + do ip = 1, mesh%np + x = 0.0_real64 + x(1:dim) = mesh%x(1:dim, ip) + + ! f = (x^2 + y^2 + z^2)^(p/2) + r2 = dot_product(x(1:dim), x(1:dim)) + ff(ip) = r2**(0.5_real64*p) + + ! gradient: grad(f) = p * r^(p-2) * x + if (p >= 2) then + do d = 1, dim + grad(ip,d) = p * r2**(0.5_real64*p - 1.0_real64) * x(d) + end do + else + grad(ip,:) = 0.0_real64 + end if + + ! Laplacian: ∇² r^n = n (n + d - 2) r^(n-2) + if (p >= 2) then + lapl(ip) = p * (p + dim - 2) * r2**(0.5_real64*p - 1.0_real64) + else + lapl(ip) = 0.0_real64 + end if + end do + + do ip = mesh%np+1, mesh%np_part + x = 0.0_real64 + x(1:dim) = mesh%x(1:dim, ip) + + ! f = (x^2 + y^2 + z^2)^(p/2) + r2 = dot_product(x(1:dim), x(1:dim)) + ff(ip) = r2**(0.5_real64*p) + end do + end subroutine testfunc_polynomial + + +#include "undef.F90" +#include "real.F90" +#include "derivatives_test_inc.F90" + +#include "undef.F90" +#include "complex.F90" +#include "derivatives_test_inc.F90" + +end module derivatives_test_oct_m + +!! Local Variables: +!! mode: f90 +!! coding: utf-8 +!! End: diff --git a/src/grid/derivatives_test_inc.F90 b/src/grid/derivatives_test_inc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..84028e3362ab4844c0acb3a56729d4f4be8027e9 --- /dev/null +++ b/src/grid/derivatives_test_inc.F90 @@ -0,0 +1,312 @@ +!! Copyright (C) 2002-2006 M. Marques, A. Castro, A. Rubio, G. Bertsch +!! +!! This program is free software; you can redistribute it and/or modify +!! it under the terms of the GNU General Public License as published by +!! the Free Software Foundation; either version 2, or (at your option) +!! any later version. +!! +!! This program is distributed in the hope that it will be useful, +!! but WITHOUT ANY WARRANTY; without even the implied warranty of +!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +!! GNU General Public License for more details. +!! +!! You should have received a copy of the GNU General Public License +!! along with this program; if not, write to the Free Software +!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +!! 02110-1301, USA. +!! + +! ---------------------------------------------------------- +!> @brief unit test for derivatives +!! +!! This will be called from the test run mode. +subroutine X(derivatives_test)(this, namespace, repetitions, min_blocksize, max_blocksize) + type(derivatives_t), intent(in) :: this + type(namespace_t), intent(in) :: namespace + integer, intent(in) :: repetitions + integer, intent(in) :: min_blocksize + integer, intent(in) :: max_blocksize + + R_TYPE, allocatable :: ff(:), opff(:, :), gradff(:, :), curlff(:, :), res(:), resgrad(:, :), rescurl(:, :) + R_TYPE :: aa, bb, cc + integer :: ip, idir, ist, ib + type(batch_t) :: ffb, opffb + type(batch_t) :: gradffb(this%mesh%box%dim) + integer :: blocksize, itime + logical :: packstates + real(real64) :: stime, etime + character(len=20) :: type + real(real64) :: norm + + call parse_variable(namespace, 'StatesPack', .true., packstates) + + SAFE_ALLOCATE(ff(1:this%mesh%np_part)) + SAFE_ALLOCATE(opff(1:this%mesh%np, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(gradff(1:this%mesh%np_part, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(res(1:this%mesh%np_part)) + SAFE_ALLOCATE(resgrad(1:this%mesh%np, 1:this%mesh%box%dim)) + SAFE_ALLOCATE(curlff(1:this%mesh%np, 1:this%mesh%box%dim)) + +#ifdef R_TREAL + type = 'real' +#else + type = 'complex' +#endif + + ! Note: here we need to use a constant function or anything that + ! is constant at the borders, since we assume that all boundary + ! points have equal values to optimize the application of the nl-operator. + + aa = R_TOTYPE(1.0)/this%mesh%box%bounding_box_l(1) + bb = R_TOTYPE(10.0) + cc = R_TOTYPE(100.0) + +#ifdef R_TCOMPLEX + ! we make things more "complex" + aa = aa + M_ZI*R_TOTYPE(0.01) + bb = bb*exp(M_ZI*R_TOTYPE(0.345)) + cc = cc - M_ZI*R_TOTYPE(50.0) +#endif + + + do ip = 1, this%mesh%np_part + ff(ip) = bb*exp(-aa*sum(this%mesh%x(:, ip)**2)) + cc + end do + do ip = 1, this%mesh%np + do idir = 1, this%mesh%box%dim + gradff(ip, idir) = -M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2)) + end do + end do + + ! curl only needed in 3D + if (this%mesh%box%dim == 3) then + do ip = 1, this%mesh%np + curlff(ip, 1) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(2, ip) - this%mesh%x(3, ip)) + curlff(ip, 2) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(3, ip) - this%mesh%x(1, ip)) + curlff(ip, 3) = -M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))*(this%mesh%x(1, ip) - this%mesh%x(2, ip)) + end do + end if + + message(1) = "Testing Laplacian" + call messages_info(1, namespace=namespace) + + ! test Laplacian + blocksize = min_blocksize + do + + call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) + call X(batch_init)(opffb, 1, 1, blocksize, this%mesh%np) + + do ist = 1, blocksize + call batch_set_state(ffb, ist, this%mesh%np_part, ff) + end do + + if (packstates) then + call ffb%do_pack() + call opffb%do_pack(copy = .false.) + end if + + if (repetitions > 1) then + call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF) + end if + + stime = loct_clock() + do itime = 1, repetitions + call X(derivatives_batch_perform)(this%lapl, this, ffb, opffb, set_bc = .false., factor = M_HALF) + end do + etime = (loct_clock() - stime)/real(repetitions, real64) + + call batch_get_state(opffb, blocksize, this%mesh%np, res) + + call ffb%end() + call opffb%end() + + do ip = 1, this%mesh%np + res(ip) = M_TWO*res(ip) - & + (M_FOUR*aa**2*bb*sum(this%mesh%x(:, ip)**2)*exp(-aa*sum(this%mesh%x(:, ip)**2)) & + - this%mesh%box%dim*M_TWO*aa*bb*exp(-aa*sum(this%mesh%x(:, ip)**2))) + end do + + write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & + 'Laplacian ', trim(type), & + ' bsize = ', blocksize, & + ' , error = ', X(mf_nrm2)(this%mesh, res), & + ' , Gflops = ', & +#ifdef R_TREAL + blocksize*this%mesh%np*M_TWO*this%lapl%stencil%size/(etime*1.0e9_real64) +#else + blocksize*this%mesh%np*M_FOUR*this%lapl%stencil%size/(etime*1.0e9_real64) +#endif + + call messages_info(1, namespace=namespace) + + blocksize = 2*blocksize + if (blocksize > max_blocksize) exit + + end do + + message(1) = "Testing Gradient" + call messages_info(1, namespace=namespace) + + ! test Gradient + blocksize = min_blocksize + do + call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) + + do ist = 1, blocksize + call batch_set_state(ffb, ist, this%mesh%np_part, ff) + end do + + if (packstates) then + call ffb%do_pack() + end if + + if (repetitions > 1) then + call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) + end if + + stime = loct_clock() + do itime = 1, repetitions + call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) + end do + etime = (loct_clock() - stime)/real(repetitions, real64) + + do idir = 1, this%mesh%box%dim + call batch_get_state(gradffb(idir), blocksize, this%mesh%np, resgrad(:, idir)) + end do + + call ffb%end() + do idir = 1, this%mesh%box%dim + call gradffb(idir)%end() + end do + + do idir = 1, this%mesh%box%dim + do ip = 1, this%mesh%np + resgrad(ip, idir) = resgrad(ip, idir) - gradff(ip, idir) + end do + end do + + write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & + 'Batch gradient ', trim(type), & + ' bsize = ', blocksize, & + ' , error (x direction) = ', X(mf_nrm2)(this%mesh, resgrad(:, 1)), & + ' , Gflops = ', & +#ifdef R_TREAL + blocksize*this%mesh%np*M_TWO*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64) +#else + blocksize*this%mesh%np*M_FOUR*this%grad(1)%stencil%size*this%mesh%box%dim/(etime*1.0e9_real64) +#endif + call messages_info(1, namespace=namespace) + + blocksize = 2*blocksize + if (blocksize > max_blocksize) exit + end do + + call X(derivatives_grad)(this, ff, opff, set_bc = .false.) + + do idir = 1, this%mesh%box%dim + do ip = 1, this%mesh%np + opff(ip, idir) = opff(ip, idir) - (-M_TWO*aa*bb*this%mesh%x(idir, ip)*exp(-aa*sum(this%mesh%x(:, ip)**2))) + end do + end do + + message(1) = '' + call messages_info(1, namespace=namespace) + + write(message(1), '(3a, es17.10)') 'Gradient ', trim(type), & + ' err = ', X(mf_nrm2)(this%mesh, this%mesh%box%dim, opff) + call messages_info(1, namespace=namespace) + + message(1) = '' + call messages_info(1, namespace=namespace) + + ! test curl for 3D case + if (this%mesh%box%dim == 3) then + message(1) = "Testing curl" + call messages_info(1, namespace=namespace) + + do blocksize = 3, 6, 3 + call X(batch_init)(ffb, 1, 1, blocksize, this%mesh%np_part) + + do ist = 1, blocksize + call batch_set_state(ffb, ist, this%mesh%np_part, ff) + end do + + if (packstates) then + call ffb%do_pack() + end if + + ! test for one blocksize without precomputed gradient + if (blocksize == 6) then + call X(derivatives_batch_curl)(this, ffb, set_bc=.false.) + else + call X(derivatives_batch_grad)(this, ffb, gradffb, set_bc=.false.) + call X(derivatives_batch_curl_from_gradient)(this, ffb, gradffb) + end if + + SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:blocksize)) + do ist = 1, blocksize + call batch_get_state(ffb, ist, this%mesh%np, rescurl(:, ist)) + end do + + call ffb%end() + do idir = 1, this%mesh%box%dim + call gradffb(idir)%end() + end do + + do ip = 1, this%mesh%np + do ib = 0, blocksize-1, this%mesh%box%dim + do idir = 1, this%mesh%box%dim + rescurl(ip, ib+idir) = rescurl(ip, ib+idir) - curlff(ip, idir) + end do + end do + end do + + norm = M_ZERO + do ist = 1, blocksize + norm = norm + X(mf_nrm2)(this%mesh, rescurl(:, ist)) + end do + + write(message(1), '(3a,i3,a,es17.10,a,f8.3)') & + 'Batch curl ', trim(type), ' bsize = ', blocksize, ' , error = ', norm + call messages_info(1, namespace=namespace) + + SAFE_DEALLOCATE_A(rescurl) + end do + + ! Testing the regular curl with the same logic as above + SAFE_ALLOCATE(rescurl(1:this%mesh%np, 1:this%mesh%box%dim)) + do idir = 1, this%mesh%box%dim + call lalg_copy(this%mesh%np_part, ff, gradff(:, idir)) + end do + call X(derivatives_curl)(this, gradff, rescurl, set_bc = .false.) + + do idir = 1, this%mesh%box%dim + do ip = 1, this%mesh%np + opff(ip, idir) = rescurl(ip, idir) - curlff(ip, idir) + end do + end do + + norm = M_ZERO + do ist = 1, this%mesh%box%dim + norm = norm + X(mf_nrm2)(this%mesh, opff(:, ist)) + end do + + write(message(1), '(3a, es17.10)') 'Curl ', trim(type), ' err = ', norm + call messages_info(1, namespace=namespace) + + SAFE_DEALLOCATE_A(rescurl) + end if + + message(1) = '' + call messages_info(1, namespace=namespace) + + SAFE_DEALLOCATE_A(ff) + SAFE_DEALLOCATE_A(opff) + SAFE_DEALLOCATE_A(gradff) + SAFE_DEALLOCATE_A(res) + SAFE_DEALLOCATE_A(resgrad) + SAFE_DEALLOCATE_A(curlff) + +end subroutine X(derivatives_test) + diff --git a/src/grid/grid.F90 b/src/grid/grid.F90 index 5c9004161739ac8f68431861ac6563cd85adac7b..423e9200d71e8d4d29085f46423ad27fc1532b91 100644 --- a/src/grid/grid.F90 +++ b/src/grid/grid.F90 @@ -193,6 +193,7 @@ contains call initialize_coordinate_system(gr, namespace, space, grid_spacing, latt, n_sites, site_position) ! initialize derivatives + call nl_operator_global_init(namespace) call derivatives_init(gr%der, namespace, space, gr%coord_system) ! the stencil used to generate the grid is a union of a cube (for ! multigrid) and the Laplacian @@ -254,6 +255,7 @@ contains call initialize_coordinate_system(gr, namespace, space, gr%spacing, latt, n_sites, site_position) ! initialize derivatives + call nl_operator_global_init(namespace) call derivatives_init(gr%der, namespace, space, gr%coord_system) ! the stencil used to generate the grid is a union of a cube (for ! multigrid) and the Laplacian @@ -383,7 +385,6 @@ contains call mesh_init_stage_3(gr, namespace, space, gr%stencil, mc) - call nl_operator_global_init(namespace) call derivatives_build(gr%der, namespace, space, gr, qvector) ! print info concerning the grid diff --git a/src/grid/nl_operator.F90 b/src/grid/nl_operator.F90 index f5f198c62e0cba8287c86d3120479e467f32c280..ed6d8be53e3d2c496cfb8cc3b233a684b2ad53a6 100644 --- a/src/grid/nl_operator.F90 +++ b/src/grid/nl_operator.F90 @@ -39,6 +39,7 @@ module nl_operator_oct_m use parser_oct_m use profiling_oct_m use space_oct_m + use sort_oct_m use stencil_oct_m use types_oct_m use varinfo_oct_m @@ -66,7 +67,8 @@ module nl_operator_oct_m nl_operator_np_zero_bc, & nl_operator_allocate_gpu_buffers, & nl_operator_update_gpu_buffers, & - nl_operator_remove_zero_weight_points + nl_operator_remove_zero_weight_points, & + nl_operator_build_symmetric_weights !> @brief index type for non-local operators !! @@ -78,8 +80,15 @@ module nl_operator_oct_m integer, allocatable :: imin(:) integer, allocatable :: imax(:) integer, allocatable :: ri(:, :) + integer, allocatable :: ri_pos(:,:,:) + integer, allocatable :: ri_neg(:,:,:) end type nl_operator_index_t + integer, public, parameter :: & + OP_GENERAL = 1, & + OP_SYMMETRIC = 2, & + OP_ANTISYMMETRIC = 3 + !> data type for non local operators type nl_operator_t private @@ -95,6 +104,8 @@ module nl_operator_oct_m type(accel_mem_t), public :: buff_weights !< buffer with constant weights type(accel_mem_t), public :: buff_half_weights !< buffer with weights multiplied by -1/2 + integer :: symmetry = OP_GENERAL !< Symmetry of the weights to be used + character(len=40) :: label !> the compressed index of grid points @@ -103,6 +114,14 @@ module nl_operator_oct_m integer, allocatable, public :: rimap(:) integer, allocatable, public :: rimap_inv(:) + !> For symmetry-based stencils + integer :: npairs = 0 + real(real64),allocatable :: wpair(:) + real(real64) :: wcenter + integer, allocatable :: ri_pos(:,:,:) + integer, allocatable :: ri_neg(:,:,:) + integer :: max_allocated_ri_pair = 0 + integer :: ninner = 0 integer :: nouter = 0 @@ -149,6 +168,7 @@ module nl_operator_oct_m integer :: dfunction_global = -1 integer :: zfunction_global = -1 integer :: function_accel + logical :: use_symmetries = .false. contains @@ -196,6 +216,17 @@ contains call parse_variable(namespace, 'OperateComplex', default, zfunction_global) if (.not. varinfo_valid_option('OperateComplex', zfunction_global)) call messages_input_error(namespace, 'OperateComplex') + !%Variable OperateUseSymmetries + !%Type logical + !%Section Execution::Optimization + !%Default no + !%Description + !% This variable selects if the operators are built using symmetries or not. + !%End + call parse_variable(namespace, 'OperateUseSymmetries', .false., use_symmetries) + if (accel_is_enabled() .and. use_symmetries) then + call messages_not_implemented("OperateUseSymmetries=yes with GPUs") + end if if (accel_is_enabled()) then @@ -232,13 +263,18 @@ contains !> @brief initialize an instance of a non-local operator !! by setting the label !! - subroutine nl_operator_init(op, label) + subroutine nl_operator_init(op, label, symm) type(nl_operator_t), intent(inout) :: op character(len=*), intent(in) :: label + integer, optional, intent(in) :: symm PUSH_SUB(nl_operator_init) op%label = label + op%symmetry = OP_GENERAL + if (use_symmetries) then + op%symmetry = optional_default(symm, OP_GENERAL) + end if POP_SUB(nl_operator_init) end subroutine nl_operator_init @@ -261,6 +297,7 @@ contains opo%np = opi%np opo%mesh => opi%mesh + opo%symmetry = opi%symmetry SAFE_ALLOCATE_SOURCE_A(opo%nn, opi%nn) SAFE_ALLOCATE_SOURCE_A(opo%w, opi%w) @@ -274,6 +311,14 @@ contains SAFE_ALLOCATE_SOURCE_A(opo%rimap, opi%rimap) SAFE_ALLOCATE_SOURCE_A(opo%rimap_inv, opi%rimap_inv) + opo%npairs = opi%npairs + if (opi%symmetry /= OP_GENERAL) then + opo%wcenter = opi%wcenter + SAFE_ALLOCATE_SOURCE_A(opo%wpair, opi%wpair) + SAFE_ALLOCATE_SOURCE_A(opo%ri_pos, opi%ri_pos) + SAFE_ALLOCATE_SOURCE_A(opo%ri_neg, opi%ri_neg) + end if + if (opi%mesh%parallel_in_domains) then opo%inner%nri = opi%inner%nri SAFE_ALLOCATE_SOURCE_A(opo%inner%imin, opi%inner%imin) @@ -284,6 +329,13 @@ contains SAFE_ALLOCATE_SOURCE_A(opo%outer%imin, opi%outer%imin) SAFE_ALLOCATE_SOURCE_A(opo%outer%imax, opi%outer%imax) SAFE_ALLOCATE_SOURCE_A(opo%outer%ri, opi%outer%ri) + + if (opi%symmetry /= OP_GENERAL) then + SAFE_ALLOCATE_SOURCE_A(opo%inner%ri_pos, opi%inner%ri_pos) + SAFE_ALLOCATE_SOURCE_A(opo%inner%ri_neg, opi%inner%ri_neg) + SAFE_ALLOCATE_SOURCE_A(opo%outer%ri_pos, opi%outer%ri_pos) + SAFE_ALLOCATE_SOURCE_A(opo%outer%ri_neg, opi%outer%ri_neg) + end if end if ! We create the corresponding GPU buffers @@ -627,6 +679,14 @@ contains SAFE_DEALLOCATE_A(op%rimap_inv) SAFE_DEALLOCATE_A(op%nn) + SAFE_DEALLOCATE_A(op%wpair) + SAFE_DEALLOCATE_A(op%ri_pos) + SAFE_DEALLOCATE_A(op%ri_neg) + + SAFE_DEALLOCATE_A(op%inner%ri_pos) + SAFE_DEALLOCATE_A(op%inner%ri_neg) + SAFE_DEALLOCATE_A(op%outer%ri_pos) + SAFE_DEALLOCATE_A(op%outer%ri_neg) call stencil_end(op%stencil) POP_SUB(nl_operator_end) @@ -762,6 +822,238 @@ contains POP_SUB(nl_operator_remove_zero_weight_points) end subroutine nl_operator_remove_zero_weight_points +!> Take a list of weights and offsets and build pairs of symmetric points with common weights + subroutine group_by_pairs_sym(size, ldf, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg, wcenter) + integer, intent(in) :: size + integer, intent(in) :: ldf + integer, intent(in) :: offsets(:, :) + real(real64), intent(in) :: wre(:) + integer, intent(in) :: ri(:, :) + integer, intent(in) :: nri + integer, intent(out) :: npairs + real(real64), intent(inout) :: wpair(:) + integer, intent(inout) :: pair_pos(:,:), pair_neg(:,:) + real(real64), intent(out) :: wcenter + + logical, allocatable :: used(:) + integer :: i, j, ndim, s + logical :: same + integer, allocatable :: idx(:) + + real(real64), parameter :: tol = 1.0e-12_real64 + + PUSH_SUB(group_by_pairs_sym) + + ASSERT(mod(size,2) == 1) + + SAFE_ALLOCATE(used(1:size)) + used = .false. + npairs = 0 + + ndim = ubound(offsets, dim=1) + + SAFE_ALLOCATE(idx(1:size)) + call robbust_sort_by_abs(wre, offsets, idx) + + do i = 1, size + if (used(i)) cycle + + if (all(offsets(:, idx(i))==0)) then + wcenter = wre(idx(i)) + used(i) = .true. + cycle + end if + + ! Try to find symmetric partner j + do j = i+1, size + if (used(j)) cycle + + ! Weight equality + same = abs(wre(idx(i)) - wre(idx(j))) <= tol + if (.not. same) cycle + + ! Offsets equal and opposite + if (any(offsets(:,idx(j))+offsets(:, idx(i)) /= 0)) cycle + + npairs = npairs + 1 + do s = 1, nri + pair_pos(npairs, s) = ishft(ri(idx(i), s), ldf) + pair_neg(npairs, s) = ishft(ri(idx(j), s), ldf) + end do + wpair(npairs) = M_HALF*(wre(idx(i)) + wre(idx(j))) + + used(i) = .true. + used(j) = .true. + exit + end do + end do + + ASSERT(npairs == size/2) + + SAFE_DEALLOCATE_A(idx) + + POP_SUB(group_by_pairs_sym) + end subroutine group_by_pairs_sym + +!> Take a list of weights and offsets and build pairs of symmetric points with common weights + subroutine group_by_pairs_antisym(size, ldf, offsets, wre, ri, nri, npairs, wpair, pair_pos, pair_neg) + integer, intent(in) :: size + integer, intent(in) :: ldf + integer, intent(in) :: offsets(:, :) + real(real64), intent(in) :: wre(:) + integer, intent(in) :: ri(:, :) + integer, intent(in) :: nri + integer, intent(out) :: npairs + real(real64), intent(inout) :: wpair(:) + integer, intent(inout) :: pair_pos(:,:), pair_neg(:,:) + + logical, allocatable :: used(:) + integer :: i, j, ndim, s + logical :: same + integer, allocatable :: idx(:) + + real(real64), parameter :: tol = 1.0e-12_real64 + + PUSH_SUB(group_by_pairs_antisym) + + ASSERT(mod(size,2) == 0) + + SAFE_ALLOCATE(used(1:size)) + used = .false. + npairs = 0 + + ndim = ubound(offsets, dim=1) + + SAFE_ALLOCATE(idx(1:size)) + call robbust_sort_by_abs(wre, offsets, idx) + + ! Max pairs = n/2 + do i = 1, size + if (used(i)) cycle + + ! Try to find symmetric partner j + do j = i+1, size + if (used(j)) cycle + + ! Weight equality + same = abs(wre(idx(i)) + wre(idx(j))) <= tol + if (.not. same) cycle + + ! Offsets equal and opposite + if (any(offsets(:,idx(j))+offsets(:, idx(i)) /= 0)) cycle + + npairs = npairs + 1 + do s = 1, nri + pair_pos(npairs, s) = ishft(ri(idx(i), s), ldf) + pair_neg(npairs, s) = ishft(ri(idx(j), s), ldf) + end do + wpair(npairs) = M_HALF*(wre(idx(i)) - wre(idx(j))) + + used(i) = .true. + used(j) = .true. + exit + end do + end do + + ASSERT(npairs == size/2) + + SAFE_DEALLOCATE_A(idx) + POP_SUB(group_by_pairs_antisym) + end subroutine group_by_pairs_antisym + + !>@brief Builds (or rebuild) the necessary arrays for symmetric and antisymmetric stencils + subroutine nl_operator_build_symmetric_weights(op, max_size) + type(nl_operator_t), intent(inout) :: op + integer, optional, intent(in) :: max_size !< For reallocation + + integer :: ldf, start, end, ipair + + if (op%symmetry == OP_GENERAL) return + + PUSH_SUB(nl_operator_build_symmetric_weights) + + ASSERT(op%const_w) + ASSERT(conf%target_states_block_size > 0) + + if(present(max_size)) then + start = op%max_allocated_ri_pair + 1 + end = max_size + call reallocate_array(op%ri_pos, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%ri_neg, op%stencil%size/2, op%nri, op%max_allocated_ri_pair, end) + if (op%mesh%parallel_in_domains) then + call reallocate_array(op%inner%ri_pos, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%inner%ri_neg, op%stencil%size/2, op%inner%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%outer%ri_pos, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end) + call reallocate_array(op%outer%ri_neg, op%stencil%size/2, op%outer%nri, op%max_allocated_ri_pair, end) + end if + else + ! Max pairs = n/2 + SAFE_ALLOCATE(op%wpair(1:op%stencil%size/2)) + start = 1 + end = log2(conf%target_states_block_size)+1 + + SAFE_ALLOCATE(op%ri_pos(1:op%stencil%size/2, 1:op%nri, 1:end)) + SAFE_ALLOCATE(op%ri_neg(1:op%stencil%size/2, 1:op%nri, 1:end)) + if (op%mesh%parallel_in_domains) then + SAFE_ALLOCATE(op%inner%ri_pos(1:op%stencil%size/2, 1:op%inner%nri, 1:end)) + SAFE_ALLOCATE(op%inner%ri_neg(1:op%stencil%size/2, 1:op%inner%nri, 1:end)) + SAFE_ALLOCATE(op%outer%ri_pos(1:op%stencil%size/2, 1:op%outer%nri, 1:end)) + SAFE_ALLOCATE(op%outer%ri_neg(1:op%stencil%size/2, 1:op%outer%nri, 1:end)) + end if + end if + op%max_allocated_ri_pair = end + + do ldf = start-1, end-1 + select case(op%symmetry) + case(OP_SYMMETRIC) + call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, & + op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1), op%wcenter) + if (op%mesh%parallel_in_domains) then + call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & + op%npairs, op%wpair, op%inner%ri_pos(:,:,ldf+1), op%inner%ri_neg(:,:,ldf+1), op%wcenter) + call group_by_pairs_sym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & + op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1), op%wcenter) + end if + case(OP_ANTISYMMETRIC) + call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%ri, op%nri, & + op%npairs, op%wpair, op%ri_pos(:,:,ldf+1), op%ri_neg(:,:,ldf+1)) + if (op%mesh%parallel_in_domains) then + call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%inner%ri, op%inner%nri, & + op%npairs, op%wpair, op%inner%ri_pos(:,:,ldf+1), op%inner%ri_neg(:,:,ldf+1)) + call group_by_pairs_antisym(op%stencil%size, ldf, op%stencil%points, op%w(:,1), op%outer%ri, op%outer%nri, & + op%npairs, op%wpair, op%outer%ri_pos(:,:,ldf+1), op%outer%ri_neg(:,:,ldf+1)) + end if + end select + end do + + if (.not. present(max_size)) then + write(message(1), '(3a)') 'Debug info: Sorted weights for ', trim(op%label), '.' + call messages_info(1, debug_only=.true.) + + do ipair = 1, op%npairs + write(message(1), '(a,i3,f25.10,2(1x,i4))') ' ', ipair, op%wpair(ipair), op%ri_pos(ipair,1,1), op%ri_neg(ipair,1,1) + call messages_info(1, debug_only=.true.) + end do + end if + + POP_SUB(nl_operator_build_symmetric_weights) + end subroutine nl_operator_build_symmetric_weights + + !>@brief Reallocate an ri array + subroutine reallocate_array(ri, stencil_size, nri, old_size, new_size) + integer, allocatable, intent(inout) :: ri(:,:,:) + integer, intent(in) :: stencil_size, nri + integer, intent(in) :: old_size, new_size + + integer, allocatable :: tmp(:,:,:) + + SAFE_ALLOCATE_SOURCE_A(tmp, ri) + SAFE_DEALLOCATE_A(ri) + SAFE_ALLOCATE(ri(1:stencil_size, 1:nri, 1:new_size)) + ri(:,:,1:old_size) = tmp + SAFE_DEALLOCATE_A(tmp) + end subroutine reallocate_array + #include "undef.F90" #include "real.F90" #include "nl_operator_inc.F90" diff --git a/src/grid/nl_operator_inc.F90 b/src/grid/nl_operator_inc.F90 index d2eb96aec17fe28b7f5c7f4ba3ba4673ddc2fe5f..6ba64178b3a64a48e88a3527757f1df19c9d22bd 100644 --- a/src/grid/nl_operator_inc.F90 +++ b/src/grid/nl_operator_inc.F90 @@ -19,7 +19,7 @@ ! --------------------------------------------------------- subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, points, factor, async) - type(nl_operator_t), target, intent(in) :: op + type(nl_operator_t), target, intent(inout) :: op type(batch_t), target, intent(inout) :: fi type(batch_t), intent(inout) :: fo !< this should be target, but old ifort 9.1 segfaults with it logical, optional, intent(in) :: ghost_update @@ -33,14 +33,17 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point logical :: ghost_update_, profile_, use_accel integer :: nri integer, pointer :: imin(:), imax(:), ri(:, :) -! real(real64), allocatable :: wre(:), wim(:) - R_BASE, allocatable :: wre(:), wim(:) + R_BASE, allocatable :: wre(:) #ifdef R_TREAL integer, parameter :: logldf = 0 #else integer, parameter :: logldf = 1 #endif integer :: nri_loc, ini + !> Symmetry of the stencil can be used to group points by pairs + real(real64), allocatable :: wpair(:) + integer, pointer :: ri_pos(:,:), ri_neg(:,:) + real(real64) :: wcenter PUSH_SUB(X(nl_operator_operate_batch)) @@ -75,6 +78,15 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point if (present(factor)) then wre(1:op%stencil%size) = wre(1:op%stencil%size)*factor end if + + if (op%symmetry /= OP_GENERAL) then + SAFE_ALLOCATE_SOURCE_A(wpair, op%wpair) + wcenter = op%wcenter + if (present(factor)) then + wpair = wpair * factor + wcenter = wcenter * factor + end if + end if end if use_accel = .false. @@ -86,10 +98,19 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point use_accel = .true. call operate_accel() else if (X(function_global) == OP_FORTRAN) then - call operate_const_weights() + select case(op%symmetry) + case(OP_ANTISYMMETRIC) + call operate_const_weights_antisymmetric() + case(OP_SYMMETRIC) + call operate_const_weights_symmetric() + case(OP_GENERAL) + call operate_const_weights() + case default + ASSERT(.false.) + end select else -! for the moment this is not implemented + ! for the moment this is not implemented !$omp parallel private(ini, nri_loc, ist) call multicomm_divide_range_omp(nri, ini, nri_loc) @@ -98,16 +119,38 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point ASSERT(ubound(fi%X(ff_pack), dim = 2) >= op%mesh%np_part) ASSERT(ubound(fo%X(ff_pack), dim = 2) >= op%mesh%np) - call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & - fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) + select case(op%symmetry) + case(OP_GENERAL) + call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & + fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) + case(OP_SYMMETRIC) + call X(operate_ri_sym_vec)(op%npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) + case(OP_ANTISYMMETRIC) + call X(operate_ri_antisym_vec)(op%npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + imin(ini), imax(ini), fi%X(ff_pack)(1, 1), log2(int(fi%pack_size_real(1), int32)), fo%X(ff_pack)(1, 1)) + case default + ASSERT(.false.) + end select else do ist = 1, fi%nst_linear ASSERT(ubound(fi%X(ff_linear), dim=1) == op%mesh%np_part) ASSERT(ubound(fo%X(ff_linear), dim=1) >= op%mesh%np) - call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & - fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) + select case(op%symmetry) + case(OP_GENERAL) + call X(operate_ri_vec)(op%stencil%size, wre(1), nri_loc, ri(1, ini), imin(ini), imax(ini), & + fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) + case(OP_SYMMETRIC) + call X(operate_ri_sym_vec)(op%npairs, wpair(1), wcenter, nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) + case(OP_ANTISYMMETRIC) + call X(operate_ri_antisym_vec)(op%npairs, wpair(1), nri_loc, ri_pos(1, ini), ri_neg(1, ini), & + imin(ini), imax(ini), fi%X(ff_linear)(1, ist), logldf, fo%X(ff_linear)(1, ist)) + case default + ASSERT(.false.) + end select end do end if !$omp end parallel @@ -121,7 +164,7 @@ subroutine X(nl_operator_operate_batch)(op, fi, fo, ghost_update, profile, point end if SAFE_DEALLOCATE_A(wre) - SAFE_DEALLOCATE_A(wim) + SAFE_DEALLOCATE_A(wpair) if (profile_) call profiling_out(TOSTRING(X(NL_OPERATOR_BATCH))) POP_SUB(X(nl_operator_operate_batch)) @@ -131,24 +174,50 @@ contains ! --------------------------------------------------------- subroutine select_op() + integer :: ldf + PUSH_SUB(X(nl_operator_operate_batch).select_op) + ldf = logldf+1 + if (fi%status() == BATCH_PACKED) then + ldf = log2(int(fi%pack_size_real(1), int32))+1 + end if + + if (X(function_global) == OP_FORTRAN .or. op%symmetry == OP_GENERAL) ldf = 1 + + ! If needed, we realloc the offsets + if (op%symmetry /= OP_GENERAL .and. ldf > op%max_allocated_ri_pair) then + call nl_operator_build_symmetric_weights(op, ldf) + end if + select case (points_) case (OP_ALL) nri = op%nri imin => op%rimap_inv(1:) imax => op%rimap_inv(2:) ri => op%ri + if (op%symmetry /= OP_GENERAL) then + ri_pos => op%ri_pos(:,:,ldf) + ri_neg => op%ri_neg(:,:,ldf) + end if case (OP_INNER) nri = op%inner%nri imin => op%inner%imin imax => op%inner%imax ri => op%inner%ri + if (op%symmetry /= OP_GENERAL) then + ri_pos => op%inner%ri_pos(:,:,ldf) + ri_neg => op%inner%ri_neg(:,:,ldf) + end if case (OP_OUTER) nri = op%outer%nri imin => op%outer%imin imax => op%outer%imax ri => op%outer%ri + if (op%symmetry /= OP_GENERAL) then + ri_pos => op%outer%ri_pos(:,:,ldf) + ri_neg => op%outer%ri_neg(:,:,ldf) + end if case default ASSERT(.false.) end select @@ -203,6 +272,109 @@ contains POP_SUB(X(nl_operator_operate_batch).operate_const_weights) end subroutine operate_const_weights + ! --------------------------------------------------------- + subroutine operate_const_weights_antisymmetric() + integer :: nn, ll, ii, ist, ipair + + PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_antisymmetric) + + nn = op%stencil%size + + select case (fi%status()) + + case (BATCH_DEVICE_PACKED) + + ASSERT(.false.) + + case (BATCH_NOT_PACKED) + + !$omp parallel do private(ll, ist, ii, ipair) + do ll = 1, nri + do ii = imin(ll) + 1, imax(ll) + do ist = 1, fi%nst_linear + fo%X(ff_linear)(ii, ist) = M_ZERO + do ipair = 1, op%npairs + fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + & + wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) - fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist)) + end do + end do + end do + end do + !$omp end parallel do + + case (BATCH_PACKED) + + !$omp parallel do private(ll, ist, ii, ipair) + do ll = 1, nri + do ii = imin(ll) + 1, imax(ll) + do ist = 1, fi%nst_linear + fo%X(ff_pack)(ist, ii) = M_ZERO + do ipair = 1, op%npairs + fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) & + + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) - fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll))) + end do + end do + end do + end do + !$omp end parallel do + + end select + + POP_SUB(X(nl_operator_operate_batch).operate_const_weights_antisymmetric) + end subroutine operate_const_weights_antisymmetric + + ! --------------------------------------------------------- + subroutine operate_const_weights_symmetric() + integer :: nn, ll, ii, ist, ipair + + PUSH_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric) + + nn = op%stencil%size + + select case (fi%status()) + + case (BATCH_DEVICE_PACKED) + + ASSERT(.false.) + + case (BATCH_NOT_PACKED) + + !$omp parallel do private(ll, ist, ii, ipair) + do ll = 1, nri + do ii = imin(ll) + 1, imax(ll) + do ist = 1, fi%nst_linear + fo%X(ff_linear)(ii, ist) = M_ZERO + do ipair = 1, op%npairs + fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + & + wpair(ipair)*(fi%X(ff_linear)(ii + ri_pos(ipair, ll), ist) + fi%X(ff_linear)(ii + ri_neg(ipair, ll), ist)) + end do + fo%X(ff_linear)(ii, ist) = fo%X(ff_linear)(ii, ist) + wcenter * fi%X(ff_linear)(ii, ist) + end do + end do + end do + !$omp end parallel do + + case (BATCH_PACKED) + + !$omp parallel do private(ll, ist, ii, ipair) + do ll = 1, nri + do ii = imin(ll) + 1, imax(ll) + do ist = 1, fi%nst_linear + fo%X(ff_pack)(ist, ii) = M_ZERO + do ipair = 1, op%npairs + fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) & + + wpair(ipair) * (fi%X(ff_pack)(ist, ii + ri_pos(ipair, ll)) + fi%X(ff_pack)(ist, ii + ri_neg(ipair, ll))) + end do + fo%X(ff_pack)(ist, ii) = fo%X(ff_pack)(ist, ii) + wcenter * fi%X(ff_pack)(ist, ii) + end do + end do + end do + !$omp end parallel do + + end select + + POP_SUB(X(nl_operator_operate_batch).operate_const_weights_symmetric) + end subroutine operate_const_weights_symmetric ! --------------------------------------------------------- subroutine operate_non_const_weights() @@ -471,7 +643,7 @@ end subroutine X(nl_operator_operate_batch) subroutine X(nl_operator_operate)(op, fi, fo, ghost_update, profile, points) R_TYPE, contiguous, intent(inout) :: fi(:) !< fi(op%np_part) - type(nl_operator_t), intent(in) :: op + type(nl_operator_t), intent(inout) :: op R_TYPE, contiguous, target, intent(out) :: fo(:) logical, optional, intent(in) :: ghost_update logical, optional, intent(in) :: profile diff --git a/src/grid/operate.c b/src/grid/operate.c index 482b8ed3cee6a82435c32b48d15168c9470fc8ea..ce5071e9a6f8b8792b6b62eff12d4b93de59bb9f 100644 --- a/src/grid/operate.c +++ b/src/grid/operate.c @@ -64,6 +64,57 @@ void FC_FUNC_(doperate_ri_vec, } } +void FC_FUNC_(doperate_ri_sym_vec, + DOPERATE_RI_SYM_VEC)(const int *opn, const double *restrict wpair, const double *restrict wcenter, + const int *opnri, const int *opri_pos, const int *opri_neg, + const int *rimap_inv, const int *rimap_inv_max, + const double *restrict fi, const int *ldfp, + double *restrict fo) { + const size_t ldf = ldfp[0]; + + /* check whether we got aligned vectors or not */ + int aligned = 1; + aligned = aligned && (((long long)fi) % (8 * VEC_SIZE) == 0); + aligned = aligned && (((long long)fo) % (8 * VEC_SIZE) == 0); + aligned = aligned && ((1 << ldf) % VEC_SIZE == 0); + + if (aligned) { +#define ALIGNED +#include "operate_sym_inc.c" +#undef ALIGNED + + } else { + /* not aligned */ + #include "operate_sym_inc.c" + } +} + +void FC_FUNC_(doperate_ri_antisym_vec, + DOPERATE_RI_ANTISYM_VEC)(const int *opn, const double *restrict wpair, + const int *opnri, const int *opri_pos, const int *opri_neg, + const int *rimap_inv, const int *rimap_inv_max, + const double *restrict fi, const int *ldfp, + double *restrict fo) { + const size_t ldf = ldfp[0]; + + /* check whether we got aligned vectors or not */ + int aligned = 1; + aligned = aligned && (((long long)fi) % (8 * VEC_SIZE) == 0); + aligned = aligned && (((long long)fo) % (8 * VEC_SIZE) == 0); + aligned = aligned && ((1 << ldf) % VEC_SIZE == 0); + + if (aligned) { +#define ALIGNED +#include "operate_antisym_inc.c" +#undef ALIGNED + + } else { + /* not aligned */ +#include "operate_antisym_inc.c" + } +} + + /* the same as doperate_ri_vec, but allows giving each an appropriate Fortan interface in which fi and fo are actually complex in Fortran Could be inline, but in that case pgcc will not put it in the symbol table. */ @@ -77,6 +128,28 @@ void FC_FUNC_(zoperate_ri_vec, (opn, w, opnri, opri, rimap_inv, rimap_inv_max, fi, ldfp, fo); } +void FC_FUNC_(zoperate_ri_sym_vec, + ZOPERATE_RI_SYM_VEC)(const int *opn, const double *restrict w, + const double *restrict wcenter, + const int *opnri, const int *opri_pos, const int *opri_neg, + const int *rimap_inv, const int *rimap_inv_max, + const double *restrict fi, const int *ldfp, + double *restrict fo) { + FC_FUNC_(doperate_ri_sym_vec, DOPERATE_RI_SYM_VEC) + (opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo); +} + +void FC_FUNC_(zoperate_ri_antisym_vec, + ZOPERATE_RI_ANTISYM_VEC)(const int *opn, const double *restrict w, + const int *opnri, const int *opri_pos, const int *opri_neg, + const int *rimap_inv, const int *rimap_inv_max, + const double *restrict fi, const int *ldfp, + double *restrict fo) { + FC_FUNC_(doperate_ri_antisym_vec, DOPERATE_RI_SYM_VEC) + (opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo); +} + + void FC_FUNC_(dgauss_seidel, DGAUSS_SEIDEL)(const int *opn, const double *restrict w, const int *opnri, const int *opri, diff --git a/src/grid/operate_antisym_inc.c b/src/grid/operate_antisym_inc.c new file mode 100644 index 0000000000000000000000000000000000000000..efa735b6a8eb84edae8f4e6b30da76bbf553950c --- /dev/null +++ b/src/grid/operate_antisym_inc.c @@ -0,0 +1,312 @@ +/* + Copyright (C) 2006 X. Andrade + Copyright (C) 2025 N. Tancogne-Dejean + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +*/ + +#ifdef ALIGNED +#define LOAD VEC_LD +#define STORE VEC_ST +#else +#define LOAD VEC_LDU +#define STORE VEC_STU +#endif + +{ + const ptrdiff_t npairs = opn[0]; + const ptrdiff_t nri = opnri[0]; +#if DEPTH >= 16 + const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 8 + const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 4 + const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 2 + const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 1 + const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf); +#endif + + ptrdiff_t l, i, j; + const int *restrict index_pos; + const int *restrict index_neg; + + for (l = 0; l < nri; l++) { + index_pos = opri_pos + npairs * l; + index_neg = opri_neg + npairs * l; + i = rimap_inv[l]; + +#if DEPTH >= 16 + for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + register VEC_TYPE a8, a9, aa, ab; + register VEC_TYPE ac, ad, ae, af; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + a8 = a9 = aa = ab = VEC_ZERO; + ac = ad = ae = af = VEC_ZERO; + + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); + a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); + a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); + a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); + a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); + a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); + a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); + a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7); + pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE); + a8 = VEC_FMA(wj, VEC_SUB(pos, neg), a8); + pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE); + a9 = VEC_FMA(wj, VEC_SUB(pos, neg), a9); + pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE); + aa = VEC_FMA(wj, VEC_SUB(pos, neg), aa); + pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE); + ab = VEC_FMA(wj, VEC_SUB(pos, neg), ab); + pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE); + ac = VEC_FMA(wj, VEC_SUB(pos, neg), ac); + pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE); + ad = VEC_FMA(wj, VEC_SUB(pos, neg), ad); + pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE); + ae = VEC_FMA(wj, VEC_SUB(pos, neg), ae); + pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE); + af = VEC_FMA(wj, VEC_SUB(pos, neg), af); + + } + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + STORE(fo + base_ik + 4 * VEC_SIZE, a4); + STORE(fo + base_ik + 5 * VEC_SIZE, a5); + STORE(fo + base_ik + 6 * VEC_SIZE, a6); + STORE(fo + base_ik + 7 * VEC_SIZE, a7); + STORE(fo + base_ik + 8 * VEC_SIZE, a8); + STORE(fo + base_ik + 9 * VEC_SIZE, a9); + STORE(fo + base_ik + 10 * VEC_SIZE, aa); + STORE(fo + base_ik + 11 * VEC_SIZE, ab); + STORE(fo + base_ik + 12 * VEC_SIZE, ac); + STORE(fo + base_ik + 13 * VEC_SIZE, ad); + STORE(fo + base_ik + 14 * VEC_SIZE, ae); + STORE(fo + base_ik + 15 * VEC_SIZE, af); + } + } +#endif + +#if DEPTH >= 8 + for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); + a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); + a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); + a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); + a4 = VEC_FMA(wj, VEC_SUB(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); + a5 = VEC_FMA(wj, VEC_SUB(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); + a6 = VEC_FMA(wj, VEC_SUB(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); + a7 = VEC_FMA(wj, VEC_SUB(pos, neg), a7); + } + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + STORE(fo + base_ik + 4 * VEC_SIZE, a4); + STORE(fo + base_ik + 5 * VEC_SIZE, a5); + STORE(fo + base_ik + 6 * VEC_SIZE, a6); + STORE(fo + base_ik + 7 * VEC_SIZE, a7); + } + } +#endif + +#if DEPTH >= 4 + for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + + a0 = a1 = a2 = a3 = VEC_ZERO; + + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); + a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); + a2 = VEC_FMA(wj, VEC_SUB(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); + a3 = VEC_FMA(wj, VEC_SUB(pos, neg), a3); + } + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + } + } +#endif + +#if DEPTH >= 2 + for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) { + register VEC_TYPE a0, a1; + + a0 = a1 = VEC_ZERO; + + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); + a1 = VEC_FMA(wj, VEC_SUB(pos, neg), a1); + } + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + } + } +#endif + +#if DEPTH >= 1 + for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += VEC_SIZE) { + register VEC_TYPE a0 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_SUB(pos, neg), a0); + } + STORE(fo + base_ik, a0); + } + } +#endif + +#if VEC_SIZE > 1 + + const ptrdiff_t size = (ptrdiff_t) 1 << ldf; + double a; + + for (; i < rimap_inv_max[l]; i++) { + for (ptrdiff_t k = 0; k < size; k++) { + a = 0.0; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + const double diff = fi[indexj_pos] - fi[indexj_neg]; + a += wpair[j] * diff; + } + fo[base_ik] = a; + } + } +#endif + + } /* l */ + + // this fence instruction is needed to ensure correctness when using + // non-temporal stores +#if defined(ALIGNED) && defined(FENCE) + FENCE; +#endif +} + +#undef LOAD +#undef STORE diff --git a/src/grid/operate_f.F90 b/src/grid/operate_f.F90 index f17b4b1a7d0d080f0148c9c2c0a4ba1fa777581b..51c467b6b21a3a8640dc1ba07674e57eb4e78430 100644 --- a/src/grid/operate_f.F90 +++ b/src/grid/operate_f.F90 @@ -61,6 +61,77 @@ module operate_f_oct_m end subroutine zoperate_ri_vec end interface + interface + subroutine doperate_ri_sym_vec(opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo) + import :: real64 + implicit none + integer, intent(in) :: opn + real(real64), intent(in) :: w + real(real64), intent(in) :: wcenter + integer, intent(in) :: opnri + integer, intent(in) :: opri_pos + integer, intent(in) :: opri_neg + integer, intent(in) :: rimap_inv + integer, intent(in) :: rimap_inv_max + real(real64), intent(in) :: fi + integer, intent(in) :: ldfp + real(real64), intent(inout) :: fo + end subroutine doperate_ri_sym_vec + end interface + + interface + subroutine zoperate_ri_sym_vec(opn, w, wcenter, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo) + import :: real64 + implicit none + integer, intent(in) :: opn + real(real64), intent(in) :: w + real(real64), intent(in) :: wcenter + integer, intent(in) :: opnri + integer, intent(in) :: opri_pos + integer, intent(in) :: opri_neg + integer, intent(in) :: rimap_inv + integer, intent(in) :: rimap_inv_max + complex(real64), intent(in) :: fi + integer, intent(in) :: ldfp + complex(real64), intent(inout) :: fo + end subroutine zoperate_ri_sym_vec + end interface + + interface + subroutine doperate_ri_antisym_vec(opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo) + import :: real64 + implicit none + integer, intent(in) :: opn + real(real64), intent(in) :: w + integer, intent(in) :: opnri + integer, intent(in) :: opri_pos + integer, intent(in) :: opri_neg + integer, intent(in) :: rimap_inv + integer, intent(in) :: rimap_inv_max + real(real64), intent(in) :: fi + integer, intent(in) :: ldfp + real(real64), intent(inout) :: fo + end subroutine doperate_ri_antisym_vec + end interface + + interface + subroutine zoperate_ri_antisym_vec(opn, w, opnri, opri_pos, opri_neg, rimap_inv, rimap_inv_max, fi, ldfp, fo) + import :: real64 + implicit none + integer, intent(in) :: opn + real(real64), intent(in) :: w + integer, intent(in) :: opnri + integer, intent(in) :: opri_pos + integer, intent(in) :: opri_neg + integer, intent(in) :: rimap_inv + integer, intent(in) :: rimap_inv_max + complex(real64), intent(in) :: fi + integer, intent(in) :: ldfp + complex(real64), intent(inout) :: fo + end subroutine zoperate_ri_antisym_vec + end interface + + interface subroutine dgauss_seidel(opn, w, opnri, opri, rimap_inv, rimap_inv_max, factor, pot, rho) import :: real64 diff --git a/src/grid/operate_inc.c b/src/grid/operate_inc.c index d9014ac3670e2ea69aad7c3938ed6bcf9d372404..2eeb3d0ebf8939f06c2ae1219faef470364012e5 100644 --- a/src/grid/operate_inc.c +++ b/src/grid/operate_inc.c @@ -29,9 +29,6 @@ { const ptrdiff_t n = opn[0]; const ptrdiff_t nri = opnri[0]; -#if DEPTH >= 32 - const ptrdiff_t unroll32 = max1(32 * VEC_SIZE >> ldf); -#endif #if DEPTH >= 16 const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf); #endif @@ -55,104 +52,6 @@ index = opri + n * l; i = rimap_inv[l]; -#if DEPTH >= 32 - for (; i < (rimap_inv_max[l] - unroll32 + 1); i += unroll32) { - ptrdiff_t k; - for (k = 0; k < (1 << ldf); k += 32 * VEC_SIZE) { - register VEC_TYPE a0, a1, a2, a3; - register VEC_TYPE a4, a5, a6, a7; - register VEC_TYPE a8, a9, aa, ab; - register VEC_TYPE ac, ad, ae, af; - register VEC_TYPE b0, b1, b2, b3; - register VEC_TYPE b4, b5, b6, b7; - register VEC_TYPE b8, b9, ba, bb; - register VEC_TYPE bc, bd, be, bf; - - a0 = a1 = a2 = a3 = VEC_ZERO; - a4 = a5 = a6 = a7 = VEC_ZERO; - a8 = a9 = aa = ab = VEC_ZERO; - ac = ad = ae = af = VEC_ZERO; - b0 = b1 = b2 = b3 = VEC_ZERO; - b4 = b5 = b6 = b7 = VEC_ZERO; - b8 = b9 = ba = bb = VEC_ZERO; - bc = bd = be = bf = VEC_ZERO; - - for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else - register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif - ptrdiff_t indexj = (index[j] + i) << ldf; - a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); - a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); - a2 = VEC_FMA(wj, LOAD(fi + indexj + 2 * VEC_SIZE + k), a2); - a3 = VEC_FMA(wj, LOAD(fi + indexj + 3 * VEC_SIZE + k), a3); - a4 = VEC_FMA(wj, LOAD(fi + indexj + 4 * VEC_SIZE + k), a4); - a5 = VEC_FMA(wj, LOAD(fi + indexj + 5 * VEC_SIZE + k), a5); - a6 = VEC_FMA(wj, LOAD(fi + indexj + 6 * VEC_SIZE + k), a6); - a7 = VEC_FMA(wj, LOAD(fi + indexj + 7 * VEC_SIZE + k), a7); - a8 = VEC_FMA(wj, LOAD(fi + indexj + 8 * VEC_SIZE + k), a8); - a9 = VEC_FMA(wj, LOAD(fi + indexj + 9 * VEC_SIZE + k), a9); - aa = VEC_FMA(wj, LOAD(fi + indexj + 10 * VEC_SIZE + k), aa); - ab = VEC_FMA(wj, LOAD(fi + indexj + 11 * VEC_SIZE + k), ab); - ac = VEC_FMA(wj, LOAD(fi + indexj + 12 * VEC_SIZE + k), ac); - ad = VEC_FMA(wj, LOAD(fi + indexj + 13 * VEC_SIZE + k), ad); - ae = VEC_FMA(wj, LOAD(fi + indexj + 14 * VEC_SIZE + k), ae); - af = VEC_FMA(wj, LOAD(fi + indexj + 15 * VEC_SIZE + k), af); - b0 = VEC_FMA(wj, LOAD(fi + indexj + 16 * VEC_SIZE + k), b0); - b1 = VEC_FMA(wj, LOAD(fi + indexj + 17 * VEC_SIZE + k), b1); - b2 = VEC_FMA(wj, LOAD(fi + indexj + 18 * VEC_SIZE + k), b2); - b3 = VEC_FMA(wj, LOAD(fi + indexj + 19 * VEC_SIZE + k), b3); - b4 = VEC_FMA(wj, LOAD(fi + indexj + 20 * VEC_SIZE + k), b4); - b5 = VEC_FMA(wj, LOAD(fi + indexj + 21 * VEC_SIZE + k), b5); - b6 = VEC_FMA(wj, LOAD(fi + indexj + 22 * VEC_SIZE + k), b6); - b7 = VEC_FMA(wj, LOAD(fi + indexj + 23 * VEC_SIZE + k), b7); - b8 = VEC_FMA(wj, LOAD(fi + indexj + 24 * VEC_SIZE + k), b8); - b9 = VEC_FMA(wj, LOAD(fi + indexj + 25 * VEC_SIZE + k), b9); - ba = VEC_FMA(wj, LOAD(fi + indexj + 26 * VEC_SIZE + k), ba); - bb = VEC_FMA(wj, LOAD(fi + indexj + 27 * VEC_SIZE + k), bb); - bc = VEC_FMA(wj, LOAD(fi + indexj + 28 * VEC_SIZE + k), bc); - bd = VEC_FMA(wj, LOAD(fi + indexj + 29 * VEC_SIZE + k), bd); - be = VEC_FMA(wj, LOAD(fi + indexj + 30 * VEC_SIZE + k), be); - bf = VEC_FMA(wj, LOAD(fi + indexj + 31 * VEC_SIZE + k), bf); - } - STORE(fo + (i << ldf) + k, a0); - STORE(fo + (i << ldf) + 1 * VEC_SIZE + k, a1); - STORE(fo + (i << ldf) + 2 * VEC_SIZE + k, a2); - STORE(fo + (i << ldf) + 3 * VEC_SIZE + k, a3); - STORE(fo + (i << ldf) + 4 * VEC_SIZE + k, a4); - STORE(fo + (i << ldf) + 5 * VEC_SIZE + k, a5); - STORE(fo + (i << ldf) + 6 * VEC_SIZE + k, a6); - STORE(fo + (i << ldf) + 7 * VEC_SIZE + k, a7); - STORE(fo + (i << ldf) + 8 * VEC_SIZE + k, a8); - STORE(fo + (i << ldf) + 9 * VEC_SIZE + k, a9); - STORE(fo + (i << ldf) + 10 * VEC_SIZE + k, aa); - STORE(fo + (i << ldf) + 11 * VEC_SIZE + k, ab); - STORE(fo + (i << ldf) + 12 * VEC_SIZE + k, ac); - STORE(fo + (i << ldf) + 13 * VEC_SIZE + k, ad); - STORE(fo + (i << ldf) + 14 * VEC_SIZE + k, ae); - STORE(fo + (i << ldf) + 15 * VEC_SIZE + k, af); - STORE(fo + (i << ldf) + 16 * VEC_SIZE + k, b0); - STORE(fo + (i << ldf) + 17 * VEC_SIZE + k, b1); - STORE(fo + (i << ldf) + 18 * VEC_SIZE + k, b2); - STORE(fo + (i << ldf) + 19 * VEC_SIZE + k, b3); - STORE(fo + (i << ldf) + 20 * VEC_SIZE + k, b4); - STORE(fo + (i << ldf) + 21 * VEC_SIZE + k, b5); - STORE(fo + (i << ldf) + 22 * VEC_SIZE + k, b6); - STORE(fo + (i << ldf) + 23 * VEC_SIZE + k, b7); - STORE(fo + (i << ldf) + 24 * VEC_SIZE + k, b8); - STORE(fo + (i << ldf) + 25 * VEC_SIZE + k, b9); - STORE(fo + (i << ldf) + 26 * VEC_SIZE + k, ba); - STORE(fo + (i << ldf) + 27 * VEC_SIZE + k, bb); - STORE(fo + (i << ldf) + 28 * VEC_SIZE + k, bc); - STORE(fo + (i << ldf) + 29 * VEC_SIZE + k, bd); - STORE(fo + (i << ldf) + 30 * VEC_SIZE + k, be); - STORE(fo + (i << ldf) + 31 * VEC_SIZE + k, bf); - } - } -#endif - #if DEPTH >= 16 for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) { ptrdiff_t k; @@ -168,11 +67,7 @@ ac = ad = ae = af = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); @@ -222,11 +117,7 @@ a4 = a5 = a6 = a7 = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); @@ -258,11 +149,7 @@ a0 = a1 = a2 = a3 = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); @@ -286,11 +173,7 @@ a0 = a1 = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); a1 = VEC_FMA(wj, LOAD(fi + indexj + 1 * VEC_SIZE + k), a1); @@ -307,11 +190,7 @@ for (k = 0; k < (1 << ldf); k += VEC_SIZE) { register VEC_TYPE a0 = VEC_ZERO; for (j = 0; j < n; j++) { -#ifdef VEC_SCAL_LD - register VEC_TYPE wj = VEC_SCAL_LD(w + j); -#else register VEC_TYPE wj = VEC_SCAL(w[j]); -#endif ptrdiff_t indexj = (index[j] + i) << ldf; a0 = VEC_FMA(wj, LOAD(fi + indexj + k), a0); } diff --git a/src/grid/operate_sym_inc.c b/src/grid/operate_sym_inc.c new file mode 100644 index 0000000000000000000000000000000000000000..fc5ba4ac88661571229f392cee808014857f9103 --- /dev/null +++ b/src/grid/operate_sym_inc.c @@ -0,0 +1,352 @@ +/* + Copyright (C) 2006 X. Andrade + Copyright (C) 2025 N. Tancogne-Dejean + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + 02110-1301, USA. + +*/ + +#ifdef ALIGNED +#define LOAD VEC_LD +#define STORE VEC_ST +#else +#define LOAD VEC_LDU +#define STORE VEC_STU +#endif + +{ + const ptrdiff_t npairs = opn[0]; + const ptrdiff_t nri = opnri[0]; +#if DEPTH >= 16 + const ptrdiff_t unroll16 = max1(16 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 8 + const ptrdiff_t unroll8 = max1(8 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 4 + const ptrdiff_t unroll4 = max1(4 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 2 + const ptrdiff_t unroll2 = max1(2 * VEC_SIZE >> ldf); +#endif +#if DEPTH >= 1 + const ptrdiff_t unroll1 = max1(1 * VEC_SIZE >> ldf); +#endif + + ptrdiff_t l, i, j; + const int *restrict index_pos; + const int *restrict index_neg; + + // Hoisting wcenter + const double w0 = wcenter[0]; + + for (l = 0; l < nri; l++) { + index_pos = opri_pos + npairs * l; + index_neg = opri_neg + npairs * l; + i = rimap_inv[l]; + +#if DEPTH >= 16 + for (; i < (rimap_inv_max[l] - unroll16 + 1); i += unroll16) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 16 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + register VEC_TYPE a8, a9, aa, ab; + register VEC_TYPE ac, ad, ae, af; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + a8 = a9 = aa = ab = VEC_ZERO; + ac = ad = ae = af = VEC_ZERO; + + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); + a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); + a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); + a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); + a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); + a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); + a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); + a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7); + pos = LOAD(fi + indexj_pos + 8 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 8 * VEC_SIZE); + a8 = VEC_FMA(wj, VEC_ADD(pos, neg), a8); + pos = LOAD(fi + indexj_pos + 9 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 9 * VEC_SIZE); + a9 = VEC_FMA(wj, VEC_ADD(pos, neg), a9); + pos = LOAD(fi + indexj_pos + 10 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 10 * VEC_SIZE); + aa = VEC_FMA(wj, VEC_ADD(pos, neg), aa); + pos = LOAD(fi + indexj_pos + 11 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 11 * VEC_SIZE); + ab = VEC_FMA(wj, VEC_ADD(pos, neg), ab); + pos = LOAD(fi + indexj_pos + 12 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 12 * VEC_SIZE); + ac = VEC_FMA(wj, VEC_ADD(pos, neg), ac); + pos = LOAD(fi + indexj_pos + 13 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 13 * VEC_SIZE); + ad = VEC_FMA(wj, VEC_ADD(pos, neg), ad); + pos = LOAD(fi + indexj_pos + 14 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 14 * VEC_SIZE); + ae = VEC_FMA(wj, VEC_ADD(pos, neg), ae); + pos = LOAD(fi + indexj_pos + 15 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 15 * VEC_SIZE); + af = VEC_FMA(wj, VEC_ADD(pos, neg), af); + + } + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); + a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2); + a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3); + a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4); + a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5); + a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6); + a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7); + a8 = VEC_FMA(wj, LOAD(fi + base_ik + 8 * VEC_SIZE), a8); + a9 = VEC_FMA(wj, LOAD(fi + base_ik + 9 * VEC_SIZE), a9); + aa = VEC_FMA(wj, LOAD(fi + base_ik + 10 * VEC_SIZE), aa); + ab = VEC_FMA(wj, LOAD(fi + base_ik + 11 * VEC_SIZE), ab); + ac = VEC_FMA(wj, LOAD(fi + base_ik + 12 * VEC_SIZE), ac); + ad = VEC_FMA(wj, LOAD(fi + base_ik + 13 * VEC_SIZE), ad); + ae = VEC_FMA(wj, LOAD(fi + base_ik + 14 * VEC_SIZE), ae); + af = VEC_FMA(wj, LOAD(fi + base_ik + 15 * VEC_SIZE), af); + + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + STORE(fo + base_ik + 4 * VEC_SIZE, a4); + STORE(fo + base_ik + 5 * VEC_SIZE, a5); + STORE(fo + base_ik + 6 * VEC_SIZE, a6); + STORE(fo + base_ik + 7 * VEC_SIZE, a7); + STORE(fo + base_ik + 8 * VEC_SIZE, a8); + STORE(fo + base_ik + 9 * VEC_SIZE, a9); + STORE(fo + base_ik + 10 * VEC_SIZE, aa); + STORE(fo + base_ik + 11 * VEC_SIZE, ab); + STORE(fo + base_ik + 12 * VEC_SIZE, ac); + STORE(fo + base_ik + 13 * VEC_SIZE, ad); + STORE(fo + base_ik + 14 * VEC_SIZE, ae); + STORE(fo + base_ik + 15 * VEC_SIZE, af); + } + } +#endif + +#if DEPTH >= 8 + for (; i < (rimap_inv_max[l] - unroll8 + 1); i += unroll8) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 8 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + register VEC_TYPE a4, a5, a6, a7; + + a0 = a1 = a2 = a3 = VEC_ZERO; + a4 = a5 = a6 = a7 = VEC_ZERO; + + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); + a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); + a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); + a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); + pos = LOAD(fi + indexj_pos + 4 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 4 * VEC_SIZE); + a4 = VEC_FMA(wj, VEC_ADD(pos, neg), a4); + pos = LOAD(fi + indexj_pos + 5 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 5 * VEC_SIZE); + a5 = VEC_FMA(wj, VEC_ADD(pos, neg), a5); + pos = LOAD(fi + indexj_pos + 6 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 6 * VEC_SIZE); + a6 = VEC_FMA(wj, VEC_ADD(pos, neg), a6); + pos = LOAD(fi + indexj_pos + 7 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 7 * VEC_SIZE); + a7 = VEC_FMA(wj, VEC_ADD(pos, neg), a7); + } + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); + a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2); + a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3); + a4 = VEC_FMA(wj, LOAD(fi + base_ik + 4 * VEC_SIZE), a4); + a5 = VEC_FMA(wj, LOAD(fi + base_ik + 5 * VEC_SIZE), a5); + a6 = VEC_FMA(wj, LOAD(fi + base_ik + 6 * VEC_SIZE), a6); + a7 = VEC_FMA(wj, LOAD(fi + base_ik + 7 * VEC_SIZE), a7); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + STORE(fo + base_ik + 4 * VEC_SIZE, a4); + STORE(fo + base_ik + 5 * VEC_SIZE, a5); + STORE(fo + base_ik + 6 * VEC_SIZE, a6); + STORE(fo + base_ik + 7 * VEC_SIZE, a7); + } + } +#endif + +#if DEPTH >= 4 + for (; i < (rimap_inv_max[l] - unroll4 + 1); i += unroll4) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 4 * VEC_SIZE) { + register VEC_TYPE a0, a1, a2, a3; + + a0 = a1 = a2 = a3 = VEC_ZERO; + + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); + a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); + pos = LOAD(fi + indexj_pos + 2 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 2 * VEC_SIZE); + a2 = VEC_FMA(wj, VEC_ADD(pos, neg), a2); + pos = LOAD(fi + indexj_pos + 3 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 3 * VEC_SIZE); + a3 = VEC_FMA(wj, VEC_ADD(pos, neg), a3); + } + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); + a2 = VEC_FMA(wj, LOAD(fi + base_ik + 2 * VEC_SIZE), a2); + a3 = VEC_FMA(wj, LOAD(fi + base_ik + 3 * VEC_SIZE), a3); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + STORE(fo + base_ik + 2 * VEC_SIZE, a2); + STORE(fo + base_ik + 3 * VEC_SIZE, a3); + } + } +#endif + +#if DEPTH >= 2 + for (; i < (rimap_inv_max[l] - unroll2 + 1); i += unroll2) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += 2 * VEC_SIZE) { + register VEC_TYPE a0, a1; + + a0 = a1 = VEC_ZERO; + + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + pos = LOAD(fi + indexj_pos + 1 * VEC_SIZE); + neg = LOAD(fi + indexj_neg + 1 * VEC_SIZE); + a1 = VEC_FMA(wj, VEC_ADD(pos, neg), a1); + } + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + a1 = VEC_FMA(wj, LOAD(fi + base_ik + 1 * VEC_SIZE), a1); + STORE(fo + base_ik, a0); + STORE(fo + base_ik + 1 * VEC_SIZE, a1); + } + } +#endif + +#if DEPTH >= 1 + for (; i < (rimap_inv_max[l] - unroll1 + 1); i += unroll1) { + ptrdiff_t k; + for (k = 0; k < (1 << ldf); k += VEC_SIZE) { + register VEC_TYPE a0 = VEC_ZERO; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { + register VEC_TYPE wj = VEC_SCAL(wpair[j]); + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + register VEC_TYPE pos = LOAD(fi + indexj_pos); + register VEC_TYPE neg = LOAD(fi + indexj_neg); + a0 = VEC_FMA(wj, VEC_ADD(pos, neg), a0); + } + const VEC_TYPE wj = VEC_SCAL(w0); + a0 = VEC_FMA(wj, LOAD(fi + base_ik), a0); + STORE(fo + base_ik, a0); + } + } +#endif + +#if VEC_SIZE > 1 + + const ptrdiff_t size = (ptrdiff_t) 1 << ldf; + double a; + + for (; i < rimap_inv_max[l]; i++) { + for (ptrdiff_t k = 0; k < size; k++) { + a = 0.0; + const ptrdiff_t base_ik = (ptrdiff_t) (i << ldf) + k; + for (j = 0; j < npairs; j++) { + const ptrdiff_t indexj_pos = index_pos[j] + base_ik; + const ptrdiff_t indexj_neg = index_neg[j] + base_ik; + const double sum = fi[indexj_pos] + fi[indexj_neg]; + a += wpair[j] * sum; + } + fo[base_ik] = a + w0 * fi[base_ik]; + } + } +#endif + + } /* l */ + + // this fence instruction is needed to ensure correctness when using + // non-temporal stores +#if defined(ALIGNED) && defined(FENCE) + FENCE; +#endif +} + +#undef LOAD +#undef STORE diff --git a/src/include/vectors.h b/src/include/vectors.h index 4b85e9a7c68a2e3ce01df1ecdc4a19921cc6e7fe..8f735020a625f16e032e3ef5920e4de284ebfda2 100644 --- a/src/include/vectors.h +++ b/src/include/vectors.h @@ -41,6 +41,8 @@ #define VEC_STU(addr, vec) _mm512_storeu_pd(addr, vec) #define VEC_FMA(aa, bb, cc) _mm512_fmadd_pd(aa, bb, cc) #define VEC_SCAL(aa) _mm512_set1_pd(aa) +#define VEC_ADD(aa, bb) _mm512_add_pd(aa, bb) +#define VEC_SUB(aa, bb) _mm512_sub_pd(aa, bb) #define VEC_ZERO _mm512_setzero_pd() #include #define FENCE _mm_mfence() @@ -73,6 +75,8 @@ #define VEC_FMA(aa, bb, cc) _mm256_add_pd(cc, _mm256_mul_pd(aa, bb)) #endif #define VEC_SCAL(aa) _mm256_set1_pd(aa) +#define VEC_ADD(aa, bb) _mm256_add_pd(aa, bb) +#define VEC_SUB(aa, bb) _mm256_sub_pd(aa, bb) #define VEC_ZERO _mm256_setzero_pd() #include #define FENCE _mm_mfence() @@ -101,6 +105,8 @@ #define VEC_FMA(aa, bb, cc) _mm_add_pd(cc, _mm_mul_pd(aa, bb)) #endif #define VEC_SCAL(aa) _mm_set1_pd(aa) +#define VEC_ADD(aa, bb) _mm_add_pd(aa, bb) +#define VEC_SUB(aa, bb) _mm_sub_pd(aa, bb) #define VEC_ZERO _mm_setzero_pd() #define FENCE _mm_mfence() @@ -118,8 +124,10 @@ #define VEC_LDU(addr) VEC_LD(addr) #define VEC_ST(addr, vec) (addr)[0] = vec #define VEC_STU(addr, vec) VEC_ST(addr, vec) -#define VEC_FMA(aa, bb, cc) aa *bb + cc +#define VEC_FMA(aa, bb, cc) ((aa) * (bb) + (cc)) #define VEC_SCAL(aa) aa +#define VEC_ADD(aa, bb) ((aa)+(bb)) +#define VEC_SUB(aa, bb) ((aa)-(bb)) #define VEC_ZERO 0.0 #define DEPTH 8 diff --git a/src/main/test.F90 b/src/main/test.F90 index d33d56c0c01c562f8de182e51eeee320e136cc66..dac77f904b58cab911d826d27072b83084f898a2 100644 --- a/src/main/test.F90 +++ b/src/main/test.F90 @@ -34,6 +34,7 @@ module test_oct_m use debug_oct_m use density_oct_m use derivatives_oct_m + use derivatives_test_oct_m use eigen_chebyshev_oct_m use electron_space_oct_m use electrons_oct_m @@ -1303,6 +1304,10 @@ contains call zderivatives_test(sys%gr%der, sys%namespace, param%repetitions, param%min_blocksize, param%max_blocksize) end if + if (sys%space%dim > 1) then + call derivatives_advanced_benchmark(sys%gr%der, sys%namespace) + end if + SAFE_DEALLOCATE_P(sys) POP_SUB(test_derivatives)