Hi-PACE · AlexanderSinn · Feb 4, 2026 · Feb 9, 2026
diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletFast.cpp
@@ -102,7 +102,7 @@ void ToSine_Transpose_ToComplex (const Array2<amrex::Real> in,
         return to_sine(in, j, i, n_data, sine_factor);
     };
 
-#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+#if defined(AMREX_USE_GPU)
     constexpr int tile_dim_x = 16;
     constexpr int tile_dim_x_ex = 34;
     constexpr int tile_dim_y = 32;
@@ -118,26 +118,25 @@ void ToSine_Transpose_ToComplex (const Array2<amrex::Real> in,
     const int num_blocks_x = (nx + tile_dim_x - 1)/tile_dim_x;
     const int num_blocks_y = (ny + tile_dim_y - 1)/tile_dim_y;
 
-    amrex::launch<tile_dim_x*block_rows_y>(num_blocks_x*num_blocks_y, amrex::Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE() noexcept
+    amrex::LaunchRaw<tile_dim_x*block_rows_y, amrex::Real>(
+        amrex::IntVectND<2>{num_blocks_x, num_blocks_y}, tile_dim_x_ex * tile_dim_y,
+        [=] AMREX_GPU_DEVICE(auto lh) noexcept
         {
-            __shared__ amrex::Real tile_ptr[tile_dim_x_ex * tile_dim_y];
-
-            const int block_y = blockIdx.x / num_blocks_x;
-            const int block_x = blockIdx.x - block_y*num_blocks_x;
+            const auto [block_x, block_y] = lh.blockIdxND();
 
             const int tile_begin_x = 2 * block_x * tile_dim_x - 2;
             const int tile_begin_y = block_y * tile_dim_y;
 
             const int tile_end_x = tile_begin_x + tile_dim_x_ex;
             const int tile_end_y = tile_begin_y + tile_dim_y;
 
-            Array2<amrex::Real> shared{{tile_ptr, {tile_begin_x, tile_begin_y, 0},
-                                                  {tile_end_x, tile_end_y, 1}, 1}};
+            Array2<amrex::Real> shared{{lh.shared_memory(),
+                                        {tile_begin_x, tile_begin_y, 0},
+                                        {tile_end_x, tile_end_y, 1}, 1}};
 
             {
-                const int thread_x = threadIdx.x / tile_dim_y;
-                const int thread_y = threadIdx.x - thread_x*tile_dim_y;
+                const auto [thread_y, thread_x] =
+                    lh.template threadIdxND<tile_dim_y, block_rows_x>();
 
                 for (int tx = thread_x; tx < tile_dim_x_ex; tx += block_rows_x) {
                     const int i = tile_begin_x + tx;
@@ -149,11 +148,11 @@ void ToSine_Transpose_ToComplex (const Array2<amrex::Real> in,
                 }
             }
 
-            __syncthreads();
+            lh.syncthreads();
 
             {
-                const int thread_y = threadIdx.x / tile_dim_x;
-                const int thread_x = threadIdx.x - thread_y*tile_dim_x;
+                const auto [thread_x, thread_y] =
+                    lh.template threadIdxND<tile_dim_x, block_rows_y>();
 
                 for (int ty = thread_y; ty < tile_dim_y; ty += block_rows_y) {
                     const int i = block_x * tile_dim_x + thread_x;

diff --git a/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletQuick.cpp b/src/fields/fft_poisson_solver/FFTPoissonSolverDirichletQuick.cpp
@@ -96,7 +96,7 @@ dst2_out_mult_dst3_in (Array2<amrex::GpuComplex<amrex::Real>> const& inout, int
 inline void
 dst2_out_t_in (Array2<amrex::GpuComplex<amrex::Real>> const& in, Array2<amrex::Real> const& out,
                int nx, int ny, const amrex::GpuComplex<amrex::Real>* omega) {
-#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+#if defined(AMREX_USE_GPU)
     static constexpr int tile_dim_x = 16;
     static constexpr int tile_dim_y = 64;
     static constexpr int elem_per_thread = 2;
@@ -108,58 +108,60 @@ dst2_out_t_in (Array2<amrex::GpuComplex<amrex::Real>> const& in, Array2<amrex::R
     const int num_blocks_x = (nx - tile_begin_x + tile_dim_x - 1)/tile_dim_x;
     const int num_blocks_y = (ny + tile_dim_y - 1)/tile_dim_y;
 
-    amrex::launch<tile_dim_x*block_rows_y>(num_blocks_x*num_blocks_y, amrex::Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE() noexcept
+    amrex::LaunchRaw<tile_dim_x*block_rows_y, amrex::Real>(
+        amrex::IntVectND<2>{num_blocks_y, num_blocks_x}, tile_dim_x * (tile_dim_y+1) * 2,
+        [=] AMREX_GPU_DEVICE(auto lh) noexcept
         {
-            __shared__ amrex::Real tile_real[tile_dim_x][tile_dim_y+1];
-            __shared__ amrex::Real tile_imag[tile_dim_x][tile_dim_y+1];
+            Array3<amrex::Real> shared{{lh.shared_memory(), {0, 0, 0},
+                                        {tile_dim_y+1, tile_dim_x, 1}, 2}};
 
-            const int block_x = blockIdx.x / num_blocks_y;
-            const int block_y = blockIdx.x - block_x*num_blocks_y;
+            const auto [block_y, block_x] = lh.blockIdxND();
 
             const int tile_start_x = tile_begin_x + block_x* tile_dim_x;
             const int tile_start_y = block_y * tile_dim_y;
 
-            int thread_y = threadIdx.x / tile_dim_x;
-            int thread_x = threadIdx.x - thread_y*tile_dim_x;
-
-            #pragma unroll elem_per_thread
-            for (; thread_y < tile_dim_y; thread_y += block_rows_y)
             {
-                const int thread_xr = tile_dim_x - thread_x - 1;
-                const int iout = tile_start_x + thread_xr;
-                const int jout = tile_start_y + thread_y;
-                const int iin = nx-iout-1;
-                const int jin = 2*jout < ny ? 2*jout : 2*(ny-jout)-1;
-
-                if (iout < nx && jout < ny) {
-                    auto val = in(iin, jin) * omega[iin];
-                    if (2*jout < ny) {
-                        val = -val;
+                auto [thread_x, thread_y] = lh.template threadIdxND<tile_dim_x, block_rows_y>();
+
+                #pragma unroll elem_per_thread
+                for (; thread_y < tile_dim_y; thread_y += block_rows_y)
+                {
+                    const int thread_xr = tile_dim_x - thread_x - 1;
+                    const int iout = tile_start_x + thread_xr;
+                    const int jout = tile_start_y + thread_y;
+                    const int iin = nx-iout-1;
+                    const int jin = 2*jout < ny ? 2*jout : 2*(ny-jout)-1;
+
+                    if (iout < nx && jout < ny) {
+                        auto val = in(iin, jin) * omega[iin];
+                        if (2*jout < ny) {
+                            val = -val;
+                        }
+                        shared(thread_y, thread_xr, 0) = val.real();
+                        shared(thread_y, thread_xr, 1) = val.imag();
                     }
-                    tile_real[thread_xr][thread_y] = val.real();
-                    tile_imag[thread_xr][thread_y] = val.imag();
                 }
             }
 
-            __syncthreads();
-
-            thread_x = threadIdx.x / tile_dim_y;
-            thread_y = threadIdx.x - thread_x*tile_dim_y;
+            lh.syncthreads();
 
-            #pragma unroll elem_per_thread
-            for (; thread_x < tile_dim_x; thread_x += block_rows_x)
             {
-                const int iout = tile_start_x + thread_x;
-                const int jout = tile_start_y + thread_y;
-                const int iin = nx-iout-1;
-                const int iout2 = iin-1;
-                const bool do_iout2 = iout2 >= 0 && iout2 != iout;
-
-                if (iout < nx && jout < ny) {
-                    out(jout, iout) = -tile_real[thread_x][thread_y];
-                    if (do_iout2) {
-                        out(jout, iout2) = tile_imag[thread_x][thread_y];
+                auto [thread_y, thread_x] = lh.template threadIdxND<tile_dim_y, block_rows_x>();
+
+                #pragma unroll elem_per_thread
+                for (; thread_x < tile_dim_x; thread_x += block_rows_x)
+                {
+                    const int iout = tile_start_x + thread_x;
+                    const int jout = tile_start_y + thread_y;
+                    const int iin = nx-iout-1;
+                    const int iout2 = iin-1;
+                    const bool do_iout2 = iout2 >= 0 && iout2 != iout;
+
+                    if (iout < nx && jout < ny) {
+                        out(jout, iout) = -shared(thread_y, thread_x, 0);
+                        if (do_iout2) {
+                            out(jout, iout2) = shared(thread_y, thread_x, 1);
+                        }
                     }
                 }
             }
@@ -180,7 +182,7 @@ dst2_out_t_in (Array2<amrex::GpuComplex<amrex::Real>> const& in, Array2<amrex::R
 inline void
 dst3_out_t_in (Array2<amrex::Real> const& in, Array2<amrex::GpuComplex<amrex::Real>> const& out,
                int nx, int ny, const amrex::GpuComplex<amrex::Real>* omega) {
-#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+#if defined(AMREX_USE_GPU)
     static constexpr int tile_dim_x = 16;
     static constexpr int tile_dim_y = 64;
     static constexpr int elem_per_thread = 2;
@@ -192,63 +194,65 @@ dst3_out_t_in (Array2<amrex::Real> const& in, Array2<amrex::GpuComplex<amrex::Re
     const int num_blocks_x = (tile_end_x + tile_dim_x - 1)/tile_dim_x;
     const int num_blocks_y = (ny + tile_dim_y - 1)/tile_dim_y;
 
-    amrex::launch<tile_dim_x*block_rows_y>(num_blocks_x*num_blocks_y, amrex::Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE() noexcept
+    amrex::LaunchRaw<tile_dim_x*block_rows_y, amrex::Real>(
+        amrex::IntVectND<2>{num_blocks_x, num_blocks_y}, tile_dim_y * (tile_dim_x+1) * 2,
+        [=] AMREX_GPU_DEVICE(auto lh) noexcept
         {
-            __shared__ amrex::Real tile_real[tile_dim_y][tile_dim_x+1];
-            __shared__ amrex::Real tile_imag[tile_dim_y][tile_dim_x+1];
+            Array3<amrex::Real> shared{{lh.shared_memory(), {0, 0, 0},
+                                        {tile_dim_x+1, tile_dim_y, 1}, 2}};
 
-            const int block_y = blockIdx.x / num_blocks_x;
-            const int block_x = blockIdx.x - block_y*num_blocks_x;
+            const auto [block_x, block_y] = lh.blockIdxND();
 
             const int tile_start_x = block_x * tile_dim_x;
             const int tile_start_y = block_y * tile_dim_y;
 
-            int thread_x = threadIdx.x / tile_dim_y;
-            int thread_y = threadIdx.x - thread_x*tile_dim_y;
-
-            #pragma unroll elem_per_thread
-            for (; thread_x < tile_dim_x; thread_x += block_rows_x)
             {
-                const int thread_yp = (thread_y*2) % tile_dim_y + (thread_y*2) / tile_dim_y;
-                const int iout = tile_start_x + thread_x;
-                const int jout = tile_start_y + thread_yp;
-                const int jin = jout%2 == 0 ? jout/2 : ny-1-jout/2;
-                const int iin1 = nx-iout-1;
-                const int iin2 = iout - 1;
-
-                if (iout < tile_end_x && jout < ny) {
-                    amrex::GpuComplex<amrex::Real> val {
-                        in(jin, iin1),
-                        iout != 0 ? -in(jin, iin2) : 0
-                    };
-                    auto o = omega[iout];
-                    o.m_imag = - o.m_imag;
-                    if (jout%2 != 0) {
-                        val = -val;
+                auto [thread_y, thread_x] = lh.template threadIdxND<tile_dim_y, block_rows_x>();
+
+                #pragma unroll elem_per_thread
+                for (; thread_x < tile_dim_x; thread_x += block_rows_x)
+                {
+                    const int thread_yp = (thread_y*2) % tile_dim_y + (thread_y*2) / tile_dim_y;
+                    const int iout = tile_start_x + thread_x;
+                    const int jout = tile_start_y + thread_yp;
+                    const int jin = jout%2 == 0 ? jout/2 : ny-1-jout/2;
+                    const int iin1 = nx-iout-1;
+                    const int iin2 = iout - 1;
+
+                    if (iout < tile_end_x && jout < ny) {
+                        amrex::GpuComplex<amrex::Real> val {
+                            in(jin, iin1),
+                            iout != 0 ? -in(jin, iin2) : 0
+                        };
+                        auto o = omega[iout];
+                        o.m_imag = - o.m_imag;
+                        if (jout%2 != 0) {
+                            val = -val;
+                        }
+                        val *= o;
+                        shared(thread_x, thread_yp, 0) = val.real();
+                        shared(thread_x, thread_yp, 1) = val.imag();
                     }
-                    val *= o;
-                    tile_real[thread_yp][thread_x] = val.real();
-                    tile_imag[thread_yp][thread_x] = val.imag();
                 }
             }
 
-            __syncthreads();
+            lh.syncthreads();
 
-            thread_y = threadIdx.x / tile_dim_x;
-            thread_x = threadIdx.x - thread_y*tile_dim_x;
-
-            #pragma unroll elem_per_thread
-            for (; thread_y < tile_dim_y; thread_y += block_rows_y)
             {
-                const int iout = tile_start_x + thread_x;
-                const int jout = tile_start_y + thread_y;
-
-                if (iout < tile_end_x && jout < ny) {
-                    out(iout, jout) = {
-                        tile_real[thread_y][thread_x],
-                        tile_imag[thread_y][thread_x]
-                    };
+                auto [thread_x, thread_y] = lh.template threadIdxND<tile_dim_x, block_rows_y>();
+
+                #pragma unroll elem_per_thread
+                for (; thread_y < tile_dim_y; thread_y += block_rows_y)
+                {
+                    const int iout = tile_start_x + thread_x;
+                    const int jout = tile_start_y + thread_y;
+
+                    if (iout < tile_end_x && jout < ny) {
+                        out(iout, jout) = {
+                            shared(thread_x, thread_y, 0),
+                            shared(thread_x, thread_y, 1)
+                        };
+                    }
                 }
             }
         });