anlsys · perarnau · Mar 12, 2025 · Mar 10, 2025 · Mar 5, 2025 · Mar 10, 2025
diff --git a/.github/workflows/builds.yml b/.github/workflows/builds.yml
@@ -26,7 +26,7 @@ jobs:
       - run: make
       - run: make check
       - run: make install
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: generic
@@ -55,7 +55,7 @@ jobs:
         working-directory: out
       - run: make check
         working-directory: out
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: out-of-tree
@@ -85,7 +85,7 @@ jobs:
           HWLOC_CPUID_PATH: ${{ github.workspace }}/cpuid
           VALGRIND_SUPPRESSIONS_FILES: ${{ github.workspace }}/.valgrind.supp
           OMP_NUM_THREADS: 1
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: valgrind
@@ -114,7 +114,7 @@ jobs:
           HWLOC_CPUID_PATH: ${{ github.workspace }}/cpuid
           VALGRIND_SUPPRESSIONS_FILES: ${{ github.workspace }}/.valgrind.supp
           OMP_NUM_THREADS: 1
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: valgrind
@@ -147,7 +147,7 @@ jobs:
           mkdir build
           ./configure --prefix=`pwd`/build --without-rocm
       - run: make distcheck
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: distcheck
@@ -173,7 +173,7 @@ jobs:
       - run: make CFLAGS=-std=c99
       - run: make check
       - run: make install
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: rocm
@@ -182,3 +182,40 @@ jobs:
             tests/*.log
             benchmarks/*.log
             doc/tutorials/*.log
+  nvhpc:
+    env:
+      CFLAGS: "-std=c99"
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash
+    container:
+      image: nvcr.io/nvidia/nvhpc:24.7-devel-cuda12.5-ubuntu22.04
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: true
+          fetch-depth: 0
+      - run: apt-get update
+      - run: apt-get install -y make autoconf automake libtool pkgconf libhwloc-dev
+      - run: |
+          echo "0.8.0" > .tarball-version
+      - name: build
+        run: |
+          source /usr/share/lmod/6.6/init/bash
+          module load nvhpc
+          ./autogen.sh
+          mkdir build
+          ./configure --prefix=`pwd`/build --with-cuda CUDA_HOME=$NVHPC_ROOT/cuda
+          make
+          make check
+          make install
+      - uses: actions/upload-artifact@v4
+        if: failure()
+        with:
+          name: nvhpc
+          path: |
+            config.log
+            tests/*.log
+            benchmarks/*.log
+            doc/tutorials/*.log
diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml
@@ -41,7 +41,7 @@ jobs:
           export LD_LIBRARY_PATH=$GITHUB_WORKSPACE/$INSTALL_PATH/lib:$LD_LIBRARY_PATH
           cd xsbench/openmp-threading
           test -n "$(./XSBench -s small | grep 'Verification checksum' | grep -i valid)"
-      - uses: actions/upload-artifact@v2
+      - uses: actions/upload-artifact@v4
         if: failure()
         with:
           name: generic

diff --git a/benchmarks/blas/l1_kernel.c b/benchmarks/blas/l1_kernel.c
@@ -56,15 +56,18 @@ double ddot(size_t n, double *a, double *b, double *c, double scalar)
 	(void)*c;
 	(void)scalar;
 	size_t i;
-	long double dot = 0.0;
+	/* should be a long double for overflow checks, but some compilers (nvc)
+	 * don't support reduce on long double in 2024.
+	 */
+	double dot = 0.0;
 
 #pragma omp parallel for reduction(+ : dot)
 	for (i = 0; i < n; i++) {
-		long double temp;
+		double temp;
 		temp = a[i] * b[i];
 		dot += temp;
 	}
-	return (double)dot;
+	return dot;
 }
 
 double dnrm2(size_t n, double *a, double *b, double *c, double scalar)

diff --git a/doc/tutorials/area/1_custom_interleave_area.c b/doc/tutorials/area/1_custom_interleave_area.c
@@ -191,6 +191,10 @@ test_custom_area(const size_t size)
 int
 main(void)
 {
+	/* impossible to do those check in a CI environment consistently */
+	if (!strcmp(getenv("CI"), "true"))
+		exit(77);
+
 	const size_t size = (2 << 16); // 16 pages
 
 	test_custom_area(size);

diff --git a/doc/tutorials/area/2_aml_area_hwloc.c b/doc/tutorials/area/2_aml_area_hwloc.c
@@ -72,6 +72,10 @@ int max_bandwidth_area()
 
 int main(int argc, char **argv)
 {
+	/* impossible to do those check in a CI environment consistently */
+	if (!strcmp(getenv("CI"), "true"))
+		exit(77);
+
 	if (aml_init(&argc, &argv) != 0)
 		return 1;
 

diff --git a/include/aml/higher/allocator.h b/include/aml/higher/allocator.h
@@ -40,6 +40,18 @@ struct aml_allocator {
 	struct aml_allocator_ops *ops;
 };
 
+/** Allocator internal's chunk information */
+struct aml_allocator_chunk {
+	/** memory allocator for the user (read-only) */
+	void *ptr;
+	/** size of the chunk, greater or equals to the size requested by the
+	 * user (read-only) */
+	size_t size;
+	/** an opaque object that the user can attach to the chunk (read/write)
+	 */
+	void *user_data;
+};
+
 /**
  * Allocator methods.
  * The design pattern of aml allocator is design to meet simplicity and
@@ -104,6 +116,20 @@ struct aml_allocator_ops {
 	 * @return AML_SUCCESS on success or an appropriate aml error code.
 	 */
 	int (*free)(struct aml_allocator_data *data, void *ptr);
+
+	/**
+	 *  Optional method.
+	 *  @see aml_allocator_alloc_chunk()
+	 */
+	struct aml_allocator_chunk *(*alloc_chunk)(
+	        struct aml_allocator_data *data, size_t size);
+
+	/**
+	 *  Optional method.
+	 *  @see aml_allocator_free_chunk()
+	 */
+	int (*free_chunk)(struct aml_allocator_data *data,
+	                  struct aml_allocator_chunk *chunk);
 };
 
 /**
@@ -118,14 +144,38 @@ struct aml_allocator_ops {
 void *aml_alloc(struct aml_allocator *allocator, size_t size);
 
 /**
- * Release memory associated with a pointer obtained with an
- * allocator.
+ * Release memory associated with a pointer obtained from a call to
+ * aml_alloc().
  *
  * @param[in, out] allocator: The allocator used to allocate pointer.
+ * @param[in, out] ptr: The pointer allocated with the same allocator.
  * @return AML_SUCCESS on success or an appropriate aml error code.
  */
 int aml_free(struct aml_allocator *allocator, void *ptr);
 
+/**
+ * Allocate memory with an allocator.
+ *
+ * @param[in, out] allocator: The allocator to use.
+ * @param[in] size: The minimum allocation size.
+ * @return NULL on error with aml_errno set to the appropriate error
+ * code.
+ * @return The chunk of memory allocated.
+ */
+struct aml_allocator_chunk *
+aml_allocator_alloc_chunk(struct aml_allocator *allocator, size_t size);
+
+/**
+ * Release memory associated with the chunk obtained from a call to
+ * aml_allocator_alloc_chunk().
+ *
+ * @param[in, out] allocator: The allocator used to allocate pointer.
+ * @param[in, out] ptr: The chunk allocated with the same allocator.
+ * @return AML_SUCCESS on success or an appropriate aml error code.
+ */
+int aml_allocator_free_chunk(struct aml_allocator *allocator,
+                             struct aml_allocator_chunk *chunk);
+
 /**
  * @}
  **/

diff --git a/src/Makefile.am b/src/Makefile.am
@@ -90,11 +90,10 @@ endif
 # Cuda sources
 
 if HAVE_CUDA
-libcuda_la_SOURCES=area/cuda.c dma/cuda.c
-noinst_LTLIBRARIES+=libcuda.la
-libcuda_la_CPPFLAGS=$(AM_CPPFLAGS) $(CUDA_CFLAGS)
-libcuda_la_LDFLAGS=$(AM_LDFLAGS) $(CUDA_LIBS)
-libaml_la_LIBADD=libcuda.la
+AM_CPPFLAGS += $(CUDA_CFLAGS)
+AM_LDFLAGS += $(CUDA_LIBS)
+libaml_la_SOURCES+=area/cuda.c
+libaml_la_SOURCES+=dma/cuda.c
 endif
 
 #############################################

diff --git a/src/allocator/allocator.c b/src/allocator/allocator.c
@@ -8,29 +8,58 @@
  * SPDX-License-Identifier: BSD-3-Clause
  ******************************************************************************/
 
+#include <assert.h>
+
 #include "aml.h"
 
 #include "aml/higher/allocator.h"
 
 void *aml_alloc(struct aml_allocator *allocator, size_t size)
 {
 	if (allocator == NULL || allocator->data == NULL ||
-	    allocator->ops == NULL || allocator->ops->alloc == NULL) {
+	    allocator->ops == NULL) {
 		aml_errno = AML_EINVAL;
 		return NULL;
 	}
-
+	assert(allocator->ops->alloc);
 	return allocator->ops->alloc(allocator->data, size);
 }
 
-int aml_free(struct aml_allocator *allocator, void *ptr)
+struct aml_allocator_chunk *
+aml_allocator_alloc_chunk(struct aml_allocator *allocator, size_t size)
 {
 	if (allocator == NULL || allocator->data == NULL ||
-	    allocator->ops == NULL || allocator->ops->free == NULL)
-		return -AML_EINVAL;
+	    allocator->ops == NULL) {
+		aml_errno = AML_EINVAL;
+		return NULL;
+	}
+	if (allocator->ops->alloc_chunk == NULL) {
+		aml_errno = AML_ENOTSUP;
+		return NULL;
+	}
+	return allocator->ops->alloc_chunk(allocator->data, size);
+}
 
+int aml_free(struct aml_allocator *allocator, void *ptr)
+{
 	if (ptr == NULL)
 		return AML_SUCCESS;
-
+	if (allocator == NULL || allocator->data == NULL ||
+	    allocator->ops == NULL)
+		return -AML_EINVAL;
+	assert(allocator->ops->free);
 	return allocator->ops->free(allocator->data, ptr);
 }
+
+int aml_allocator_free_chunk(struct aml_allocator *allocator,
+                             struct aml_allocator_chunk *chunk)
+{
+	if (chunk == NULL)
+		return AML_SUCCESS;
+	if (allocator == NULL || allocator->data == NULL ||
+	    allocator->ops == NULL)
+		return -AML_EINVAL;
+	if (allocator->ops->free_chunk == NULL)
+		return -AML_ENOTSUP;
+	return allocator->ops->free_chunk(allocator->data, chunk);
+}
diff --git a/src/allocator/area.c b/src/allocator/area.c
@@ -18,7 +18,8 @@
 struct aml_allocator_ops aml_allocator_area_ops = {
         .alloc = aml_allocator_area_alloc,
         .free = aml_allocator_area_free,
-};
+        .alloc_chunk = NULL,
+        .free_chunk = NULL};
 
 struct aml_allocator_area_chunk {
 	void *ptr;