arch: Use the node's max memory transaction size

FabioLuporini · FabioLuporini · commit 37dae7f4d5f2 · 2025-05-27T10:48:52.000+01:00
diff --git a/devito/arch/archinfo.py b/devito/arch/archinfo.py
@@ -179,7 +179,6 @@ def homogenise_gpus(gpu_infos):
         homogeneous, otherwise None.
         """
         if gpu_infos == []:
-            warning('No graphics cards detected')
             return {}
 
         # Check must ignore physical IDs as they may differ
@@ -324,7 +323,9 @@ def cbk(deviceid=0):
 
             gpu_info[f'mem.{i}'] = make_cbk(i)
 
-        gpu_infos['architecture'] = 'AMD'
+        gpu_info['architecture'] = 'unspecified'
+        gpu_info['vendor'] = 'AMD'
+
         return gpu_info
 
     except OSError:
@@ -387,13 +388,15 @@ def cbk(deviceid=0):
 
             gpu_info['mem.%s' % i] = make_cbk(i)
 
-        gpu_infos['architecture'] = 'Intel'
+        gpu_info['architecture'] = 'unspecified'
+        gpu_info['vendor'] = 'INTEL'
+
         return gpu_info
 
     except OSError:
         pass
 
-    # *** Second try: `lshw`
+    # *** Fourth try: `lshw`
     try:
         info_cmd = ['lshw', '-C', 'video']
         proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL)
@@ -438,7 +441,7 @@ def parse_product_arch():
     except OSError:
         pass
 
-    # Third try: `lspci`, which is more readable but less detailed than `lshw`
+    # Fifth try: `lspci`, which is more readable but less detailed than `lshw`
     try:
         info_cmd = ['lspci']
         proc = Popen(info_cmd, stdout=PIPE, stderr=DEVNULL)
@@ -756,8 +759,15 @@ def max_mem_trans_size(self, dtype):
         Number of items of type `dtype` that can be transferred in a single
         memory transaction.
         """
-        assert self.max_mem_trans_nbytes % np.dtype(dtype).itemsize == 0
-        return int(self.max_mem_trans_nbytes / np.dtype(dtype).itemsize)
+        itemsize = np.dtype(dtype).itemsize
+
+        # NOTE: This method conservatively uses the node's `max_mem_trans_size`,
+        # instead of self's, so that we always pad by a compatible amount should
+        # the user switch target platforms dynamically
+        mmtb = node_max_mem_trans_nbytes(self)
+        assert mmtb % itemsize == 0
+
+        return int(mmtb / itemsize)
 
     def limits(self, compiler=None, language=None):
         """
@@ -1101,6 +1111,42 @@ def march(cls):
             return fallback
 
 
+@memoized_func
+def node_max_mem_trans_nbytes(platform):
+    """
+    Return the maximum memory transaction size in bytes for the underlying
+    node, which, as such, takes into account all available platforms.
+    """
+    mmtb0 = platform.max_mem_trans_nbytes
+
+    if isinstance(platform, Cpu64):
+        gpu_info = get_gpu_info()
+        if not gpu_info:
+            # This node may simply not have a GPU
+            return mmtb0
+
+        mapper = {
+            'NVIDIA': NvidiaDevice,
+            'AMD': AmdDevice,
+            'INTEL': IntelDevice,
+        }
+        try:
+            mmtb1 = mapper[gpu_info['vendor']].max_mem_trans_nbytes
+            return max(mmtb0, mmtb1)
+        except KeyError:
+            # Fallback -- act even more conservatively
+            mmtb1 = max(p.max_mem_trans_nbytes for p in
+                        [NvidiaDevice, AmdDevice, IntelDevice])
+            mmtb = max(mmtb0, mmtb1)
+            warning("Unable to determine GPU type, assuming a maximum memory "
+                    f"transaction size of {mmtb} bytes")
+            return mmtb
+    elif isinstance(platform, Device):
+        return max(Cpu64.max_mem_trans_nbytes, mmtb0)
+    else:
+        assert False, f"Unknown platform type: {type(platform)}"
+
+
 # CPUs
 ANYCPU = Cpu64('cpu64')
 CPU64_DUMMY = Intel64('cpu64-dummy', cores_logical=2, cores_physical=1, isa='sse')
diff --git a/tests/test_gpu_common.py b/tests/test_gpu_common.py
@@ -57,6 +57,22 @@ def test_host_threads(self):
         assert nth == get_cpu_info()['physical']
         assert nth == Cpu64("test").cores_physical
 
+    @switchconfig(platform='intel64', autopadding=True)
+    def test_autopad_with_platform_switch(self):
+        grid = Grid(shape=(10, 10))
+
+        f = Function(name='f', grid=grid, space_order=0)
+
+        assert f.shape_allocated[0] == 10
+
+        info = get_gpu_info()
+        if info['vendor'] == 'INTEL':
+            assert f.shape_allocated[1] == 16
+        elif info['vendor'] == 'NVIDIA':
+            assert f.shape_allocated[1] == 32
+        elif info['vendor'] == 'AMD':
+            assert f.shape_allocated[1] == 64
+
 
 class TestCodeGeneration: