@@ -121,7 +121,7 @@ def _set_gpu_cli_option(self, wresources, extra_args, gpu_setting_name, gpu_valu
121121 def _set_gpu_env_var (self , wresources , task , gpus_per_node , gpus_env ):
122122 """Add GPU environment variable setting to the tasks environment"""
123123 jassert (wresources .matching_slots , f"Cannot assign CPUs/GPUs to non-matching slots per node { wresources .slots } " )
124- slot_list = wresources .get_slots_as_string (multiplier = wresources .gpus_per_rset , limit = gpus_per_node )
124+ slot_list = wresources .get_slots_as_string (multiplier = wresources .gpus_per_rset_per_node , limit = gpus_per_node )
125125 task ._add_to_env (gpus_env , slot_list )
126126
127127 def _local_runner_set_gpus (self , task , wresources , extra_args , gpus_per_node , ppn ):
@@ -171,7 +171,7 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,
171171
172172 # gpus per node for this worker.
173173 if wresources .doihave_gpus ():
174- gpus_avail_per_node = wresources .slot_count * wresources .gpus_per_rset
174+ gpus_avail_per_node = wresources .slot_count * wresources .gpus_per_rset_per_node
175175 else :
176176 gpus_avail_per_node = 0
177177
@@ -224,6 +224,35 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,
224224
225225 return nprocs , nnodes , ppn , extra_args
226226
227+ def _get_min_nodes (self , nprocs , ppn , nnodes , ngpus , resources ):
228+ """Get minimum nodes needed to match configuration"""
229+ if nnodes is not None :
230+ return nnodes
231+ if ppn :
232+ return None # nnodes gets processed later.
233+ if resources is not None :
234+ wresources = resources .worker_resources
235+ total_nodes = wresources .local_node_count
236+ procs_on_node = wresources .slot_count * wresources .procs_per_rset_per_node
237+
238+ if not nprocs and ngpus is None :
239+ # Delay node evaluation to GPU assignment code
240+ return None
241+ proc_min_nodes = 1
242+ gpu_min_nodes = 1
243+ if nprocs :
244+ proc_min_nodes = (nprocs + procs_on_node - 1 ) // procs_on_node
245+ if ngpus :
246+ gpus_on_node = wresources .slot_count * wresources .gpus_per_rset_per_node
247+ gpu_min_nodes = (ngpus + gpus_on_node - 1 ) // gpus_on_node
248+
249+ min_nodes = max (proc_min_nodes , gpu_min_nodes )
250+ nnodes = min (min_nodes , total_nodes )
251+ # Must have atleast one processor per node to use GPUs
252+ if nprocs :
253+ nnodes = min (nnodes , nprocs )
254+ return nnodes
255+
227256 def _adjust_procs (self , nprocs , ppn , nnodes , ngpus , resources ):
228257 """Adjust an invalid config"""
229258
@@ -241,8 +270,8 @@ def adjust_resource(n_units, units_attr, units_name):
241270
242271 if resources is not None :
243272 wresources = resources .worker_resources
244- ngpus = adjust_resource (ngpus , "gpus_per_rset " , "ngpus" )
245- nprocs = adjust_resource (nprocs , "procs_per_rset " , "nprocs" )
273+ ngpus = adjust_resource (ngpus , "gpus_per_rset_per_node " , "ngpus" )
274+ nprocs = adjust_resource (nprocs , "procs_per_rset_per_node " , "nprocs" )
246275 return nprocs , ngpus
247276
248277 def get_mpi_specs (
@@ -284,6 +313,8 @@ def get_mpi_specs(
284313
285314 if match_procs_to_gpus :
286315 jassert (no_config_set , "match_procs_to_gpus is mutually exclusive with either of nprocs/ppn" )
316+
317+ nnodes = self ._get_min_nodes (nprocs , ppn , nnodes , ngpus , resources )
287318 nprocs , ngpus = self ._adjust_procs (nprocs , ppn , nnodes , ngpus , resources )
288319
289320 if auto_assign_gpus or ngpus is not None :
@@ -294,7 +325,7 @@ def get_mpi_specs(
294325 task , resources , nprocs , nnodes , ppn , ngpus , extra_args , match_procs_to_gpus
295326 )
296327
297- rm_rpn = True if self .rm_rpn and ppn is None and nnodes is None else False
328+ rm_rpn = self .rm_rpn and ppn is None and nnodes is None
298329
299330 hostlist = None
300331 if machinefile and not self .mfile_support :
0 commit comments