Skip to content

Commit 2892c86

Browse files
authored
Merge pull request #1401 from Libensemble/release/v_1.4.2
Release/v 1.4.2
2 parents 33118cc + 63314c1 commit 2892c86

26 files changed

+293
-69
lines changed

.github/workflows/basic.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,4 +163,4 @@ jobs:
163163
runs-on: ubuntu-latest
164164
steps:
165165
- uses: actions/checkout@v4
166-
- uses: crate-ci/typos@v1.23.4
166+
- uses: crate-ci/typos@v1.23.6

.github/workflows/extra.yml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,12 @@ jobs:
229229
rm ./libensemble/tests/unit_tests/test_ufunc_runners.py
230230
rm ./libensemble/tests/unit_tests/test_executor_balsam.py
231231
232+
- name: Start Redis
233+
if: matrix.os == 'ubuntu-latest'
234+
uses: supercharge/redis-github-action@1.7.0
235+
with:
236+
redis-version: 7
237+
232238
- name: Run extensive tests, Ubuntu
233239
if: matrix.os == 'ubuntu-latest'
234240
run: |
@@ -254,4 +260,4 @@ jobs:
254260
runs-on: ubuntu-latest
255261
steps:
256262
- uses: actions/checkout@v4
257-
- uses: crate-ci/typos@v1.23.4
263+
- uses: crate-ci/typos@v1.23.6

.wci.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ description: |
1616
language: Python
1717

1818
release:
19-
version: 1.4.1
20-
date: 2024-07-29
19+
version: 1.4.2
20+
date: 2024-08-14
2121

2222
documentation:
2323
general: https://libensemble.readthedocs.io

CHANGELOG.rst

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,32 @@ GitHub issues are referenced, and can be viewed with hyperlinks on the `github r
88

99
.. _`github releases page`: https://github.com/Libensemble/libensemble/releases
1010

11+
Release 1.4.2
12+
--------------
13+
14+
:Date: August 14, 2024
15+
16+
* Fix under-utilized resource usage. #1398
17+
* Fixes bug causing executor to wrongly increase processor counts when not all nodes are utilized.
18+
* Fixes case where setting `num_gpus` to zero was treated as `None`.
19+
* Add missing PerlmutterGPU specs (these were detected anyway). #1393
20+
* Handle case where Perlmutter finds no partition. #1391
21+
* Launch environment scripts in shell. #1392
22+
23+
:Examples:
24+
25+
* Add proxystore example (uses a proxy in history array). #1326
26+
27+
:Note:
28+
29+
* Tests were run on Linux and MacOS with Python versions 3.9, 3.10, 3.11, 3.12
30+
* Heterogeneous workflows tested on Frontier (OLCF), Polaris (ALCF), and Perlmutter (NERSC).
31+
* Note that tests have been recently run on Aurora (ALCF), but the system was unavailable at time of release.
32+
33+
:Known Issues:
34+
35+
* See known issues section in the documentation.
36+
1137
Release 1.4.1
1238
--------------
1339

@@ -25,7 +51,6 @@ Release 1.4.1
2551

2652
* See known issues section in the documentation.
2753

28-
2954
Release 1.4.0
3055
--------------
3156

docs/requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
sphinx<8
1+
sphinx<9
22
sphinxcontrib-bibtex
33
sphinxcontrib-spelling
44
autodoc_pydantic
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1-
globus-compute-sdk==2.24.0
1+
globus-compute-sdk==2.25.0
2+
proxystore==0.7.0

install/testing_requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
1-
flake8==7.1.0
1+
flake8==7.1.1
22
coverage==7.3.1
33
pytest==8.3.2
44
pytest-cov==5.0.0
55
pytest-timeout==2.3.1
66
mock==5.1.0
77
python-dateutil==2.9.0.post0
88
anyio==4.4.0
9-
matplotlib==3.9.1
9+
matplotlib==3.9.2

libensemble/ensemble.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
from libensemble.specs import AllocSpecs, ExitCriteria, GenSpecs, LibeSpecs, SimSpecs
1313
from libensemble.tools import add_unique_random_streams
1414
from libensemble.tools import parse_args as parse_args_f
15-
from libensemble.tools.parse_args import mpi_init
1615
from libensemble.tools import save_libE_output
16+
from libensemble.tools.parse_args import mpi_init
1717
from libensemble.utils.misc import specs_dump
1818

1919
ATTR_ERR_MSG = 'Unable to load "{}". Is the function or submodule correctly named?'

libensemble/executors/mpi_executor.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def set_resources(self, resources: Resources) -> None:
138138
self.resources = resources
139139

140140
def _launch_with_retries(
141-
self, task: Task, subgroup_launch: bool, wait_on_start: Union[bool, int], run_cmd: List[str]
141+
self, task: Task, subgroup_launch: bool, wait_on_start: Union[bool, int], run_cmd: List[str], use_shell: bool
142142
) -> None:
143143
"""Launch task with retry mechanism"""
144144
retry_count = 0
@@ -156,6 +156,7 @@ def _launch_with_retries(
156156
stdout=out,
157157
stderr=err,
158158
start_new_session=subgroup_launch,
159+
shell=use_shell,
159160
)
160161
except Exception as e:
161162
logger.warning(f"task {task.name} submit command failed on try {retry_count} with error {e}")
@@ -325,12 +326,9 @@ def submit(
325326
if not num_procs and not match_procs_to_gpus:
326327
num_procs = self.gen_nprocs
327328

328-
if not num_gpus:
329+
if num_gpus is None:
329330
num_gpus = self.gen_ngpus
330331

331-
if not num_nodes and (self.gen_ngpus or self.gen_nprocs):
332-
num_nodes = self.resources.worker_resources.local_node_count
333-
334332
if mpi_runner_type is not None:
335333
if isinstance(mpi_runner_type, str):
336334
mpi_config = {"mpi_runner": mpi_runner_type}
@@ -367,8 +365,10 @@ def submit(
367365

368366
if env_script is not None:
369367
run_cmd = Executor._process_env_script(task, runline, env_script)
368+
use_shell = True
370369
else:
371370
run_cmd = runline
371+
use_shell = False
372372

373373
if dry_run:
374374
logger.info(f"Test (No submit) Runline: {' '.join(run_cmd)}")
@@ -378,7 +378,7 @@ def submit(
378378
task._implement_env()
379379

380380
# Launch Task
381-
self._launch_with_retries(task, sglaunch, wait_on_start, run_cmd)
381+
self._launch_with_retries(task, sglaunch, wait_on_start, run_cmd, use_shell)
382382

383383
if not task.timer.timing and not task.finished:
384384
task.timer.start()

libensemble/executors/mpi_runner.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def _set_gpu_cli_option(self, wresources, extra_args, gpu_setting_name, gpu_valu
121121
def _set_gpu_env_var(self, wresources, task, gpus_per_node, gpus_env):
122122
"""Add GPU environment variable setting to the tasks environment"""
123123
jassert(wresources.matching_slots, f"Cannot assign CPUs/GPUs to non-matching slots per node {wresources.slots}")
124-
slot_list = wresources.get_slots_as_string(multiplier=wresources.gpus_per_rset, limit=gpus_per_node)
124+
slot_list = wresources.get_slots_as_string(multiplier=wresources.gpus_per_rset_per_node, limit=gpus_per_node)
125125
task._add_to_env(gpus_env, slot_list)
126126

127127
def _local_runner_set_gpus(self, task, wresources, extra_args, gpus_per_node, ppn):
@@ -171,7 +171,7 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,
171171

172172
# gpus per node for this worker.
173173
if wresources.doihave_gpus():
174-
gpus_avail_per_node = wresources.slot_count * wresources.gpus_per_rset
174+
gpus_avail_per_node = wresources.slot_count * wresources.gpus_per_rset_per_node
175175
else:
176176
gpus_avail_per_node = 0
177177

@@ -224,6 +224,35 @@ def _assign_gpus(self, task, resources, nprocs, nnodes, ppn, ngpus, extra_args,
224224

225225
return nprocs, nnodes, ppn, extra_args
226226

227+
def _get_min_nodes(self, nprocs, ppn, nnodes, ngpus, resources):
228+
"""Get minimum nodes needed to match configuration"""
229+
if nnodes is not None:
230+
return nnodes
231+
if ppn:
232+
return None # nnodes gets processed later.
233+
if resources is not None:
234+
wresources = resources.worker_resources
235+
total_nodes = wresources.local_node_count
236+
procs_on_node = wresources.slot_count * wresources.procs_per_rset_per_node
237+
238+
if not nprocs and ngpus is None:
239+
# Delay node evaluation to GPU assignment code
240+
return None
241+
proc_min_nodes = 1
242+
gpu_min_nodes = 1
243+
if nprocs:
244+
proc_min_nodes = (nprocs + procs_on_node - 1) // procs_on_node
245+
if ngpus:
246+
gpus_on_node = wresources.slot_count * wresources.gpus_per_rset_per_node
247+
gpu_min_nodes = (ngpus + gpus_on_node - 1) // gpus_on_node
248+
249+
min_nodes = max(proc_min_nodes, gpu_min_nodes)
250+
nnodes = min(min_nodes, total_nodes)
251+
# Must have atleast one processor per node to use GPUs
252+
if nprocs:
253+
nnodes = min(nnodes, nprocs)
254+
return nnodes
255+
227256
def _adjust_procs(self, nprocs, ppn, nnodes, ngpus, resources):
228257
"""Adjust an invalid config"""
229258

@@ -241,8 +270,8 @@ def adjust_resource(n_units, units_attr, units_name):
241270

242271
if resources is not None:
243272
wresources = resources.worker_resources
244-
ngpus = adjust_resource(ngpus, "gpus_per_rset", "ngpus")
245-
nprocs = adjust_resource(nprocs, "procs_per_rset", "nprocs")
273+
ngpus = adjust_resource(ngpus, "gpus_per_rset_per_node", "ngpus")
274+
nprocs = adjust_resource(nprocs, "procs_per_rset_per_node", "nprocs")
246275
return nprocs, ngpus
247276

248277
def get_mpi_specs(
@@ -284,6 +313,8 @@ def get_mpi_specs(
284313

285314
if match_procs_to_gpus:
286315
jassert(no_config_set, "match_procs_to_gpus is mutually exclusive with either of nprocs/ppn")
316+
317+
nnodes = self._get_min_nodes(nprocs, ppn, nnodes, ngpus, resources)
287318
nprocs, ngpus = self._adjust_procs(nprocs, ppn, nnodes, ngpus, resources)
288319

289320
if auto_assign_gpus or ngpus is not None:
@@ -294,7 +325,7 @@ def get_mpi_specs(
294325
task, resources, nprocs, nnodes, ppn, ngpus, extra_args, match_procs_to_gpus
295326
)
296327

297-
rm_rpn = True if self.rm_rpn and ppn is None and nnodes is None else False
328+
rm_rpn = self.rm_rpn and ppn is None and nnodes is None
298329

299330
hostlist = None
300331
if machinefile and not self.mfile_support:

0 commit comments

Comments
 (0)