Skip to content

Commit 5e323ef

Browse files
Copiloteeholmes
andcommitted
feat: open_dataset respects open_method, removes open_dataset_kwargs, adds silent param and integer index support
Co-authored-by: eeholmes <[email protected]>
1 parent aaf46e9 commit 5e323ef

2 files changed

Lines changed: 176 additions & 45 deletions

File tree

src/point_collocation/core/plan.py

Lines changed: 56 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -188,32 +188,47 @@ def __getitem__(self, idx: int | slice) -> "Plan | Any":
188188

189189
def open_dataset(
190190
self,
191-
result: Any,
192-
open_method: str | dict | None = None,
193-
open_dataset_kwargs: dict[str, Any] | None = None,
191+
result: "int | Any",
192+
open_method: "str | dict | None" = None,
193+
*,
194+
silent: bool = False,
194195
) -> "xr.Dataset":
195196
"""Open a single granule result as an :class:`xarray.Dataset`.
196197
197198
Parameters
198199
----------
199200
result:
200-
A single earthaccess result object, typically obtained via
201-
``plan[n]``.
201+
An integer index into ``plan.results`` (e.g. ``0``), or a
202+
single earthaccess result object obtained via ``plan[n]``.
203+
Using an integer is preferred: ``plan.open_dataset(0)`` is
204+
equivalent to ``plan.open_dataset(plan[0])``.
202205
open_method:
203206
How to open the granule. Accepts the same string presets or
204207
dict spec as :func:`~point_collocation.matchup`. Defaults to
205208
``"auto"`` (try dataset first, fall back to datatree merge).
206-
open_dataset_kwargs:
207-
Keyword arguments forwarded to the xarray open function.
208-
``chunks`` defaults to ``{}`` (lazy/dask loading). ``engine``
209-
defaults to ``"h5netcdf"`` when not specified.
209+
Pass open-function kwargs via the ``"open_kwargs"`` key of a
210+
dict spec, e.g.
211+
``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
212+
silent:
213+
When ``False`` (default), print the effective open_method spec
214+
actually used (after normalization and defaults are applied).
215+
Set to ``True`` to suppress this output.
210216
211217
Returns
212218
-------
213219
xarray.Dataset
214220
The caller is responsible for closing the dataset when finished
215221
(e.g. ``ds.close()`` or ``with plan.open_dataset(...) as ds:``).
216222
"""
223+
if isinstance(result, int):
224+
n = len(self.results)
225+
if result < 0 or result >= n:
226+
raise IndexError(
227+
f"result index {result} is out of range for a plan with {n} result(s). "
228+
f"Valid indices are 0 to {n - 1}."
229+
)
230+
result = self.results[result]
231+
217232
from point_collocation.core._open_method import (
218233
_apply_coords,
219234
_build_effective_open_kwargs,
@@ -234,7 +249,14 @@ def open_dataset(
234249
import xarray as xr
235250

236251
effective_open_method = "auto" if open_method is None else open_method
237-
spec = _normalize_open_method(effective_open_method, open_dataset_kwargs)
252+
spec = _normalize_open_method(effective_open_method)
253+
254+
xarray_open = spec.get("xarray_open", "dataset")
255+
effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))
256+
257+
if not silent:
258+
display_spec = {**spec, "open_kwargs": effective_kwargs}
259+
print(f"open_method: {display_spec!r}")
238260

239261
file_objs = earthaccess.open([result], pqdm_kwargs={"disable": True})
240262
if len(file_objs) != 1:
@@ -243,9 +265,6 @@ def open_dataset(
243265
)
244266
file_obj = file_objs[0]
245267

246-
xarray_open = spec.get("xarray_open", "dataset")
247-
effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))
248-
249268
if xarray_open == "datatree":
250269
dt = _open_datatree_fn(file_obj, effective_kwargs)
251270
try:
@@ -283,8 +302,9 @@ def open_dataset(
283302
def open_mfdataset(
284303
self,
285304
results: "list[Any] | Plan",
286-
open_method: str | dict | None = None,
287-
open_dataset_kwargs: dict[str, Any] | None = None,
305+
open_method: "str | dict | None" = None,
306+
*,
307+
silent: bool = False,
288308
) -> "xr.Dataset":
289309
"""Open multiple granule results as a single :class:`xarray.Dataset`.
290310
@@ -300,10 +320,13 @@ def open_mfdataset(
300320
``"datatree-merge"`` opens each granule as a DataTree, merges
301321
its groups into a flat dataset, then concatenates all granules
302322
along a new ``granule`` dimension. Defaults to ``"auto"``.
303-
open_dataset_kwargs:
304-
Keyword arguments forwarded to the xarray open function.
305-
``chunks`` defaults to ``{}`` (lazy/dask loading). ``engine``
306-
defaults to ``"h5netcdf"`` when not specified.
323+
Pass open-function kwargs via the ``"open_kwargs"`` key of a
324+
dict spec, e.g.
325+
``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
326+
silent:
327+
When ``False`` (default), print the effective open_method spec
328+
actually used (after normalization and defaults are applied).
329+
Set to ``True`` to suppress this output.
307330
308331
Returns
309332
-------
@@ -329,17 +352,21 @@ def open_mfdataset(
329352
import xarray as xr
330353

331354
effective_open_method = "auto" if open_method is None else open_method
332-
spec = _normalize_open_method(effective_open_method, open_dataset_kwargs)
355+
spec = _normalize_open_method(effective_open_method)
356+
357+
xarray_open = spec.get("xarray_open", "dataset")
358+
effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))
359+
360+
if not silent:
361+
display_spec = {**spec, "open_kwargs": effective_kwargs}
362+
print(f"open_method: {display_spec!r}")
333363

334364
result_list = results.results if isinstance(results, Plan) else list(results)
335365
file_objs = earthaccess.open(result_list, pqdm_kwargs={"disable": True})
336366

337-
xarray_open = spec.get("xarray_open", "dataset")
338-
339367
if xarray_open == "datatree":
340368
# Open each granule as a DataTree, merge its groups, then
341369
# concatenate all granule datasets along a new "granule" dim.
342-
effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))
343370
merged_datasets: list[xr.Dataset] = []
344371
for file_obj in file_objs:
345372
dt = _open_datatree_fn(file_obj, effective_kwargs)
@@ -357,7 +384,6 @@ def open_mfdataset(
357384
# separate datasets and merge them, then concatenate all granules
358385
# along a new "granule" dimension.
359386
# Without merge, use xr.open_mfdataset for simplicity.
360-
effective_kwargs = _build_effective_open_kwargs(spec.get("open_kwargs", {}))
361387
merge = spec.get("merge")
362388
if merge is not None:
363389
merged_datasets = []
@@ -380,8 +406,7 @@ def open_mfdataset(
380406

381407
def show_variables(
382408
self,
383-
open_method: str | dict | None = None,
384-
open_dataset_kwargs: dict[str, Any] | None = None,
409+
open_method: "str | dict | None" = None,
385410
) -> None:
386411
"""Print the first granule's groups, dimensions, and variables.
387412
@@ -395,19 +420,16 @@ def show_variables(
395420
the merged flat dataset.
396421
397422
To obtain the full xarray representation programmatically use
398-
``plan.open_dataset(plan[0])`` instead.
423+
``plan.open_dataset(0)`` instead.
399424
400425
Parameters
401426
----------
402427
open_method:
403428
How to open the granule. Accepts the same string presets or
404429
dict spec as :func:`~point_collocation.matchup`. Defaults to
405-
``"auto"``.
406-
open_dataset_kwargs:
407-
Keyword arguments forwarded to the xarray open function when
408-
the h5py fast path is unavailable.
409-
``chunks`` defaults to ``{}`` (lazy/dask loading). ``engine``
410-
defaults to ``"h5netcdf"`` when not specified.
430+
``"auto"``. Pass open-function kwargs via the ``"open_kwargs"``
431+
key of a dict spec, e.g.
432+
``open_method={"open_kwargs": {"engine": "netcdf4"}}``.
411433
412434
Raises
413435
------
@@ -430,7 +452,7 @@ def show_variables(
430452
raise ValueError("No granules in plan — cannot show variables.")
431453

432454
effective_open_method = "auto" if open_method is None else open_method
433-
spec = _normalize_open_method(effective_open_method, open_dataset_kwargs)
455+
spec = _normalize_open_method(effective_open_method)
434456
xarray_open = spec.get("xarray_open", "dataset")
435457

436458
try:

tests/test_plan.py

Lines changed: 120 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2916,7 +2916,7 @@ def test_open_dataset_returns_dataset(
29162916
time_buffer=pd.Timedelta(0),
29172917
)
29182918

2919-
ds = p.open_dataset(p[0], open_dataset_kwargs={"engine": "netcdf4"})
2919+
ds = p.open_dataset(0, open_method={"open_kwargs": {"engine": "netcdf4"}}, silent=True)
29202920
assert isinstance(ds, xr.Dataset)
29212921
assert "sst" in ds
29222922
ds.close()
@@ -2954,7 +2954,7 @@ def test_open_mfdataset_returns_dataset(
29542954
fake_ds = xr.Dataset({"sst": (["lat", "lon"], [[1.0]])}, coords={"lat": [0.0], "lon": [0.0]})
29552955
with patch("xarray.open_mfdataset", return_value=fake_ds) as mock_mfdataset:
29562956
# Pass a list of results directly (backward-compatible path)
2957-
ds = p.open_mfdataset(fake_results, open_dataset_kwargs={"engine": "netcdf4"})
2957+
ds = p.open_mfdataset(fake_results, open_method={"open_kwargs": {"engine": "netcdf4"}}, silent=True)
29582958

29592959
assert ds is fake_ds
29602960
mock_ea.open.assert_called_once_with(fake_results, pqdm_kwargs={"disable": True})
@@ -3016,7 +3016,7 @@ def test_open_mfdataset_accepts_subset_plan(
30163016

30173017
fake_ds = xr.Dataset({"sst": (["lat", "lon"], [[1.0]])}, coords={"lat": [0.0], "lon": [0.0]})
30183018
with patch("xarray.open_mfdataset", return_value=fake_ds) as mock_mfdataset:
3019-
ds = p.open_mfdataset(subset, open_dataset_kwargs={"engine": "netcdf4"})
3019+
ds = p.open_mfdataset(subset, open_method={"open_kwargs": {"engine": "netcdf4"}}, silent=True)
30203020

30213021
assert ds is fake_ds
30223022
mock_ea.open.assert_called_once_with(fake_results, pqdm_kwargs={"disable": True})
@@ -3047,7 +3047,7 @@ def test_open_dataset_default_uses_auto(
30473047
time_buffer=pd.Timedelta(0),
30483048
)
30493049

3050-
ds = p.open_dataset(p[0], open_method="dataset", open_dataset_kwargs={"engine": "netcdf4"})
3050+
ds = p.open_dataset(0, open_method={"xarray_open": "dataset", "open_kwargs": {"engine": "netcdf4"}}, silent=True)
30513051
assert isinstance(ds, xr.Dataset)
30523052
assert "sst" in ds
30533053
ds.close()
@@ -3073,7 +3073,7 @@ def test_open_dataset_invalid_open_method_raises(
30733073
)
30743074

30753075
with pytest.raises(ValueError, match="open_method"):
3076-
p.open_dataset(p[0], open_method="bad")
3076+
p.open_dataset(0, open_method="bad", silent=True)
30773077

30783078
def test_open_mfdataset_with_open_method_dataset_uses_open_mfdataset(
30793079
self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
@@ -3106,7 +3106,7 @@ def test_open_mfdataset_with_open_method_dataset_uses_open_mfdataset(
31063106

31073107
fake_ds = xr.Dataset({"sst": (["lat", "lon"], [[1.0]])}, coords={"lat": [0.0], "lon": [0.0]})
31083108
with patch("xarray.open_mfdataset", return_value=fake_ds) as mock_mfd:
3109-
ds = p.open_mfdataset(fake_results, open_method="dataset", open_dataset_kwargs={"engine": "netcdf4"})
3109+
ds = p.open_mfdataset(fake_results, open_method={"xarray_open": "dataset", "open_kwargs": {"engine": "netcdf4"}}, silent=True)
31103110

31113111
assert ds is fake_ds
31123112
mock_mfd.assert_called_once_with([nc_a, nc_b], chunks={}, engine="netcdf4", decode_timedelta=False)
@@ -3139,14 +3139,123 @@ def test_open_mfdataset_datatree_merge_concatenates(
31393139

31403140
ds = p.open_mfdataset(
31413141
fake_results,
3142-
open_method="datatree-merge",
3143-
open_dataset_kwargs={"engine": "netcdf4"},
3142+
open_method={"xarray_open": "datatree", "merge": "all", "open_kwargs": {"engine": "netcdf4"}},
3143+
silent=True,
31443144
)
31453145
# Result is a Dataset with a "granule" dimension from concatenation.
31463146
assert isinstance(ds, xr.Dataset)
31473147
assert ds.sizes["granule"] == 2
31483148
assert "sst" in ds
31493149

3150+
def test_open_dataset_prints_effective_spec_when_not_silent(
3151+
self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture
3152+
) -> None:
3153+
"""open_dataset prints the effective open_method spec when silent=False."""
3154+
nc_path = str(tmp_path / "test.nc")
3155+
_make_l3_dataset([-90.0, 0.0, 90.0], [-180.0, 0.0, 180.0]).to_netcdf(
3156+
nc_path, engine="netcdf4"
3157+
)
3158+
mock_ea = MagicMock()
3159+
mock_ea.open.return_value = [nc_path]
3160+
monkeypatch.setitem(__import__("sys").modules, "earthaccess", mock_ea)
3161+
3162+
pts = pd.DataFrame(
3163+
{"lat": [0.0], "lon": [0.0], "time": pd.to_datetime(["2023-06-01"])}
3164+
)
3165+
p = Plan(
3166+
points=pts,
3167+
results=[object()],
3168+
granules=[],
3169+
point_granule_map={0: []},
3170+
source_kwargs={"short_name": "TEST"},
3171+
time_buffer=pd.Timedelta(0),
3172+
)
3173+
3174+
ds = p.open_dataset(0, open_method={"open_kwargs": {"engine": "netcdf4"}}, silent=False)
3175+
ds.close()
3176+
captured = capsys.readouterr()
3177+
assert "open_method:" in captured.out
3178+
# The effective spec includes the resolved defaults (chunks, decode_timedelta)
3179+
assert "engine" in captured.out
3180+
3181+
def test_open_dataset_silent_suppresses_output(
3182+
self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch, capsys: pytest.CaptureFixture
3183+
) -> None:
3184+
"""open_dataset prints nothing when silent=True."""
3185+
nc_path = str(tmp_path / "test.nc")
3186+
_make_l3_dataset([-90.0, 0.0, 90.0], [-180.0, 0.0, 180.0]).to_netcdf(
3187+
nc_path, engine="netcdf4"
3188+
)
3189+
mock_ea = MagicMock()
3190+
mock_ea.open.return_value = [nc_path]
3191+
monkeypatch.setitem(__import__("sys").modules, "earthaccess", mock_ea)
3192+
3193+
pts = pd.DataFrame(
3194+
{"lat": [0.0], "lon": [0.0], "time": pd.to_datetime(["2023-06-01"])}
3195+
)
3196+
p = Plan(
3197+
points=pts,
3198+
results=[object()],
3199+
granules=[],
3200+
point_granule_map={0: []},
3201+
source_kwargs={"short_name": "TEST"},
3202+
time_buffer=pd.Timedelta(0),
3203+
)
3204+
3205+
ds = p.open_dataset(0, open_method={"open_kwargs": {"engine": "netcdf4"}}, silent=True)
3206+
ds.close()
3207+
captured = capsys.readouterr()
3208+
assert captured.out == ""
3209+
3210+
def test_open_dataset_integer_index(
3211+
self, tmp_path: pathlib.Path, monkeypatch: pytest.MonkeyPatch
3212+
) -> None:
3213+
"""open_dataset(0) resolves the integer to plan.results[0]."""
3214+
nc_path = str(tmp_path / "test.nc")
3215+
_make_l3_dataset([-90.0, 0.0, 90.0], [-180.0, 0.0, 180.0]).to_netcdf(
3216+
nc_path, engine="netcdf4"
3217+
)
3218+
mock_ea = MagicMock()
3219+
mock_ea.open.return_value = [nc_path]
3220+
monkeypatch.setitem(__import__("sys").modules, "earthaccess", mock_ea)
3221+
3222+
fake_result = object()
3223+
pts = pd.DataFrame(
3224+
{"lat": [0.0], "lon": [0.0], "time": pd.to_datetime(["2023-06-01"])}
3225+
)
3226+
p = Plan(
3227+
points=pts,
3228+
results=[fake_result],
3229+
granules=[],
3230+
point_granule_map={0: []},
3231+
source_kwargs={"short_name": "TEST"},
3232+
time_buffer=pd.Timedelta(0),
3233+
)
3234+
3235+
# Using integer index 0 should behave identically to plan[0]
3236+
ds = p.open_dataset(0, open_method={"open_kwargs": {"engine": "netcdf4"}}, silent=True)
3237+
assert isinstance(ds, xr.Dataset)
3238+
assert "sst" in ds
3239+
ds.close()
3240+
mock_ea.open.assert_called_once_with([fake_result], pqdm_kwargs={"disable": True})
3241+
3242+
def test_open_dataset_integer_index_out_of_range_raises(self) -> None:
3243+
"""open_dataset raises IndexError for an out-of-range integer index."""
3244+
pts = pd.DataFrame(
3245+
{"lat": [0.0], "lon": [0.0], "time": pd.to_datetime(["2023-06-01"])}
3246+
)
3247+
p = Plan(
3248+
points=pts,
3249+
results=[object()],
3250+
granules=[],
3251+
point_granule_map={0: []},
3252+
source_kwargs={"short_name": "TEST"},
3253+
time_buffer=pd.Timedelta(0),
3254+
)
3255+
3256+
with pytest.raises(IndexError, match="out of range"):
3257+
p.open_dataset(5, silent=True)
3258+
31503259

31513260
# ---------------------------------------------------------------------------
31523261
# Helper: create a grouped NetCDF4 file (HDF5 with subgroups)
@@ -3426,7 +3535,7 @@ def test_show_variables_prints_dims_and_vars(
34263535
time_buffer=pd.Timedelta(0),
34273536
)
34283537

3429-
p.show_variables(open_dataset_kwargs={"engine": "netcdf4"})
3538+
p.show_variables(open_method={"open_kwargs": {"engine": "netcdf4"}})
34303539
captured = capsys.readouterr()
34313540
assert "Dimensions" in captured.out
34323541
assert "Variables" in captured.out
@@ -4575,7 +4684,7 @@ def test_show_variables_dataset_layout_prints_dims_and_vars(
45754684
time_buffer=pd.Timedelta(0),
45764685
)
45774686

4578-
p.show_variables(open_dataset_kwargs={"engine": "netcdf4"})
4687+
p.show_variables(open_method={"open_kwargs": {"engine": "netcdf4"}})
45794688
captured = capsys.readouterr()
45804689
assert "Dimensions" in captured.out
45814690
assert "Variables" in captured.out
@@ -4613,7 +4722,7 @@ def test_show_variables_geo_detection_none_warns(
46134722
time_buffer=pd.Timedelta(0),
46144723
)
46154724

4616-
p.show_variables(open_dataset_kwargs={"engine": "netcdf4"})
4725+
p.show_variables(open_method={"open_kwargs": {"engine": "netcdf4"}})
46174726
captured = capsys.readouterr()
46184727
assert "NONE" in captured.out or "no geolocation" in captured.out.lower()
46194728

0 commit comments

Comments
 (0)