Skip to content

Commit 0d1a4ae

Browse files
authored
Merge pull request #80 from fish-pace/copilot/investigate-cf-xarray-lat-lon-coords
Use cf_xarray for lat/lon coordinate detection with name-based fallback
2 parents c33437a + 61c8f10 commit 0d1a4ae

4 files changed

Lines changed: 208 additions & 26 deletions

File tree

pyproject.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,14 @@ dependencies = [
1818
[project.optional-dependencies]
1919
earthaccess = ["earthaccess>=0.9.0"]
2020
swath = ["xoak>=0.2.0", "scikit-learn>=1.0.0"]
21+
cf = ["cf-xarray>=0.7.0"]
2122
dev = [
2223
"pytest>=7.0",
2324
"pytest-cov",
2425
"ruff",
2526
"mypy",
2627
"pyarrow>=10.0",
28+
"cf-xarray>=0.7.0",
2729
]
2830
docs = [
2931
"mkdocs>=1.5,<2.0",
@@ -32,7 +34,7 @@ docs = [
3234
"nbconvert>=7.0",
3335
]
3436
all = [
35-
"point-collocation[earthaccess,swath,dev]",
37+
"point-collocation[earthaccess,swath,cf,dev]",
3638
]
3739

3840
[project.urls]

src/point_collocation/core/engine.py

Lines changed: 59 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@
3737
if TYPE_CHECKING:
3838
from point_collocation.core.plan import Plan
3939

40-
# Geolocation name pairs (case-sensitive, tried in order).
41-
# Each element is (lon_name, lat_name).
40+
# Geolocation name pairs used as a fallback when cf_xarray is not installed or
41+
# when the dataset lacks CF-convention attributes (standard_name, units, etc.).
42+
# Each element is (lon_name, lat_name), tried in order (case-sensitive).
4243
_GEOLOC_PAIRS = [
4344
("lon", "lat"),
4445
("longitude", "latitude"),
@@ -245,11 +246,44 @@ def matchup(
245246
# ---------------------------------------------------------------------------
246247

247248

249+
def _cf_geoloc_names(ds: xr.Dataset, key: str) -> list[str]:
250+
"""Return variable names that match a CF *key* (e.g. ``"longitude"``) in *ds*.
251+
252+
Searches both ``ds.coords`` and ``ds.data_vars`` via the ``cf_xarray``
253+
accessor. Returns an empty list when ``cf_xarray`` is not installed or
254+
when no variables match the key.
255+
256+
Parameters
257+
----------
258+
ds:
259+
Dataset to inspect.
260+
key:
261+
CF coordinate key, e.g. ``"longitude"`` or ``"latitude"``.
262+
"""
263+
try:
264+
import cf_xarray # noqa: F401 (registers the .cf accessor)
265+
except ImportError:
266+
return []
267+
268+
try:
269+
matched = ds.cf[[key]]
270+
except KeyError:
271+
return []
272+
273+
return list(matched.coords) + list(matched.data_vars)
274+
275+
248276
def _find_geoloc_pair(ds: xr.Dataset) -> tuple[str, str]:
249277
"""Find exactly one ``(lon_name, lat_name)`` pair in *ds*.
250278
251-
Searches both ``ds.coords`` and ``ds.data_vars`` for each pair in
252-
:data:`_GEOLOC_PAIRS`.
279+
Detection strategy
280+
------------------
281+
1. **cf_xarray** (primary, if installed): inspects CF-convention attributes
282+
such as ``standard_name``, ``units``, and ``long_name`` in both
283+
``ds.coords`` and ``ds.data_vars``.
284+
2. **Name-based fallback**: if ``cf_xarray`` is not installed or the
285+
dataset lacks CF attributes, searches ``ds.coords`` and ``ds.data_vars``
286+
for each ``(lon_name, lat_name)`` pair in :data:`_GEOLOC_PAIRS`.
253287
254288
Returns
255289
-------
@@ -262,6 +296,27 @@ def _find_geoloc_pair(ds: xr.Dataset) -> tuple[str, str]:
262296
If zero pairs are found ("no geolocation variables found") or
263297
more than one pair is found ("ambiguous geolocation variables").
264298
"""
299+
# --- primary: cf_xarray (CF-conventions-aware) ---
300+
lon_names = _cf_geoloc_names(ds, "longitude")
301+
lat_names = _cf_geoloc_names(ds, "latitude")
302+
303+
if lon_names or lat_names:
304+
# cf_xarray found at least one candidate — commit to its results.
305+
if not lon_names or not lat_names:
306+
raise ValueError(
307+
"no geolocation variables found. "
308+
f"cf_xarray detected longitude={lon_names}, latitude={lat_names}; "
309+
"expected exactly one variable for each."
310+
)
311+
if len(lon_names) > 1 or len(lat_names) > 1:
312+
raise ValueError(
313+
f"ambiguous geolocation variables; "
314+
f"cf_xarray detected longitude={lon_names}, latitude={lat_names}. "
315+
"Rename or drop the extra coordinates before running matchup."
316+
)
317+
return lon_names[0], lat_names[0]
318+
319+
# --- fallback: name-based search (no cf_xarray or no CF attributes) ---
265320
found: list[tuple[str, str]] = []
266321
for lon_name, lat_name in _GEOLOC_PAIRS:
267322
has_lon = lon_name in ds.coords or lon_name in ds.data_vars

src/point_collocation/core/plan.py

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,7 @@ def show_variables(
409409
If the plan contains no granules.
410410
"""
411411
from point_collocation.core.engine import (
412-
_GEOLOC_PAIRS,
412+
_find_geoloc_pair,
413413
_VALID_GEOMETRIES,
414414
_VALID_OPEN_METHODS,
415415
_merge_datatree,
@@ -472,30 +472,25 @@ def show_variables(
472472
print(f"Variables : {list(ds_flat.data_vars)}")
473473

474474
# Geolocation detection results.
475-
found_pairs: list[tuple[str, str]] = []
476-
for lon_name, lat_name in _GEOLOC_PAIRS:
477-
has_lon = lon_name in ds_flat.coords or lon_name in ds_flat.data_vars
478-
has_lat = lat_name in ds_flat.coords or lat_name in ds_flat.data_vars
479-
if has_lon and has_lat:
480-
found_pairs.append((lon_name, lat_name))
481-
482-
if len(found_pairs) == 0:
483-
alt_open_method = "datatree-merge" if open_method == "dataset" else "dataset"
484-
alt = f"plan.show_variables(geometry={geometry!r}, open_method={alt_open_method!r})"
485-
print(
486-
f"\nGeolocation: NONE detected with open_method={open_method!r}. "
487-
f"Try {alt}."
488-
)
489-
elif len(found_pairs) == 1:
490-
lon_n, lat_n = found_pairs[0]
475+
try:
476+
lon_n, lat_n = _find_geoloc_pair(ds_flat)
491477
lon_var = ds_flat.coords[lon_n] if lon_n in ds_flat.coords else ds_flat[lon_n]
492478
lat_var = ds_flat.coords[lat_n] if lat_n in ds_flat.coords else ds_flat[lat_n]
493479
print(
494480
f"\nGeolocation: ({lon_n!r}, {lat_n!r}) — "
495481
f"lon dims={tuple(lon_var.dims)}, lat dims={tuple(lat_var.dims)}"
496482
)
497-
else:
498-
print(f"\nGeolocation: ambiguous — detected pairs: {found_pairs}")
483+
except ValueError as exc:
484+
msg = str(exc)
485+
if "no geolocation variables found" in msg:
486+
alt_open_method = "datatree-merge" if open_method == "dataset" else "dataset"
487+
alt = f"plan.show_variables(geometry={geometry!r}, open_method={alt_open_method!r})"
488+
print(
489+
f"\nGeolocation: NONE detected with open_method={open_method!r}. "
490+
f"Try {alt}."
491+
)
492+
else:
493+
print(f"\nGeolocation: {msg}")
499494

500495
# For datatree-merge, print group details at the end.
501496
if open_method == "datatree-merge":

tests/test_plan.py

Lines changed: 132 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3425,10 +3425,36 @@ def test_no_geoloc_raises(self) -> None:
34253425
_find_geoloc_pair(ds)
34263426

34273427
def test_ambiguous_geoloc_raises(self) -> None:
3428-
"""Multiple recognised pairs should raise with all pairs listed."""
3428+
"""Multiple recognised pairs should raise with all pairs listed.
3429+
3430+
Uses a dataset where cf_xarray cannot resolve the ambiguity (no CF
3431+
attributes, and variable names are not recognised CF standard names).
3432+
"""
3433+
from point_collocation.core.engine import _find_geoloc_pair
3434+
3435+
# Has both (lon, lat) and (Longitude, Latitude) — neither pair uses the
3436+
# exact CF standard name spellings that cf_xarray can resolve by name,
3437+
# so the name-based fallback detects two pairs and raises.
3438+
ds = xr.Dataset(
3439+
coords={
3440+
"lon": [0.0],
3441+
"lat": [0.0],
3442+
"Longitude": [0.0],
3443+
"Latitude": [0.0],
3444+
}
3445+
)
3446+
with pytest.raises(ValueError, match="ambiguous geolocation variables"):
3447+
_find_geoloc_pair(ds)
3448+
3449+
def test_lon_lat_plus_longitude_latitude_resolves_via_cf(self) -> None:
3450+
"""When cf_xarray is installed, (lon,lat)+(longitude,latitude) resolves.
3451+
3452+
Without cf_xarray this would be ambiguous, but cf_xarray correctly
3453+
picks the CF standard-name pair ``(longitude, latitude)``.
3454+
"""
3455+
cf_xarray = pytest.importorskip("cf_xarray") # noqa: F841
34293456
from point_collocation.core.engine import _find_geoloc_pair
34303457

3431-
# Has both (lon, lat) and (longitude, latitude)
34323458
ds = xr.Dataset(
34333459
coords={
34343460
"lon": [0.0],
@@ -3437,9 +3463,113 @@ def test_ambiguous_geoloc_raises(self) -> None:
34373463
"latitude": [0.0],
34383464
}
34393465
)
3466+
lon_name, lat_name = _find_geoloc_pair(ds)
3467+
assert lon_name == "longitude"
3468+
assert lat_name == "latitude"
3469+
3470+
3471+
class TestGeolocDetectionCfXarray:
3472+
"""Tests for _find_geoloc_pair() using cf_xarray CF-convention detection."""
3473+
3474+
def test_finds_via_standard_name(self) -> None:
3475+
pytest.importorskip("cf_xarray")
3476+
from point_collocation.core.engine import _find_geoloc_pair
3477+
3478+
ds = xr.Dataset(
3479+
coords={
3480+
"myX": xr.DataArray(
3481+
[0.0], attrs={"standard_name": "longitude", "units": "degrees_east"}
3482+
),
3483+
"myY": xr.DataArray(
3484+
[0.0], attrs={"standard_name": "latitude", "units": "degrees_north"}
3485+
),
3486+
}
3487+
)
3488+
lon_name, lat_name = _find_geoloc_pair(ds)
3489+
assert lon_name == "myX"
3490+
assert lat_name == "myY"
3491+
3492+
def test_finds_via_units(self) -> None:
3493+
pytest.importorskip("cf_xarray")
3494+
from point_collocation.core.engine import _find_geoloc_pair
3495+
3496+
ds = xr.Dataset(
3497+
coords={
3498+
"mylon": xr.DataArray([0.0], attrs={"units": "degrees_east"}),
3499+
"mylat": xr.DataArray([0.0], attrs={"units": "degrees_north"}),
3500+
}
3501+
)
3502+
lon_name, lat_name = _find_geoloc_pair(ds)
3503+
assert lon_name == "mylon"
3504+
assert lat_name == "mylat"
3505+
3506+
def test_finds_via_long_name(self) -> None:
3507+
pytest.importorskip("cf_xarray")
3508+
from point_collocation.core.engine import _find_geoloc_pair
3509+
3510+
ds = xr.Dataset(
3511+
coords={
3512+
"x": xr.DataArray([0.0], attrs={"long_name": "longitude"}),
3513+
"y": xr.DataArray([0.0], attrs={"long_name": "latitude"}),
3514+
}
3515+
)
3516+
lon_name, lat_name = _find_geoloc_pair(ds)
3517+
assert lon_name == "x"
3518+
assert lat_name == "y"
3519+
3520+
def test_finds_cf_attrs_in_data_vars(self) -> None:
3521+
"""CF-detected geoloc vars stored as data_vars (not coords) should be found."""
3522+
pytest.importorskip("cf_xarray")
3523+
from point_collocation.core.engine import _find_geoloc_pair
3524+
3525+
ds = xr.Dataset(
3526+
{
3527+
"nav_lon": xr.DataArray(
3528+
[[0.0, 1.0]],
3529+
dims=["y", "x"],
3530+
attrs={"standard_name": "longitude"},
3531+
),
3532+
"nav_lat": xr.DataArray(
3533+
[[0.0, 0.0]],
3534+
dims=["y", "x"],
3535+
attrs={"standard_name": "latitude"},
3536+
),
3537+
"sst": xr.DataArray([[25.0, 26.0]], dims=["y", "x"]),
3538+
}
3539+
)
3540+
lon_name, lat_name = _find_geoloc_pair(ds)
3541+
assert lon_name == "nav_lon"
3542+
assert lat_name == "nav_lat"
3543+
3544+
def test_cf_ambiguous_raises(self) -> None:
3545+
"""Multiple CF-detected longitude vars should raise ambiguous error."""
3546+
pytest.importorskip("cf_xarray")
3547+
from point_collocation.core.engine import _find_geoloc_pair
3548+
3549+
ds = xr.Dataset(
3550+
coords={
3551+
"lon1": xr.DataArray([0.0], attrs={"standard_name": "longitude"}),
3552+
"lon2": xr.DataArray([0.0], attrs={"standard_name": "longitude"}),
3553+
"lat1": xr.DataArray([0.0], attrs={"standard_name": "latitude"}),
3554+
}
3555+
)
34403556
with pytest.raises(ValueError, match="ambiguous geolocation variables"):
34413557
_find_geoloc_pair(ds)
34423558

3559+
def test_cf_partial_detection_raises(self) -> None:
3560+
"""CF detects longitude but not latitude — should raise 'no geolocation'."""
3561+
pytest.importorskip("cf_xarray")
3562+
from point_collocation.core.engine import _find_geoloc_pair
3563+
3564+
ds = xr.Dataset(
3565+
coords={
3566+
"myX": xr.DataArray([0.0], attrs={"standard_name": "longitude"}),
3567+
"temperature": xr.DataArray([20.0]),
3568+
}
3569+
)
3570+
with pytest.raises(ValueError, match="no geolocation variables found"):
3571+
_find_geoloc_pair(ds)
3572+
34433573

34443574
class TestGeometryEnforcement:
34453575
"""Tests for _check_geometry()."""

0 commit comments

Comments
 (0)