diff --git a/changelog/486.feature.md b/changelog/486.feature.md new file mode 100644 index 000000000..f5b3dc6a4 --- /dev/null +++ b/changelog/486.feature.md @@ -0,0 +1,2 @@ +Added support for CMIP7 datasets including database tables, dataset adapters, and ESGF integration. +CMIP7 datasets can now be ingested and queried, with a proxy mechanism that translates CMIP7 requests to CMIP6 queries. diff --git a/packages/climate-ref-core/src/climate_ref_core/cmip6_to_cmip7.py b/packages/climate-ref-core/src/climate_ref_core/cmip6_to_cmip7.py index b1175fb6b..00a39e098 100644 --- a/packages/climate-ref-core/src/climate_ref_core/cmip6_to_cmip7.py +++ b/packages/climate-ref-core/src/climate_ref_core/cmip6_to_cmip7.py @@ -299,11 +299,16 @@ class CMIP7Metadata: This captures the additional/modified attributes needed for CMIP7 format. Based on CMIP7 Global Attributes V1.0 (DOI: 10.5281/zenodo.17250297). + + Mandatory attributes per spec: + - mip_era, region, drs_specs, data_specs_version, product, license_id + - temporal_label, vertical_label, horizontal_label, area_label + - branding_suffix (derived from labels) """ - # Required new attributes + # Required new attributes (mandatory per CMIP7 V1.0 spec) mip_era: str = "CMIP7" - region: str = "glb" + region: str = "glb" # lowercase per spec drs_specs: str = "MIP-DRS7" data_specs_version: str = "MIP-DS7.1.0.0" product: str = "model-output" @@ -609,3 +614,58 @@ def create_cmip7_path(attrs: dict[str, Any], version: str | None = None) -> str: version_str, ] return "/".join(str(c) for c in components) + + +def create_cmip7_instance_id(attrs: dict[str, Any]) -> str: + """ + Create a CMIP7 instance_id following MIP-DRS7 spec. + + The instance_id uniquely identifies a dataset using the DRS components + separated by periods. + + Parameters + ---------- + attrs + Dictionary containing CMIP7 attributes + + Returns + ------- + str + The CMIP7 instance_id + + Examples + -------- + >>> attrs = { + ... "drs_specs": "MIP-DRS7", + ... "mip_era": "CMIP7", + ... "activity_id": "CMIP", + ... "institution_id": "CCCma", + ... "source_id": "CanESM6-MR", + ... "experiment_id": "historical", + ... "variant_label": "r2i1p1f1", + ... "region": "glb", + ... "frequency": "mon", + ... "variable_id": "tas", + ... "branding_suffix": "tavg-h2m-hxy-u", + ... "grid_label": "g13s", + ... "version": "v20250622", + ... } + >>> create_cmip7_instance_id(attrs) + 'MIP-DRS7.CMIP7.CMIP.CCCma.CanESM6-MR.historical.r2i1p1f1.glb.mon.tas.tavg-h2m-hxy-u.g13s.v20250622' + """ + components = [ + attrs.get("drs_specs", "MIP-DRS7"), + attrs.get("mip_era", "CMIP7"), + attrs.get("activity_id", "CMIP"), + attrs.get("institution_id", ""), + attrs.get("source_id", ""), + attrs.get("experiment_id", ""), + attrs.get("variant_label", ""), + attrs.get("region", "glb"), + attrs.get("frequency", "mon"), + attrs.get("variable_id", ""), + attrs.get("branding_suffix", ""), + attrs.get("grid_label", "gn"), + attrs.get("version", "v1"), + ] + return ".".join(str(c) for c in components) diff --git a/packages/climate-ref-core/src/climate_ref_core/esgf/__init__.py b/packages/climate-ref-core/src/climate_ref_core/esgf/__init__.py index 99df77449..08b6dcfd9 100644 --- a/packages/climate-ref-core/src/climate_ref_core/esgf/__init__.py +++ b/packages/climate-ref-core/src/climate_ref_core/esgf/__init__.py @@ -7,11 +7,13 @@ from climate_ref_core.esgf.base import ESGFRequest, IntakeESGFMixin from climate_ref_core.esgf.cmip6 import CMIP6Request +from climate_ref_core.esgf.cmip7 import CMIP7Request from climate_ref_core.esgf.fetcher import ESGFFetcher from climate_ref_core.esgf.obs4mips import Obs4MIPsRequest __all__ = [ "CMIP6Request", + "CMIP7Request", "ESGFFetcher", "ESGFRequest", "IntakeESGFMixin", diff --git a/packages/climate-ref-core/src/climate_ref_core/esgf/cmip7.py b/packages/climate-ref-core/src/climate_ref_core/esgf/cmip7.py new file mode 100644 index 000000000..85bd9ac16 --- /dev/null +++ b/packages/climate-ref-core/src/climate_ref_core/esgf/cmip7.py @@ -0,0 +1,291 @@ +""" +CMIP7 dataset request implementation. + +This module provides a CMIP7Request class that wraps CMIP6 data from ESGF +and converts it to CMIP7 format for testing diagnostics with CMIP7-style data +before actual CMIP7 data becomes available on ESGF. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Any, cast + +import pandas as pd +import xarray as xr +from loguru import logger + +from climate_ref_core.cmip6_to_cmip7 import ( + convert_cmip6_dataset, + create_cmip7_filename, + create_cmip7_instance_id, + create_cmip7_path, + get_branding_suffix, + get_frequency_from_table, + get_realm_from_table, +) +from climate_ref_core.esgf.cmip6 import CMIP6Request + +if TYPE_CHECKING: + import xarray as xr + + +def _convert_file( + cmip6_path: Path, + output_dir: Path, + rename_variables: bool = False, +) -> tuple[Path, dict[str, Any]] | None: + """ + Convert a single CMIP6 file to CMIP7 format. + + Parameters + ---------- + cmip6_path + Path to the CMIP6 file + output_dir + Root directory for converted CMIP7 files + rename_variables + If True, rename variables to CMIP7 branded format + + Returns + ------- + tuple[Path, dict[str, Any]] | None + Tuple of (cmip7_path, cmip7_attrs) or None if conversion failed + """ + try: + ds: xr.Dataset = xr.open_dataset(cmip6_path) + except Exception as e: + logger.warning(f"Failed to open {cmip6_path}: {e}") + return None + + try: + ds_cmip7 = convert_cmip6_dataset(ds, rename_variables=rename_variables) + + # Build output path using CMIP7 DRS + # TODO: extract time range from data for filename + cmip7_subpath = create_cmip7_path(ds_cmip7.attrs) + cmip7_filename = create_cmip7_filename(ds_cmip7.attrs, time_range=None) + cmip7_dir = output_dir / cmip7_subpath + cmip7_dir.mkdir(parents=True, exist_ok=True) + + cmip7_path = cmip7_dir / cmip7_filename + + # Only write if file doesn't already exist + if not cmip7_path.exists(): + ds_cmip7.to_netcdf(cmip7_path) + logger.debug(f"Wrote CMIP7 file: {cmip7_path}") + else: + logger.debug(f"CMIP7 file already exists: {cmip7_path}") + + return cmip7_path, dict(ds_cmip7.attrs) + + except Exception as e: + logger.warning(f"Failed to convert {cmip6_path}: {e}") + return None + finally: + ds.close() + + +class CMIP7Request: + """ + CMIP7 dataset request that wraps CMIP6 data with conversion. + + Fetches CMIP6 data from ESGF and converts to CMIP7 format. + Converted files are written to disk with CMIP7 directory structure. + + Parameters + ---------- + slug + Unique identifier for this request + facets + ESGF search facets (e.g., source_id, variable_id, experiment_id) + output_dir + Directory to write converted CMIP7 files. + Defaults to ~/.cache/climate-ref/cmip7 + remove_ensembles + If True, keep only one ensemble member per model + time_span + Optional time range filter (start, end) in YYYY-MM format + rename_variables + If True, rename variables to CMIP7 branded format (e.g., tas -> tas_tavg-h2m-hxy-u) + + Examples + -------- + >>> from climate_ref_core.esgf import CMIP7Request + >>> request = CMIP7Request( + ... slug="tas-historical", + ... facets={ + ... "source_id": "ACCESS-ESM1-5", + ... "experiment_id": "historical", + ... "variable_id": "tas", + ... "member_id": "r1i1p1f1", + ... "table_id": "Amon", + ... }, + ... time_span=("2000-01", "2014-12"), + ... ) + >>> df = request.fetch_datasets() + """ + + source_type = "CMIP7" + + def __init__( # noqa: PLR0913 + self, + slug: str, + facets: dict[str, Any], + output_dir: Path | None = None, + remove_ensembles: bool = False, + time_span: tuple[str, str] | None = None, + rename_variables: bool = False, + ): + self.slug = slug + self.facets = facets + self.output_dir = output_dir or Path.home() / ".cache" / "climate-ref" / "cmip7" + self.rename_variables = rename_variables + self.time_span = time_span + + # Internal CMIP6 request for actual fetching + self._cmip6_request = CMIP6Request( + slug=f"{slug}_cmip6_source", + facets=facets, + remove_ensembles=remove_ensembles, + time_span=time_span, + ) + + def __repr__(self) -> str: + return f"CMIP7Request(slug={self.slug!r}, facets={self.facets!r})" + + def fetch_datasets(self) -> pd.DataFrame: + """ + Fetch CMIP6 data from ESGF and convert to CMIP7 format. + + Returns + ------- + pd.DataFrame + DataFrame containing CMIP7 metadata and file paths. + Contains columns: key, files, path, source_type, and all CMIP7 metadata fields. + """ + # Fetch CMIP6 data + cmip6_df = self._cmip6_request.fetch_datasets() + + if cmip6_df.empty: + logger.warning(f"No CMIP6 datasets found for {self.slug}") + return cmip6_df + + converted_rows = [] + + for _, row in cmip6_df.iterrows(): + files_list = row.get("files", []) + if not files_list: + continue + + cmip7_files = [] + cmip7_attrs: dict[str, Any] | None = None + + for _file_path in files_list: + # Convert each CMIP6 file to CMIP7-like + file_path = Path(_file_path) + if not file_path.exists(): + logger.warning(f"CMIP6 file does not exist: {file_path}") + continue + + result = _convert_file(file_path, self.output_dir, self.rename_variables) + if result is not None: + cmip7_path, attrs = result + cmip7_files.append(str(cmip7_path)) + # Use attributes from first successful conversion + if cmip7_attrs is None: + cmip7_attrs = attrs + + if cmip7_files and cmip7_attrs is not None: + converted_row = self._build_cmip7_metadata(row, cmip7_files, cmip7_attrs) + converted_rows.append(converted_row) + + if not converted_rows: + logger.warning(f"No CMIP7 files converted for {self.slug}") + return pd.DataFrame() + + return pd.DataFrame(converted_rows) + + def _build_cmip7_metadata( + self, + cmip6_row: pd.Series, + cmip7_files: list[str], + cmip7_attrs: dict[str, Any], + ) -> dict[str, Any]: + """ + Build CMIP7 metadata row from CMIP6 source and converted attributes. + + Based on CMIP7 Global Attributes V1.0 (DOI: 10.5281/zenodo.17250297). + + Parameters + ---------- + cmip6_row + Original CMIP6 metadata row + cmip7_files + List of converted CMIP7 file paths + cmip7_attrs + Attributes from the converted CMIP7 dataset + + Returns + ------- + dict[str, Any] + CMIP7 metadata dictionary + """ + table_id = cmip6_row.get("table_id", "Amon") + variable_id = cmip6_row.get("variable_id", "tas") + branding_suffix = get_branding_suffix(variable_id) + branding_suffix_str = str(branding_suffix) + + # Start with CMIP6 row as base + result: dict[str, Any] = cast(dict[str, Any], cmip6_row.to_dict()) + + # Remove CMIP6-only fields + for field in ["member_id", "table_id", "grid", "sub_experiment", "sub_experiment_id"]: + result.pop(field, None) + + # Override with CMIP7-specific values per V1.0 spec + result.update( + { + "path": cmip7_files[0], # Primary path + "files": cmip7_files, + # Mandatory CMIP7 fields + "mip_era": "CMIP7", + "realm": get_realm_from_table(table_id), + "frequency": get_frequency_from_table(table_id), + "region": cmip7_attrs.get("region", "glb"), # lowercase per spec + "branding_suffix": branding_suffix_str, + "branded_variable": f"{variable_id}_{branding_suffix_str}", + "drs_specs": cmip7_attrs.get("drs_specs", "MIP-DRS7"), + "data_specs_version": cmip7_attrs.get("data_specs_version", "MIP-DS7.1.0.0"), + "product": cmip7_attrs.get("product", "model-output"), + "license_id": cmip7_attrs.get("license_id", "CC-BY-4.0"), + # Branding suffix components + "temporal_label": cmip7_attrs.get("temporal_label", branding_suffix.temporal_label), + "vertical_label": cmip7_attrs.get("vertical_label", branding_suffix.vertical_label), + "horizontal_label": cmip7_attrs.get("horizontal_label", branding_suffix.horizontal_label), + "area_label": cmip7_attrs.get("area_label", branding_suffix.area_label), + # Variant indices (convert from CMIP6 integers to CMIP7 strings) + "realization_index": cmip7_attrs.get("realization_index", ""), + "initialization_index": cmip7_attrs.get("initialization_index", ""), + "physics_index": cmip7_attrs.get("physics_index", ""), + "forcing_index": cmip7_attrs.get("forcing_index", ""), + # Optional fields + "nominal_resolution": cmip7_attrs.get("nominal_resolution", ""), + "tracking_id": cmip7_attrs.get("tracking_id", ""), + # Parent fields + "branch_time_in_child": cmip7_attrs.get("branch_time_in_child"), + "branch_time_in_parent": cmip7_attrs.get("branch_time_in_parent"), + "parent_activity_id": cmip7_attrs.get("parent_activity_id", ""), + "parent_experiment_id": cmip7_attrs.get("parent_experiment_id", ""), + "parent_mip_era": cmip7_attrs.get("parent_mip_era", ""), + "parent_source_id": cmip7_attrs.get("parent_source_id", ""), + "parent_time_units": cmip7_attrs.get("parent_time_units", ""), + "parent_variant_label": cmip7_attrs.get("parent_variant_label", ""), + "external_variables": cmip7_attrs.get("external_variables", ""), + } + ) + + # Generate CMIP7 instance_id + result["key"] = create_cmip7_instance_id(result) + + return result diff --git a/packages/climate-ref-core/tests/unit/esgf/test_cmip7.py b/packages/climate-ref-core/tests/unit/esgf/test_cmip7.py new file mode 100644 index 000000000..bdbda2231 --- /dev/null +++ b/packages/climate-ref-core/tests/unit/esgf/test_cmip7.py @@ -0,0 +1,210 @@ +from pathlib import Path +from unittest.mock import MagicMock, patch + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from climate_ref_core.esgf.cmip7 import CMIP7Request, _convert_file + + +class TestCMIP7Request: + def test_init_default_output_dir(self): + request = CMIP7Request( + slug="test", + facets={"variable_id": "tas"}, + ) + assert request.output_dir == Path.home() / ".cache" / "climate-ref" / "cmip7" + + def test_init_custom_output_dir(self, tmp_path): + request = CMIP7Request( + slug="test", + facets={"variable_id": "tas"}, + output_dir=tmp_path / "cmip7", + ) + assert request.output_dir == tmp_path / "cmip7" + + def test_source_type(self): + request = CMIP7Request( + slug="test", + facets={"variable_id": "tas"}, + ) + assert request.source_type == "CMIP7" + + def test_repr(self): + request = CMIP7Request( + slug="test", + facets={"variable_id": "tas", "source_id": "ACCESS-ESM1-5"}, + ) + repr_str = repr(request) + assert "CMIP7Request" in repr_str + assert "test" in repr_str + + def test_internal_cmip6_request_created(self): + request = CMIP7Request( + slug="test-request", + facets={"variable_id": "tas", "experiment_id": "historical"}, + remove_ensembles=True, + time_span=("2000-01", "2010-12"), + ) + assert request._cmip6_request.slug == "test-request_cmip6_source" + assert request._cmip6_request.facets == {"variable_id": "tas", "experiment_id": "historical"} + assert request._cmip6_request.remove_ensembles is True + assert request._cmip6_request.time_span == ("2000-01", "2010-12") + + def test_build_cmip7_metadata(self): + request = CMIP7Request( + slug="test", + facets={"variable_id": "tas"}, + ) + + cmip6_row = pd.Series( + { + "variable_id": "tas", + "table_id": "Amon", + "source_id": "ACCESS-ESM1-5", + "experiment_id": "historical", + "variant_label": "r1i1p1f1", + "institution_id": "CSIRO", + "activity_id": "CMIP", + "grid_label": "gn", + "version": "v20191115", + } + ) + cmip7_files = ["/path/to/converted/file.nc"] + cmip7_attrs = { + "region": "glb", + "drs_specs": "MIP-DRS7", + "data_specs_version": "MIP-DS7.1.0.0", + "temporal_label": "tavg", + "vertical_label": "h2m", + "horizontal_label": "hxy", + "area_label": "u", + } + + result = request._build_cmip7_metadata(cmip6_row, cmip7_files, cmip7_attrs) + + # Core CMIP7 attributes per V1.0 spec + assert result["mip_era"] == "CMIP7" + assert result["realm"] == "atmos" # Converted from Amon table_id + assert result["frequency"] == "mon" # Extracted from Amon + assert result["region"] == "glb" # lowercase per CMIP7 spec + assert result["branding_suffix"] == "tavg-h2m-hxy-u" + assert result["branded_variable"] == "tas_tavg-h2m-hxy-u" + assert result["path"] == "/path/to/converted/file.nc" + assert result["files"] == cmip7_files + # Key should be CMIP7 instance_id format (MIP-DRS7.CMIP7....) + assert "MIP-DRS7.CMIP7" in result["key"] + # CMIP6-only fields should be removed + assert "table_id" not in result + assert "member_id" not in result + + @patch("climate_ref_core.esgf.cmip7.CMIP6Request") + def test_fetch_datasets_empty(self, mock_cmip6_request_cls): + mock_cmip6_request = MagicMock() + mock_cmip6_request.fetch_datasets.return_value = pd.DataFrame() + mock_cmip6_request_cls.return_value = mock_cmip6_request + + request = CMIP7Request( + slug="test", + facets={"variable_id": "tas"}, + ) + request._cmip6_request = mock_cmip6_request + + result = request.fetch_datasets() + + assert result.empty + + +class TestConvertFile: + def test_convert_file_missing_file(self, tmp_path): + result = _convert_file( + cmip6_path=tmp_path / "nonexistent.nc", + output_dir=tmp_path / "output", + rename_variables=False, + ) + assert result is None + + def test_convert_file_success(self, tmp_path, sample_cmip6_dataset): + # Write sample dataset to file + input_file = tmp_path / "input" / "tas_Amon_test.nc" + input_file.parent.mkdir(parents=True, exist_ok=True) + sample_cmip6_dataset.to_netcdf(input_file) + + output_dir = tmp_path / "output" + + result = _convert_file( + cmip6_path=input_file, + output_dir=output_dir, + rename_variables=False, + ) + + assert result is not None + cmip7_path, cmip7_attrs = result + assert cmip7_path.exists() + assert cmip7_attrs["mip_era"] == "CMIP7" + assert cmip7_attrs["branding_suffix"] == "tavg-h2m-hxy-u" + + # Verify the converted file has correct attributes + ds = xr.open_dataset(cmip7_path) + assert ds.attrs["mip_era"] == "CMIP7" + ds.close() + + def test_convert_file_already_exists(self, tmp_path, sample_cmip6_dataset): + # Write sample dataset to file + input_file = tmp_path / "input" / "tas_Amon_test.nc" + input_file.parent.mkdir(parents=True, exist_ok=True) + sample_cmip6_dataset.to_netcdf(input_file) + + output_dir = tmp_path / "output" + + # First conversion + result1 = _convert_file(input_file, output_dir, rename_variables=False) + assert result1 is not None + cmip7_path1, _ = result1 + mtime1 = cmip7_path1.stat().st_mtime + + # Second conversion - should skip writing + result2 = _convert_file(input_file, output_dir, rename_variables=False) + assert result2 is not None + cmip7_path2, _ = result2 + mtime2 = cmip7_path2.stat().st_mtime + + # File should not have been rewritten + assert mtime1 == mtime2 + + +@pytest.fixture +def sample_cmip6_dataset() -> xr.Dataset: + """Create a minimal CMIP6-style dataset for testing.""" + + time = np.arange(12) + lat = np.linspace(-90, 90, 5) + lon = np.linspace(0, 360, 10) + rng = np.random.default_rng(42) + + data = rng.random((len(time), len(lat), len(lon))) + + ds = xr.Dataset( + {"tas": (["time", "lat", "lon"], data)}, + coords={ + "time": time, + "lat": lat, + "lon": lon, + }, + attrs={ + "variable_id": "tas", + "table_id": "Amon", + "source_id": "ACCESS-ESM1-5", + "experiment_id": "historical", + "variant_label": "r1i1p1f1", + "member_id": "r1i1p1f1", + "institution_id": "CSIRO", + "activity_id": "CMIP", + "grid_label": "gn", + "version": "v20191115", + "Conventions": "CF-1.6", + }, + ) + return ds diff --git a/packages/climate-ref-core/tests/unit/test_cmip6_to_cmip7.py b/packages/climate-ref-core/tests/unit/test_cmip6_to_cmip7.py index a0d94802e..332338e42 100644 --- a/packages/climate-ref-core/tests/unit/test_cmip6_to_cmip7.py +++ b/packages/climate-ref-core/tests/unit/test_cmip6_to_cmip7.py @@ -12,6 +12,7 @@ convert_cmip6_to_cmip7_attrs, convert_variant_index, create_cmip7_filename, + create_cmip7_instance_id, create_cmip7_path, get_branding_suffix, get_cmip7_variable_name, @@ -434,3 +435,73 @@ class TestVariableBrandingCoverage: ) def test_common_variables_have_branding(self, variable_id: str): assert variable_id in VARIABLE_BRANDING + + +class TestCreateCmip7InstanceId: + """Test CMIP7 instance_id generation per MIP-DRS7 spec.""" + + def test_creates_valid_instance_id(self): + """Test instance_id generation with all attributes.""" + attrs = { + "drs_specs": "MIP-DRS7", + "mip_era": "CMIP7", + "activity_id": "CMIP", + "institution_id": "CCCma", + "source_id": "CanESM6-MR", + "experiment_id": "historical", + "variant_label": "r2i1p1f1", + "region": "glb", + "frequency": "mon", + "variable_id": "tas", + "branding_suffix": "tavg-h2m-hxy-u", + "grid_label": "g13s", + "version": "v20250622", + } + instance_id = create_cmip7_instance_id(attrs) + + expected = ( + "MIP-DRS7.CMIP7.CMIP.CCCma.CanESM6-MR.historical.r2i1p1f1." + "glb.mon.tas.tavg-h2m-hxy-u.g13s.v20250622" + ) + assert instance_id == expected + + def test_uses_defaults_for_missing_attributes(self): + """Test that defaults are used for missing attributes.""" + attrs = { + "institution_id": "TestInst", + "source_id": "TestModel", + "experiment_id": "piControl", + "variant_label": "r1i1p1f1", + "variable_id": "pr", + "branding_suffix": "tavg-u-hxy-u", + } + instance_id = create_cmip7_instance_id(attrs) + + # Should use defaults: MIP-DRS7, CMIP7, CMIP, glb, mon, gn, v1 + assert instance_id.startswith("MIP-DRS7.CMIP7.CMIP.") + assert ".glb." in instance_id + assert ".mon." in instance_id + assert instance_id.endswith(".gn.v1") + + def test_instance_id_matches_path_components(self): + """Test that instance_id components match path components.""" + attrs = { + "drs_specs": "MIP-DRS7", + "mip_era": "CMIP7", + "activity_id": "CMIP", + "institution_id": "CSIRO", + "source_id": "ACCESS-ESM1-5", + "experiment_id": "historical", + "variant_label": "r1i1p1f1", + "region": "glb", + "frequency": "mon", + "variable_id": "tas", + "branding_suffix": "tavg-h2m-hxy-u", + "grid_label": "gn", + "version": "v20240101", + } + instance_id = create_cmip7_instance_id(attrs) + path = create_cmip7_path(attrs) + + # Instance_id uses "." separator, path uses "/" + assert instance_id.replace(".", "/") == path diff --git a/packages/climate-ref/src/climate_ref/cli/test_cases.py b/packages/climate-ref/src/climate_ref/cli/test_cases.py index 1b33fed22..305ff6e8f 100644 --- a/packages/climate-ref/src/climate_ref/cli/test_cases.py +++ b/packages/climate-ref/src/climate_ref/cli/test_cases.py @@ -15,7 +15,12 @@ from rich.table import Table from climate_ref.config import Config -from climate_ref.datasets import CMIP6DatasetAdapter, DatasetAdapter, Obs4MIPsDatasetAdapter +from climate_ref.datasets import ( + CMIP6DatasetAdapter, + CMIP7DatasetAdapter, + DatasetAdapter, + Obs4MIPsDatasetAdapter, +) from climate_ref.provider_registry import ProviderRegistry from climate_ref.testing import TEST_DATA_DIR, TestCaseRunner, get_catalog_path from climate_ref_core.datasets import ExecutionDatasetCollection, SourceDatasetType @@ -110,8 +115,13 @@ def _fetch_and_build_catalog( if source_type == "CMIP6": data_catalog[SourceDatasetType.CMIP6] = _build_catalog(CMIP6DatasetAdapter(), file_paths) + elif source_type == "CMIP7": + data_catalog[SourceDatasetType.CMIP7] = _build_catalog(CMIP7DatasetAdapter(), file_paths) + elif source_type == "obs4MIPs": data_catalog[SourceDatasetType.obs4MIPs] = _build_catalog(Obs4MIPsDatasetAdapter(), file_paths) + else: + logger.warning(f"Unsupported source type for test case: {source_type}") if not data_catalog: raise DatasetResolutionError( diff --git a/packages/climate-ref/src/climate_ref/datasets/__init__.py b/packages/climate-ref/src/climate_ref/datasets/__init__.py index d074be4a0..1da62faf0 100644 --- a/packages/climate-ref/src/climate_ref/datasets/__init__.py +++ b/packages/climate-ref/src/climate_ref/datasets/__init__.py @@ -6,6 +6,7 @@ from climate_ref.datasets.base import DatasetAdapter from climate_ref.datasets.cmip6 import CMIP6DatasetAdapter +from climate_ref.datasets.cmip7 import CMIP7DatasetAdapter from climate_ref.datasets.obs4mips import Obs4MIPsDatasetAdapter from climate_ref.datasets.pmp_climatology import PMPClimatologyDatasetAdapter from climate_ref_core.datasets import SourceDatasetType @@ -27,6 +28,8 @@ def get_dataset_adapter(source_type: str, **kwargs: Any) -> DatasetAdapter: """ if source_type.lower() == SourceDatasetType.CMIP6.value: return CMIP6DatasetAdapter(**kwargs) + elif source_type.lower() == SourceDatasetType.CMIP7.value: + return CMIP7DatasetAdapter(**kwargs) elif source_type.lower() == SourceDatasetType.obs4MIPs.value.lower(): return Obs4MIPsDatasetAdapter(**kwargs) elif source_type.lower() == SourceDatasetType.PMPClimatology.value.lower(): @@ -37,6 +40,7 @@ def get_dataset_adapter(source_type: str, **kwargs: Any) -> DatasetAdapter: __all__ = [ "CMIP6DatasetAdapter", + "CMIP7DatasetAdapter", "DatasetAdapter", "Obs4MIPsDatasetAdapter", "PMPClimatologyDatasetAdapter", diff --git a/packages/climate-ref/src/climate_ref/datasets/cmip7.py b/packages/climate-ref/src/climate_ref/datasets/cmip7.py new file mode 100644 index 000000000..54a60aeeb --- /dev/null +++ b/packages/climate-ref/src/climate_ref/datasets/cmip7.py @@ -0,0 +1,307 @@ +""" +CMIP7 dataset adapter for parsing and registering CMIP7 datasets. +""" + +from __future__ import annotations + +import warnings +from datetime import datetime +from pathlib import Path +from typing import Any + +import pandas as pd +import xarray as xr +from ecgtools import Builder +from loguru import logger + +from climate_ref.config import Config +from climate_ref.datasets.base import DatasetAdapter, DatasetParsingFunction +from climate_ref.models.dataset import CMIP7Dataset +from climate_ref_core.cmip6_to_cmip7 import create_cmip7_instance_id + + +def _parse_datetime(dt_str: pd.Series[str]) -> pd.Series[datetime | Any]: + """ + Parse datetime strings to datetime objects. + """ + + def _inner(date_string: str | None) -> datetime | None: + if not date_string or pd.isnull(date_string): + return None + + for fmt in ("%Y-%m-%d", "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M:%S.%f"): + try: + return datetime.strptime(date_string, fmt) + except ValueError: + continue + + logger.error(f"Failed to parse date string: {date_string}") + return None + + return pd.Series( + [_inner(dt) for dt in dt_str], + index=dt_str.index, + dtype="object", + ) + + +def parse_cmip7_file(file: str, **kwargs: Any) -> dict[str, Any]: + """ + Parse a CMIP7 netCDF file and extract metadata. + + Parameters + ---------- + file + Path to the netCDF file + kwargs + Additional keyword arguments (unused, for protocol compatibility) + + Returns + ------- + dict[str, Any] + Dictionary of metadata extracted from the file + """ + try: + time_coder = xr.coders.CFDatetimeCoder(use_cftime=True) + ds = xr.open_dataset(file, decode_times=time_coder) + except Exception as e: + logger.warning(f"Failed to open {file}: {e}") + return {} + + attrs = ds.attrs + + # Extract time range + start_time = None + end_time = None + if "time" in ds.dims: + try: + # Use cf accessor if available, or fall back to direct access + start_time = str(ds.cf["T"][0].data) + end_time = str(ds.cf["T"][-1].data) + except (KeyError, AttributeError, ValueError): + time_values = ds["time"].values + if len(time_values) > 0: + start_time = str(time_values[0]) + end_time = str(time_values[-1]) + + ds.close() + + # Build metadata dictionary per CMIP7 V1.0 spec + variable_id = attrs.get("variable_id", "") + branding_suffix = attrs.get("branding_suffix", "") + branded_variable = attrs.get("branded_variable", "") + if not branded_variable and variable_id and branding_suffix: + branded_variable = f"{variable_id}_{branding_suffix}" + + result = { + "path": file, + "start_time": start_time, + "end_time": end_time, + # Core identification (mandatory) + "activity_id": attrs.get("activity_id", "CMIP"), + "institution_id": attrs.get("institution_id", ""), + "source_id": attrs.get("source_id", ""), + "experiment_id": attrs.get("experiment_id", ""), + "variant_label": attrs.get("variant_label", ""), + "variable_id": variable_id, + "grid_label": attrs.get("grid_label", "gn"), + "version": attrs.get("version", "v1"), + # CMIP7-specific mandatory fields + "mip_era": attrs.get("mip_era", "CMIP7"), + "region": attrs.get("region", "glb"), # lowercase per spec + "frequency": attrs.get("frequency", "mon"), + "branding_suffix": branding_suffix, + "branded_variable": branded_variable, + "temporal_label": attrs.get("temporal_label", ""), + "vertical_label": attrs.get("vertical_label", ""), + "horizontal_label": attrs.get("horizontal_label", ""), + "area_label": attrs.get("area_label", ""), + # DRS and spec info + "drs_specs": attrs.get("drs_specs", "MIP-DRS7"), + "data_specs_version": attrs.get("data_specs_version", "MIP-DS7.1.0.0"), + "product": attrs.get("product", "model-output"), + "license_id": attrs.get("license_id", "CC-BY-4.0"), + # Variant indices (CMIP7 uses prefixed strings) + "realization_index": attrs.get("realization_index", ""), + "initialization_index": attrs.get("initialization_index", ""), + "physics_index": attrs.get("physics_index", ""), + "forcing_index": attrs.get("forcing_index", ""), + # Optional metadata + "realm": attrs.get("realm", ""), + "nominal_resolution": attrs.get("nominal_resolution", ""), + "tracking_id": attrs.get("tracking_id", ""), + "standard_name": attrs.get("standard_name", ""), + "long_name": attrs.get("long_name", ""), + "units": attrs.get("units", ""), + # Parent fields (conditionally required) + "branch_time_in_child": attrs.get("branch_time_in_child"), + "branch_time_in_parent": attrs.get("branch_time_in_parent"), + "parent_activity_id": attrs.get("parent_activity_id", ""), + "parent_experiment_id": attrs.get("parent_experiment_id", ""), + "parent_mip_era": attrs.get("parent_mip_era", ""), + "parent_source_id": attrs.get("parent_source_id", ""), + "parent_time_units": attrs.get("parent_time_units", ""), + "parent_variant_label": attrs.get("parent_variant_label", ""), + "external_variables": attrs.get("external_variables", ""), + } + + return result + + +class CMIP7DatasetAdapter(DatasetAdapter): + """ + Adapter for CMIP7 datasets. + + CMIP7 datasets follow the MIP-DRS7 specification with additional + branding suffix information and new metadata fields. + """ + + dataset_cls = CMIP7Dataset + slug_column = "instance_id" + + dataset_specific_metadata = ( + # Core identification (mandatory per CMIP7 V1.0 spec) + "activity_id", + "institution_id", + "source_id", + "experiment_id", + "variant_label", + "variable_id", + "grid_label", + "version", + # CMIP7-specific mandatory fields + "mip_era", + "region", + "frequency", + "branding_suffix", + "branded_variable", + "temporal_label", + "vertical_label", + "horizontal_label", + "area_label", + # DRS and spec info + "drs_specs", + "data_specs_version", + "product", + "license_id", + # Variant indices (CMIP7 uses prefixed strings) + "realization_index", + "initialization_index", + "physics_index", + "forcing_index", + # Optional metadata + "realm", + "nominal_resolution", + "tracking_id", + "standard_name", + "long_name", + "units", + # Parent fields (conditionally required) + "branch_time_in_child", + "branch_time_in_parent", + "parent_activity_id", + "parent_experiment_id", + "parent_mip_era", + "parent_source_id", + "parent_time_units", + "parent_variant_label", + "external_variables", + # System fields + "finalised", + slug_column, + ) + + file_specific_metadata = ("start_time", "end_time", "path") + + version_metadata = "version" + + # CMIP7 DRS components (excluding version) per MIP-DRS7 spec + # Order matches directory structure: activity/institution/source/experiment/ + # variant/region/frequency/variable/branding/grid + dataset_id_metadata = ( + "activity_id", + "institution_id", + "source_id", + "experiment_id", + "variant_label", + "region", + "frequency", + "variable_id", + "branding_suffix", + "grid_label", + ) + + def __init__(self, n_jobs: int = 1, config: Config | None = None): + self.n_jobs = n_jobs + self.config = config or Config.default() + + def get_parsing_function(self) -> DatasetParsingFunction: + """ + Get the parsing function for CMIP7 datasets. + + Returns + ------- + DatasetParsingFunction + The CMIP7 file parsing function + """ + return parse_cmip7_file + + def find_local_datasets(self, file_or_directory: Path) -> pd.DataFrame: + """ + Generate a data catalog from the specified file or directory. + + Parameters + ---------- + file_or_directory + File or directory containing CMIP7 datasets + + Returns + ------- + pd.DataFrame + Data catalog containing metadata for the datasets + """ + parsing_function = self.get_parsing_function() + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", DeprecationWarning) + + builder = Builder( + paths=[str(file_or_directory)], + depth=15, # CMIP7 DRS is deeper than CMIP6 + include_patterns=["*.nc"], + joblib_parallel_kwargs={"n_jobs": self.n_jobs}, + ) + + # Check if there are any assets before building + builder.get_assets() # type: ignore[attr-defined] + if not builder.assets: # type: ignore[attr-defined] + return pd.DataFrame(columns=self.dataset_specific_metadata + self.file_specific_metadata) + + builder.build(parsing_func=parsing_function) + + datasets: pd.DataFrame = builder.df + + if datasets.empty: + return pd.DataFrame(columns=self.dataset_specific_metadata + self.file_specific_metadata) + + # Convert time columns + if "start_time" in datasets.columns: + datasets["start_time"] = _parse_datetime(datasets["start_time"]) + if "end_time" in datasets.columns: + datasets["end_time"] = _parse_datetime(datasets["end_time"]) + + # Generate instance_id + datasets["instance_id"] = datasets.apply(lambda row: create_cmip7_instance_id(row.to_dict()), axis=1) + + # Ensure finalised column exists + if "finalised" not in datasets.columns: + datasets["finalised"] = True + + # Add any missing metadata columns + missing_columns = set(self.dataset_specific_metadata + self.file_specific_metadata) - set( + datasets.columns + ) + for column in missing_columns: + datasets[column] = pd.NA + + return datasets diff --git a/packages/climate-ref/src/climate_ref/migrations/versions/2026-01-13T2038_ed30a1d12209_add_cmip7_tables.py b/packages/climate-ref/src/climate_ref/migrations/versions/2026-01-13T2038_ed30a1d12209_add_cmip7_tables.py new file mode 100644 index 000000000..90a6f93b0 --- /dev/null +++ b/packages/climate-ref/src/climate_ref/migrations/versions/2026-01-13T2038_ed30a1d12209_add_cmip7_tables.py @@ -0,0 +1,87 @@ +"""add CMIP7 tables + +Revision ID: ed30a1d12209 +Revises: 20cd136a5b04 +Create Date: 2026-01-13 20:38:09.188101 + +""" + +from collections.abc import Sequence +from typing import Union + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "ed30a1d12209" +down_revision: Union[str, None] = "20cd136a5b04" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "cmip7_dataset", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("activity_id", sa.String(), nullable=False), + sa.Column("institution_id", sa.String(), nullable=False), + sa.Column("source_id", sa.String(), nullable=False), + sa.Column("experiment_id", sa.String(), nullable=False), + sa.Column("variant_label", sa.String(), nullable=False), + sa.Column("variable_id", sa.String(), nullable=False), + sa.Column("grid_label", sa.String(), nullable=False), + sa.Column("version", sa.String(), nullable=False), + sa.Column("mip_era", sa.String(), nullable=False), + sa.Column("region", sa.String(), nullable=False), + sa.Column("frequency", sa.String(), nullable=False), + sa.Column("branding_suffix", sa.String(), nullable=False), + sa.Column("branded_variable", sa.String(), nullable=False), + sa.Column("temporal_label", sa.String(), nullable=True), + sa.Column("vertical_label", sa.String(), nullable=True), + sa.Column("horizontal_label", sa.String(), nullable=True), + sa.Column("area_label", sa.String(), nullable=True), + sa.Column("drs_specs", sa.String(), nullable=False), + sa.Column("data_specs_version", sa.String(), nullable=False), + sa.Column("product", sa.String(), nullable=False), + sa.Column("license_id", sa.String(), nullable=False), + sa.Column("realization_index", sa.String(), nullable=True), + sa.Column("initialization_index", sa.String(), nullable=True), + sa.Column("physics_index", sa.String(), nullable=True), + sa.Column("forcing_index", sa.String(), nullable=True), + sa.Column("realm", sa.String(), nullable=True), + sa.Column("nominal_resolution", sa.String(), nullable=True), + sa.Column("tracking_id", sa.String(), nullable=True), + sa.Column("standard_name", sa.String(), nullable=True), + sa.Column("long_name", sa.String(), nullable=True), + sa.Column("units", sa.String(), nullable=True), + sa.Column("branch_time_in_child", sa.Float(), nullable=True), + sa.Column("branch_time_in_parent", sa.Float(), nullable=True), + sa.Column("parent_activity_id", sa.String(), nullable=True), + sa.Column("parent_experiment_id", sa.String(), nullable=True), + sa.Column("parent_mip_era", sa.String(), nullable=True), + sa.Column("parent_source_id", sa.String(), nullable=True), + sa.Column("parent_time_units", sa.String(), nullable=True), + sa.Column("parent_variant_label", sa.String(), nullable=True), + sa.Column("external_variables", sa.String(), nullable=True), + sa.Column("instance_id", sa.String(), nullable=False), + sa.ForeignKeyConstraint(["id"], ["dataset.id"], name=op.f("fk_cmip7_dataset_id_dataset")), + sa.PrimaryKeyConstraint("id", name=op.f("pk_cmip7_dataset")), + ) + with op.batch_alter_table("cmip7_dataset", schema=None) as batch_op: + batch_op.create_index(batch_op.f("ix_cmip7_dataset_experiment_id"), ["experiment_id"], unique=False) + batch_op.create_index(batch_op.f("ix_cmip7_dataset_instance_id"), ["instance_id"], unique=False) + batch_op.create_index(batch_op.f("ix_cmip7_dataset_source_id"), ["source_id"], unique=False) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("cmip7_dataset", schema=None) as batch_op: + batch_op.drop_index(batch_op.f("ix_cmip7_dataset_source_id")) + batch_op.drop_index(batch_op.f("ix_cmip7_dataset_instance_id")) + batch_op.drop_index(batch_op.f("ix_cmip7_dataset_experiment_id")) + + op.drop_table("cmip7_dataset") + # ### end Alembic commands ### diff --git a/packages/climate-ref/src/climate_ref/models/dataset.py b/packages/climate-ref/src/climate_ref/models/dataset.py index b488dc342..5350746a8 100644 --- a/packages/climate-ref/src/climate_ref/models/dataset.py +++ b/packages/climate-ref/src/climate_ref/models/dataset.py @@ -148,6 +148,80 @@ class CMIP6Dataset(Dataset): __mapper_args__: ClassVar[Any] = {"polymorphic_identity": SourceDatasetType.CMIP6} # type: ignore +class CMIP7Dataset(Dataset): + """ + Represents a CMIP7 dataset. + + Based on CMIP7 Global Attributes V1.0 (DOI: 10.5281/zenodo.17250297). + CMIP7 datasets have branding suffix components, variant indices as strings, + and additional DRS version information. + """ + + __tablename__ = "cmip7_dataset" + id: Mapped[int] = mapped_column(ForeignKey("dataset.id"), primary_key=True) + + # Core identification (mandatory per CMIP7 spec) + activity_id: Mapped[str] = mapped_column() + institution_id: Mapped[str] = mapped_column() + source_id: Mapped[str] = mapped_column(index=True) + experiment_id: Mapped[str] = mapped_column(index=True) + variant_label: Mapped[str] = mapped_column() + variable_id: Mapped[str] = mapped_column() + grid_label: Mapped[str] = mapped_column() + version: Mapped[str] = mapped_column() + + # CMIP7-specific mandatory fields + mip_era: Mapped[str] = mapped_column(default="CMIP7") + region: Mapped[str] = mapped_column(default="glb") # lowercase per spec + frequency: Mapped[str] = mapped_column() + branding_suffix: Mapped[str] = mapped_column() + branded_variable: Mapped[str] = mapped_column() + + # Branding suffix components (mandatory per spec) + temporal_label: Mapped[str] = mapped_column(nullable=True) + vertical_label: Mapped[str] = mapped_column(nullable=True) + horizontal_label: Mapped[str] = mapped_column(nullable=True) + area_label: Mapped[str] = mapped_column(nullable=True) + + # DRS and spec info (mandatory per spec) + drs_specs: Mapped[str] = mapped_column(default="MIP-DRS7") + data_specs_version: Mapped[str] = mapped_column(default="MIP-DS7.1.0.0") + product: Mapped[str] = mapped_column(default="model-output") + license_id: Mapped[str] = mapped_column(default="CC-BY-4.0") + + # Variant indices (CMIP7 uses prefixed strings, e.g., "r1", "i1", "p1", "f1") + realization_index: Mapped[str] = mapped_column(nullable=True) + initialization_index: Mapped[str] = mapped_column(nullable=True) + physics_index: Mapped[str] = mapped_column(nullable=True) + forcing_index: Mapped[str] = mapped_column(nullable=True) + + # Optional metadata + realm: Mapped[str] = mapped_column(nullable=True) + nominal_resolution: Mapped[str] = mapped_column(nullable=True) + tracking_id: Mapped[str] = mapped_column(nullable=True) + standard_name: Mapped[str] = mapped_column(nullable=True) + long_name: Mapped[str] = mapped_column(nullable=True) + units: Mapped[str] = mapped_column(nullable=True) + + # Conditionally required parent fields (when parent exists) + branch_time_in_child: Mapped[float] = mapped_column(nullable=True) + branch_time_in_parent: Mapped[float] = mapped_column(nullable=True) + parent_activity_id: Mapped[str] = mapped_column(nullable=True) + parent_experiment_id: Mapped[str] = mapped_column(nullable=True) + parent_mip_era: Mapped[str] = mapped_column(nullable=True) + parent_source_id: Mapped[str] = mapped_column(nullable=True) + parent_time_units: Mapped[str] = mapped_column(nullable=True) + parent_variant_label: Mapped[str] = mapped_column(nullable=True) + external_variables: Mapped[str] = mapped_column(nullable=True) + + instance_id: Mapped[str] = mapped_column(index=True) + """ + Unique identifier for the dataset (CMIP7 DRS format). + """ + + __mapper_args__: ClassVar[Any] = {"polymorphic_identity": SourceDatasetType.CMIP7} # type: ignore + + class Obs4MIPsDataset(Dataset): """ Represents a obs4mips dataset diff --git a/packages/climate-ref/src/climate_ref/solver.py b/packages/climate-ref/src/climate_ref/solver.py index 15f8df1b1..277d3a324 100644 --- a/packages/climate-ref/src/climate_ref/solver.py +++ b/packages/climate-ref/src/climate_ref/solver.py @@ -15,10 +15,13 @@ from climate_ref.config import Config from climate_ref.database import Database -from climate_ref.datasets import get_dataset_adapter -from climate_ref.datasets.cmip6 import CMIP6DatasetAdapter -from climate_ref.datasets.obs4mips import Obs4MIPsDatasetAdapter -from climate_ref.datasets.pmp_climatology import PMPClimatologyDatasetAdapter +from climate_ref.datasets import ( + CMIP6DatasetAdapter, + CMIP7DatasetAdapter, + Obs4MIPsDatasetAdapter, + PMPClimatologyDatasetAdapter, + get_dataset_adapter, +) from climate_ref.models import Diagnostic as DiagnosticModel from climate_ref.models import ExecutionGroup from climate_ref.models import Provider as ProviderModel @@ -324,6 +327,7 @@ def build_from_db(config: Config, db: Database) -> "ExecutionSolver": provider_registry=ProviderRegistry.build_from_config(config, db), data_catalog={ SourceDatasetType.CMIP6: CMIP6DatasetAdapter().load_catalog(db), + SourceDatasetType.CMIP7: CMIP7DatasetAdapter().load_catalog(db), SourceDatasetType.obs4MIPs: Obs4MIPsDatasetAdapter().load_catalog(db), SourceDatasetType.PMPClimatology: PMPClimatologyDatasetAdapter().load_catalog(db), }, diff --git a/packages/climate-ref/tests/unit/datasets/test_cmip7.py b/packages/climate-ref/tests/unit/datasets/test_cmip7.py new file mode 100644 index 000000000..69e7b184c --- /dev/null +++ b/packages/climate-ref/tests/unit/datasets/test_cmip7.py @@ -0,0 +1,155 @@ +"""Tests for CMIP7DatasetAdapter.""" + +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +from climate_ref.datasets.cmip7 import CMIP7DatasetAdapter, parse_cmip7_file +from climate_ref_core.cmip6_to_cmip7 import convert_cmip6_dataset, create_cmip7_path + + +@pytest.fixture +def sample_cmip7_dataset(): + """Create a minimal CMIP7-style dataset for testing.""" + time = np.arange(12) + lat = np.linspace(-90, 90, 5) + lon = np.linspace(0, 360, 10) + rng = np.random.default_rng(42) + + data = rng.random((len(time), len(lat), len(lon))) + + # Create a CMIP6 dataset and convert it to CMIP7 + ds_cmip6 = xr.Dataset( + {"tas": (["time", "lat", "lon"], data)}, + coords={ + "time": pd.date_range("2000-01-01", periods=12, freq="ME"), + "lat": lat, + "lon": lon, + }, + attrs={ + "variable_id": "tas", + "table_id": "Amon", + "source_id": "ACCESS-ESM1-5", + "experiment_id": "historical", + "variant_label": "r1i1p1f1", + "member_id": "r1i1p1f1", + "institution_id": "CSIRO", + "activity_id": "CMIP", + "grid_label": "gn", + "version": "v20191115", + "Conventions": "CF-1.6", + }, + ) + + ds_cmip7 = convert_cmip6_dataset(ds_cmip6, rename_variables=False) + return ds_cmip7 + + +@pytest.fixture +def cmip7_file(tmp_path, sample_cmip7_dataset): + """Write a CMIP7 dataset to a file.""" + # Build CMIP7 DRS path + cmip7_subpath = create_cmip7_path(sample_cmip7_dataset.attrs) + cmip7_dir = tmp_path / cmip7_subpath + cmip7_dir.mkdir(parents=True, exist_ok=True) + + filepath = cmip7_dir / "tas_Amon_ACCESS-ESM1-5_historical_r1i1p1f1_gn_200001-200012.nc" + sample_cmip7_dataset.to_netcdf(filepath) + return filepath + + +class TestParseCmip7File: + def test_parse_valid_file(self, cmip7_file, sample_cmip7_dataset): + result = parse_cmip7_file(str(cmip7_file)) + + assert result["path"] == str(cmip7_file) + assert result["mip_era"] == "CMIP7" + assert result["variable_id"] == "tas" + assert result["source_id"] == "ACCESS-ESM1-5" + assert result["experiment_id"] == "historical" + assert result["variant_label"] == "r1i1p1f1" + assert result["institution_id"] == "CSIRO" + assert result["activity_id"] == "CMIP" + assert result["grid_label"] == "gn" + assert result["region"] == "glb" # lowercase per CMIP7 V1.0 spec + assert "branding_suffix" in result + assert "branded_variable" in result + assert result["start_time"] is not None + assert result["end_time"] is not None + # New mandatory CMIP7 fields + assert result["drs_specs"] == "MIP-DRS7" + assert result["data_specs_version"] == "MIP-DS7.1.0.0" + assert result["product"] == "model-output" + assert result["license_id"] == "CC-BY-4.0" + + def test_parse_missing_file(self, tmp_path): + result = parse_cmip7_file(str(tmp_path / "nonexistent.nc")) + assert result == {} + + +class TestCMIP7DatasetAdapter: + def test_adapter_attributes(self): + adapter = CMIP7DatasetAdapter() + assert adapter.slug_column == "instance_id" + assert adapter.version_metadata == "version" + assert "mip_era" in adapter.dataset_specific_metadata + assert "region" in adapter.dataset_specific_metadata + assert "branding_suffix" in adapter.dataset_specific_metadata + + def test_catalog_empty(self, db): + adapter = CMIP7DatasetAdapter() + df = adapter.load_catalog(db) + assert df.empty + + def test_find_local_datasets(self, tmp_path, cmip7_file): + adapter = CMIP7DatasetAdapter() + df = adapter.find_local_datasets(tmp_path) + + assert len(df) == 1 + assert df["mip_era"].iloc[0] == "CMIP7" + assert df["variable_id"].iloc[0] == "tas" + assert df["source_id"].iloc[0] == "ACCESS-ESM1-5" + assert "instance_id" in df.columns + # Instance_id follows MIP-DRS7 format per CMIP7 V1.0 spec + assert df["instance_id"].iloc[0].startswith("MIP-DRS7.CMIP7.") + + def test_find_local_datasets_empty_dir(self, tmp_path): + adapter = CMIP7DatasetAdapter() + df = adapter.find_local_datasets(tmp_path) + assert df.empty + + def test_dataset_id_metadata(self): + adapter = CMIP7DatasetAdapter() + # CMIP7 DRS components per MIP-DRS7 spec (using variant_label, not member_id) + expected = ( + "activity_id", + "institution_id", + "source_id", + "experiment_id", + "variant_label", + "region", + "frequency", + "variable_id", + "branding_suffix", + "grid_label", + ) + assert adapter.dataset_id_metadata == expected + + def test_register_and_load_dataset(self, config, db, tmp_path, cmip7_file): + adapter = CMIP7DatasetAdapter() + + # Find local datasets + data_catalog = adapter.find_local_datasets(tmp_path) + assert len(data_catalog) == 1 + + # Register the dataset + with db.session.begin(): + for instance_id, data_catalog_dataset in data_catalog.groupby(adapter.slug_column): + adapter.register_dataset(config, db, data_catalog_dataset) + + # Load and verify + loaded_catalog = adapter.load_catalog(db) + assert len(loaded_catalog) == 1 + assert loaded_catalog["mip_era"].iloc[0] == "CMIP7" + assert loaded_catalog["variable_id"].iloc[0] == "tas"