diff --git a/src/scmdata/netcdf.py b/src/scmdata/netcdf.py index 12edd964..1f4d7aa9 100644 --- a/src/scmdata/netcdf.py +++ b/src/scmdata/netcdf.py @@ -16,6 +16,7 @@ from logging import getLogger import numpy as np +from xarray.coding.times import decode_cf_datetime, encode_cf_datetime from . import __version__ @@ -78,22 +79,46 @@ def _get_nc_type(np_type): return {"datatype": str, "fill_value": None} -def _write_nc(ds, df, dimensions, extras): +def _create_time_variable(ds, run): + """ + Create a CF-compliant time variable + + Note that the CF dictates the use of units, rather than unit which we use else where + """ + ds.createDimension("time", run.shape[1]) + ds.createVariable( + "time", "i8", "time", + ) + + num, units, calendar = encode_cf_datetime(run.times) + ds.variables["time"][:] = num + ds.variables["time"].setncatts({"calendar": calendar, "units": units}) + + +def _read_time_variable(time_var): + # If times use the f8 datatype, convert to datetime64[s] + if time_var.dtype == np.dtype("f8"): + return time_var[:].astype("datetime64[s]") + else: + # Use CF-compliant time handling + attrs = time_var.ncattrs() + units = time_var.units if "units" in attrs else None + calendar = time_var.calendar if "calendar" in attrs else None + + return decode_cf_datetime(time_var[:], units, calendar) + + +def _write_nc(ds, run, dimensions, extras): """ Low level function to write the dimensions, variables and metadata to disk """ all_dims = list(dimensions) + ["time"] - # Create the dimensions - ds.createDimension("time", len(df.time_points)) - ds.createVariable( - "time", "f8", "time", - ) - ds.variables["time"][:] = df.time_points.values + _create_time_variable(ds, run) dims = {} for d in dimensions: - vals = sorted(df.meta[d].unique()) + vals = sorted(run.meta[d].unique()) if not all([isinstance(v, str) for v in vals]) and np.isnan(vals).any(): raise AssertionError("nan in dimension: `{}`".format(d)) @@ -104,11 +129,11 @@ def _write_nc(ds, df, dimensions, extras): ds.variables[d][i] = v dims[d] = np.asarray(vals) - var_shape = [len(dims[d]) for d in dimensions] + [len(df.time_points)] + var_shape = [len(dims[d]) for d in dimensions] + [run.shape[1]] # Write any extra variables for e in extras: - metadata = df.meta[[e, *dimensions]].drop_duplicates() + metadata = run.meta[[e, *dimensions]].drop_duplicates() if metadata[dimensions].duplicated().any(): raise ValueError( @@ -132,7 +157,7 @@ def _write_nc(ds, df, dimensions, extras): ds.variables[e][:] = data_to_write - for var_df in df.groupby("variable"): + for var_df in run.groupby("variable"): v = var_df.get_unique_meta("variable", True) meta = var_df.meta.copy().drop("variable", axis=1) @@ -173,7 +198,7 @@ def _write_nc(ds, df, dimensions, extras): def _read_nc(cls, ds): dims = {d: ds.variables[d][:] for d in ds.dimensions} - dims["time"] = dims["time"].astype("datetime64[s]") + dims["time"] = _read_time_variable(ds.variables["time"]) data = [] columns = defaultdict(list) diff --git a/src/scmdata/ops.py b/src/scmdata/ops.py index cd89765e..27a9064e 100644 --- a/src/scmdata/ops.py +++ b/src/scmdata/ops.py @@ -825,9 +825,7 @@ def linear_regression_scmrun(self): def _calculate_linear_regression(in_scmrun): time_unit = "s" - times_numpy = in_scmrun.time_points.values.astype( - "datetime64[{}]".format(time_unit) - ) + times_numpy = in_scmrun.times.to_numpy().astype("datetime64[{}]".format(time_unit)) times_in_s = times_numpy.astype("int") ts = in_scmrun.timeseries() diff --git a/src/scmdata/run.py b/src/scmdata/run.py index 9dc1d3f3..cb533993 100644 --- a/src/scmdata/run.py +++ b/src/scmdata/run.py @@ -13,12 +13,14 @@ from logging import getLogger from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +import cftime import numpy as np import numpy.testing as npt import openscm_units.unit_registry as ur import pandas as pd import pint from dateutil import parser +from xarray import CFTimeIndex from xarray.core.ops import inject_binary_ops from .errors import MissingRequiredColumnError, NonUniqueMetadataError @@ -37,7 +39,12 @@ from .ops import inject_ops_methods from .plotting import inject_plotting_methods from .pyam_compat import IamDataFrame, LongDatetimeIamDataFrame -from .time import _TARGET_DTYPE, TimePoints, TimeseriesConverter +from .time import ( + _TARGET_DTYPE, + TimePoints, + TimeseriesConverter, + decode_datetimes_to_index, +) from .units import UnitConverter _logger = getLogger(__name__) @@ -202,7 +209,7 @@ def _format_wide_data(df, required_cols): time_cols, extra_cols = False, [] for i in cols: # if in wide format, check if columns are years (int) or datetime - if isinstance(i, dt.datetime): + if isinstance(i, (dt.datetime, cftime.datetime)): time_cols = True else: try: @@ -235,10 +242,7 @@ def _format_wide_data(df, required_cols): def _from_ts( - df: Any, - required_cols: Tuple[str], - index: Any = None, - **columns: Union[str, bool, float, int, List], + df: Any, required_cols: Tuple[str], **columns: Union[str, bool, float, int, List], ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Prepare data to initialize :class:`ScmRun` from wide timeseries. @@ -257,21 +261,12 @@ def _from_ts( """ if not isinstance(df, pd.DataFrame): df = pd.DataFrame(df) - if index is not None: - if isinstance(index, np.ndarray): - df.index = TimePoints(index).to_index() - elif isinstance(index, TimePoints): - df.index = index.to_index() - else: - df.index = index # format columns to lower-case and check that all required columns exist if not set(required_cols).issubset(columns.keys()): missing = list(set(required_cols) - set(columns.keys())) raise MissingRequiredColumnError(missing) - df.index.name = "time" - num_ts = len(df.columns) for c_name, col in columns.items(): col_list = ( @@ -423,7 +418,6 @@ def __init__( if isinstance(data, ScmRun): self._df = data._df.copy() if copy_data else data._df self._meta = data._meta - self._time_points = TimePoints(data.time_points.values) if metadata is None: metadata = data.metadata.copy() else: @@ -451,9 +445,7 @@ def _init_timeseries( raise ValueError("`index` argument is required") if columns is not None: - (_df, _meta) = _from_ts( - data, index=index, required_cols=self.required_cols, **columns - ) + (_df, _meta) = _from_ts(data, required_cols=self.required_cols, **columns) elif isinstance(data, (pd.DataFrame, pd.Series)): (_df, _meta) = _format_data(data, self.required_cols) elif (IamDataFrame is not None) and isinstance(data, IamDataFrame): @@ -472,13 +464,17 @@ def _init_timeseries( (_df, _meta) = _read_file(data, required_cols=self.required_cols, **kwargs) - self._time_points = TimePoints(_df.index.values) + self._df = _df.astype(float) - _df = _df.astype(float) - self._df = _df - self._df.index = self._time_points.to_index() + self._set_time_index(index if index is not None else self._df.index) self._meta = pd.MultiIndex.from_frame(_meta.astype("category")) + def _set_time_index(self, index): + if not isinstance(index, (pd.DatetimeIndex, CFTimeIndex)): + index = decode_datetimes_to_index(index) + self._df.index = index + self.time_points = TimePoints(self.times.to_numpy().astype("datetime64[s]")) + def copy(self): """ Return a :func:`copy.deepcopy` of self. @@ -514,9 +510,9 @@ def __getitem__(self, key: Any) -> Any: [key] if isinstance(key, str) or not isinstance(key, Iterable) else key ) if key == "time": - return pd.Series(self._time_points.to_index(), dtype="object") + return pd.Series(self.times, dtype="object") # converted to datetimes if key == "year": - return pd.Series(self._time_points.years()) + return pd.Series(self.times.year) if set(_key_check).issubset(self.meta_attributes): try: return self._meta_column(key).astype( @@ -600,8 +596,7 @@ def __setitem__( """ meta = np.atleast_1d(value) if key == "time": - self._time_points = TimePoints(meta) - self._df.index = self._time_points.to_index() + self._set_time_index(meta) else: if len(meta) == 1: new_meta = self._meta.to_frame() @@ -628,12 +623,12 @@ def _indent(s): meta_str = _indent(self.meta.__repr__()) time_str = [ - "Start: {}".format(self.time_points.values[0]), - "End: {}".format(self.time_points.values[-1]), + "Start: {}".format(self.times.values[0]), + "End: {}".format(self.times.values[-1]), ] time_str = _indent("\n".join(time_str)) return "\nTime:\n{}\nMeta:\n{}".format( - len(self), len(self.time_points), time_str, meta_str + len(self), len(self.times), time_str, meta_str ) @staticmethod @@ -744,15 +739,8 @@ def meta_attributes(self): return sorted(list(self._meta.names)) @property - def time_points(self): - """ - Time points of the data - - Returns - ------- - :obj:`scmdata.time.TimePoints` - """ - return self._time_points + def times(self): + return self._df.index def timeseries( self, meta=None, check_duplicated=True, time_axis=None, drop_all_nan_times=False @@ -809,13 +797,11 @@ def timeseries( raise NonUniqueMetadataError(_meta) if time_axis is None: - columns = self._time_points.to_index() + columns = self.times elif time_axis == "year": - columns = self._time_points.years() + columns = self.times.year elif time_axis == "year-month": - columns = ( - self._time_points.years() + (self._time_points.months() - 0.5) / 12 - ) + columns = self.times.year + (self.times.month - 0.5) / 12 elif time_axis == "days since 1970-01-01": def calc_days(x): @@ -823,16 +809,20 @@ def calc_days(x): return (x - ref).astype("timedelta64[D]") - columns = calc_days(self._time_points.values).astype(int) + columns = calc_days(self.times.to_numpy().astype("datetime64[s]")).astype( + int + ) elif time_axis == "seconds since 1970-01-01": def calc_seconds(x): ref = np.array(["1970-01-01"], dtype=_TARGET_DTYPE)[0] - return x - ref + return (x - ref).astype("timedelta64[s]") - columns = calc_seconds(self._time_points.values).astype(int) + columns = calc_seconds( + self.times.to_numpy().astype("datetime64[s]") + ).astype(int) else: raise NotImplementedError("time_axis = '{}'".format(time_axis)) @@ -1067,8 +1057,8 @@ def filter( _keep_rows = _keep_rows * False ret._df = ret._df.loc[_keep_times, _keep_rows] + ret._set_time_index(ret.times) ret._meta = ret._meta[_keep_rows] - ret["time"] = self.time_points.values[_keep_times] if log_if_empty and ret.empty: _logger.warning("Filtered ScmRun is empty!") @@ -1104,7 +1094,7 @@ def _apply_filters( # pylint: disable=missing-return-doc Filtering cannot be performed on requested column """ regexp = filters.pop("regexp", False) - keep_ts = np.array([True] * len(self.time_points)) + keep_ts = np.array([True] * len(self.times)) keep_meta = np.array([True] * len(self)) # filter by columns and list of values @@ -1133,21 +1123,20 @@ def _apply_filters( # pylint: disable=missing-return-doc separator=self.data_hierarchy_separator, ) # else do nothing as level handled in variable filtering - elif col == "year": - keep_ts &= years_match(self._time_points.years(), values) + keep_ts &= years_match(self.time_points.years(), values) elif col == "month": - keep_ts &= month_match(self._time_points.months(), values) + keep_ts &= month_match(self.time_points.months(), values) elif col == "day": keep_ts &= self._day_match(values) elif col == "hour": - keep_ts &= hour_match(self._time_points.hours(), values) + keep_ts &= hour_match(self.time_points.hours(), values) elif col == "time": - keep_ts &= datetime_match(self._time_points.values, values) + keep_ts &= datetime_match(self.time_points.values, values) else: raise ValueError("filter by `{}` not supported".format(col)) @@ -1163,9 +1152,9 @@ def _day_match(self, values): wday = False if wday: - days = self._time_points.weekdays() + days = self.time_points.weekdays() else: # ints or list of ints - days = self._time_points.days() + days = self.time_points.days() return day_match(days, values) @@ -1275,11 +1264,11 @@ def interpolate( res = self.copy() - target_times = TimePoints(target_times) + target_index = decode_datetimes_to_index(target_times) timeseries_converter = TimeseriesConverter( - self.time_points.values, - target_times.values, + self.times.values, + target_times, interpolation_type=interpolation_type, extrapolation_type=extrapolation_type, ) @@ -1290,10 +1279,7 @@ def interpolate( target_data[:, i] = timeseries_converter.convert_from( res._df.iloc[:, i].values ) - res._df = pd.DataFrame( - target_data, columns=res._df.columns, index=target_times.to_index() - ) - res._time_points = target_times + res._df = pd.DataFrame(target_data, columns=res._df.columns, index=target_index) return res @@ -1902,12 +1888,10 @@ def reduce(self, func, dim=None, axis=None, **kwargs): data = func(input_data, **kwargs) if getattr(data, "shape", ()) == self.shape: - return type(self)( - data, index=self.time_points, columns=self.meta.to_dict("list") - ) + return type(self)(data, index=self.times, columns=self.meta.to_dict("list")) else: removed_axes = range(2) if axis is None else np.atleast_1d(axis) % 2 - index = self.time_points + index = self.times meta = self.meta.to_dict("list") if 0 in removed_axes and len(meta): # Reduced the timeseries @@ -2030,6 +2014,8 @@ def run_append( to_join_metas = [] overlapping_times = False + any_cftimes = isinstance(ret.times, CFTimeIndex) + ind = range(ret._df.shape[1]) ret._df.columns = ind ret._meta.index = ind @@ -2038,6 +2024,8 @@ def run_append( for run in runs[1:]: run_to_join_df = run._df + any_cftimes = any_cftimes or isinstance(run.times, CFTimeIndex) + max_idx = min_idx + run_to_join_df.shape[1] ind = range(min_idx, max_idx) min_idx = max_idx @@ -2052,7 +2040,7 @@ def run_append( # check for overlap idx_to_check = run_to_join_df.index if not overlapping_times and ( - idx_to_check.isin(ret._df.index).any() + idx_to_check.isin(ret.times.values.astype("datetime64[s]")).any() or any([idx_to_check.isin(df.index).any() for df in to_join_dfs]) ): overlapping_times = True @@ -2060,9 +2048,16 @@ def run_append( to_join_dfs.append(run_to_join_df) to_join_metas.append(run_to_join_meta) + if any_cftimes: + # If any cftimes are present cast everything to cftime + ret._df.index = decode_datetimes_to_index(ret._df.index, use_cftime=True) + for df in to_join_dfs: + df.index = decode_datetimes_to_index(df.index, use_cftime=True) + ret._df = pd.concat([ret._df] + to_join_dfs, axis="columns").sort_index() - ret._time_points = TimePoints(ret._df.index.values) - ret._df.index = ret._time_points.to_index() + ret._set_time_index( + decode_datetimes_to_index(ret.times.values, use_cftime=any_cftimes) + ) ret._meta = pd.MultiIndex.from_frame( pd.concat([ret._meta.to_frame()] + to_join_metas).astype("category") ) diff --git a/src/scmdata/time.py b/src/scmdata/time.py index f2b2cf24..70de37e5 100644 --- a/src/scmdata/time.py +++ b/src/scmdata/time.py @@ -10,9 +10,24 @@ import numpy as np import pandas as pd from dateutil import parser +from pandas.errors import OutOfBoundsDatetime +from xarray import CFTimeIndex _TARGET_TYPE = np.int64 _TARGET_DTYPE = "datetime64[s]" +STANDARD_CALENDARS = {"standard", "gregorian", "proleptic_gregorian"} +""" +Over the span of a ``datetime64[ns]`` these calendars are all equivalent +""" + +_CFTIME_CALENDARS = { + "standard": cftime.datetime, + "360_day": cftime.Datetime360Day, + "gregorian": cftime.DatetimeGregorian, + "proleptic_gregorian": cftime.DatetimeProlepticGregorian, + "noleap": cftime.DatetimeNoLeap, + "julian": cftime.DatetimeJulian, +} class InsufficientDataError(Exception): @@ -25,6 +40,10 @@ class InsufficientDataError(Exception): def _float_year_to_datetime(inp: float) -> np.datetime64: year = int(inp) + + if year < 0: + raise OutOfBoundsDatetime("Cannot connect negative decimal year") + fractional_part = inp - year return np.datetime64( # pylint: disable=too-many-function-args year - 1970, "Y" @@ -37,29 +56,58 @@ def _float_year_to_datetime(inp: float) -> np.datetime64: ) +def _str_to_cftime(inp: str, calendar: str): + cls = _CFTIME_CALENDARS[calendar] + + negative_year = False + if inp.startswith("-"): + negative_year = True + inp = inp[1:] + + assert len(inp) == 19 + + y = int(inp[:4]) + if negative_year: + y = -y + mon = int(inp[5:7]) + d = int(inp[8:10]) + h = int(inp[11:13]) + m = int(inp[14:16]) + s = int(inp[17:]) + + return cls(y, mon, d, h, m, s) + + _ufunc_float_year_to_datetime = np.frompyfunc(_float_year_to_datetime, 1, 1) -_ufunc_str_to_datetime = np.frompyfunc(parser.parse, 1, 1) +_ufunc_str_to_datetime = np.frompyfunc(np.datetime64, 1, 1) +_ufunc_str_to_datetime_parser = np.frompyfunc(parser.parse, 1, 1) +_ufunc_str_to_cftime = np.frompyfunc(_str_to_cftime, 2, 1) def _parse_datetime(inp: np.ndarray) -> np.ndarray: try: return _ufunc_float_year_to_datetime(inp.astype(float)) except (TypeError, ValueError): - return _ufunc_str_to_datetime(inp) + try: + return _ufunc_str_to_datetime(inp) + except ValueError: + return _ufunc_str_to_datetime_parser(inp) -def _format_datetime(dts: np.ndarray) -> np.ndarray: +def _format_datetime(dts) -> np.ndarray: """ - Convert an array to an array of :class:`np.datetime64`. + Convert a list of times to numpy datetimes + + This truncates the datetimes to have second resolution Parameters ---------- - dts + dts : np.array or list Input to attempt to convert Returns ------- - :class:`np.ndarray` of :class:`np.datetime64` + :class:`np.ndarray` with dtype :class:`np.datetime64[s]` Converted array Raises @@ -67,6 +115,9 @@ def _format_datetime(dts: np.ndarray) -> np.ndarray: ValueError If one of the values in :obj:`dts` cannot be converted to :class:`np.datetime64` """ + + dts = np.asarray(dts) + if len(dts) <= 0: # pylint: disable=len-as-condition return np.array([], dtype=_TARGET_DTYPE) @@ -84,6 +135,104 @@ def _format_datetime(dts: np.ndarray) -> np.ndarray: return np.asarray(dts, dtype=_TARGET_DTYPE) +def _to_cftimes(np_dates, calendar): + # This would be faster, but results in calendar issues + # return cftime.num2date( + # np_dates.astype(int), "seconds since 1970-01-01", calendar=calendar + # ) + + if calendar not in _CFTIME_CALENDARS: + raise ValueError("Unknown calendar: {}".format(calendar)) + + return _ufunc_str_to_cftime(np.datetime_as_string(np_dates), calendar) + + +def decode_datetimes_to_index(dates, calendar=None, use_cftime=None): + """ + Decodes a list of dates to an index + + Uses a :class:`pandas.DatetimeIndex` where possible. When a non-standard calendar is + used or for dates before year 1678 or after year 2262, a dates are converted + to :module:`cftime` datetimes and a :class:`xarray.CFTimeIndex()` is used. + + A wide formats for dates is supported. The following are all equivalent: + + * str ("2000" or "2000-01-01") + * int (2000) + * decimal years (2000.0) + * python datetimes (``datetime.datetime(2000, 1, 1)``) + * cftime datetimes (``cftime.datetime(2000, 1, 1)``) + * numpy datetimes (``np.datetime64("2000-01-01", "Y")``) + + Parameters + ---------- + dates + Dates to be converted + + calendar: str + Describes the calendar used by in the time calculations. All the values + currently defined in the [CF metadata convention](http://cfconventions.org) + and are implemented in [cftime](https://unidata.github.io/cftime) + + Valid calendars ``'standard', 'gregorian', 'proleptic_gregorian', 'noleap', '360_day’, 'julian'``. + Default is ``'standard'``, which is a mixed Julian/Gregorian calendar. + + If a calendar other than ``'standard', 'gregorian'`` or ``'proleptic_gregorian'`` + is selected, then dates will be attempted to converted to ``cftime``'s + + use_cftime: bool + If None (default), then try and determine the appropriate time index to use. + Attempts to use a :class:`pandas.DatetimeIndex`, but falls back to + :class:`xarray.CFTimeIndex` if the conversion fails. + + If True, dates are explicitly converted to `cftime`'s and a + :class:`xarray.CFTimeIndex` is returned. + + If False, a :class:`pandas.DatetimeIndex` will always be returned (if + possible). In this case a :class:`pandas.errors.OutOfBoundsDatetime` + is raised if a date falls before year 1678 or after year 2262. + + Returns + ------- + :class:`pandas.DatetimeIndex` or :class:`xarray.CFTimeIndex` + + The return type depends on the value of calendar and the dates provided + + Raises + ------ + :class:`pandas.errors.OutOfBoundsDatetime` + ``use_cftime == False`` and date before year 1678 or after year 2262 is + provided + + ValueError + ``use_cftime == False`` and a non-standard calendar is requested + """ + dates = np.asarray(dates) + dates = _format_datetime(dates) + + if calendar is None: + calendar = "standard" + + if use_cftime is None: + try: + if calendar not in STANDARD_CALENDARS: + raise ValueError( + "Cannot use pandas indexes with a non-standard calendar" + ) + index = pd.DatetimeIndex(dates) + except (OutOfBoundsDatetime, ValueError): + index = CFTimeIndex(_to_cftimes(dates, calendar)) + elif use_cftime: + index = CFTimeIndex(_to_cftimes(dates, calendar)) + else: + if calendar not in STANDARD_CALENDARS: + raise ValueError("Cannot use pandas indexes with a non-standard calendar") + index = pd.DatetimeIndex(dates) + + index.name = "time" + return index + + class TimePoints: """ Handles time points by wrapping :class:`np.ndarray` of :class:`np.datetime64`.. @@ -123,7 +272,7 @@ def to_index(self) -> pd.Index: :class:`pd.Index` of :class:`np.dtype` :class:`object` with name ``"time"`` made from the time points represented as :class:`datetime.datetime`. """ - return pd.Index(self._values.astype(object), dtype=object, name="time") + return CFTimeIndex(self.as_cftime(), name="time") def as_cftime(self) -> list: """ diff --git a/tests/conftest.py b/tests/conftest.py index 89ff6632..fb28f204 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -256,6 +256,11 @@ def scm_run(): yield ScmRun(TEST_DF.copy()) +@pytest.fixture(scope="function") +def long_scm_run(): + yield ScmRun(TEST_DF_LONG_TIMES.copy()) + + @pytest.fixture(scope="function") def base_scm_run(): yield BaseScmRun( @@ -543,6 +548,16 @@ def rcp26(): interpolation_type="linear", extrapolation_type=None, ), + dict( + source_start_time=np.datetime64("1000-01-06"), + source_period_length=np.timedelta64(3, "D"), + target_start_time=np.datetime64("1000-01-07"), + target_period_length=np.timedelta64(4, "D"), + source_values=possible_source_values[0], + target_values=[2.33333333, 3.6666667], + interpolation_type="linear", + extrapolation_type=None, + ), ] test_combinations = [] diff --git a/tests/integration/test_scmrun_speed.py b/tests/integration/test_scmrun_speed.py index 70c377ea..b1cfe513 100644 --- a/tests/integration/test_scmrun_speed.py +++ b/tests/integration/test_scmrun_speed.py @@ -6,9 +6,7 @@ import scmdata -@pytest.fixture(params=[10, 10 ** 2, 10 ** 3, 10 ** 3.5, 10 ** 4, 10 ** 4.5]) -def big_scmrun(request): - length = int(request.param) +def make_example_run(length): t_steps = 750 variables = [ "Surface Air Temperature Change", @@ -45,6 +43,12 @@ def big_scmrun(request): ) +@pytest.fixture(params=[10, 10 ** 2, 10 ** 3, 10 ** 3.5, 10 ** 4, 10 ** 4.5]) +def big_scmrun(request): + length = int(request.param) + return make_example_run(length) + + def test_recreate_from_timeseries(benchmark, big_scmrun): def recreate(): return scmdata.ScmRun(big_scmrun.timeseries()) @@ -169,3 +173,18 @@ def append(): assert res.shape[0] == big_scmrun.shape[0] * n_to_append assert res.shape[1] == big_scmrun.shape[1] + + +def test_append_example_magicc_ensemble(benchmark): + n_to_append = 600 + + to_append = [] + for i in range(n_to_append): + tmp = make_example_run(20 * 5) # 20 variable 5 scenarios + tmp["run_number"] = i + to_append.append(tmp) + + def append(): + return scmdata.run_append(to_append) + + benchmark.pedantic(append, iterations=1, rounds=5) diff --git a/tests/test_data/legacy_datetimes.nc b/tests/test_data/legacy_datetimes.nc new file mode 100644 index 00000000..9658108c Binary files /dev/null and b/tests/test_data/legacy_datetimes.nc differ diff --git a/tests/unit/test_netcdf.py b/tests/unit/test_netcdf.py index 9df5e58a..eebe01c3 100644 --- a/tests/unit/test_netcdf.py +++ b/tests/unit/test_netcdf.py @@ -24,7 +24,7 @@ def test_run_to_nc(scm_run): ds = nc.Dataset(out_fname) - assert ds.dimensions["time"].size == len(scm_run.time_points) + assert ds.dimensions["time"].size == scm_run.shape[1] assert ds.dimensions["scenario"].size == 2 assert ds.variables["scenario"][0] == "a_scenario" @@ -403,3 +403,9 @@ def test_error_run_to_nc_required_cols_in_extras_duplicated(): msg = "metadata for model is not unique for requested dimensions" with pytest.raises(ValueError, match=msg): start.to_nc(out_fname, extras=("model",)) + + +def test_read_legacy_datetimes_nc(scm_run, test_data_path): + old_datetimes_run = ScmRun.from_nc(join(test_data_path, "legacy_datetimes.nc")) + + assert_scmdf_almost_equal(old_datetimes_run, scm_run, check_ts_names=False) diff --git a/tests/unit/test_run.py b/tests/unit/test_run.py index 30f4d46a..5ffdb4e8 100644 --- a/tests/unit/test_run.py +++ b/tests/unit/test_run.py @@ -17,6 +17,7 @@ from scmdata.errors import MissingRequiredColumnError, NonUniqueMetadataError from scmdata.run import BaseScmRun, ScmRun, run_append from scmdata.testing import _check_pandas_less_110, assert_scmdf_almost_equal +from scmdata.time import decode_datetimes_to_index def test_init_df_year_converted_to_datetime(test_pd_df): @@ -986,12 +987,6 @@ def test_append_long_times( "unit": "unit_1", }, ) - if try_start_1_from_df_with_datetime_index: - scmrun_1_ts = scmrun_1.timeseries() - try: - scmrun_1_ts.columns = pd.DatetimeIndex(scmrun_1_ts.columns.values) - except pd.errors.OutOfBoundsDatetime: - pytest.skip("pandas datetime error") scmrun_2 = ScmRun( data=np.arange(len(time_2)), @@ -1004,18 +999,29 @@ def test_append_long_times( "unit": "unit_2", }, ) - if try_start_2_from_df_with_datetime_index: - scmrun_2_ts = scmrun_2.timeseries() - try: - scmrun_2_ts.columns = pd.DatetimeIndex(scmrun_2_ts.columns.values) - scmrun_2 = ScmRun(scmrun_2_ts) - except pd.errors.OutOfBoundsDatetime: - pytest.skip("pandas datetime error") + def _ensure_dateindex_type(run, ensure_datetime_index): + index = run.times + + if ensure_datetime_index and not isinstance(index, pd.DatetimeIndex): + try: + scmrun_1._df.index = index.to_datetimeindex() + except ValueError: + pytest.skip("pandas datetime error") + elif not ensure_datetime_index and isinstance(index, pd.DatetimeIndex): + run._df.index = decode_datetimes_to_index(index, use_cftime=True) + + _ensure_dateindex_type(scmrun_1, try_start_1_from_df_with_datetime_index) + _ensure_dateindex_type(scmrun_2, try_start_2_from_df_with_datetime_index) res = scmrun_1.append(scmrun_2) - assert not isinstance(res._df.index, pd.DatetimeIndex) + if isinstance(scmrun_1.times, pd.DatetimeIndex) and isinstance( + scmrun_2.times, pd.DatetimeIndex + ): + assert isinstance(res.times, pd.DatetimeIndex) + else: + assert not isinstance(res.times, pd.DatetimeIndex) exp_years = set(time_1).union(set(time_2)) assert set(res["year"]) == exp_years @@ -1446,9 +1452,7 @@ def test_append_duplicates_order_doesnt_matter(scm_run): obs = res.filter(scenario="a_scenario2").timeseries().squeeze() exp = [2.0, 7.0, 7.0, 2.0, 7.0, 5.0] - npt.assert_array_equal( - res._time_points.years(), [2005, 2010, 2015, 2020, 2030, 2040] - ) + npt.assert_array_equal(res["year"], [2005, 2010, 2015, 2020, 2030, 2040]) npt.assert_almost_equal(obs, exp) @@ -2585,6 +2589,13 @@ def test_timeseries_time_axis(scm_run, time_axis, mod_func): assert (res.columns == (scm_run["time"].apply(mod_func))).all() +@time_axis_checks +def test_long_data_time_axis_long_run(long_scm_run, time_axis, mod_func): + res = long_scm_run.timeseries(time_axis=time_axis) + + assert (res.columns == (long_scm_run["time"].apply(mod_func))).all() + + @time_axis_checks def test_long_data_time_axis(scm_run, time_axis, mod_func): res = scm_run.long_data(time_axis=time_axis) @@ -2592,6 +2603,13 @@ def test_long_data_time_axis(scm_run, time_axis, mod_func): assert (res["time"] == (scm_run.long_data()["time"].apply(mod_func))).all() +@time_axis_checks +def test_long_data_time_axis_long_run(long_scm_run, time_axis, mod_func): + res = long_scm_run.long_data(time_axis=time_axis) + + assert (res["time"] == (long_scm_run.long_data()["time"].apply(mod_func))).all() + + @time_axis_checks @patch("scmdata.plotting.sns.lineplot") @patch.object(ScmRun, "long_data") diff --git a/tests/unit/test_time.py b/tests/unit/test_time.py new file mode 100644 index 00000000..e68b6518 --- /dev/null +++ b/tests/unit/test_time.py @@ -0,0 +1,169 @@ +import datetime as dt + +import cftime +import numpy as np +import pandas as pd +import pandas.testing as pdt +import pytest +from pandas import DatetimeIndex +from pandas.errors import OutOfBoundsDatetime +from xarray import CFTimeIndex + +from scmdata.time import _CFTIME_CALENDARS, _format_datetime, decode_datetimes_to_index + +input_type = pytest.mark.parametrize( + "input_type", + [ + "int-year", + "decimal-year", + "str-year", + "str-year-month-day", + "numpy-ns", + "numpy-s", + "numpy-y", + "datetime", + "cftime", + ], +) + + +def convert_input(dates_as_int, input_type): + if input_type == "int-year": + return [int(d) for d in dates_as_int] + elif input_type == "decimal-year": + if min(dates_as_int) < 0: + pytest.skip("datetime out of range") + return [float(d) for d in dates_as_int] + elif input_type == "str-year": + return [str(d) for d in dates_as_int] + elif input_type == "str-year-month-day": + return [str(d) + "-01-01" for d in dates_as_int] + elif input_type == "numpy-ns": + if min(dates_as_int) <= 1678 or max(dates_as_int) >= 2262: + pytest.skip("datetime out of range") + return np.asarray([str(d) + "-01-01" for d in dates_as_int]).astype( + "datetime64[ns]" + ) + elif input_type == "numpy-s": + return np.asarray([str(d) + "-01-01" for d in dates_as_int]).astype( + "datetime64[s]" + ) + elif input_type == "numpy-y": + return np.asarray([str(d) + "-01-01" for d in dates_as_int]).astype( + "datetime64[Y]" + ) + elif input_type == "pandas": + try: + return [pd.Timestamp(dt.datetime(d, 1, 1)) for d in dates_as_int] + except OutOfBoundsDatetime: + pytest.skip("datetime out of range") + return [np.datestr(d) + "-01-01" for d in dates_as_int] + elif input_type == "datetime": + try: + return [dt.datetime(d, 1, 1) for d in dates_as_int] + except ValueError: + pytest.skip("datetime out of range") + elif input_type == "cftime": + return [cftime.datetime(d, 1, 1) for d in dates_as_int] + + +@input_type +def test_format(input_type): + dates = [2000, 2010, 2020] + + inp_dates = convert_input(dates, input_type) + res = _format_datetime(inp_dates) + exp = np.asarray(["2000-01-01", "2010-01-01", "2020-01-01"]).astype("datetime64[s]") + + np.testing.assert_array_equal(res, exp) + + +@input_type +def test_format_wide_range(input_type): + dates = [-100, 0, 1000, 5000] + + inp_dates = convert_input(dates, input_type) + res = _format_datetime(inp_dates) + exp = np.asarray(["-100-01-01", "0-01-01", "1000-01-01", "5000-01-01"]).astype( + "datetime64[s]" + ) + + np.testing.assert_array_equal(res, exp) + + +@pytest.mark.parametrize("use_cftime", [True, None]) +@input_type +def test_to_cftime_index(input_type, use_cftime): + years = [-1000, 1000, 2000, 3000] + inp_dates = convert_input(years, input_type) + + res = decode_datetimes_to_index(inp_dates, use_cftime=use_cftime) + + cftime_dts = [cftime.datetime(y, 1, 1) for y in years] + exp = CFTimeIndex(cftime_dts, name="time") + + assert isinstance(res, CFTimeIndex) + assert all(res.year == years) + pdt.assert_index_equal(res, exp) + + +@pytest.mark.parametrize("calendar", _CFTIME_CALENDARS.keys()) +@pytest.mark.parametrize("use_cftime", [True, None]) +@input_type +def test_decode_index_with_calendar(input_type, calendar, use_cftime): + years = [-1000, 1000, 2000, 3000] + inp_dates = convert_input(years, input_type) + + res = decode_datetimes_to_index(inp_dates, calendar=calendar, use_cftime=use_cftime) + + cls = _CFTIME_CALENDARS[calendar] + cftime_dts = [cls(y, 1, 1) for y in years] + exp = CFTimeIndex(cftime_dts, name="time") + + assert isinstance(res, CFTimeIndex) + assert all(res.year == years) + pdt.assert_index_equal(res, exp) + + +def test_decode_index_with_invalid_calendar(): + years = [-1000, 1000, 2000, 3000] + + with pytest.raises(ValueError, match="Unknown calendar: not-a-cal"): + decode_datetimes_to_index(years, calendar="not-a-cal") + + +def test_decode_index_with_nonstandard_calendar(): + years = [-1000, 1000, 2000, 3000] + + res = decode_datetimes_to_index(years, calendar="360_day") + assert isinstance(res, CFTimeIndex) + + with pytest.raises( + ValueError, match="Cannot use pandas indexes with a non-standard calendar" + ): + decode_datetimes_to_index(years, calendar="360_day", use_cftime=False) + + +@pytest.mark.parametrize("use_cftime", [False, None]) +@input_type +def test_to_pd_index(input_type, use_cftime): + years = [2000, 2050, 2100] + inp_dates = convert_input(years, input_type) + + res = decode_datetimes_to_index(inp_dates, use_cftime=use_cftime) + + exp = DatetimeIndex([str(y) for y in years], name="time") + + # Pandas datetimes are coerced to ns + assert res.values.dtype == "datetime64[ns]" + assert all(res.year == years) + pdt.assert_index_equal(res, exp) + + +@input_type +def test_to_pd_index_with_overflow(input_type): + years = [1500, 2050, 2100] + inp_dates = convert_input(years, input_type) + + with pytest.raises(OutOfBoundsDatetime): + decode_datetimes_to_index(inp_dates, use_cftime=False)