From bd3bdf726c2a1d2c248033d70349ac48f749ad12 Mon Sep 17 00:00:00 2001 From: Angus Gibson Date: Mon, 22 Apr 2024 13:14:52 +1000 Subject: [PATCH] Add get_ranges query function This function returns a DataFrame containing the distinct ranges of contiguous data. For example, >>> cc.querying.get_ranges(session, "01deg_jra55v13_ryf9091", "u", "1 daily") start end 0 1950-01-01 00:00:00 1950-02-01 00:00:00 1 1950-02-01 00:00:00 1971-01-01 00:00:00 2 2086-01-01 00:00:00 2100-10-01 00:00:00 3 2100-10-01 00:00:00 2101-01-01 00:00:00 4 2170-01-01 00:00:00 2180-01-01 00:00:00 Curiously, this example does highlight an inconsistency in the data for this experiment, but it does show the gaps where no daily velocity data is available. --- cosima_cookbook/querying.py | 46 +++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/cosima_cookbook/querying.py b/cosima_cookbook/querying.py index 835d0c9..73d4a3b 100644 --- a/cosima_cookbook/querying.py +++ b/cosima_cookbook/querying.py @@ -7,8 +7,10 @@ import logging import os.path import pandas as pd +import sqlalchemy as sa from sqlalchemy import func, distinct, or_ from sqlalchemy.orm import aliased +from sqlalchemy.sql.expression import cast from sqlalchemy.sql.selectable import subquery import warnings import xarray as xr @@ -272,6 +274,50 @@ def get_frequencies(session, experiment=None): return pd.DataFrame(q, columns=[c["name"] for c in q.column_descriptions]) +def get_ranges(session, experiment, variable, frequency, cellmethods=None): + # first, we query for the files with a flag indicating that the current row is not + # contiguous with its predecessor + flag_q = ( + session.query( + NCFile.time_start, + NCFile.time_end, + ( + NCFile.time_start + != func.lag(NCFile.time_end, 1, "").over(order_by=NCFile.time_start) + ).label("flag"), + ) + .join(NCFile.experiment) + .join(NCFile.ncvars) + .join(NCVar.variable) + .filter(NCExperiment.experiment == experiment) + .filter(NCFile.frequency == frequency) + .filter(NCVar.variable == variable) + .order_by(NCFile.time_start) + ).subquery() + + # now, by summing over the flag (as an integer), we get a column that allows us to group + # on consecutive files + group_q = session.query( + flag_q, + func.sum(cast(flag_q.c.flag, sa.Integer)) + .over(order_by=flag_q.c.time_start) + .label("grp"), + ).subquery() + + # we just need the smallest start time and largest end time out of each group + # to gets its extent + q = ( + session.query( + func.min(group_q.c.time_start), + func.max(group_q.c.time_end), + ) + .group_by(group_q.c.grp) + .order_by(group_q.c.time_start) + ) + + return pd.DataFrame(q, columns=["start", "end"]) + + def getvar( expt, variable,