From bd3bdf726c2a1d2c248033d70349ac48f749ad12 Mon Sep 17 00:00:00 2001
From: Angus Gibson <angus@agibson.me>
Date: Mon, 22 Apr 2024 13:14:52 +1000
Subject: [PATCH] Add get_ranges query function

This function returns a DataFrame containing the distinct ranges of
contiguous data. For example,

>>> cc.querying.get_ranges(session, "01deg_jra55v13_ryf9091", "u", "1 daily")
                 start                  end
0  1950-01-01 00:00:00  1950-02-01 00:00:00
1  1950-02-01 00:00:00  1971-01-01 00:00:00
2  2086-01-01 00:00:00  2100-10-01 00:00:00
3  2100-10-01 00:00:00  2101-01-01 00:00:00
4  2170-01-01 00:00:00  2180-01-01 00:00:00

Curiously, this example does highlight an inconsistency in the data
for this experiment, but it does show the gaps where no daily velocity
data is available.
---
 cosima_cookbook/querying.py | 46 +++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/cosima_cookbook/querying.py b/cosima_cookbook/querying.py
index 835d0c9..73d4a3b 100644
--- a/cosima_cookbook/querying.py
+++ b/cosima_cookbook/querying.py
@@ -7,8 +7,10 @@
 import logging
 import os.path
 import pandas as pd
+import sqlalchemy as sa
 from sqlalchemy import func, distinct, or_
 from sqlalchemy.orm import aliased
+from sqlalchemy.sql.expression import cast
 from sqlalchemy.sql.selectable import subquery
 import warnings
 import xarray as xr
@@ -272,6 +274,50 @@ def get_frequencies(session, experiment=None):
     return pd.DataFrame(q, columns=[c["name"] for c in q.column_descriptions])
 
 
+def get_ranges(session, experiment, variable, frequency, cellmethods=None):
+    # first, we query for the files with a flag indicating that the current row is not
+    # contiguous with its predecessor
+    flag_q = (
+        session.query(
+            NCFile.time_start,
+            NCFile.time_end,
+            (
+                NCFile.time_start
+                != func.lag(NCFile.time_end, 1, "").over(order_by=NCFile.time_start)
+            ).label("flag"),
+        )
+        .join(NCFile.experiment)
+        .join(NCFile.ncvars)
+        .join(NCVar.variable)
+        .filter(NCExperiment.experiment == experiment)
+        .filter(NCFile.frequency == frequency)
+        .filter(NCVar.variable == variable)
+        .order_by(NCFile.time_start)
+    ).subquery()
+
+    # now, by summing over the flag (as an integer), we get a column that allows us to group
+    # on consecutive files
+    group_q = session.query(
+        flag_q,
+        func.sum(cast(flag_q.c.flag, sa.Integer))
+        .over(order_by=flag_q.c.time_start)
+        .label("grp"),
+    ).subquery()
+
+    # we just need the smallest start time and largest end time out of each group
+    # to gets its extent
+    q = (
+        session.query(
+            func.min(group_q.c.time_start),
+            func.max(group_q.c.time_end),
+        )
+        .group_by(group_q.c.grp)
+        .order_by(group_q.c.time_start)
+    )
+
+    return pd.DataFrame(q, columns=["start", "end"])
+
+
 def getvar(
     expt,
     variable,