Preprocess/era5 land (#78)

tommylees112 · web-flow · commit 97706bb9b3d5 · 2020-05-12T13:56:18.000+01:00
* initial commit

* flake8 error

* update the final test

* add variable parameter to process model

* add docstring to process()

* add variable to base

* update merge files

* update merge files

* update era5_land tests to work with variable names

* add year functionality to preprocessor

* Update preprocess.py

* add type ignore to fix mypy errors

* fix tests because merged file doesn't have variable name

* update black

* update
diff --git a/scripts/preprocess.py b/scripts/preprocess.py
@@ -13,6 +13,7 @@
     ERA5HourlyPreprocessor,
     BokuNDVIPreprocessor,
     KenyaASALMask,
+    ERA5LandPreprocessor,
 )
 
 from src.preprocess.admin_boundaries import KenyaAdminPreprocessor
@@ -54,6 +55,25 @@ def process_era5POS_2018():
     )
 
 
+def process_era5_land(variable: str):
+    if Path(".").absolute().as_posix().split("/")[-1] == "ml_drought":
+        data_path = Path("data")
+    else:
+        data_path = Path("../data")
+    regrid_path = data_path / "interim/chirps_preprocessed/chirps_kenya.nc"
+    assert regrid_path.exists(), f"{regrid_path} not available"
+
+    processor = ERA5LandPreprocessor(data_path)
+
+    processor.preprocess(
+        subset_str="kenya",
+        regrid=None,
+        resample_time="M",
+        upsampling=False,
+        variable=variable,
+    )
+
+
 def process_gleam():
     data_path = get_data_path()
 
diff --git a/src/preprocess/__init__.py b/src/preprocess/__init__.py
@@ -2,13 +2,15 @@
 from .chirps import CHIRPSPreprocessor
 from .planetOS import PlanetOSPreprocessor
 from .gleam import GLEAMPreprocessor
+from .era5_land import ERA5LandPreprocessor
 from .seas5 import S5Preprocessor
 from .era5 import ERA5MonthlyMeanPreprocessor, ERA5HourlyPreprocessor
 from .esa_cci import ESACCIPreprocessor
 from .srtm import SRTMPreprocessor
 from .admin_boundaries import KenyaAdminPreprocessor, KenyaASALMask
 from .boku_ndvi import BokuNDVIPreprocessor
 
+
 __all__ = [
     "VHIPreprocessor",
     "CHIRPSPreprocessor",
@@ -22,4 +24,5 @@
     "KenyaAdminPreprocessor",
     "BokuNDVIPreprocessor",
     "KenyaASALMask",
+    "ERA5LandPreprocessor",
 ]
diff --git a/src/preprocess/era5_land.py b/src/preprocess/era5_land.py
@@ -0,0 +1,152 @@
+from pathlib import Path
+import xarray as xr
+import multiprocessing
+from functools import partial
+from typing import Optional, List
+from shutil import rmtree
+
+from .base import BasePreProcessor
+
+
+class ERA5LandPreprocessor(BasePreProcessor):
+    """ Preprocesses the ERA5 Land data """
+
+    dataset = "reanalysis-era5-land"
+
+    @staticmethod
+    def create_filename(
+        netcdf_filepath: Path, subset_name: Optional[str] = None
+    ) -> str:
+
+        var_name = netcdf_filepath.parts[-3]
+        months = netcdf_filepath.parts[-1][:-3]
+        year = netcdf_filepath.parts[-2]
+
+        stem = f"{year}_{months}_{var_name}"
+        if subset_name is not None:
+            stem = f"{stem}_{subset_name}"
+        return f"{stem}.nc"
+
+    def _preprocess_single(
+        self,
+        netcdf_filepath: Path,
+        subset_str: Optional[str] = "kenya",
+        regrid: Optional[xr.Dataset] = None,
+    ) -> None:
+        """ Preprocess a single netcdf file (run in parallel if
+        `parallel_processes` arg > 1)
+
+        Process:
+        -------
+        * rename latitude/longitude -> lat/lon
+        * chop region of interset (ROI)
+        * regrid to same spatial grid as a reference dataset (`regrid`)
+        * Save the output file to new folder / filename
+
+        Todo:
+        # read the variable name from the fpath
+        # variable = netcdf_filepath.parents[1].name
+        """
+        print(f"Processing {netcdf_filepath.name}")
+
+        # 1. read in the dataset
+        ds = xr.open_dataset(netcdf_filepath).rename(
+            {"longitude": "lon", "latitude": "lat"}
+        )
+
+        # 2. chop out EastAfrica
+        if subset_str is not None:
+            ds = self.chop_roi(ds, subset_str, inverse_lat=True)
+
+        if regrid is not None:
+            ds = self.regrid(ds, regrid)
+
+        filename = self.create_filename(
+            netcdf_filepath, subset_name=subset_str if subset_str is not None else None
+        )
+        print(f"Saving to {self.interim}/{filename}")
+        ds.to_netcdf(self.interim / filename)
+
+        print(f"Done for ERA5-Land {netcdf_filepath.name}")
+
+    def preprocess(
+        self,
+        subset_str: Optional[str] = "kenya",
+        regrid: Optional[Path] = None,
+        resample_time: Optional[str] = "M",
+        upsampling: bool = False,
+        parallel_processes: int = 1,
+        variable: Optional[str] = None,
+        years: Optional[List[int]] = None,
+        cleanup: bool = True,
+    ) -> None:
+        """Preprocess all of the ERA5-Land .nc files to produce
+        one subset file.
+        Arguments
+        ----------
+        :param: subset_str: Optional[str] = 'kenya'
+            Whether to subset Kenya when preprocessing
+        :param: regrid: Optional[Path] = None
+            If a Path is passed, the CHIRPS files will be regridded to have the same
+            grid as the dataset at that Path. If None, no regridding happens
+        :param: resample_time: str = 'M'
+            If not None, defines the time length to which the data will be resampled
+        :param: upsampling: bool = False
+            If true, tells the class the time-sampling will be upsampling. In this case,
+            nearest instead of mean is used for the resampling
+        :param: variable: Optional[str] = None
+            the variable that you want to preprocess. If None then will
+            process ALL variables that have been downloaded to the
+            `data/raw/reanalysis-era5-land` by the ERA5LandExporter
+        :param: parallel_processes: int = 1
+            If > 1, run the preprocessing in parallel
+        :param: years: Optional[List[int]] = None
+            preprocess a subset of the years from the raw data
+        :param: cleanup: bool = True
+            If true, delete interim files created by the class
+
+        Note:
+        ----
+        - the raw data is downloaded at annual resolution by default
+        """
+        print(f"Reading data from {self.raw_folder}. Writing to {self.interim}")
+        nc_files = self.get_filepaths()
+        if years is not None:
+            nc_files = [
+                f for f in nc_files if int(f.parents[0]) in years  # type: ignore
+            ]
+
+        # run for one variable or all variables?
+        if variable is not None:
+            variables = [d.name for d in (self.raw_folder / self.dataset).iterdir()]
+            assert variable in variables, (
+                "Expect the variable provided" f"to be in {variables}"
+            )
+            print(f"Running preprocessor for var: {variable}")
+            nc_files = [f for f in nc_files if f.parents[1].name == variable]
+
+        if regrid is not None:
+            regrid = self.load_reference_grid(regrid)
+
+        # parallel processing ?
+        if parallel_processes <= 1:  # sequential
+            for file in nc_files:
+                self._preprocess_single(file, subset_str, regrid)
+        else:
+            pool = multiprocessing.Pool(processes=parallel_processes)
+            outputs = pool.map(
+                partial(self._preprocess_single, subset_str=subset_str, regrid=regrid),
+                nc_files,
+            )
+            print("\nOutputs (errors):\n\t", outputs)
+
+        # merge and resample files
+        self.merge_files(
+            subset_str=subset_str,
+            resample_time=resample_time,
+            upsampling=upsampling,
+            # variable=variable,
+        )
+
+        if cleanup:
+            rmtree(self.interim)
diff --git a/tests/preprocess/test_era5_land.py b/tests/preprocess/test_era5_land.py
@@ -0,0 +1,128 @@
+import xarray as xr
+import numpy as np
+from datetime import datetime
+from pathlib import Path
+
+from src.preprocess import ERA5LandPreprocessor
+from src.utils import get_kenya
+
+from ..utils import _make_dataset
+
+
+class TestERA5LandPreprocessor:
+    @staticmethod
+    def _make_era5_dataset(
+        size, lonmin=33.75, lonmax=42.25, latmin=6.0, latmax=-5.0, add_times=True
+    ):
+        # Same as make_chirps_dataset, except already truncated
+        # since we can just download Kenya from the cds api
+        lat_len, lon_len = size
+        # create the vector
+        longitudes = np.linspace(lonmin, lonmax, lon_len)
+        latitudes = np.linspace(latmin, latmax, lat_len)
+
+        dims = ["longitude", "latitude"]
+        coords = {"latitude": latitudes, "longitude": longitudes}
+
+        if add_times:
+            size = (2, size[0], size[1])
+            dims.insert(0, "time")
+            coords["time"] = [datetime(2019, 1, 1), datetime(2019, 1, 2)]
+        t2m = np.random.randint(100, size=size)
+
+        return xr.Dataset({"t2m": (dims, t2m)}, coords=coords)
+
+    def test_init(self, tmp_path):
+
+        ERA5LandPreprocessor(tmp_path)
+
+        assert (tmp_path / "interim/reanalysis-era5-land_interim").exists()
+        assert (tmp_path / "interim/reanalysis-era5-land_preprocessed").exists()
+
+    @staticmethod
+    def test_make_filename():
+        path = Path("reanalysis-era5-land" "/2m_temperature/1979_2019/01_12.nc")
+
+        name = ERA5LandPreprocessor.create_filename(path, "kenya")
+        expected_name = "1979_2019_01_12_2m_temperature_kenya.nc"
+        assert name == expected_name, f"{name} generated, expected {expected_name}"
+
+    @staticmethod
+    def test_get_filenames(tmp_path):
+        (tmp_path / "raw/reanalysis-era5-land/" "2m_temperature/1979_2019").mkdir(
+            parents=True
+        )
+
+        test_file = (
+            tmp_path / "raw/reanalysis-era5-land" "/2m_temperature/1979_2019.01_12.nc"
+        )
+        test_file.touch()
+
+        processor = ERA5LandPreprocessor(tmp_path)
+
+        files = processor.get_filepaths()
+        assert files[0] == test_file, f"Expected {test_file} to be retrieved"
+
+    def test_preprocess(self, tmp_path):
+
+        (tmp_path / "raw/reanalysis-era5-land/" "2m_temperature/1979_2019").mkdir(
+            parents=True
+        )
+        data_path = (
+            tmp_path / "raw/reanalysis-era5-land/" "2m_temperature/1979_2019/01_12.nc"
+        )
+        dataset = self._make_era5_dataset(size=(100, 100))
+        dataset.to_netcdf(path=data_path)
+
+        kenya = get_kenya()
+        regrid_dataset, _, _ = _make_dataset(
+            size=(20, 20),
+            latmin=kenya.latmin,
+            latmax=kenya.latmax,
+            lonmin=kenya.lonmin,
+            lonmax=kenya.lonmax,
+        )
+
+        regrid_path = tmp_path / "regridder.nc"
+        regrid_dataset.to_netcdf(regrid_path)
+
+        processor = ERA5LandPreprocessor(tmp_path)
+        processor.preprocess(
+            subset_str="kenya",
+            regrid=regrid_path,
+            parallel_processes=1,
+            variable="2m_temperature",
+        )
+
+        expected_out_path = (
+            tmp_path / "interim/reanalysis-era5"
+            "-land_preprocessed/reanalysis-era5-land_kenya.nc"
+        )
+        assert (
+            expected_out_path.exists()
+        ), f"Expected processed file to be saved to {expected_out_path}"
+
+        # check the subsetting happened correctly
+        out_data = xr.open_dataset(expected_out_path)
+        expected_dims = ["lat", "lon", "time"]
+        assert len(list(out_data.dims)) == len(expected_dims)
+        for dim in expected_dims:
+            assert dim in list(
+                out_data.dims
+            ), f"Expected {dim} to be in the processed dataset dims"
+
+        lons = out_data.lon.values
+        assert (lons.min() >= kenya.lonmin) and (
+            lons.max() <= kenya.lonmax
+        ), "Longitudes not correctly subset"
+
+        lats = out_data.lat.values
+        assert (lats.min() >= kenya.latmin) and (
+            lats.max() <= kenya.latmax
+        ), "Latitudes not correctly subset"
+
+        assert out_data.t2m.values.shape[1:] == (20, 20)
+
+        assert (
+            not processor.interim.exists()
+        ), f"Interim era5 folder should have been deleted"