janelia-cellmap · rhoadesScholar · Feb 27, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/.gitignore b/.gitignore
@@ -116,4 +116,7 @@ clean/
 .pytest_cache/
 __pycache__/
 mypy_cache/
-.claude/
+.claude/
+*.out
+*.log
+*.err
diff --git a/pyproject.toml b/pyproject.toml
@@ -66,6 +66,7 @@ dev = [
     "twine",
     "hatch",
     "python-semantic-release",
+    "objgraph",
 ]
 all = [
     "cellmap-data[dev,test]",

diff --git a/src/cellmap_data/dataset.py b/src/cellmap_data/dataset.py
@@ -19,7 +19,7 @@
 from .empty_image import EmptyImage
 from .image import CellMapImage
 from .mutable_sampler import MutableSubsetRandomSampler
-from .read_limiter import MAX_CONCURRENT_READS, limit_tensorstore_reads
+from .utils.read_limiter import MAX_CONCURRENT_READS, limit_tensorstore_reads
 from .utils import get_sliced_shape, is_array_2D, min_redundant_inds, split_target_path
 
 logger = logging.getLogger(__name__)

diff --git a/src/cellmap_data/dataset_writer.py b/src/cellmap_data/dataset_writer.py
@@ -95,8 +95,8 @@ def __init__(
             self.target_array_writers[array_name] = self.get_target_array_writer(
                 array_name, array_info
             )
+        self._device: str | torch.device = device if device is not None else "cpu"
         if device is not None:
-            self._device = device
             self.to(device, non_blocking=True)
 
     @cached_property
@@ -237,11 +237,7 @@ def loader(
     @property
     def device(self) -> str | torch.device:
         """Returns the device for the dataset."""
-        try:
-            return self._device
-        except AttributeError:
-            self._device = "cpu"
-            return self._device
+        return self._device
 
     def get_center(self, idx: int) -> dict[str, float]:
         """

diff --git a/src/cellmap_data/image.py b/src/cellmap_data/image.py
@@ -130,6 +130,11 @@ def __getitem__(self, center: Mapping[str, float]) -> torch.Tensor:
         if self.value_transform is not None:
             data = self.value_transform(data)
 
+        # Clear cached array property to prevent memory accumulation from xarray
+        # operations (interp/reindex/sel) during training iterations. The array
+        # will be reopened on next access if needed.
+        self._clear_array_cache()
+
         # Return data on CPU - let the DataLoader handle device transfer with streams
         # This avoids redundant transfers and allows for optimized batch transfers
         return data
@@ -138,6 +143,18 @@ def __repr__(self) -> str:
         """Returns a string representation of the CellMapImage object."""
         return f"CellMapImage({self.array_path})"
 
+    def _clear_array_cache(self) -> None:
+        """
+        Clear the cached xarray DataArray to release intermediate objects.
+
+        xarray operations (interp, reindex, sel) create intermediate arrays that
+        remain referenced through the DataArray. Clearing the cache after each
+        __getitem__ releases those references without closing the underlying
+        TensorStore handle, which is separately cached in _ts_store and reused.
+        """
+        if "array" in self.__dict__:
+            del self.__dict__["array"]
+
     @property
     def coord_offsets(self) -> Mapping[str, np.ndarray]:
         """
@@ -223,9 +240,43 @@ def array_path(self) -> str:
         """Returns the path to the single-scale image array."""
         return os.path.join(self.path, self.scale_level)
 
+    @cached_property
+    def _ts_store(self) -> ts.TensorStore:  # type: ignore
+        """
+        Opens and caches the TensorStore array handle.
+
+        ts.open() is called exactly once per CellMapImage instance and the
+        resulting handle is kept alive for the instance's lifetime. The handle
+        is lightweight (it holds a reference to the shared context and chunk
+        cache) and is safe to reuse across many __getitem__ calls.
+
+        Separating this from the `array` cached_property means that clearing
+        `array` after each __getitem__ (to release xarray intermediate objects)
+        does not trigger a new ts.open() call on the next access.
+        """
+        spec = xt._zarr_spec_from_path(self.array_path)
+        array_future = ts.open(spec, read=True, write=False, context=self.context)
+        try:
+            return array_future.result()
+        except ValueError as e:
+            logger.warning(
+                "Failed to open with default driver: %s. Falling back to zarr3 driver.",
+                e,
+            )
+            spec["driver"] = "zarr3"
+            return ts.open(spec, read=True, write=False, context=self.context).result()
+
     @cached_property
     def array(self) -> xarray.DataArray:
-        """Returns the image data as an xarray DataArray."""
+        """
+        Returns the image data as an xarray DataArray.
+
+        This property is cached but is explicitly cleared after each __getitem__
+        call to release xarray intermediate objects (from interp/reindex/sel)
+        that would otherwise accumulate during training. Clearing it is cheap
+        because the underlying TensorStore handle is separately cached in
+        _ts_store and is not reopened.
+        """
         if (
             os.environ.get("CELLMAP_DATA_BACKEND", "tensorstore").lower()
             != "tensorstore"
@@ -235,22 +286,7 @@ def array(self) -> xarray.DataArray:
                 chunks="auto",
             )
         else:
-            # Construct an xarray with Tensorstore backend
-            spec = xt._zarr_spec_from_path(self.array_path)
-            array_future = ts.open(spec, read=True, write=False, context=self.context)
-            try:
-                array = array_future.result()
-            except ValueError as e:
-                logger.warning(
-                    "Failed to open with default driver: %s. Falling back to zarr3 driver.",
-                    e,
-                )
-                spec["driver"] = "zarr3"
-                array_future = ts.open(
-                    spec, read=True, write=False, context=self.context
-                )
-                array = array_future.result()
-            data = xt._TensorStoreAdapter(array)
+            data = xt._TensorStoreAdapter(self._ts_store)
         return xarray.DataArray(data=data, coords=self.full_coords)
 
     @cached_property
@@ -324,6 +360,7 @@ def class_counts(self) -> float:
             else:
                 raise ValueError("s0_scale not found")
         except Exception as e:
+            # TODO: This fallback is very expensive, and ideally should be avoided. We should add a script to precompute class counts for all images and save them to the metadata to avoid this in the future.
             logger.warning(
                 "Unable to get class counts for %s from metadata, "
                 "falling back to calculating from array. Error: %s, %s",