Skip to content

Commit 54a24a0

Browse files
committed
use zarr zipstores in xenium
1 parent 92a09ae commit 54a24a0

File tree

1 file changed

+73
-96
lines changed

1 file changed

+73
-96
lines changed

src/spatialdata_io/readers/xenium.py

Lines changed: 73 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,6 @@ def xenium(
7373
cells_table: bool = True,
7474
n_jobs: int = 1,
7575
gex_only: bool = True,
76-
cleanup_labels_zarr_tmpdir: bool = True,
7776
imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
7877
image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
7978
labels_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
@@ -202,86 +201,62 @@ def xenium(
202201
else:
203202
table = None
204203

205-
tmpdir = tempfile.TemporaryDirectory()
206-
if not cleanup_labels_zarr_tmpdir:
207-
logging.info(
208-
f"Extracting cells zarr in the temporary directory {tmpdir.name}. Since `cleanup_labels_zarr_tmpdir` "
209-
f"is set to `False`, this directory cleanup will be deferred (up to the end of the process). "
210-
f"If the process is interrupted aburptly cleanup may not occurr. Use with care to avoid uncleaned up "
211-
f"temporary directories."
204+
if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
205+
cell_summary_table = _get_cells_metadata_table_from_zarr(path, XeniumKeys.CELLS_ZARR, specs)
206+
if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
207+
warnings.warn(
208+
'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
209+
" table. This could be due to trying to read a new version that is not supported yet. Please "
210+
"report this issue.",
211+
UserWarning,
212+
stacklevel=2,
213+
)
214+
table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
215+
table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
216+
217+
polygons = {}
218+
labels = {}
219+
tables = {}
220+
points = {}
221+
images = {}
222+
223+
# From the public release notes here:
224+
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
225+
# we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
226+
# This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
227+
# nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
228+
# labels.
229+
if nucleus_labels:
230+
labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
231+
path,
232+
XeniumKeys.CELLS_ZARR,
233+
specs,
234+
mask_index=0,
235+
labels_name="nucleus_labels",
236+
labels_models_kwargs=labels_models_kwargs,
212237
)
213-
zip_file = path / XeniumKeys.CELLS_ZARR
214-
with zipfile.ZipFile(zip_file, "r") as zip_ref:
215-
zip_ref.extractall(tmpdir.name)
216-
try:
217-
cells_zarr = zarr.open(str(tmpdir.name), mode="r")
218-
if version is not None and version >= packaging.version.parse("2.0.0") and table is not None:
219-
cell_summary_table = _get_cells_metadata_table_from_zarr(cells_zarr=cells_zarr)
220-
if not cell_summary_table[XeniumKeys.CELL_ID].equals(table.obs[XeniumKeys.CELL_ID]):
238+
if cells_labels:
239+
labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
240+
path,
241+
XeniumKeys.CELLS_ZARR,
242+
specs,
243+
mask_index=1,
244+
labels_name="cell_labels",
245+
labels_models_kwargs=labels_models_kwargs,
246+
)
247+
if cell_labels_indices_mapping is not None and table is not None:
248+
if not pd.DataFrame.equals(cell_labels_indices_mapping["cell_id"], table.obs[str(XeniumKeys.CELL_ID)]):
221249
warnings.warn(
222-
'The "cell_id" column in the cells metadata table does not match the "cell_id" column in the annotation'
223-
" table. This could be due to trying to read a new version that is not supported yet. Please "
224-
"report this issue.",
250+
"The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
251+
"cell labels data. This could be due to trying to read a new version that is not supported yet. "
252+
"Please report this issue.",
225253
UserWarning,
226254
stacklevel=2,
227255
)
228-
table.obs[XeniumKeys.Z_LEVEL] = cell_summary_table[XeniumKeys.Z_LEVEL]
229-
table.obs[XeniumKeys.NUCLEUS_COUNT] = cell_summary_table[XeniumKeys.NUCLEUS_COUNT]
230-
231-
polygons = {}
232-
labels = {}
233-
tables = {}
234-
points = {}
235-
images = {}
236-
237-
# From the public release notes here:
238-
# https://www.10xgenomics.com/support/software/xenium-onboard-analysis/latest/release-notes/release-notes-for-xoa
239-
# we see that for distinguishing between the nuclei of polinucleated cells, the `label_id` column is used.
240-
# This column is currently not found in the preview data, while I think it is needed in order to unambiguously match
241-
# nuclei to cells. Therefore for the moment we only link the table to the cell labels, and not to the nucleus
242-
# labels.
243-
244-
if nucleus_labels:
245-
labels["nucleus_labels"], _ = _get_labels_and_indices_mapping(
246-
cells_zarr,
247-
cleanup_labels_zarr_tmpdir,
248-
specs,
249-
mask_index=0,
250-
labels_name="nucleus_labels",
251-
labels_models_kwargs=labels_models_kwargs,
252-
)
253-
if cells_labels:
254-
labels["cell_labels"], cell_labels_indices_mapping = _get_labels_and_indices_mapping(
255-
cells_zarr,
256-
cleanup_labels_zarr_tmpdir,
257-
specs,
258-
mask_index=1,
259-
labels_name="cell_labels",
260-
labels_models_kwargs=labels_models_kwargs,
261-
)
262-
if cell_labels_indices_mapping is not None and table is not None:
263-
if not pd.DataFrame.equals(
264-
cell_labels_indices_mapping["cell_id"],
265-
table.obs[str(XeniumKeys.CELL_ID)],
266-
):
267-
warnings.warn(
268-
"The cell_id column in the cell_labels_table does not match the cell_id column derived from the "
269-
"cell labels data. This could be due to trying to read a new version that is not supported yet. "
270-
"Please report this issue.",
271-
UserWarning,
272-
stacklevel=2,
273-
)
274-
else:
275-
table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
276-
if not cells_as_circles:
277-
table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
278-
except Exception as e:
279-
tmpdir.cleanup()
280-
raise e
281-
282-
# we cleanup now if we don't have lazy data
283-
if not cells_labels and not nucleus_labels or cleanup_labels_zarr_tmpdir:
284-
tmpdir.cleanup()
256+
else:
257+
table.obs["cell_labels"] = cell_labels_indices_mapping["label_index"]
258+
if not cells_as_circles:
259+
table.uns[TableModel.ATTRS_KEY][TableModel.INSTANCE_KEY] = "cell_labels"
285260

286261
if nucleus_boundaries:
287262
polygons["nucleus_boundaries"] = _get_polygons(
@@ -480,8 +455,8 @@ def _poly(arr: ArrayLike) -> Polygon:
480455

481456

482457
def _get_labels_and_indices_mapping(
483-
cells_zarr: zarr.Group,
484-
cleanup_labels_zarr_tmpdir: bool,
458+
path: Path,
459+
file: str,
485460
specs: dict[str, Any],
486461
mask_index: int,
487462
labels_name: str,
@@ -490,17 +465,12 @@ def _get_labels_and_indices_mapping(
490465
if mask_index not in [0, 1]:
491466
raise ValueError(f"mask_index must be 0 or 1, found {mask_index}.")
492467

468+
zip_file = path / XeniumKeys.CELLS_ZARR
469+
store = zarr.storage.ZipStore(zip_file, read_only=True)
470+
z = zarr.open(store, mode="r")
493471
# get the labels
494-
if cleanup_labels_zarr_tmpdir:
495-
masks = cells_zarr["masks"][f"{mask_index}"][...]
496-
else:
497-
masks = da.from_array(cells_zarr["masks"][f"{mask_index}"])
498-
labels = Labels2DModel.parse(
499-
masks,
500-
dims=("y", "x"),
501-
transformations={"global": Identity()},
502-
**labels_models_kwargs,
503-
)
472+
masks = da.from_array(z["masks"][f"{mask_index}"])
473+
labels = Labels2DModel.parse(masks, dims=("y", "x"), transformations={"global": Identity()}, **labels_models_kwargs)
504474

505475
# build the matching table
506476
version = _parse_version_of_xenium_analyzer(specs)
@@ -512,7 +482,7 @@ def _get_labels_and_indices_mapping(
512482
# supported in versions < 1.3.0
513483
return labels, None
514484

515-
cell_id, dataset_suffix = cells_zarr["cell_id"][...].T
485+
cell_id, dataset_suffix = z["cell_id"][...].T
516486
cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id, dataset_suffix)
517487

518488
# this information will probably be available in the `label_id` column for version > 2.0.0 (see public
@@ -524,7 +494,7 @@ def _get_labels_and_indices_mapping(
524494
real_label_index = real_label_index[1:]
525495

526496
if version < packaging.version.parse("2.0.0"):
527-
expected_label_index = cells_zarr["seg_mask_value"][...]
497+
expected_label_index = z["seg_mask_value"][...]
528498

529499
if not np.array_equal(expected_label_index, real_label_index):
530500
raise ValueError(
@@ -533,7 +503,7 @@ def _get_labels_and_indices_mapping(
533503
f"{expected_label_index}."
534504
)
535505
else:
536-
labels_positional_indices = cells_zarr["polygon_sets"][f"{mask_index}"]["cell_index"][...]
506+
labels_positional_indices = z["polygon_sets"][f"{mask_index}"]["cell_index"][...]
537507
if not np.array_equal(labels_positional_indices, np.arange(len(labels_positional_indices))):
538508
raise ValueError(
539509
"The positional indices of the labels do not match the expected range. Please report this issue."
@@ -554,19 +524,26 @@ def _get_labels_and_indices_mapping(
554524

555525
@inject_docs(xx=XeniumKeys)
556526
def _get_cells_metadata_table_from_zarr(
557-
cells_zarr: zarr.Group,
527+
path: Path,
528+
file: str,
529+
specs: dict[str, Any],
558530
) -> AnnData:
559531
"""Read cells metadata from ``{xx.CELLS_ZARR}``.
560532
561533
Read the cells summary table, which contains the z_level information for versions < 2.0.0, and also the
562534
nucleus_count for versions >= 2.0.0.
563535
"""
564536
# for version >= 2.0.0, in this function we could also parse the segmentation method used to obtain the masks
565-
x = cells_zarr["cell_summary"][...]
566-
column_names = cells_zarr["cell_summary"].attrs["column_names"]
537+
zip_file = path / XeniumKeys.CELLS_ZARR
538+
store = zarr.storage.ZipStore(zip_file, read_only=True)
539+
540+
z = zarr.open(store, mode="r")
541+
x = z["cell_summary"][...]
542+
column_names = z["cell_summary"].attrs["column_names"]
567543
df = pd.DataFrame(x, columns=column_names)
568-
cell_id_prefix = cells_zarr["cell_id"][:, 0]
569-
dataset_suffix = cells_zarr["cell_id"][:, 1]
544+
cell_id_prefix = z["cell_id"][:, 0]
545+
dataset_suffix = z["cell_id"][:, 1]
546+
store.close()
570547

571548
cell_id_str = cell_id_str_from_prefix_suffix_uint32(cell_id_prefix, dataset_suffix)
572549
df[XeniumKeys.CELL_ID] = cell_id_str

0 commit comments

Comments
 (0)