Skip to content

Commit 778f2b8

Browse files
authored
Merge pull request #7 from openproblems-bio/add_pciseq
Add pciseq
2 parents 7a0d863 + 1dfd101 commit 778f2b8

File tree

6 files changed

+331
-4
lines changed

6 files changed

+331
-4
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,4 +18,6 @@ singularity_container/
1818
/resources
1919
/.vscode
2020
/.nextflow*
21-
/work
21+
/work
22+
23+
.DS_STORE

src/api/comp_method_transcript_assignment.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,11 @@ arguments:
1919
required: false
2020
direction: input
2121
__merge__: file_scrnaseq_reference.yaml
22+
- name: "--sc_cell_type_key"
23+
type: string
24+
required: false
25+
direction: input
26+
default: cell_type
2227
- name: "--output"
2328
__merge__: file_transcript_assignments.yaml
2429
direction: output

src/api/file_transcript_assignments.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,4 @@ info:
5050
- type: string
5151
name: region
5252
description: Region
53-
required: true
53+
required: false

src/methods_transcript_assignment/basic_transcript_assignment/script.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import dask
33
import spatialdata as sd
44
import anndata as ad
5+
import pandas as pd
56
import os
67
import shutil
78

@@ -11,7 +12,7 @@
1112
'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
1213
'transcripts_key': 'transcripts',
1314
'coordinate_system': 'global',
14-
'output': 'assigned_transcripts.zarr',
15+
'output': 'basic_assigned_transcripts.zarr',
1516
}
1617
meta = {
1718
'name': 'basic'
@@ -49,6 +50,17 @@
4950
)
5051
sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series
5152

53+
#create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
54+
unique_cells = np.unique(cell_id_dask_series)
55+
56+
# check if a '0' (noise/background) cell is in cell_id and remove
57+
zero_idx = np.where(unique_cells == 0)
58+
if len(zero_idx[0]): unique_cells=np.delete(unique_cells, zero_idx[0][0])
59+
60+
#transform into pandas series and check
61+
cell_id_col = pd.Series(unique_cells, name='cell_id', index=unique_cells)
62+
assert 0 not in cell_id_col, "Found '0' in cell_id column of assingment output cell matrix"
63+
5264
# TODO: Also take care of the following cases:
5365
# - segmentation 3D, transcripts 3D
5466
# - segmentation 3D, transcripts 2D
@@ -61,7 +73,7 @@
6173
},
6274
tables={
6375
"table": ad.AnnData(
64-
obs=sdata.tables["table"].obs[["cell_id", "region"]],
76+
obs=pd.DataFrame(cell_id_col),
6577
var=sdata.tables["table"].var[[]]
6678
)
6779
}
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
__merge__: /src/api/comp_method_transcript_assignment.yaml
2+
3+
name: pciseq_transcript_assignment
4+
label: "pciSeq Transcript Assignment"
5+
summary: "Assign transcripts to cells using the pciSeq method from Qian et. al. (2020)"
6+
description: "Uses a reference sc-RNAseq dataset to probabalistically assign cell types and transcripts to cells ."
7+
links:
8+
documentation: "https://github.com/acycliq/pciSeq"
9+
repository: "https://github.com/acycliq/pciSeq"
10+
references:
11+
doi: "10.1038/s41592-019-0631-4"
12+
13+
arguments:
14+
- name: --transcripts_key
15+
type: string
16+
description: The key of the transcripts within the points of the spatial data
17+
default: transcripts
18+
- name: --coordinate_system
19+
type: string
20+
description: The key of the pixel space coordinate system within the spatial data
21+
default: global
22+
# - name: --sc_cell_type_key
23+
# type: string
24+
# default: cell_type
25+
# required: true
26+
# direction: input
27+
# description: The name of column in the SC-RNAseq AnnData .obs with the cell type of each cell
28+
29+
# - name: --exclude_genes
30+
# type: string
31+
# required: false
32+
# description: "list of genes to be excluded during cell-typing, e.g ['Aldoc', 'Id2'] to exclude all spots from Aldoc and Id2"
33+
# direction: input
34+
# default: None
35+
36+
- name: --max_iter
37+
type: integer
38+
required: false
39+
description: "Maximum number of loops allowed for the Variational Bayes to run"
40+
direction: input
41+
default: 1000
42+
43+
- name: --CellCallTolerance
44+
type: double
45+
required: false
46+
description: "Convergence achieved if assignment probabilities between two successive loops is less than the tolerance"
47+
direction: input
48+
default: 0.02
49+
50+
- name: --rGene
51+
type: double
52+
required: false
53+
description: |
54+
"A gamma distribution expresses the efficiency of the in-situ sequencing for each gene. It tries to capture
55+
the ratio of the observed over the theoretical counts for a given gene. rGene controls the variance and
56+
Inefficiency is the average of this assumed Gamma distribution"
57+
direction: input
58+
default: 20
59+
60+
- name: --Inefficiency
61+
type: double
62+
required: false
63+
description: " "
64+
direction: input
65+
default: 0.2
66+
67+
- name: --InsideCellBonus
68+
type: double
69+
required: false
70+
description: |
71+
"If a spot is inside the cell boundaries this bonus will give the likelihood an extra boost
72+
in order to make the spot more probable to get assigned to the cell than another spot positioned
73+
outside the cell boundaries"
74+
direction: input
75+
default: 2
76+
77+
- name: --MisreadDensity
78+
type: double
79+
required: false
80+
description: |
81+
"To account for spots far from the some a uniform distribution is introduced to describe those misreads.
82+
By default this uniform distribution has a density of 1e-5 misreads per pixel."
83+
direction: input
84+
default: 0.00001
85+
86+
- name: --SpotReg
87+
type: double
88+
required: false
89+
description: |
90+
"Gene detection might come with irregularities due to technical errors. A small value is introduced
91+
here to account for these errors. It is an additive factor, applied to the single cell expression
92+
counts when the mean counts per class and per gene are calculated."
93+
direction: input
94+
default: 0.1
95+
96+
- name: --nNeighbors
97+
type: integer
98+
required: false
99+
description: |
100+
"By default only the 3 nearest cells will be considered as possible parent cells for any given spot.
101+
There is also one extra 'super-neighbor', which is always a neighbor to the spots so we can assign
102+
the misreads to. Could be seen as the background. Hence, by default the algorithm tries examines
103+
whether any of the 3 nearest cells is a possible parent cell to a given cell or whether the spot is
104+
a misread"
105+
direction: input
106+
default: 3
107+
108+
#
109+
- name: --rSpot
110+
type: double
111+
required: false
112+
description: |
113+
"A gamma distributed variate from Gamma(rSpot, 1) is applied to the mean expression, hence the counts
114+
are distributed according to a Negative Binomial distribution.
115+
The value for rSpot will control the variance/dispersion of the counts"
116+
direction: input
117+
default: 2
118+
119+
- name: --save_data
120+
type: boolean
121+
required: false
122+
description: "Boolean, if True the output will be saved as tsv files in a folder named 'pciSeq' in your system's temp dir."
123+
direction: input
124+
default: False
125+
126+
# output directory 'default' will save to temp location
127+
# - name: output_path
128+
# default: ['default']
129+
130+
#
131+
# - name: --dtype
132+
# type: string
133+
# required: false
134+
# description: |
135+
# "Use either np.float16 or np.float32 to reduce memory usage. In most cases RAM consumption shouldnt
136+
# need more than 32Gb RAM. If you have a dataset from a full coronal mouse slice with a high number of
137+
# segmented cells (around 150,000) a gene panel of more than 250 genes and 100 or more different
138+
# cell types (aka clusters, aka classes) in the single cell data then you might need at least 64GB on
139+
# your machine. Changing the datatype to a float16 or float32 will help keeping RAM usage to a lower
140+
# level"
141+
# direction: input
142+
# default: np.float64
143+
144+
145+
146+
resources:
147+
- type: python_script
148+
path: script.py
149+
150+
engines:
151+
- type: docker
152+
image: openproblems/base_python:1.0.0
153+
__merge__:
154+
- /src/base/setup_spatialdata_partial.yaml
155+
- /src/base/setup_txsim_partial.yaml
156+
setup:
157+
- type: python
158+
pypi: [pciseq]
159+
- type: native
160+
161+
runners:
162+
- type: executable
163+
- type: nextflow
164+
directives:
165+
label: [ midtime, midcpu, midmem ]
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
import numpy as np
2+
import dask
3+
import spatialdata as sd
4+
import txsim as tx
5+
import anndata as ad
6+
import os
7+
import shutil
8+
9+
## VIASH START
10+
# Note: this section is auto-generated by viash at runtime. To edit it, make changes
11+
# in config.vsh.yaml and then run `viash config inject config.vsh.yaml`.
12+
par = {
13+
'input_ist': 'resources_test/task_ist_preprocessing/mouse_brain_combined/raw_ist.zarr',
14+
'input_segmentation': 'resources_test/task_ist_preprocessing/mouse_brain_combined/segmentation.zarr',
15+
'transcripts_key': 'transcripts',
16+
'coordinate_system': 'global',
17+
'output': '../pciSeq_assigned_transcripts.zarr',
18+
19+
'input_scrnaseq': 'resources_test/task_ist_preprocessing/mouse_brain_combined/scrnaseq_reference.h5ad',
20+
'sc_cell_type_key': 'cell_type',
21+
22+
'exclude_genes': None,
23+
'max_iter': 1000,
24+
'CellCallTolerance': 0.02,
25+
'rGene': 20,
26+
'Inefficiency': 0.2,
27+
'InsideCellBonus': 2,
28+
'MisreadDensity': 0.00001,
29+
'SpotReg': 0.1,
30+
'nNeighbors': 3,
31+
'rSpot': 2,
32+
'save_data': False,
33+
'dtype': np.float64
34+
}
35+
meta = {
36+
'name': 'pciSeq_transcript_assignment'
37+
}
38+
## VIASH END
39+
40+
# Read input
41+
print('Reading input files', flush=True)
42+
sdata = sd.read_zarr(par['input_ist'])
43+
sdata_segm = sd.read_zarr(par['input_segmentation'])
44+
45+
# Check if coordinate system is available in input data
46+
transcripts_coord_systems = sd.transformations.get_transformation(sdata[par["transcripts_key"]], get_all=True).keys()
47+
assert par['coordinate_system'] in transcripts_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
48+
segmentation_coord_systems = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True).keys()
49+
assert par['coordinate_system'] in segmentation_coord_systems, f"Coordinate system '{par['coordinate_system']}' not found in input data."
50+
51+
# Transform transcript coordinates to the coordinate system
52+
print('Transforming transcripts coordinates', flush=True)
53+
transcripts = sd.transform(sdata[par['transcripts_key']], to_coordinate_system=par['coordinate_system'])
54+
55+
# In case of a translation transformation of the segmentation (e.g. crop of the data), we need to adjust the transcript coordinates
56+
trans = sd.transformations.get_transformation(sdata_segm["segmentation"], get_all=True)[par['coordinate_system']].inverse()
57+
transcripts = sd.transform(transcripts, trans, par['coordinate_system'])
58+
59+
# Assign cell ids to transcripts
60+
print('Assigning transcripts to cell ids', flush=True)
61+
y_coords = transcripts.y.compute().to_numpy()
62+
x_coords = transcripts.x.compute().to_numpy()
63+
64+
#Added for pciSeq
65+
#TODO this will immediately break when the name of the gene isn't feature_name
66+
transcripts_dataframe = sdata[par['transcripts_key']].compute()[['feature_name']]
67+
transcripts_dataframe['x'] = x_coords
68+
transcripts_dataframe['y'] = y_coords
69+
70+
#same as before
71+
label_image = sdata_segm["segmentation"]["scale0"].image.to_numpy() #TODO: mabye this line needs generalization (DataTree vs DataArray)
72+
73+
# Grab all the pciSeq parameters
74+
opts_keys = [#'exclude_genes',
75+
'max_iter',
76+
'CellCallTolerance',
77+
'rGene',
78+
'Inefficiency',
79+
'InsideCellBonus',
80+
'MisreadDensity',
81+
'SpotReg',
82+
'nNeighbors',
83+
'rSpot',
84+
'save_data']
85+
86+
opts = {k: par[k] for k in opts_keys}
87+
88+
input_scrnaseq = ad.read_h5ad(par['input_scrnaseq'])
89+
input_scrnaseq.X = input_scrnaseq.layers['counts']
90+
91+
assignments, cell_types = tx.preprocessing.run_pciSeq(
92+
transcripts_dataframe,
93+
label_image,
94+
input_scrnaseq,
95+
par['sc_cell_type_key'],
96+
opts
97+
)
98+
99+
#assign transcript -> cell
100+
cell_id_dask_series = dask.dataframe.from_dask_array(
101+
dask.array.from_array(
102+
assignments['cell'].to_numpy(), chunks=tuple(sdata[par['transcripts_key']].map_partitions(len).compute())
103+
),
104+
index=sdata[par['transcripts_key']].index
105+
)
106+
107+
sdata[par['transcripts_key']]["cell_id"] = cell_id_dask_series
108+
109+
# create new .obs for cells based on the segmentation output (corresponding with the transcripts 'cell_id')
110+
cell_types['type'] = cell_types['type'].replace({'None':'None_sp'})
111+
cell_types.insert(0, 'cell_id', cell_types.index)
112+
cell_types.rename(columns={'type':'cell_type','prob':'cell_type_prob'}, inplace=True)
113+
114+
assert 0 not in cell_types['cell_id'], "Found '0' in cell_id column of assingment output cell matrix"
115+
116+
output_table = ad.AnnData(
117+
obs=cell_types[['cell_id','cell_type','cell_type_prob']],
118+
var=sdata.tables["table"].var[[]]
119+
)
120+
121+
# TODO: Also take care of the following cases:
122+
# - segmentation 3D, transcripts 3D
123+
# - segmentation 3D, transcripts 2D
124+
# - segmentation 2D, transcripts 3D
125+
126+
# Subset sdata to transcripts with cell ids
127+
128+
print('Subsetting to transcripts cell id and cell type data', flush=True)
129+
sdata_transcripts_only = sd.SpatialData(
130+
points={
131+
"transcripts": sdata[par['transcripts_key']]
132+
},
133+
tables={
134+
"table": output_table
135+
}
136+
)
137+
138+
print('Write transcripts with cell ids and cell types', flush=True)
139+
if os.path.exists(par["output"]):
140+
shutil.rmtree(par["output"])
141+
sdata_transcripts_only.write(par['output'])
142+
143+

0 commit comments

Comments
 (0)