|
| 1 | +from pathlib import Path |
| 2 | +import pandas as pd |
| 3 | +import anndata as ad |
| 4 | +import scanpy as sc |
| 5 | +import urllib.request |
| 6 | + |
| 7 | +## VIASH START |
| 8 | +par = { |
| 9 | + "keep_files": True, # wether to delete the intermediate files |
| 10 | + "output": "./temp/datasets/2020Travaglini_human_lung_sc.h5ad", |
| 11 | + "dataset_id": "2020Travaglini_human_lung_sc", |
| 12 | + "dataset_name": "2020Travaglini_human_lung_sc", |
| 13 | + "dataset_url": "https://cellxgene.cziscience.com/collections/6f6d381a-7701-4781-935c-db10d30de293", |
| 14 | + "dataset_reference": "https://doi.org/10.1038/s41586-020-2922-4", #TODO: bibtex not doi, also adjust config.vsh.yaml |
| 15 | + "dataset_summary": "This dataset contains scRNA-seq data from human lung cells.", |
| 16 | + "dataset_description": "This dataset contains scRNA-seq data from human lung cells.", |
| 17 | + "dataset_organism": "Homo sapiens" |
| 18 | +} |
| 19 | +meta = { |
| 20 | + "temp_dir": "./temp/datasets/2020Travaglini_human_lung_sc", |
| 21 | +} |
| 22 | +## VIASH END |
| 23 | + |
| 24 | +# helper variables |
| 25 | +TMP_DIR = Path(meta["temp_dir"] or "/tmp") |
| 26 | +TMP_DIR.mkdir(parents=True, exist_ok=True) |
| 27 | +FILE_URL = "https://datasets.cellxgene.cziscience.com/060e8716-9f0e-4773-9417-582ddc9ba7ab.h5ad" |
| 28 | +FILE_PATH = TMP_DIR / "060e8716-9f0e-4773-9417-582ddc9ba7ab.h5ad" |
| 29 | + |
| 30 | + |
| 31 | +# Download the data |
| 32 | +print("Downloading data (~5.5GB)", flush=True) |
| 33 | +urllib.request.urlretrieve(FILE_URL, FILE_PATH) |
| 34 | + |
| 35 | +adata = ad.read_h5ad(FILE_PATH) |
| 36 | + |
| 37 | +# Subset to Travaglini/Krasnow dataset |
| 38 | +adata = adata[adata.obs['dataset'] == "Krasnow_2020"] |
| 39 | + |
| 40 | +# Filter out cell types with less than 30 cells |
| 41 | +ct_value_count = adata.obs['ann_finest_level'].value_counts() |
| 42 | +cts_to_keep = ct_value_count[ct_value_count > 30].index.tolist() |
| 43 | +adata = adata[adata.obs['ann_finest_level'].isin(cts_to_keep)] |
| 44 | + |
| 45 | + |
| 46 | +adata.layers["counts"] = adata.raw.X |
| 47 | +sc.pp.filter_genes(adata, min_counts=1) |
| 48 | +del adata.X |
| 49 | +del adata.raw |
| 50 | + |
| 51 | +# Remove cell_type obs column as we'll assign ann_finest_level as cell_type |
| 52 | +del adata.obs["cell_type"] |
| 53 | + |
| 54 | +# Rename fields |
| 55 | +rename_obs_keys = { |
| 56 | + "cell_type": "ann_finest_level", |
| 57 | + "batch": "sample", |
| 58 | + "assay": "assay", |
| 59 | + "assay_ontology_term_id": "assay_ontology_term_id", |
| 60 | + "cell_type_ontology_term_id": "cell_type_ontology_term_id", |
| 61 | + "development_stage": "development_stage", |
| 62 | + "development_stage_ontology_term_id": "development_stage_ontology_term_id", |
| 63 | + "disease": "disease", |
| 64 | + "disease_ontology_term_id": "disease_ontology_term_id", |
| 65 | + "is_primary_data": "is_primary_data", |
| 66 | + "self_reported_ethnicity": "self_reported_ethnicity", |
| 67 | + "self_reported_ethnicity_ontology_term_id": "self_reported_ethnicity_ontology_term_id", |
| 68 | + "sex": "sex", |
| 69 | + "sex_ontology_term_id": "sex_ontology_term_id", |
| 70 | + "suspension_type": "suspension_type", |
| 71 | + "tissue": "tissue", |
| 72 | + "tissue_ontology_term_id": "tissue_ontology_term_id", |
| 73 | +} |
| 74 | +adata.obs = adata.obs.rename(columns={old:new for new,old in rename_obs_keys.items()}) |
| 75 | + |
| 76 | +# Add additional information to obs |
| 77 | +#TODO: Finish up the terms according to the ontology |
| 78 | +#Ontology schema currently (13.03.2025) used in openproblems (CELLxGENE schema v4.0.0): |
| 79 | +#https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md |
| 80 | +#(mentioned here: https://openproblems.bio/documentation/reference/openproblems/src-datasets#file-format:-raw-dataset) |
| 81 | +store_info = { |
| 82 | + "dataset_id": "2020Travaglini_human_lung_sc", |
| 83 | + "organism": "Homo sapiens", |
| 84 | + # "organism_ontology_term_id": "NCBITaxon:10090", #TODO: ontology |
| 85 | +} |
| 86 | +for key, value in store_info.items(): |
| 87 | + adata.obs[key] = pd.Categorical([value] * adata.n_obs, categories=[value]) |
| 88 | + |
| 89 | +# Remove undesired columns |
| 90 | +for key in adata.obs.columns: |
| 91 | + if (key not in rename_obs_keys.keys()) and (key not in store_info.keys()): |
| 92 | + print(f"Removing .obs['{key}']") |
| 93 | + del adata.obs[key] |
| 94 | + |
| 95 | +# Var |
| 96 | +adata.var["gene_symbol"] = adata.var["feature_name"] |
| 97 | +adata.var["feature_id"] = adata.var_names |
| 98 | +adata.var_names = adata.var["feature_name"] |
| 99 | +adata.var_names = adata.var_names.astype(str) |
| 100 | +adata.var_names_make_unique() |
| 101 | +adata.var.index.name = None |
| 102 | + |
| 103 | +# Uns |
| 104 | +for key in ["dataset_id", "dataset_name", "dataset_url", "dataset_reference", "dataset_summary", "dataset_description", "dataset_organism"]: |
| 105 | + adata.uns[key] = par[key] |
| 106 | + |
| 107 | +# Delete files if requested |
| 108 | +if not par["keep_files"]: |
| 109 | + print("Removing files", flush=True) |
| 110 | + if FILE_PATH.exists(): |
| 111 | + print("\t...", FILE_PATH, flush=True) |
| 112 | + FILE_PATH.unlink() |
| 113 | + |
| 114 | +print("Writing adata", flush=True) |
| 115 | +adata.write_h5ad(par["output"], compression="gzip") |
0 commit comments