mcvickerlab
diff --git a/‎docs/source/api.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/source/api.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎python/genvarloader/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎python/genvarloader/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/genvarloader/_dataset/_impl.py‎
Lines changed: 16 additions & 6 deletions b/‎python/genvarloader/_dataset/_impl.py‎
Lines changed: 16 additions & 6 deletions
diff --git a/‎python/genvarloader/_dataset/_reconstruct.py‎
Lines changed: 36 additions & 10 deletions b/‎python/genvarloader/_dataset/_reconstruct.py‎
Lines changed: 36 additions & 10 deletions
diff --git a/‎python/genvarloader/_dataset/_reference.py‎
Lines changed: 0 additions & 59 deletions b/‎python/genvarloader/_dataset/_reference.py‎
Lines changed: 0 additions & 59 deletions
diff --git a/‎python/genvarloader/_dummy.py‎
Lines changed: 9 additions & 6 deletions b/‎python/genvarloader/_dummy.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎python/genvarloader/_ragged.py‎
Lines changed: 11 additions & 0 deletions b/‎python/genvarloader/_ragged.py‎
Lines changed: 11 additions & 0 deletions
@@ -31,6 +31,10 @@
 
 .. autofunction:: get_dummy_dataset
 
+.. autoclass:: Reference
+    :members:
+    :exclude-members: __new__, __init__
+
 .. autoclass:: RaggedDataset
     :exclude-members: __new__, __init__
 
 
@@ -2,6 +2,7 @@
 
 from ._bigwig import BigWigs
 from ._dataset._impl import ArrayDataset, Dataset, RaggedDataset
+from ._dataset._reconstruct import Reference
 from ._dataset._write import write
 from ._dummy import get_dummy_dataset
 from ._ragged import Ragged
@@ -19,4 +20,5 @@
     "get_dummy_dataset",
     "ArrayDataset",
     "RaggedDataset",
+    "Reference",
 ]
@@ -107,7 +107,7 @@ def open(
     @staticmethod
     def open(
         path: str | Path,
-        reference: str | Path,
+        reference: str | Path | Reference,
         jitter: int = 0,
         rng: int | np.random.Generator | None = False,
         deterministic: bool = True,
@@ -116,7 +116,7 @@ def open(
     @staticmethod
     def open(
         path: str | Path,
-        reference: str | Path | None = None,
+        reference: str | Path | Reference | None = None,
         jitter: int = 0,
         rng: int | np.random.Generator | None = False,
         deterministic: bool = True,
@@ -202,7 +202,10 @@ def open(
                 logger.info(
                     "Loading reference genome into memory. This typically has a modest memory footprint (a few GB) and greatly improves performance."
                 )
-                _reference = Reference.from_path_and_contigs(reference, contigs)
+                if isinstance(reference, Reference):
+                    _reference = reference
+                else:
+                    _reference = Reference.from_path(reference, contigs)
                 seqs = Seqs(reference=_reference)
                 tracks = Tracks.from_path(path, regions, len(samples))
                 tracks = tracks.with_tracks(list(tracks.intervals))
@@ -211,7 +214,10 @@ def open(
                 logger.info(
                     "Loading reference genome into memory. This typically has a modest memory footprint (a few GB) and greatly improves performance."
                 )
-                _reference = Reference.from_path_and_contigs(reference, contigs)
+                if isinstance(reference, Reference):
+                    _reference = reference
+                else:
+                    _reference = Reference.from_path(reference, contigs)
                 assert phased is not None
                 assert ploidy is not None
                 seqs = Haps.from_path(
@@ -228,7 +234,10 @@ def open(
                 logger.info(
                     "Loading reference genome into memory. This typically has a modest memory footprint (a few GB) and greatly improves performance."
                 )
-                _reference = Reference.from_path_and_contigs(reference, contigs)
+                if isinstance(reference, Reference):
+                    _reference = reference
+                else:
+                    _reference = Reference.from_path(reference, contigs)
                 assert phased is not None
                 assert ploidy is not None
                 seqs = Haps.from_path(
@@ -253,7 +262,8 @@ def open(
             )
             out_of_bounds = bed.select(
                 (
-                    pl.col("chromStart") >= pl.col("chrom").replace_strict(contig_lengths)
+                    pl.col("chromStart")
+                    >= pl.col("chrom").replace_strict(contig_lengths)
                 ).any()
             ).item()
             if out_of_bounds:
 
@@ -34,7 +34,7 @@
     RaggedIntervals,
 )
 from .._utils import _lengths_to_offsets, _normalize_contig_name
-from .._variants._records import VLenAlleles
+from .._variants._records import RaggedAlleles
 from ._genotypes import (
     SparseGenotypes,
     SparseSomaticGenotypes,
@@ -52,23 +52,49 @@
 
 @define
 class Reference:
+    """A reference genome kept in-memory. Typically this is only instantiated to be
+    passed to :meth:`Dataset.open <genvarloader.Dataset.open>` and avoid data duplication.
+
+    .. note::
+        Do not instantiate this class directly. Use :meth:`Reference.from_path` instead.
+    """
+
     reference: NDArray[np.uint8]
     contigs: List[str]
     offsets: NDArray[np.uint64]
     pad_char: int
 
     @classmethod
-    def from_path_and_contigs(cls, fasta: Union[str, Path], contigs: List[str]):
+    def from_path(cls, fasta: Union[str, Path], contigs: List[str] | None = None):
+        """Load a reference genome from a FASTA file.
+
+        Parameters
+        ----------
+        fasta
+            Path to the FASTA file.
+        contigs
+            List of contig names to load. If None, all contigs in the FASTA file are loaded.
+            Can be either UCSC or Ensembl style (i.e. with or without the "chr" prefix) and
+            will be handled appropriately to match the underlying FASTA.
+        """
         _fasta = Fasta("ref", fasta, "N")
 
         if not _fasta.cache_path.exists():
             logger.info("Memory-mapping FASTA file for faster access.")
             _fasta._write_to_cache()
 
-        contigs = cast(
-            List[str],
-            [_normalize_contig_name(c, _fasta.contigs) for c in contigs],
-        )
+        if contigs is None:
+            contigs = list(_fasta.contigs)
+
+        _contigs = [_normalize_contig_name(c, _fasta.contigs) for c in contigs]
+        if unmapped := [
+            source for source, mapped in zip(contigs, _contigs) if mapped is None
+        ]:
+            raise ValueError(
+                f"Some of the given contig names are not present in reference file: {unmapped}"
+            )
+        contigs = cast(list[str], _contigs)
+
         _fasta.sequences = _fasta._get_sequences(contigs)
         if TYPE_CHECKING:
             assert _fasta.sequences is not None
@@ -95,7 +121,7 @@ def from_path_and_contigs(cls, fasta: Union[str, Path], contigs: List[str]):
 class _Variants:
     positions: NDArray[np.int32]
     sizes: NDArray[np.int32]
-    alts: VLenAlleles
+    alts: RaggedAlleles
 
     @classmethod
     def from_table(cls, variants: Union[str, Path, pl.DataFrame]):
@@ -104,7 +130,7 @@ def from_table(cls, variants: Union[str, Path, pl.DataFrame]):
         return cls(
             variants["POS"].to_numpy(),
             variants["ILEN"].to_numpy(),
-            VLenAlleles.from_polars(variants["ALT"]),
+            RaggedAlleles.from_polars(variants["ALT"]),
         )
 
 
@@ -268,7 +294,7 @@ def from_path(
             variants = _Variants(
                 svar_index["POS"].to_numpy() - 1,
                 svar_index["ILEN"].to_numpy(),
-                VLenAlleles.from_polars(svar_index["ALT"]),
+                RaggedAlleles.from_polars(svar_index["ALT"]),
             )
         return cls(
             reference=reference,
@@ -547,7 +573,7 @@ def _get_haplotypes(
             geno_v_idxs=self.genotypes.data,
             positions=self.variants.positions,
             sizes=self.variants.sizes,
-            alt_alleles=self.variants.alts.alleles.view(np.uint8),
+            alt_alleles=self.variants.alts.data.view(np.uint8),
             alt_offsets=self.variants.alts.offsets,
             ref=self.reference.reference,
             ref_offsets=self.reference.offsets,
 
@@ -14,7 +14,7 @@
 from ._dataset._utils import bed_to_regions
 from ._ragged import Ragged, RaggedIntervals
 from ._utils import _lengths_to_offsets
-from ._variants._records import VLenAlleles
+from ._variants._records import RaggedAlleles
 
 
 def get_dummy_dataset():
@@ -25,6 +25,7 @@ def get_dummy_dataset():
     max_jitter = 2
 
     dummy_samples = ["Aang", "Katara", "Sokka", "Toph"]
+    n_samples = len(dummy_samples)
 
     dummy_contigs = [str(i) for i in range(1, 23)] + ["X", "Y", "MT"]
     dummy_bed = pl.DataFrame(
@@ -35,6 +36,7 @@ def get_dummy_dataset():
             "strand": ["+", "-", "+", "+"],
         }
     )
+    n_regions = len(dummy_bed)
 
     with pl.StringCache():
         pl.Series(natsorted(dummy_contigs), dtype=pl.Categorical())
@@ -61,12 +63,13 @@ def get_dummy_dataset():
     )
 
     dummy_vars = _Variants(
-        positions=repeat(dummy_regions[:, 1], "r -> (r s)", s=4),
-        sizes=repeat(np.array([-2, -1, 0, 1], np.int32), "s -> (r s)", r=4),
-        alts=VLenAlleles(
-            alleles=repeat(sp.cast_seqs("ACGTT"), "a -> (r a)", r=4),
+        positions=repeat(dummy_regions[:, 1], "r -> (r s)", s=n_samples),
+        sizes=repeat(np.array([-2, -1, 0, 1], np.int32), "s -> (r s)", r=n_regions),
+        alts=RaggedAlleles.from_offsets(
+            data=repeat(sp.cast_seqs("ACGTT"), "a -> (r a)", r=n_regions),
+            shape=n_regions*n_samples,
             offsets=_lengths_to_offsets(
-                repeat(np.array([1, 1, 1, 2]), "s -> (r s)", r=4)
+                repeat(np.array([1, 1, 1, 2]), "s -> (r s)", r=n_regions)
             ),
         ),
     )
 
@@ -2,6 +2,7 @@
 
 from typing import Any, Optional, Tuple, TypeGuard, TypeVar, Union
 
+import awkward as ak
 import numba as nb
 import numpy as np
 from attrs import define
@@ -54,6 +55,16 @@ def to_fixed_shape(self, shape: tuple[int, ...]) -> AnnotatedHaps:
         return AnnotatedHaps(haps, var_idxs, ref_coords)
 
 
+@define
+class RaggedVariants:
+    """Typically contains ragged arrays with shape (batch, ploidy, ~variants)"""
+
+    alts: ak.Array  # (batch, ploidy, ~variants, ~length)
+    pos: Ragged[np.int32]
+    ilens: Ragged[np.int32]
+    ccfs: Ragged[np.float32]
+
+
 def is_rag_dtype(rag: Ragged, dtype: type[DTYPE]) -> TypeGuard[Ragged[DTYPE]]:
     return np.issubdtype(rag.data.dtype, dtype)