3434 RaggedIntervals ,
3535)
3636from .._utils import _lengths_to_offsets , _normalize_contig_name
37- from .._variants ._records import VLenAlleles
37+ from .._variants ._records import RaggedAlleles
3838from ._genotypes import (
3939 SparseGenotypes ,
4040 SparseSomaticGenotypes ,
5252
5353@define
5454class Reference :
55+ """A reference genome kept in-memory. Typically this is only instantiated to be
56+ passed to :meth:`Dataset.open <genvarloader.Dataset.open>` and avoid data duplication.
57+
58+ .. note::
59+ Do not instantiate this class directly. Use :meth:`Reference.from_path` instead.
60+ """
61+
5562 reference : NDArray [np .uint8 ]
5663 contigs : List [str ]
5764 offsets : NDArray [np .uint64 ]
5865 pad_char : int
5966
6067 @classmethod
61- def from_path_and_contigs (cls , fasta : Union [str , Path ], contigs : List [str ]):
68+ def from_path (cls , fasta : Union [str , Path ], contigs : List [str ] | None = None ):
69+ """Load a reference genome from a FASTA file.
70+
71+ Parameters
72+ ----------
73+ fasta
74+ Path to the FASTA file.
75+ contigs
76+ List of contig names to load. If None, all contigs in the FASTA file are loaded.
77+ Can be either UCSC or Ensembl style (i.e. with or without the "chr" prefix) and
78+ will be handled appropriately to match the underlying FASTA.
79+ """
6280 _fasta = Fasta ("ref" , fasta , "N" )
6381
6482 if not _fasta .cache_path .exists ():
6583 logger .info ("Memory-mapping FASTA file for faster access." )
6684 _fasta ._write_to_cache ()
6785
68- contigs = cast (
69- List [str ],
70- [_normalize_contig_name (c , _fasta .contigs ) for c in contigs ],
71- )
86+ if contigs is None :
87+ contigs = list (_fasta .contigs )
88+
89+ _contigs = [_normalize_contig_name (c , _fasta .contigs ) for c in contigs ]
90+ if unmapped := [
91+ source for source , mapped in zip (contigs , _contigs ) if mapped is None
92+ ]:
93+ raise ValueError (
94+ f"Some of the given contig names are not present in reference file: { unmapped } "
95+ )
96+ contigs = cast (list [str ], _contigs )
97+
7298 _fasta .sequences = _fasta ._get_sequences (contigs )
7399 if TYPE_CHECKING :
74100 assert _fasta .sequences is not None
@@ -95,7 +121,7 @@ def from_path_and_contigs(cls, fasta: Union[str, Path], contigs: List[str]):
95121class _Variants :
96122 positions : NDArray [np .int32 ]
97123 sizes : NDArray [np .int32 ]
98- alts : VLenAlleles
124+ alts : RaggedAlleles
99125
100126 @classmethod
101127 def from_table (cls , variants : Union [str , Path , pl .DataFrame ]):
@@ -104,7 +130,7 @@ def from_table(cls, variants: Union[str, Path, pl.DataFrame]):
104130 return cls (
105131 variants ["POS" ].to_numpy (),
106132 variants ["ILEN" ].to_numpy (),
107- VLenAlleles .from_polars (variants ["ALT" ]),
133+ RaggedAlleles .from_polars (variants ["ALT" ]),
108134 )
109135
110136
@@ -268,7 +294,7 @@ def from_path(
268294 variants = _Variants (
269295 svar_index ["POS" ].to_numpy () - 1 ,
270296 svar_index ["ILEN" ].to_numpy (),
271- VLenAlleles .from_polars (svar_index ["ALT" ]),
297+ RaggedAlleles .from_polars (svar_index ["ALT" ]),
272298 )
273299 return cls (
274300 reference = reference ,
@@ -547,7 +573,7 @@ def _get_haplotypes(
547573 geno_v_idxs = self .genotypes .data ,
548574 positions = self .variants .positions ,
549575 sizes = self .variants .sizes ,
550- alt_alleles = self .variants .alts .alleles .view (np .uint8 ),
576+ alt_alleles = self .variants .alts .data .view (np .uint8 ),
551577 alt_offsets = self .variants .alts .offsets ,
552578 ref = self .reference .reference ,
553579 ref_offsets = self .reference .offsets ,
0 commit comments