Skip to content

Commit c9663fa

Browse files
bpiwowarclaude
andcommitted
feat: improve Dataset API ergonomics and document new patterns
Add Dataset.data_path class property as shortcut for __dataset__.datapath, make reference() accept dataset class as first positional arg, and add reference.config() alias for prepare(). Update documentation accordingly. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 24d5d82 commit c9663fa

3 files changed

Lines changed: 130 additions & 8 deletions

File tree

docs/source/datasets.rst

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,28 @@ Advantages of class-based definitions:
7272
3. **Auto-naming** --- resource names are auto-detected from class attribute names
7373
4. **Two-path safety** --- incomplete downloads never appear at the final path
7474

75+
Dataset Utilities
76+
-----------------
77+
78+
``Dataset.data_path``
79+
A class property that returns the ``Path`` where this dataset's data is
80+
stored on disk. This is a shortcut for ``MyDataset.__dataset__.datapath``:
81+
82+
.. code-block:: python
83+
84+
@dataset(url="http://example.com")
85+
class Documents(Dataset):
86+
...
87+
88+
@dataset(url="http://example.com")
89+
class MyTask(Dataset):
90+
DOCS = reference(Documents)
91+
92+
def config(self) -> TaskData:
93+
# Use Documents.data_path to locate sibling data
94+
store_path = Documents.data_path / "docstore"
95+
return TaskData.C(path=store_path)
96+
7597
Resource Pipelines
7698
------------------
7799

@@ -221,10 +243,40 @@ HuggingFace Integration
221243
class Squad(QADataset):
222244
HF_DATA = HFDownloader("squad_data", "squad")
223245
224-
Links to Other Datasets
225-
------------------------
246+
Referencing Other Datasets
247+
--------------------------
248+
249+
Use :py:class:`~datamaestro.download.reference` to declare a dependency on
250+
another dataset class. The referenced dataset is prepared automatically when
251+
needed:
252+
253+
.. code-block:: python
254+
255+
from datamaestro.download import reference
256+
257+
@dataset(url="http://example.com")
258+
class MyTask(TaskData):
259+
DOCUMENTS = reference(DocumentsDataset)
260+
261+
def config(self) -> TaskData:
262+
return TaskData.C(
263+
documents=self.DOCUMENTS.config(),
264+
)
265+
266+
Call ``.config()`` on the reference resource to obtain the referenced dataset's
267+
configuration object (this is equivalent to ``.prepare()``).
268+
269+
.. note::
270+
271+
``reference()`` accepts the dataset class as its first positional argument.
272+
The older keyword form ``reference(reference=DocumentsDataset)`` still works
273+
but is no longer necessary.
274+
275+
Links to Other Datasets (by ID)
276+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
226277

227-
Use :py:func:`~datamaestro.download.links.links` to reference other datasets:
278+
Use :py:func:`~datamaestro.download.links.links` to reference datasets by their
279+
string ID rather than by class:
228280

229281
.. code-block:: python
230282

src/datamaestro/definitions.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,10 @@ def config(self) -> ImageClassification:
949949
train=IDX(path=self.TRAIN_IMAGES.path),
950950
test=IDX(path=self.TEST_IMAGES.path),
951951
)
952+
953+
Class-level convenience properties:
954+
955+
- ``MyDataset.data_path`` — shortcut for ``MyDataset.__dataset__.datapath``
952956
"""
953957

954958
@abstractmethod
@@ -964,6 +968,32 @@ def config(self) -> "Base":
964968
"""
965969
...
966970

971+
class _DataPath:
972+
"""Descriptor that provides ``Dataset.data_path`` as a class property.
973+
974+
Returns the ``datapath`` of the dataset wrapper (``__dataset__``),
975+
which is the directory where downloaded data is stored.
976+
"""
977+
978+
def __set_name__(self, owner, name):
979+
self.name = name
980+
981+
def __get__(self, obj, objtype=None):
982+
cls = objtype if objtype is not None else type(obj)
983+
dw = getattr(cls, "__dataset__", None)
984+
if dw is None:
985+
raise AttributeError(
986+
f"{cls.__name__} has no __dataset__; "
987+
"is the @dataset decorator applied?"
988+
)
989+
return dw.datapath
990+
991+
data_path: Path = _DataPath()
992+
"""Path to the directory where this dataset's downloaded data is stored.
993+
994+
Shortcut for ``MyDataset.__dataset__.datapath``.
995+
"""
996+
967997

968998
class metadataset(AbstractDataset):
969999
"""Annotation for object/functions which are abstract dataset definitions

src/datamaestro/download/__init__.py

Lines changed: 45 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -675,13 +675,32 @@ def __init_subclass__(cls):
675675

676676

677677
class reference(Resource):
678-
"""References another dataset instead of downloading."""
678+
"""References another dataset instead of downloading.
679679
680-
def __init__(self, varname=None, reference=None):
680+
Usage::
681+
682+
# Positional form (preferred):
683+
DOCS = reference(Documents)
684+
685+
# Keyword form:
686+
DOCS = reference(reference=Documents)
687+
688+
# With explicit varname (rarely needed — auto-set from attribute name):
689+
DOCS = reference(Documents, varname="docs")
690+
691+
In the ``config()`` method, call ``.config()`` (or ``.prepare()``)
692+
to obtain the referenced dataset's prepared configuration::
693+
694+
def config(self) -> Adhoc:
695+
return Adhoc.C(documents=self.DOCS.config())
696+
"""
697+
698+
def __init__(self, reference=None, *, varname=None):
681699
"""
682700
Args:
683-
varname: The name of the variable.
684-
reference: Another dataset to reference.
701+
reference: The dataset class (or wrapper) to reference.
702+
varname: Explicit resource name (auto-set from class attribute
703+
name if omitted).
685704
"""
686705
super().__init__(varname=varname)
687706
assert reference is not None, "Reference cannot be null"
@@ -691,7 +710,7 @@ def _resolve_reference(self):
691710
"""Resolve the reference to a DatasetWrapper.
692711
693712
For class-based datasets, the reference is the class itself with
694-
a __dataset__ attribute pointing to the DatasetWrapper.
713+
a ``__dataset__`` attribute pointing to the DatasetWrapper.
695714
For function-based datasets, the reference is already a DatasetWrapper.
696715
"""
697716
ref = self.reference
@@ -700,11 +719,32 @@ def _resolve_reference(self):
700719
return ref
701720

702721
def prepare(self):
722+
"""Return the referenced dataset's prepared configuration.
723+
724+
Resolves the reference and calls the target dataset's
725+
``config()`` method (via ``_prepare()``).
726+
727+
Returns:
728+
A Config instance — the result of the referenced dataset's
729+
``config()`` method.
730+
"""
703731
resolved = self._resolve_reference()
704732
if isinstance(resolved, AbstractDataset):
705733
return resolved._prepare()
706734
return resolved.prepare()
707735

736+
def config(self):
737+
"""Alias for :meth:`prepare` — returns the referenced dataset's config.
738+
739+
Preferred over ``prepare()`` for readability when used with
740+
``reference`` resources, since the return value is a configuration
741+
object (not a file path)::
742+
743+
def config(self) -> Adhoc:
744+
return Adhoc.C(documents=self.DOCS.config())
745+
"""
746+
return self.prepare()
747+
708748
def download(self, force=False):
709749
resolved = self._resolve_reference()
710750
if isinstance(resolved, AbstractDataset):

0 commit comments

Comments
 (0)