Separate HF types and add data

martindurant · martindurant · commit 4b2374c6e44d · 2026-03-31T11:29:46.000-04:00
diff --git a/src/projspec/content/__init__.py b/src/projspec/content/__init__.py
@@ -1,7 +1,7 @@
 """Contents classes - information declared in project specs"""
 
 from projspec.content.base import BaseContent
-from projspec.content.data import FrictionlessData, IntakeSource
+from projspec.content.data import TabularData, IntakeSource
 from projspec.content.env_var import EnvironmentVariables
 from projspec.content.environment import Environment, Stack, Precision
 from projspec.content.executable import Command
@@ -12,7 +12,7 @@
 
 __all__ = [
     "BaseContent",
-    "FrictionlessData",
+    "TabularData",
     "IntakeSource",
     "EnvironmentVariables",
     "Command",
diff --git a/src/projspec/content/data.py b/src/projspec/content/data.py
@@ -5,8 +5,8 @@
 
 
 @dataclass
-class FrictionlessData(BaseContent):
-    """A datapackage spec, as defined by frictionlessdata
+class TabularData(BaseContent):
+    """A tabular dataset, columns and rows
 
     This lists loadable tabular files with defined schema, typically from formats such as
     JSON, CSV, and parquet.
@@ -15,7 +15,13 @@ class FrictionlessData(BaseContent):
     """
 
     name: str
-    schema: dict = field(default_factory=dict)
+    metadata: dict = field(default_factory=dict)
+    # allowed schema formats:
+    #  - dtype-like {fieldname: string-type}
+    #  - dtype-complex {fieldname: {...}}
+    #  - list like [{name:, ...}]
+    # We may choose to normalise to just one of these eventually
+    schema: dict | list = field(default_factory=dict)
 
 
 @dataclass
diff --git a/src/projspec/content/metadata.py b/src/projspec/content/metadata.py
@@ -15,6 +15,13 @@ class DescriptiveMetadata(BaseContent):
     meta: dict[str, str] = field(default_factory=dict)
 
 
+@dataclass
+class Citation(BaseContent):
+    """A citation for the project, or associated publication"""
+
+    meta: dict[str, str] = field(default_factory=dict)
+
+
 @dataclass
 class License(BaseContent):
     """A legal description of what the given project (code and other assets) can be used for.
diff --git a/src/projspec/proj/__init__.py b/src/projspec/proj/__init__.py
@@ -16,7 +16,7 @@
 from projspec.proj.node import JLabExtension, Node, Yarn
 from projspec.proj.pixi import Pixi
 from projspec.proj.poetry import Poetry
-from projspec.proj.published import Citation, Zenodo
+from projspec.proj.published import Cited, Zenodo
 from projspec.proj.pyscript import PyScript
 from projspec.proj.python_code import PythonCode, PythonLibrary
 from projspec.proj.rust import Rust, RustPython
@@ -30,7 +30,7 @@
     "ProjectSpec",
     "AIEnabled",
     "Briefcase",
-    "Citation",
+    "Cited",
     "CondaRecipe",
     "CondaProject",
     "Golang",
diff --git a/src/projspec/proj/datapackage.py b/src/projspec/proj/datapackage.py
@@ -15,7 +15,7 @@ def match(self) -> bool:
         return "datapackage.json" in self.proj.basenames
 
     def parse(self) -> None:
-        from projspec.content import DescriptiveMetadata, License, FrictionlessData
+        from projspec.content import DescriptiveMetadata, License, TabularData
 
         import json
 
@@ -36,7 +36,7 @@ def parse(self) -> None:
             )
         if "resources" in conf:
             self.contents["frictionless_data"] = [
-                FrictionlessData(
+                TabularData(
                     proj=self.proj,
                     name=_["name"],
                     schema=_.get("schema", {}),
diff --git a/src/projspec/proj/hf.py b/src/projspec/proj/hf.py
@@ -29,11 +29,10 @@ class HuggingFaceRepo(ProjectSpec):
     # Dataset names are the same as the repo names in HF.
 
     def match(self) -> bool:
-        readme = f"{self.proj.url}/README.md"
-        return self.proj.fs.exists(readme)
+        return "README.md" in self.proj.basenames
 
     def parse(self) -> None:
-        from projspec.content.metadata import DescriptiveMetadata, License
+        from projspec.content.metadata import DescriptiveMetadata, License, Citation
         import yaml
 
         with self.get_file("README.md") as f:
@@ -45,17 +44,41 @@ def parse(self) -> None:
             meta = yaml.safe_load(StringIO(meta))
         except yaml.YAMLError:
             raise ParseFailed
+        if {
+            "dataset_info",
+            "source_datasets",
+            "task_categories",
+            "task_ids",
+        }.intersection(meta):
+            raise ParseFailed("README.md is a dataset card")
+
         if "licence" in meta:
             self.contents["license"] = License(
                 proj=self.proj,
                 shortname=meta["licence"],
                 fullname=meta.get("license_name"),
                 url=meta.get("license_link"),
             )
+        for tag in meta.get("tags", []):
+            if tag.startswith("arxiv:"):
+                self._contents.setdefault("citations", []).append(
+                    Citation(
+                        proj=self.proj, meta=dict(arxiv=tag.removeprefix("arxiv:"))
+                    )
+                )
+        # TODO: datasets are links to other repos
         self.contents["descriptive_metadata"] = DescriptiveMetadata(
             proj=self.proj,
             meta={
-                k: meta[k] for k in ["language", "library_name", "tags"] if k in meta
+                k: meta[k]
+                for k in [
+                    "language",
+                    "library_name",
+                    "tags",
+                    "base_model",
+                    "new_version",
+                ]
+                if k in meta
             },
         )
 
@@ -85,11 +108,7 @@ class HuggingFaceDataset(ProjectSpec):
 
     A HuggingFace dataset repo is identified by a ``README.md`` whose YAML
     front-matter contains at least one dataset-specific key (e.g.
-    ``task_categories``, ``dataset_info``, ``size_categories``).  The card
-    format is defined at
-    https://huggingface.co/docs/hub/datasets-cards and the full metadata
-    specification is at
-    https://github.com/huggingface/hub-docs/blob/main/datasetcard.md
+    ``task_categories``, ``dataset_info``, ``size_categories``).
 
     Parsed contents
     ---------------
@@ -109,7 +128,8 @@ def match(self) -> bool:
 
     def parse(self) -> None:
         import yaml
-        from projspec.content.metadata import DescriptiveMetadata, License
+        from projspec.content.metadata import DescriptiveMetadata, License, Citation
+        from projspec.content.data import TabularData
 
         try:
             with self.get_file("README.md") as f:
@@ -125,6 +145,8 @@ def parse(self) -> None:
             raise ParseFailed(f"Invalid YAML front-matter: {exc}") from exc
         if not isinstance(meta, dict):
             raise ParseFailed("YAML front-matter did not parse to a mapping")
+        if {"library_name", "base_model", "new_version"}.intersection(meta):
+            raise ParseFailed("README.md is a dataset card")
 
         if "license" in meta:
             self._contents["license"] = License(
@@ -133,7 +155,15 @@ def parse(self) -> None:
                 fullname=meta.get("license_name", "unknown"),
                 url=meta.get("license_link", ""),
             )
-
+        for tag in meta.get("tags", []):
+            if tag.startswith("arxiv:"):
+                self._contents.setdefault("citations", []).append(
+                    Citation(
+                        proj=self.proj, meta=dict(arxiv=tag.removeprefix("arxiv:"))
+                    )
+                )
+
+        # TODO: source_datasets are links to other datasets
         descriptive_keys = [
             "pretty_name",
             "language",
@@ -151,6 +181,35 @@ def parse(self) -> None:
             proj=self.proj,
             meta=card_meta,
         )
+        if datasets := meta.get("dataset_info"):
+            # only including configured tabular data for now
+            if "config_name" in datasets[0]:
+                self._contents["tabular_data"] = [
+                    TabularData(
+                        name=data["config_name"],
+                        proj=self.proj,
+                        schema=data["features"],
+                        metadata={
+                            k: data[k]
+                            for k in ("splits", "download_size", "dataset_size")
+                            if k in data
+                        },
+                    )
+                    for data in datasets
+                    if "features" in data
+                ]
+            else:
+                if "features" in datasets:
+                    self._contents["tabular_data"] = TabularData(
+                        name="data",
+                        proj=self.proj,
+                        schema=datasets["features"],
+                        metadata={
+                            k: datasets[k]
+                            for k in ("splits", "download_size", "dataset_size")
+                            if k in datasets
+                        },
+                    )
 
     @staticmethod
     def _create(path: str) -> None:
diff --git a/src/projspec/proj/published.py b/src/projspec/proj/published.py
@@ -1,9 +1,9 @@
 import yaml
 
-from projspec.proj.base import ProjectSpec
+from projspec.proj.base import ProjectExtra
 
 
-class Citation(ProjectSpec):
+class Cited(ProjectExtra):
     """A github-specified format to say how this project should be cited."""
 
     spec_doc = "https://citation-file-format.github.io/"
@@ -12,7 +12,7 @@ def match(self):
         return "CITATION.cff" in self.proj.basenames
 
     def parse(self) -> None:
-        from projspec.content.metadata import DescriptiveMetadata
+        from projspec.content.metadata import Citation
 
         with self.proj.fs.open(self.proj.basenames["CITATION.cff"], "rt") as f:
             meta = yaml.safe_load(f)
@@ -21,7 +21,7 @@ def parse(self) -> None:
         )
 
 
-class Zenodo(ProjectSpec):
+class Zenodo(ProjectExtra):
     """This project has been published on Zenodo"""
 
     spec_doc = "https://help.zenodo.org/docs/github/describe-software/zenodo-json/"