Skip to content

Commit 4b2374c

Browse files
committed
Separate HF types and add data
1 parent 37f0782 commit 4b2374c

7 files changed

Lines changed: 96 additions & 24 deletions

File tree

src/projspec/content/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""Contents classes - information declared in project specs"""
22

33
from projspec.content.base import BaseContent
4-
from projspec.content.data import FrictionlessData, IntakeSource
4+
from projspec.content.data import TabularData, IntakeSource
55
from projspec.content.env_var import EnvironmentVariables
66
from projspec.content.environment import Environment, Stack, Precision
77
from projspec.content.executable import Command
@@ -12,7 +12,7 @@
1212

1313
__all__ = [
1414
"BaseContent",
15-
"FrictionlessData",
15+
"TabularData",
1616
"IntakeSource",
1717
"EnvironmentVariables",
1818
"Command",

src/projspec/content/data.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66

77
@dataclass
8-
class FrictionlessData(BaseContent):
9-
"""A datapackage spec, as defined by frictionlessdata
8+
class TabularData(BaseContent):
9+
"""A tabular dataset, columns and rows
1010
1111
This lists loadable tabular files with defined schema, typically from formats such as
1212
JSON, CSV, and parquet.
@@ -15,7 +15,13 @@ class FrictionlessData(BaseContent):
1515
"""
1616

1717
name: str
18-
schema: dict = field(default_factory=dict)
18+
metadata: dict = field(default_factory=dict)
19+
# allowed schema formats:
20+
# - dtype-like {fieldname: string-type}
21+
# - dtype-complex {fieldname: {...}}
22+
# - list like [{name:, ...}]
23+
# We may choose to normalise to just one of these eventually
24+
schema: dict | list = field(default_factory=dict)
1925

2026

2127
@dataclass

src/projspec/content/metadata.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,13 @@ class DescriptiveMetadata(BaseContent):
1515
meta: dict[str, str] = field(default_factory=dict)
1616

1717

18+
@dataclass
19+
class Citation(BaseContent):
20+
"""A citation for the project, or associated publication"""
21+
22+
meta: dict[str, str] = field(default_factory=dict)
23+
24+
1825
@dataclass
1926
class License(BaseContent):
2027
"""A legal description of what the given project (code and other assets) can be used for.

src/projspec/proj/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from projspec.proj.node import JLabExtension, Node, Yarn
1717
from projspec.proj.pixi import Pixi
1818
from projspec.proj.poetry import Poetry
19-
from projspec.proj.published import Citation, Zenodo
19+
from projspec.proj.published import Cited, Zenodo
2020
from projspec.proj.pyscript import PyScript
2121
from projspec.proj.python_code import PythonCode, PythonLibrary
2222
from projspec.proj.rust import Rust, RustPython
@@ -30,7 +30,7 @@
3030
"ProjectSpec",
3131
"AIEnabled",
3232
"Briefcase",
33-
"Citation",
33+
"Cited",
3434
"CondaRecipe",
3535
"CondaProject",
3636
"Golang",

src/projspec/proj/datapackage.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def match(self) -> bool:
1515
return "datapackage.json" in self.proj.basenames
1616

1717
def parse(self) -> None:
18-
from projspec.content import DescriptiveMetadata, License, FrictionlessData
18+
from projspec.content import DescriptiveMetadata, License, TabularData
1919

2020
import json
2121

@@ -36,7 +36,7 @@ def parse(self) -> None:
3636
)
3737
if "resources" in conf:
3838
self.contents["frictionless_data"] = [
39-
FrictionlessData(
39+
TabularData(
4040
proj=self.proj,
4141
name=_["name"],
4242
schema=_.get("schema", {}),

src/projspec/proj/hf.py

Lines changed: 70 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,10 @@ class HuggingFaceRepo(ProjectSpec):
2929
# Dataset names are the same as the repo names in HF.
3030

3131
def match(self) -> bool:
32-
readme = f"{self.proj.url}/README.md"
33-
return self.proj.fs.exists(readme)
32+
return "README.md" in self.proj.basenames
3433

3534
def parse(self) -> None:
36-
from projspec.content.metadata import DescriptiveMetadata, License
35+
from projspec.content.metadata import DescriptiveMetadata, License, Citation
3736
import yaml
3837

3938
with self.get_file("README.md") as f:
@@ -45,17 +44,41 @@ def parse(self) -> None:
4544
meta = yaml.safe_load(StringIO(meta))
4645
except yaml.YAMLError:
4746
raise ParseFailed
47+
if {
48+
"dataset_info",
49+
"source_datasets",
50+
"task_categories",
51+
"task_ids",
52+
}.intersection(meta):
53+
raise ParseFailed("README.md is a dataset card")
54+
4855
if "licence" in meta:
4956
self.contents["license"] = License(
5057
proj=self.proj,
5158
shortname=meta["licence"],
5259
fullname=meta.get("license_name"),
5360
url=meta.get("license_link"),
5461
)
62+
for tag in meta.get("tags", []):
63+
if tag.startswith("arxiv:"):
64+
self._contents.setdefault("citations", []).append(
65+
Citation(
66+
proj=self.proj, meta=dict(arxiv=tag.removeprefix("arxiv:"))
67+
)
68+
)
69+
# TODO: datasets are links to other repos
5570
self.contents["descriptive_metadata"] = DescriptiveMetadata(
5671
proj=self.proj,
5772
meta={
58-
k: meta[k] for k in ["language", "library_name", "tags"] if k in meta
73+
k: meta[k]
74+
for k in [
75+
"language",
76+
"library_name",
77+
"tags",
78+
"base_model",
79+
"new_version",
80+
]
81+
if k in meta
5982
},
6083
)
6184

@@ -85,11 +108,7 @@ class HuggingFaceDataset(ProjectSpec):
85108
86109
A HuggingFace dataset repo is identified by a ``README.md`` whose YAML
87110
front-matter contains at least one dataset-specific key (e.g.
88-
``task_categories``, ``dataset_info``, ``size_categories``). The card
89-
format is defined at
90-
https://huggingface.co/docs/hub/datasets-cards and the full metadata
91-
specification is at
92-
https://github.com/huggingface/hub-docs/blob/main/datasetcard.md
111+
``task_categories``, ``dataset_info``, ``size_categories``).
93112
94113
Parsed contents
95114
---------------
@@ -109,7 +128,8 @@ def match(self) -> bool:
109128

110129
def parse(self) -> None:
111130
import yaml
112-
from projspec.content.metadata import DescriptiveMetadata, License
131+
from projspec.content.metadata import DescriptiveMetadata, License, Citation
132+
from projspec.content.data import TabularData
113133

114134
try:
115135
with self.get_file("README.md") as f:
@@ -125,6 +145,8 @@ def parse(self) -> None:
125145
raise ParseFailed(f"Invalid YAML front-matter: {exc}") from exc
126146
if not isinstance(meta, dict):
127147
raise ParseFailed("YAML front-matter did not parse to a mapping")
148+
if {"library_name", "base_model", "new_version"}.intersection(meta):
149+
raise ParseFailed("README.md is a dataset card")
128150

129151
if "license" in meta:
130152
self._contents["license"] = License(
@@ -133,7 +155,15 @@ def parse(self) -> None:
133155
fullname=meta.get("license_name", "unknown"),
134156
url=meta.get("license_link", ""),
135157
)
136-
158+
for tag in meta.get("tags", []):
159+
if tag.startswith("arxiv:"):
160+
self._contents.setdefault("citations", []).append(
161+
Citation(
162+
proj=self.proj, meta=dict(arxiv=tag.removeprefix("arxiv:"))
163+
)
164+
)
165+
166+
# TODO: source_datasets are links to other datasets
137167
descriptive_keys = [
138168
"pretty_name",
139169
"language",
@@ -151,6 +181,35 @@ def parse(self) -> None:
151181
proj=self.proj,
152182
meta=card_meta,
153183
)
184+
if datasets := meta.get("dataset_info"):
185+
# only including configured tabular data for now
186+
if "config_name" in datasets[0]:
187+
self._contents["tabular_data"] = [
188+
TabularData(
189+
name=data["config_name"],
190+
proj=self.proj,
191+
schema=data["features"],
192+
metadata={
193+
k: data[k]
194+
for k in ("splits", "download_size", "dataset_size")
195+
if k in data
196+
},
197+
)
198+
for data in datasets
199+
if "features" in data
200+
]
201+
else:
202+
if "features" in datasets:
203+
self._contents["tabular_data"] = TabularData(
204+
name="data",
205+
proj=self.proj,
206+
schema=datasets["features"],
207+
metadata={
208+
k: datasets[k]
209+
for k in ("splits", "download_size", "dataset_size")
210+
if k in datasets
211+
},
212+
)
154213

155214
@staticmethod
156215
def _create(path: str) -> None:

src/projspec/proj/published.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
import yaml
22

3-
from projspec.proj.base import ProjectSpec
3+
from projspec.proj.base import ProjectExtra
44

55

6-
class Citation(ProjectSpec):
6+
class Cited(ProjectExtra):
77
"""A github-specified format to say how this project should be cited."""
88

99
spec_doc = "https://citation-file-format.github.io/"
@@ -12,7 +12,7 @@ def match(self):
1212
return "CITATION.cff" in self.proj.basenames
1313

1414
def parse(self) -> None:
15-
from projspec.content.metadata import DescriptiveMetadata
15+
from projspec.content.metadata import Citation
1616

1717
with self.proj.fs.open(self.proj.basenames["CITATION.cff"], "rt") as f:
1818
meta = yaml.safe_load(f)
@@ -21,7 +21,7 @@ def parse(self) -> None:
2121
)
2222

2323

24-
class Zenodo(ProjectSpec):
24+
class Zenodo(ProjectExtra):
2525
"""This project has been published on Zenodo"""
2626

2727
spec_doc = "https://help.zenodo.org/docs/github/describe-software/zenodo-json/"

0 commit comments

Comments
 (0)