@@ -29,11 +29,10 @@ class HuggingFaceRepo(ProjectSpec):
2929 # Dataset names are the same as the repo names in HF.
3030
3131 def match (self ) -> bool :
32- readme = f"{ self .proj .url } /README.md"
33- return self .proj .fs .exists (readme )
32+ return "README.md" in self .proj .basenames
3433
3534 def parse (self ) -> None :
36- from projspec .content .metadata import DescriptiveMetadata , License
35+ from projspec .content .metadata import DescriptiveMetadata , License , Citation
3736 import yaml
3837
3938 with self .get_file ("README.md" ) as f :
@@ -45,17 +44,41 @@ def parse(self) -> None:
4544 meta = yaml .safe_load (StringIO (meta ))
4645 except yaml .YAMLError :
4746 raise ParseFailed
47+ if {
48+ "dataset_info" ,
49+ "source_datasets" ,
50+ "task_categories" ,
51+ "task_ids" ,
52+ }.intersection (meta ):
53+ raise ParseFailed ("README.md is a dataset card" )
54+
4855 if "licence" in meta :
4956 self .contents ["license" ] = License (
5057 proj = self .proj ,
5158 shortname = meta ["licence" ],
5259 fullname = meta .get ("license_name" ),
5360 url = meta .get ("license_link" ),
5461 )
62+ for tag in meta .get ("tags" , []):
63+ if tag .startswith ("arxiv:" ):
64+ self ._contents .setdefault ("citations" , []).append (
65+ Citation (
66+ proj = self .proj , meta = dict (arxiv = tag .removeprefix ("arxiv:" ))
67+ )
68+ )
69+ # TODO: datasets are links to other repos
5570 self .contents ["descriptive_metadata" ] = DescriptiveMetadata (
5671 proj = self .proj ,
5772 meta = {
58- k : meta [k ] for k in ["language" , "library_name" , "tags" ] if k in meta
73+ k : meta [k ]
74+ for k in [
75+ "language" ,
76+ "library_name" ,
77+ "tags" ,
78+ "base_model" ,
79+ "new_version" ,
80+ ]
81+ if k in meta
5982 },
6083 )
6184
@@ -85,11 +108,7 @@ class HuggingFaceDataset(ProjectSpec):
85108
86109 A HuggingFace dataset repo is identified by a ``README.md`` whose YAML
87110 front-matter contains at least one dataset-specific key (e.g.
88- ``task_categories``, ``dataset_info``, ``size_categories``). The card
89- format is defined at
90- https://huggingface.co/docs/hub/datasets-cards and the full metadata
91- specification is at
92- https://github.com/huggingface/hub-docs/blob/main/datasetcard.md
111+ ``task_categories``, ``dataset_info``, ``size_categories``).
93112
94113 Parsed contents
95114 ---------------
@@ -109,7 +128,8 @@ def match(self) -> bool:
109128
110129 def parse (self ) -> None :
111130 import yaml
112- from projspec .content .metadata import DescriptiveMetadata , License
131+ from projspec .content .metadata import DescriptiveMetadata , License , Citation
132+ from projspec .content .data import TabularData
113133
114134 try :
115135 with self .get_file ("README.md" ) as f :
@@ -125,6 +145,8 @@ def parse(self) -> None:
125145 raise ParseFailed (f"Invalid YAML front-matter: { exc } " ) from exc
126146 if not isinstance (meta , dict ):
127147 raise ParseFailed ("YAML front-matter did not parse to a mapping" )
148+ if {"library_name" , "base_model" , "new_version" }.intersection (meta ):
149+ raise ParseFailed ("README.md is a dataset card" )
128150
129151 if "license" in meta :
130152 self ._contents ["license" ] = License (
@@ -133,7 +155,15 @@ def parse(self) -> None:
133155 fullname = meta .get ("license_name" , "unknown" ),
134156 url = meta .get ("license_link" , "" ),
135157 )
136-
158+ for tag in meta .get ("tags" , []):
159+ if tag .startswith ("arxiv:" ):
160+ self ._contents .setdefault ("citations" , []).append (
161+ Citation (
162+ proj = self .proj , meta = dict (arxiv = tag .removeprefix ("arxiv:" ))
163+ )
164+ )
165+
166+ # TODO: source_datasets are links to other datasets
137167 descriptive_keys = [
138168 "pretty_name" ,
139169 "language" ,
@@ -151,6 +181,35 @@ def parse(self) -> None:
151181 proj = self .proj ,
152182 meta = card_meta ,
153183 )
184+ if datasets := meta .get ("dataset_info" ):
185+ # only including configured tabular data for now
186+ if "config_name" in datasets [0 ]:
187+ self ._contents ["tabular_data" ] = [
188+ TabularData (
189+ name = data ["config_name" ],
190+ proj = self .proj ,
191+ schema = data ["features" ],
192+ metadata = {
193+ k : data [k ]
194+ for k in ("splits" , "download_size" , "dataset_size" )
195+ if k in data
196+ },
197+ )
198+ for data in datasets
199+ if "features" in data
200+ ]
201+ else :
202+ if "features" in datasets :
203+ self ._contents ["tabular_data" ] = TabularData (
204+ name = "data" ,
205+ proj = self .proj ,
206+ schema = datasets ["features" ],
207+ metadata = {
208+ k : datasets [k ]
209+ for k in ("splits" , "download_size" , "dataset_size" )
210+ if k in datasets
211+ },
212+ )
154213
155214 @staticmethod
156215 def _create (path : str ) -> None :
0 commit comments