Extract authors/contributors as objects with name and orcid.

fbacall · fbacall · commit 135ce24ef909 · 2026-03-05T10:31:30.000Z
diff --git a/lib/tess/rdf/event_extractor.rb b/lib/tess/rdf/event_extractor.rb
@@ -22,7 +22,7 @@ def extract_params
 
         params[:capacity] = extract_value(RDF::Vocab::SCHEMA.maximumAttendeeCapacity)
 
-        contact = extract_person(RDF::Vocabulary::Term.new('http://schema.org/contact', attributes: {}))
+        contact = extract_contact(RDF::Vocabulary::Term.new('http://schema.org/contact', attributes: {}))
         params[:contact] = [contact[:name], contact[:email]].compact.join(' - ') if contact
 
         legacy_topics = extract_values(RDF::Vocabulary::Term.new('http://schema.org/topic', attributes: {}))
diff --git a/lib/tess/rdf/extraction.rb b/lib/tess/rdf/extraction.rb
@@ -105,7 +105,23 @@ def extract_names(predicate, subject: resource)
               [:thing, RDF::Vocab::SCHEMA.name, :name]).map { |r| r[:name] }.compact.uniq.sort
       end
 
-      def extract_person(predicate, subject: resource)
+      def extract_people(predicate, subject: resource)
+        query([subject, predicate, :person],
+              [:person, RDF::Vocab::SCHEMA.name, :name],
+              [:person, RDF::Vocab::SCHEMA.identifier, :identifier, { optional: true }]).map do |n|
+          p = { name: n[:name] }
+          # Check `identifier` and `@id` for ORCID
+          [:person, :identifier].each do |attr|
+            if n[attr]
+              match = n[attr].match(/(\d\d\d\d-\d\d\d\d-\d\d\d\d-\d\d\d[\dx])/i)
+              p[:orcid] = match[1] if match
+            end
+          end
+          p
+        end.compact
+      end
+
+      def extract_contact(predicate, subject: resource)
         query([subject, predicate, :person],
               [:person, RDF::Vocab::SCHEMA.name, :name, { optional: true }],
               [:person, RDF::Vocab::SCHEMA.email, :email, { optional: true }]).first
diff --git a/lib/tess/rdf/material_extractor.rb b/lib/tess/rdf/material_extractor.rb
@@ -20,8 +20,8 @@ def extract_params
         params[:date_published] = extract_value(RDF::Vocab::SCHEMA.datePublished)
         params[:status] = extract_value(RDF::Vocab::SCHEMA.creativeWorkStatus)
 
-        params[:authors] = (extract_names(RDF::Vocab::SCHEMA.author) | extract_names(RDF::Vocab::SIOC.has_creator)).sort
-        params[:contributors] = extract_names(RDF::Vocab::SCHEMA.contributor)
+        params[:authors] = (extract_people(RDF::Vocab::SCHEMA.author) | extract_names(RDF::Vocab::SIOC.has_creator).sort)
+        params[:contributors] = extract_people(RDF::Vocab::SCHEMA.contributor)
         params[:target_audience] = extract_audience
         params[:resource_type] = extract_values(RDF::Vocab::SCHEMA.learningResourceType)
         params[:external_resources] = extract_mentions
diff --git a/test/extraction_test.rb b/test/extraction_test.rb
@@ -84,10 +84,13 @@ class ExtractionTest < Test::Unit::TestCase
     refute params.key?(:scientific_topic_names)
     refute params.key?(:scientific_topic_uris)
     assert_equal ["assembly"], params[:keywords]
-    assert_equal ["Helena Rasche", "Saskia Hiltemann", "Simon Gladman"], params[:authors]
+    people = [{ name: 'Simon Gladman'},
+              { name: 'Helena Rasche', orcid: '0000-0001-9760-8992'},
+              { name: 'Saskia Hiltemann', orcid: '0000-0003-3803-468X'}].sort_by { |p| p[:name] }
+    assert_equal people, params[:authors].sort_by { |p| p[:name] }
     assert_equal ["Students"], params[:target_audience]
     assert_equal ["hands-on tutorial"], params[:resource_type]
-    assert_equal ["Helena Rasche", "Saskia Hiltemann", "Simon Gladman"], params[:contributors]
+    assert_equal people, params[:contributors].sort_by { |p| p[:name] }
     assert_equal "Beginner", params[:difficulty_level]
   end
 
@@ -107,10 +110,12 @@ class ExtractionTest < Test::Unit::TestCase
     refute params.key?(:scientific_topic_names)
     assert_equal ["http://edamontology.org/topic_3174"], params[:scientific_topic_uris]
     refute params.key?(:keywords)
-    assert_equal ["Bérénice Batut", "Saskia Hiltemann"], params[:authors]
+    people = [{ name: 'Bérénice Batut', orcid: '0000-0001-9852-1987'},
+              { name: 'Saskia Hiltemann', orcid: '0000-0003-3803-468X'}].sort_by { |p| p[:name] }
+    assert_equal people, params[:authors].sort_by { |p| p[:name] }
     assert_equal ["Students"], params[:target_audience]
     assert_equal ["slides"], params[:resource_type]
-    assert_equal ["Bérénice Batut", "Saskia Hiltemann"], params[:contributors]
+    assert_equal people, params[:contributors].sort_by { |p| p[:name] }
     assert params[:node_names].include?('Belgium')
   end
 
@@ -285,7 +290,7 @@ class ExtractionTest < Test::Unit::TestCase
     assert_equal ['ELIXIR RIR', 'BridgeDb'], params[:keywords]
     assert_equal 'https://bioconductor.org/packages/release/bioc/vignettes/BridgeDbR/inst/doc/AGPL-3', params[:licence]
     assert_equal '1.17.5', params[:version]
-    assert_equal ['Egon Willighagen'], params[:authors]
+    assert_equal [{ name: 'Egon Willighagen', orcid: '0000-0001-7542-0286' }], params[:authors]
   end
 
   test 'extract event from legacy Edinburgh Genomics Event markup' do
diff --git a/test/field_test.rb b/test/field_test.rb
@@ -280,6 +280,92 @@ class FieldTest < Test::Unit::TestCase
     assert_equal 'de', course_instance_extractor(json).send(:extract_language)
   end
 
+  test 'extract authors' do
+    json = %(
+[{
+  "@context": "https://schema.org/",
+  "@type": "LearningResource",
+  "http://purl.org/dc/terms/conformsTo": {
+    "@id": "https://bioschemas.org/profiles/TrainingMaterial/1.0-RELEASE",
+    "@type": "CreativeWork"
+  },
+  "name": "Dummy Material",
+  "author": {
+      "@type": "Person",
+      "@id": "https://orcid.org/0000-0001-9062-6303",
+      "name": "Patricia Palagi"
+  }
+}])
+    authors = learning_resource_extractor(json).send(:extract_people, RDF::Vocab::SCHEMA.author)
+    assert_equal 1, authors.length
+    assert_equal '0000-0001-9062-6303', authors.first[:orcid]
+    assert_equal 'Patricia Palagi', authors.first[:name]
+
+    json = %(
+[{
+  "@context": "https://schema.org/",
+  "@type": "LearningResource",
+  "http://purl.org/dc/terms/conformsTo": {
+    "@id": "https://bioschemas.org/profiles/TrainingMaterial/1.0-RELEASE",
+    "@type": "CreativeWork"
+  },
+  "name": "Dummy Material",
+  "author": {
+      "@type": "Person",
+      "@id": "#aperson",
+      "name": "Patricia Palagi",
+      "identifier": "https://orcid.org/0000-0001-9062-6303"
+  }
+}])
+    authors = learning_resource_extractor(json).send(:extract_people, RDF::Vocab::SCHEMA.author)
+    assert_equal 1, authors.length
+    assert_equal '0000-0001-9062-6303', authors.first[:orcid]
+    assert_equal 'Patricia Palagi', authors.first[:name]
+
+    json = %(
+[{
+  "@context": "https://schema.org/",
+  "@type": "LearningResource",
+  "http://purl.org/dc/terms/conformsTo": {
+    "@id": "https://bioschemas.org/profiles/TrainingMaterial/1.0-RELEASE",
+    "@type": "CreativeWork"
+  },
+  "name": "Dummy Material",
+  "author": {
+      "@type": "Person",
+      "@id": "#aperson",
+      "name": "Thomas B. Hickey",
+      "identifier": "  0000-0002-1694-233X  "
+  }
+}])
+    authors = learning_resource_extractor(json).send(:extract_people, RDF::Vocab::SCHEMA.author)
+    assert_equal 1, authors.length
+    assert_equal '0000-0002-1694-233X', authors.first[:orcid]
+    assert_equal 'Thomas B. Hickey', authors.first[:name]
+
+
+    json = %(
+[{
+  "@context": "https://schema.org/",
+  "@type": "LearningResource",
+  "http://purl.org/dc/terms/conformsTo": {
+    "@id": "https://bioschemas.org/profiles/TrainingMaterial/1.0-RELEASE",
+    "@type": "CreativeWork"
+  },
+  "name": "Dummy Material",
+  "author": {
+      "@type": "Person",
+      "@id": "#aperson",
+      "name": "Patricia Palagi",
+      "identifier": "https://something.that.isnt.an.orcid"
+  }
+}])
+    authors = learning_resource_extractor(json).send(:extract_people, RDF::Vocab::SCHEMA.author)
+    assert_equal 1, authors.length
+    assert_nil authors.first[:orcid]
+    assert_equal 'Patricia Palagi', authors.first[:name]
+  end
+
   private
 
   def course_extractor(fixture, format: :jsonld, base_uri: 'https://example.com/my.json')