Skip to content

Commit e5c27db

Browse files
committed
Add support for Bioschemas types and draft types namespaces
1 parent e66e4c1 commit e5c27db

File tree

10 files changed

+6430
-1
lines changed

10 files changed

+6430
-1
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ vendor
1111
.jekyll-metadata
1212
workspace.code-workspace
1313
/Gemfile
14-
/Gemfile.lock
14+
/Gemfile.lock
15+
.ipynb_checkpoints

_includes/head.html

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,14 @@
3535
<script src="{{ 'assets/js/bootstrap.bundle.min.js' | relative_url }}"></script>
3636
<script src="{{ 'assets/js/toc.js' | relative_url }}"></script>
3737
<script defer src="{{'assets/js/all.min.js' | relative_url }}"></script>
38+
<!-- Signposting -->
39+
{% if page.signposting %}
40+
{% for item in page.signposting %}
41+
{% if item.rel == "type" %}
42+
<link rel="{{item.rel}}" type="application/ld+json" href="{{item.link}}">
43+
{% else %}
44+
<link rel="{{item.rel}}" href="{{item.link}}">
45+
{% endif %}
46+
{% endfor %}
47+
{% endif %}
3848
</head>

notebooks/BioschemasTypes.ipynb

Lines changed: 355 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,355 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 1,
6+
"id": "9a444332-ef86-46cf-a110-3f8cf66c1c2b",
7+
"metadata": {},
8+
"outputs": [
9+
{
10+
"name": "stdout",
11+
"output_type": "stream",
12+
"text": [
13+
"Requirement already satisfied: rdflib in c:\\software\\iniforge3\\lib\\site-packages (7.1.3)\n",
14+
"Requirement already satisfied: pyparsing<4,>=2.1.0 in c:\\software\\iniforge3\\lib\\site-packages (from rdflib) (3.2.1)\n"
15+
]
16+
}
17+
],
18+
"source": [
19+
"import sys\n",
20+
"!{sys.executable} -m pip install rdflib"
21+
]
22+
},
23+
{
24+
"cell_type": "markdown",
25+
"id": "06db8876-5d45-4cf2-9251-4cf2a83287fc",
26+
"metadata": {},
27+
"source": [
28+
"## Modify context and remove unnecessary keys from the JSON-LD file"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": 122,
34+
"id": "26fc24a1-fa8d-4de5-b57f-36fa8fb0c935",
35+
"metadata": {},
36+
"outputs": [],
37+
"source": [
38+
"import urllib.request, json"
39+
]
40+
},
41+
{
42+
"cell_type": "code",
43+
"execution_count": 123,
44+
"id": "0080eaab-f809-43ba-a6dd-989b8809dac2",
45+
"metadata": {},
46+
"outputs": [],
47+
"source": [
48+
"def remove_key (key_to_remove, data):\n",
49+
" if key_to_remove in data:\n",
50+
" #removed_value = data.pop(key_to_remove)\n",
51+
" del data[key_to_remove]"
52+
]
53+
},
54+
{
55+
"cell_type": "code",
56+
"execution_count": 124,
57+
"id": "302448cc-c8e8-4e45-bef9-951002e9f43b",
58+
"metadata": {},
59+
"outputs": [],
60+
"source": [
61+
"def replace_string_in_json(search_value, replace_value, data):\n",
62+
" if isinstance(data, dict): # Check if data is a dictionary\n",
63+
" for key, value in data.items():\n",
64+
" if isinstance(value, str) and search_value in value: # Check if value is a string containing search_value\n",
65+
" data[key] = value.replace(search_value, replace_value)\n",
66+
" elif isinstance(value, (dict, list)): # Recursively call for nested dictionaries or lists\n",
67+
" replace_string_in_json(search_value, replace_value, value)\n",
68+
" elif isinstance(data, list): # Check if data is a list\n",
69+
" for item in data:\n",
70+
" replace_string_in_json(search_value, replace_value, item) # Recursively call for list items"
71+
]
72+
},
73+
{
74+
"cell_type": "code",
75+
"execution_count": 148,
76+
"id": "a16b30ca-a972-4ca2-89d0-89d07ced82c9",
77+
"metadata": {},
78+
"outputs": [],
79+
"source": [
80+
"def process_context(types_data, bs_ns, bs_url) :\n",
81+
" #remove all DDE bioschemas namespaces\n",
82+
" remove_key(\"bioschemas\", types_data[\"@context\"])\n",
83+
" remove_key(\"bioschemasdrafts\", types_data[\"@context\"])\n",
84+
" remove_key(\"bioschemastypes\", types_data[\"@context\"])\n",
85+
" remove_key(\"bioschemastypesdrafts\", types_data[\"@context\"])\n",
86+
" remove_key(\"bioschemasdeprecated\", types_data[\"@context\"])\n",
87+
" \n",
88+
" #also remove schema as it should point to https://schema.org rather than http://schema.org\n",
89+
" remove_key(\"schema\", types_data[\"@context\"])\n",
90+
" \n",
91+
" #add the canonical Bioschemas types namespace and corresponding url\n",
92+
" types_data[\"@context\"][\"bioschemas\"] = \"https://bioschemas.org/terms/\"\n",
93+
" try :\n",
94+
" types_data[\"@context\"][bs_ns]\n",
95+
" except KeyError :\n",
96+
" types_data[\"@context\"][bs_ns] = bs_url\n",
97+
" \n",
98+
" #also add back schema pointing to https://schema.org\n",
99+
" types_data[\"@context\"][\"schema\"] = \"https://schema.org/\""
100+
]
101+
},
102+
{
103+
"cell_type": "code",
104+
"execution_count": 141,
105+
"id": "fc96c453-ad88-4c2b-a6be-816df8309041",
106+
"metadata": {},
107+
"outputs": [],
108+
"source": [
109+
"def process_graph(types_data, bs_ns, dde_ns) :\n",
110+
" #remove additional_type from every element in the graph, used on the website for navigation but not needed in the JSON-LD\n",
111+
" for item in types_data[\"@graph\"]:\n",
112+
" remove_key(\"additional_type\", item)\n",
113+
" remove_key(\"schema:additionalType\", item)\n",
114+
"\n",
115+
" #now use the new bioschemas namespace instedad the old one bioschemastypes\n",
116+
" replace_string_in_json(dde_ns, bs_ns, types_data[\"@graph\"])\n",
117+
" replace_string_in_json(\"bioschemastypes\", \"bioschemas\", types_data[\"@graph\"])"
118+
]
119+
},
120+
{
121+
"cell_type": "code",
122+
"execution_count": 133,
123+
"id": "5f7464e0-a992-41a7-8970-4f12dd2d4661",
124+
"metadata": {},
125+
"outputs": [],
126+
"source": [
127+
"def process_json(types_data, bs_ns, bs_url, dde_ns) :\n",
128+
" process_context(types_data, bs_ns, bs_url)\n",
129+
" process_graph(types_data, bs_ns, dde_ns)"
130+
]
131+
},
132+
{
133+
"cell_type": "code",
134+
"execution_count": 134,
135+
"id": "259dc83d-6037-4788-9f46-8d9ffa40a3a3",
136+
"metadata": {},
137+
"outputs": [],
138+
"source": [
139+
"def process_all_json(origin, destination, bs_ns, bs_url, dde_ns) :\n",
140+
" #Get the bioschemas types file from the Bioschemas DDE GitHub repository\n",
141+
" with urllib.request.urlopen(origin) as types_file:\n",
142+
" types_data = json.load(types_file)\n",
143+
" \n",
144+
" process_json(types_data, bs_ns, bs_url, dde_ns)\n",
145+
" \n",
146+
" #save modified json to local temp file\n",
147+
" with open(destination, 'w') as temp_types_file:\n",
148+
" json.dump(types_data, temp_types_file)"
149+
]
150+
},
151+
{
152+
"cell_type": "markdown",
153+
"id": "bbd8fffc-e218-407a-8e13-04ddc2c7620b",
154+
"metadata": {},
155+
"source": [
156+
"### Process JSON-LD file for Bioschemas types"
157+
]
158+
},
159+
{
160+
"cell_type": "code",
161+
"execution_count": 149,
162+
"id": "710ea645-7525-4009-bd5e-b122c475bb5c",
163+
"metadata": {},
164+
"outputs": [],
165+
"source": [
166+
"origin = \"https://github.com/BioSchemas/bioschemas-dde/raw/main/bioschemastypes.json\"\n",
167+
"destination = \"bioschemas_types_temp.json\"\n",
168+
"bs_ns = \"bioschemas\"\n",
169+
"bs_url = \"https://bioschemas.org/terms/\"\n",
170+
"dde_ns = \"bioschemastypes\"\n",
171+
"dde_url = \"https://discovery.biothings.io/view/bioschemastypes/\"\n",
172+
"w3id_url = \"https://w3id.org/bioschemas/terms/\"\n",
173+
"process_all_json(origin, destination, bs_ns, bs_url, dde_ns)"
174+
]
175+
},
176+
{
177+
"cell_type": "markdown",
178+
"id": "a4cc29b8-92ab-4473-97f4-5934780a56fc",
179+
"metadata": {},
180+
"source": [
181+
"### Process JSON-LD file for Bioschemas draft types"
182+
]
183+
},
184+
{
185+
"cell_type": "code",
186+
"execution_count": 150,
187+
"id": "7705bc06-06dc-4b53-9a35-4894c594ebd0",
188+
"metadata": {},
189+
"outputs": [],
190+
"source": [
191+
"origin = \"https://github.com/BioSchemas/bioschemas-dde/raw/main/bioschemastypesdrafts.json\"\n",
192+
"destination = \"bioschemas_draft_types_temp.json\"\n",
193+
"bs_ns = \"bioschemas_draft\"\n",
194+
"bs_url = \"https://bioschemas.org/draft_terms/\"\n",
195+
"dde_ns = \"bioschemastypesdrafts\"\n",
196+
"dde_url = \"https://discovery.biothings.io/view/bioschemastypesdrafts/\"\n",
197+
"w3id_url = \"https://w3id.org/bioschemas/draft_terms/\"\n",
198+
"process_all_json(origin, destination, bs_ns, bs_url, dde_ns)"
199+
]
200+
},
201+
{
202+
"cell_type": "markdown",
203+
"id": "4b4b51d5-7200-40f4-9844-b094507caa2b",
204+
"metadata": {},
205+
"source": [
206+
"## Add same-as to DDE and w3id"
207+
]
208+
},
209+
{
210+
"cell_type": "code",
211+
"execution_count": 2,
212+
"id": "90246e73-5b99-42b6-9c54-28a2fc147ea0",
213+
"metadata": {},
214+
"outputs": [],
215+
"source": [
216+
"from rdflib import Graph\n",
217+
"from rdflib import URIRef\n",
218+
"from rdflib.namespace import RDF, RDFS, OWL"
219+
]
220+
},
221+
{
222+
"cell_type": "code",
223+
"execution_count": 18,
224+
"id": "8f5d0652-2029-4cfd-b9cc-9faa9964214f",
225+
"metadata": {},
226+
"outputs": [],
227+
"source": [
228+
"def add_equivalence(g, eq_dde_type, eq_w3id_type, eq_dde_type_draft, eq_w3id_type_draft, bs_ns, bs_ns_draft) :\n",
229+
"\n",
230+
" #Classes and properties already aincluded in schema.org\n",
231+
" types_in_schema = [\"BioChemEntity\", \"ChemicalSubstance\", \"Gene\", \"MolecularEntity\", \"Protein\", \"Taxon\"]\n",
232+
" properties_in_schema = [\"bioChemInteraction\", \"bioChemSimilarity\", \"biologicalRole\", \"hasBioChemEntityPart\", \"hasMolecularFunction\", \"hasRepresentation\", \"isEncodedByBioChemEntity\", \"isInvolvedInBiologicalProcess\", \"isLocatedInSubcellularLocation\", \"isPartOfBioChemEntity\", \"taxonomicRange\", \"alternativeOf\", \"encodesBioChemEntity\", \"expressedIn\", \"hasBioPolymerSequence\", \"chemicalComposition\", \"chemicalRole\", \"potentialUse\", \"chemicalRole\", \"inChi\", \"inChiKey\", \"iupacName\", \"molecularFormula\", \"molecularWeight\", \"monoisotopicMolecularWeight\", \"smiles\", \"childTaxon\", \"parentTaxon\", \"taxonRank\"]\n",
233+
" eq_schema = URIRef(\"https://schema.org/\")\n",
234+
" \n",
235+
" #iterates over all classes in the bioschemas namespace\n",
236+
" rdfs_class = URIRef(\"http://www.w3.org/2000/01/rdf-schema#Class\")\n",
237+
" eq_class = URIRef(\"http://www.w3.org/2002/07/owl#equivalentClass\") \n",
238+
" for s in g.subjects(object=rdfs_class, unique=True) :\n",
239+
" sufix = str(s).split('/')[-1]\n",
240+
" #print(\"type: \", sufix)\n",
241+
" if bs_ns in s :\n",
242+
" g.add((s, eq_class, eq_dde_type + sufix))\n",
243+
" g.add((s, eq_class, eq_w3id_type + sufix))\n",
244+
" if sufix in types_in_schema :\n",
245+
" g.add((s, eq_class, eq_schema + sufix))\n",
246+
" if bs_ns_draft in s :\n",
247+
" g.add((s, eq_class, eq_dde_type_draft + sufix))\n",
248+
" g.add((s, eq_class, eq_w3id_type_draft + sufix)) \n",
249+
" \n",
250+
" #iterates over all properties in the bioschemas namespace\n",
251+
" schema_domain = URIRef(\"https://schema.org/domainIncludes\")\n",
252+
" eq_prop = URIRef(\"http://www.w3.org/2002/07/owl#equivalentProperty\")\n",
253+
" for s in g.subjects(predicate=schema_domain, unique=True) :\n",
254+
" sufix = str(s).split('/')[-1]\n",
255+
" #print(\"property: \", sufix)\n",
256+
" if bs_ns in s :\n",
257+
" g.add((s, eq_prop, eq_dde_type + sufix))\n",
258+
" g.add((s, eq_prop, eq_w3id_type + sufix))\n",
259+
" if sufix in properties_in_schema :\n",
260+
" g.add((s, eq_prop, eq_schema + sufix))\n",
261+
" if bs_ns_draft in s :\n",
262+
" g.add((s, eq_prop, eq_dde_type_draft + sufix))\n",
263+
" g.add((s, eq_prop, eq_w3id_type_draft + sufix))\n"
264+
]
265+
},
266+
{
267+
"cell_type": "code",
268+
"execution_count": 14,
269+
"id": "c47ba42a-79a9-416c-bd63-32d72df73210",
270+
"metadata": {},
271+
"outputs": [],
272+
"source": [
273+
"def process_equivalences(g_file, json_file, ttl_file) :\n",
274+
" #equivalences for types\n",
275+
" eq_dde_type = URIRef(\"https://discovery.biothings.io/view/bioschemastypes/\")\n",
276+
" eq_w3id_type = URIRef(\"https://w3id.org/bioschemas/terms/\")\n",
277+
" \n",
278+
" #equivalences for draft types\n",
279+
" eq_dde_type_draft = URIRef(\"https://discovery.biothings.io/view/bioschemastypesdrafts/\")\n",
280+
" eq_w3id_type_draft = URIRef(\"https://w3id.org/bioschemas/draft_terms/\")\n",
281+
" \n",
282+
" #bioschemas namespace for types and draft types\n",
283+
" bs_ns = \"https://bioschemas.org/terms/\"\n",
284+
" bs_ns_draft = \"https://bioschemas.org/draft_terms/\"\n",
285+
"\n",
286+
" #load graph\n",
287+
" g = Graph()\n",
288+
" g.parse(g_file)\n",
289+
" \n",
290+
" #add equivalences\n",
291+
" add_equivalence(g, eq_dde_type, eq_w3id_type, eq_dde_type_draft, eq_w3id_type_draft, bs_ns, bs_ns_draft)\n",
292+
" \n",
293+
" #save as json\n",
294+
" g.serialize(destination=json_file, format=\"json-ld\", auto_compact=True, indent=2)\n",
295+
" #save as turtle\n",
296+
" g.serialize(destination=ttl_file, format=\"turtle\")"
297+
]
298+
},
299+
{
300+
"cell_type": "code",
301+
"execution_count": 21,
302+
"id": "7515e020-7489-4238-b879-156fe1d61003",
303+
"metadata": {},
304+
"outputs": [],
305+
"source": [
306+
"g_file = \"bioschemas_types_temp.json\"\n",
307+
"json_file=\"bioschemas_types_temp.jsonld\"\n",
308+
"ttl_file=\"bioschemas_types_temp.ttl\"\n",
309+
"process_equivalences(g_file, json_file, ttl_file)"
310+
]
311+
},
312+
{
313+
"cell_type": "code",
314+
"execution_count": 20,
315+
"id": "f02a0c63-c83c-476a-86e9-5b3e07c8142c",
316+
"metadata": {},
317+
"outputs": [],
318+
"source": [
319+
"g_file = \"bioschemas_draft_types_temp.json\"\n",
320+
"json_file=\"bioschemas_draft_types_temp.jsonld\"\n",
321+
"ttl_file=\"bioschemas_draft_types_temp.ttl\"\n",
322+
"process_equivalences(g_file, json_file, ttl_file)"
323+
]
324+
},
325+
{
326+
"cell_type": "code",
327+
"execution_count": null,
328+
"id": "fcd72bdf-31c1-4d07-bac2-941b8b00703f",
329+
"metadata": {},
330+
"outputs": [],
331+
"source": []
332+
}
333+
],
334+
"metadata": {
335+
"kernelspec": {
336+
"display_name": "Python 3 (ipykernel)",
337+
"language": "python",
338+
"name": "python3"
339+
},
340+
"language_info": {
341+
"codemirror_mode": {
342+
"name": "ipython",
343+
"version": 3
344+
},
345+
"file_extension": ".py",
346+
"mimetype": "text/x-python",
347+
"name": "python",
348+
"nbconvert_exporter": "python",
349+
"pygments_lexer": "ipython3",
350+
"version": "3.12.7"
351+
}
352+
},
353+
"nbformat": 4,
354+
"nbformat_minor": 5
355+
}

notebooks/bioschemas_draft_types_temp.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)