diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 850be2b4..f698e541 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -55,9 +55,10 @@ jobs: run: | python manage.py migrate - - name: Load testdata to see if it is up to date with the Django migrations + - name: Load all testdata to see if it is up to date with the Django migrations run: | - python manage.py loaddata fixtures/test_data.json + python manage.py loaddata fixtures/test_data_optimap.json + python manage.py loaddata fixtures/test_data_partners.json - name: Run deploy checks run: | diff --git a/README.md b/README.md index 9110e74b..64ee8999 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,10 @@ python manage.py dumpdata --exclude=auth --exclude=contenttypes | jq > fixtures/ #### Loading Test Data -To load the test data into your database, run: +To load the test data into your database, run the following command choosing one of the existing fixtures: ```bash -python manage.py loaddata fixtures/test_data.json +python manage.py loaddata fixtures/test_data{optimap, partners}.json ``` #### Adding New Test Data diff --git a/fixtures/test_data.json b/fixtures/test_data_optimap.json similarity index 85% rename from fixtures/test_data.json rename to fixtures/test_data_optimap.json index 123637aa..b0f8dab0 100644 --- a/fixtures/test_data.json +++ b/fixtures/test_data_optimap.json @@ -1,7 +1,24 @@ [ + { + "model": "publications.source", + "pk": 9, + "fields": { + "name": "OPTIMAP Test Journal", + "issn_l": null, + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": "http://optimap.science", + "abbreviated_title": null, + "is_oa": true, + "cited_by_count": null, + "is_preprint": true + } + }, { "model": "publications.publication", - "pk": 1, + "pk": 900, "fields": { "status": "p", "title": "The First Article", @@ -12,7 +29,7 @@ "geometry": "SRID=4326;GEOMETRYCOLLECTION (POINT (7.595730774920725 51.96944097112328), POLYGON ((7.599984296478425 51.984257653537384, 7.5715788777530975 51.97057414651397, 7.570122189613329 51.950602187631205, 7.580319006590855 51.93825551711683, 7.609054957094401 51.93035649564658, 7.659674869951374 51.942256350721436, 7.6833460522228165 51.968514669138415, 7.665137450475669 51.99229098076532, 7.626171042736502 51.98982421450293, 7.599984296478425 51.984257653537384)))", "creationDate": "2022-10-24T12:10:53.086Z", "lastUpdate": "2022-10-24T12:10:53.086Z", - "source": "OPTIMAP Test Journal", + "source": 9, "timeperiod_startdate": "[\"2020-02-02\"]", "timeperiod_enddate": "[\"2022-02-20\"]", "provenance": "Manually added from file test_data.json using the Django management script." @@ -20,7 +37,7 @@ }, { "model": "publications.publication", - "pk": 2, + "pk": 901, "fields": { "status": "p", "title": "Paper Two", @@ -31,7 +48,7 @@ "geometry": "SRID=4326;GEOMETRYCOLLECTION (LINESTRING (9.754609563397707 52.36630414438588, 9.813062794192035 52.41569645624003, 10.141300167111496 52.36904961184797, 10.518997966087937 52.330597538337116, 10.838242534270051 52.311358956793185, 11.058566250338231 52.220550088821824, 11.535184901427073 52.15714903642342, 12.272594889905236 52.24258143981572, 12.618817872299417 52.35532056817789, 12.911084026269464 52.2976119913985, 13.144896949445211 52.50063147184562, 13.396695482095708 52.517051586549286))", "creationDate": "2022-10-24T12:10:53.086Z", "lastUpdate": "2022-10-24T12:10:53.086Z", - "source": "OPTIMAP Test Journal", + "source": 9, "timeperiod_startdate": "[\"2010-01-01\"]", "timeperiod_enddate": "[\"2012-12-12\"]", "provenance": "Manually added from file test_data.json using the Django management script." @@ -39,7 +56,7 @@ }, { "model": "publications.publication", - "pk": 3, + "pk": 902, "fields": { "status": "p", "title": "Paper 3", @@ -50,7 +67,7 @@ "geometry": "SRID=4326;GEOMETRYCOLLECTION(POLYGON ((13.558502 50.990421, 13.558502 51.094036, 13.864746 51.094036, 13.864746 50.990421, 13.558502 50.990421)))", "creationDate": "2022-10-24T12:10:53.086Z", "lastUpdate": "2022-10-24T12:10:53.086Z", - "source": "OPTIMAP Test Journal", + "source": 9, "timeperiod_startdate": "[\"2023\"]", "timeperiod_enddate": "[\"2024\"]", "provenance": "Manually added from file test_data.json using the Django management script." diff --git a/fixtures/test_data_partners.json b/fixtures/test_data_partners.json new file mode 100644 index 00000000..a2b2fcbb --- /dev/null +++ b/fixtures/test_data_partners.json @@ -0,0 +1,533 @@ +[ + { + "model": "publications.source", + "pk": 1, + "fields": { + "name": "Volcanica", + "issn_l": "2610-3540", + "openalex_id": "https://openalex.org/S26103540", + "openalex_url": "https://openalex.org/S26103540", + "publisher_name": "Volcanica Society", + "works_count": 12, + "is_oa": false, + "cited_by_count": null + } + }, + { + "model": "publications.publication", + "pk": 101, + "fields": { + "status": "p", + "title": "Eruption Dynamics of New Fissure on Reykjanes Peninsula", + "abstract": "High-resolution analysis of lava flow progression.", + "publicationDate": "2023-01-15", + "doi": "10.5710/volcanica.12345", + "url": "https://www.jvolcanica.org/ojs/index.php/volcanica/article/view/12345", + "geometry": "SRID=4326;GEOMETRYCOLLECTION(POLYGON((-10 35, 30 35, 30 60, -10 60, -10 35)))", + "creationDate": "2024-06-10T08:00:00Z", + "lastUpdate": "2024-06-10T08:00:00Z", + "source": 1, + "timeperiod_startdate": ["2022-05-01"], + "timeperiod_enddate": ["2022-10-01"], + "provenance": "Imported from DOAJ on 2024-06-10." + } + }, + { + "model": "publications.publication", + "pk": 102, + "fields": { + "status": "d", + "title": "Preliminary Survey of Volcanic Gas Emissions", + "abstract": "", + "publicationDate": null, + "doi": null, + "url": null, + "geometry": null, + "creationDate": "2024-06-10T08:05:00Z", + "lastUpdate": "2024-06-10T08:05:00Z", + "source": 1, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "Draft entry for Volcanica" + } + }, + + { + "model": "publications.source", + "pk": 2, + "fields": { + "name": "Journal of Spatial Information Science", + "issn_l": "1948-660X", + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": null, + "abbreviated_title": null, + "is_oa": false, + "cited_by_count": null + } + }, + { + "model": "publications.publication", + "pk": 201, + "fields": { + "status": "p", + "title": "Integrating Lidar and Photogrammetry for Urban Mapping", + "abstract": "An end-to-end pipeline for 3D city models.", + "publicationDate": "2022-08-20", + "doi": "10.5311/JSIS.2022.08.001", + "url": "https://josis.org/index.php/josis/article/view/08-001", + "geometry": "SRID=4326;GEOMETRYCOLLECTION(POLYGON((8.681 50.112,8.683 50.113,8.684 50.111,8.681 50.112)))", + "creationDate": "2024-06-10T09:00:00Z", + "lastUpdate": "2024-06-10T09:00:00Z", + "source": 2, + "timeperiod_startdate": ["2021-01-01"], + "timeperiod_enddate": ["2021-12-31"], + "provenance": "Imported from JOSIS archive" + } + }, + { + "model": "publications.publication", + "pk": 202, + "fields": { + "status": "p", + "title": "Spatial Database Performance Benchmarks", + "abstract": "Comparing PostGIS, Oracle Spatial, and SQL Server.", + "publicationDate": "2023-03-15", + "doi": null, + "url": null, + "geometry": null, + "creationDate": "2024-06-10T09:05:00Z", + "lastUpdate": "2024-06-10T09:05:00Z", + "source": 2, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "Manual entry – missing DOI/URL" + } + }, + + { + "model": "publications.source", + "pk": 3, + "fields": { + "name": "European Journal of Transport and Infrastructure Research", + "issn_l": "1567-7133", + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": null, + "abbreviated_title": null, + "is_oa": false, + "cited_by_count": null + } + }, + { + "model": "publications.publication", + "pk": 301, + "fields": { + "status": "p", + "title": "Electrification of European Rail Networks", + "abstract": "Economic and environmental impacts of railway electrification.", + "publicationDate": "2021-05-10", + "doi": "10.1234/EJTIR.2021.05.010", + "url": "https://journals.open.tudelft.nl/ejtir/article/view/2021-05-010", + "geometry": "SRID=4326;GEOMETRYCOLLECTION(POINT(52.379 4.900))", + "creationDate": "2024-06-10T10:00:00Z", + "lastUpdate": "2024-06-10T10:00:00Z", + "source": 3, + "timeperiod_startdate": ["2020-01-01"], + "timeperiod_enddate": ["2020-12-31"], + "provenance": "Imported from EJTIR online" + } + }, + { + "model": "publications.publication", + "pk": 302, + "fields": { + "status": "p", + "title": "Modal Split Analysis in Urban Regions", + "abstract": "A survey of travel behavior patterns.", + "publicationDate": null, + "doi": null, + "url": "https://journals.open.tudelft.nl/ejtir/article/view/2022-02-015", + "geometry": null, + "creationDate": "2024-06-10T10:05:00Z", + "lastUpdate": "2024-06-10T10:05:00Z", + "source": 3, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "URL only; missing DOI & geometry" + } + }, + + { + "model": "publications.source", + "pk": 4, + "fields": { + "name": "AGILE: GIScience Series", + "issn_l": "2700-8150", + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": null, + "abbreviated_title": null, + "is_oa": false, + "cited_by_count": null + } + }, + { + "model": "publications.publication", + "pk": 401, + "fields": { + "status": "p", + "title": "Participatory Mapping for Coastal Resilience", + "abstract": "Case studies from Mediterranean communities.", + "publicationDate": "2020-11-05", + "doi": "10.5194/ags-2020-05", + "url": "https://agile-giss.copernicus.org/articles/20/05/2020/", + "geometry": "SRID=4326;GEOMETRYCOLLECTION(POLYGON((3.0 43.0,3.1 43.1,3.2 43.0,3.0 43.0)))", + "creationDate": "2024-06-10T11:00:00Z", + "lastUpdate": "2024-06-10T11:00:00Z", + "source": 4, + "timeperiod_startdate": ["2019-01-01"], + "timeperiod_enddate": ["2019-12-31"], + "provenance": "Imported from Copernicus AGILE archive" + } + }, + { + "model": "publications.publication", + "pk": 402, + "fields": { + "status": "d", + "title": "Edge Computing for Geoanalytics", + "abstract": "", + "publicationDate": null, + "doi": null, + "url": null, + "geometry": null, + "creationDate": "2024-06-10T11:05:00Z", + "lastUpdate": "2024-06-10T11:05:00Z", + "source": 4, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "Draft entry; minimal fields" + } + }, + { + "model": "publications.source", + "pk": 1, + "fields": { + "name": "Volcanica", + "issn_l": "2610-3540", + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": "https://www.jvolcanica.org/ojs/index.php/volcanica", + "abbreviated_title": null, + "is_oa": false, + "cited_by_count": null, + "is_preprint": false + } + }, + { + "model": "publications.publication", + "pk": 102, + "fields": { + "status": "d", + "title": "Preliminary Survey of Volcanic Gas Emissions", + "abstract": "", + "publicationDate": null, + "doi": null, + "url": null, + "geometry": null, + "creationDate": "2024-06-10T08:05:00Z", + "lastUpdate": "2024-06-10T08:05:00Z", + "source": 1, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "Draft entry for Volcanica" + } + }, + + { + "model": "publications.source", + "pk": 2, + "fields": { + "name": "Journal of Spatial Information Science", + "issn_l": "1948-660X", + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": "https://josis.org/index.php/josis/index", + "abbreviated_title": null, + "is_oa": false, + "cited_by_count": null, + "is_preprint": false + } + }, + { + "model": "publications.publication", + "pk": 201, + "fields": { + "status": "p", + "title": "Integrating Lidar and Photogrammetry for Urban Mapping", + "abstract": "An end-to-end pipeline for 3D city models.", + "publicationDate": "2022-08-20", + "doi": "10.5311/JSIS.2022.08.001", + "url": "https://josis.org/index.php/josis/article/view/08-001", + "geometry": "SRID=4326;GEOMETRYCOLLECTION(POLYGON((8.681 50.112,8.683 50.113,8.684 50.111,8.681 50.112)))", + "creationDate": "2024-06-10T09:00:00Z", + "lastUpdate": "2024-06-10T09:00:00Z", + "source": 2, + "timeperiod_startdate": ["2021-01-01"], + "timeperiod_enddate": ["2021-12-31"], + "provenance": "Imported from JOSIS archive" + } + }, + { + "model": "publications.publication", + "pk": 202, + "fields": { + "status": "p", + "title": "Spatial Database Performance Benchmarks", + "abstract": "Comparing PostGIS, Oracle Spatial, and SQL Server.", + "publicationDate": "2023-03-15", + "doi": null, + "url": null, + "geometry": null, + "creationDate": "2024-06-10T09:05:00Z", + "lastUpdate": "2024-06-10T09:05:00Z", + "source": 2, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "Manual entry – missing DOI/URL" + } + }, + + { + "model": "publications.source", + "pk": 3, + "fields": { + "name": "European Journal of Transport and Infrastructure Research", + "issn_l": "1567-7133", + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": "https://journals.open.tudelft.nl/ejtir", + "abbreviated_title": null, + "is_oa": false, + "cited_by_count": null, + "is_preprint": false + } + }, + { + "model": "publications.publication", + "pk": 301, + "fields": { + "status": "p", + "title": "Electrification of European Rail Networks", + "abstract": "Economic and environmental impacts of railway electrification.", + "publicationDate": "2021-05-10", + "doi": "10.1234/EJTIR.2021.05.010", + "url": "https://journals.open.tudelft.nl/ejtir/article/view/2021-05-010", + "geometry": "SRID=4326;GEOMETRYCOLLECTION(POINT(52.379 4.900))", + "creationDate": "2024-06-10T10:00:00Z", + "lastUpdate": "2024-06-10T10:00:00Z", + "source": 3, + "timeperiod_startdate": ["2020-01-01"], + "timeperiod_enddate": ["2020-12-31"], + "provenance": "Imported from EJTIR online" + } + }, + { + "model": "publications.publication", + "pk": 302, + "fields": { + "status": "p", + "title": "Modal Split Analysis in Urban Regions", + "abstract": "A survey of travel behavior patterns.", + "publicationDate": null, + "doi": null, + "url": "https://journals.open.tudelft.nl/ejtir/article/view/2022-02-015", + "geometry": null, + "creationDate": "2024-06-10T10:05:00Z", + "lastUpdate": "2024-06-10T10:05:00Z", + "source": 3, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "URL only; missing DOI & geometry" + } + }, + + { + "model": "publications.source", + "pk": 4, + "fields": { + "name": "AGILE: GIScience Series", + "issn_l": "2700-8150", + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": "https://agile-giss.copernicus.org", + "abbreviated_title": null, + "is_oa": false, + "cited_by_count": null, + "is_preprint": false + } + }, + { + "model": "publications.publication", + "pk": 401, + "fields": { + "status": "p", + "title": "Participatory Mapping for Coastal Resilience", + "abstract": "Case studies from Mediterranean communities.", + "publicationDate": "2020-11-05", + "doi": "10.5194/ags-2020-05", + "url": "https://agile-giss.copernicus.org/articles/20/05/2020/", + "geometry": "SRID=4326;GEOMETRYCOLLECTION(POLYGON((3.0 43.0,3.1 43.1,3.2 43.0,3.0 43.0)))", + "creationDate": "2024-06-10T11:00:00Z", + "lastUpdate": "2024-06-10T11:00:00Z", + "source": 4, + "timeperiod_startdate": ["2019-01-01"], + "timeperiod_enddate": ["2019-12-31"], + "provenance": "Imported from Copernicus AGILE archive" + } + }, + { + "model": "publications.publication", + "pk": 402, + "fields": { + "status": "d", + "title": "Edge Computing for Geoanalytics", + "abstract": "", + "publicationDate": null, + "doi": null, + "url": null, + "geometry": null, + "creationDate": "2024-06-10T11:05:00Z", + "lastUpdate": "2024-06-10T11:05:00Z", + "source": 4, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "Draft entry; minimal fields" + } + }, + { + "model": "publications.source", + "pk": 5, + "fields": { + "name": "arXiv", + "issn_l": null, + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": "https://arxiv.org", + "abbreviated_title": null, + "is_oa": true, + "cited_by_count": null, + "is_preprint": true + } + }, + { + "model": "publications.publication", + "pk": 501, + "fields": { + "status": "p", + "title": "Quantum Entanglement in Large-Scale Systems", + "abstract": "An exploration of entanglement scaling in quantum networks.", + "publicationDate": null, + "doi": "10.48550/arXiv.2101.00001", + "url": "https://arxiv.org/abs/2101.00001", + "geometry": null, + "creationDate": "2024-06-10T12:00:00Z", + "lastUpdate": "2024-06-10T12:00:00Z", + "source": 5, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "Imported from arXiv" + } + }, + { + "model": "publications.publication", + "pk": 502, + "fields": { + "status": "d", + "title": "Early Results on Neural Rendering", + "abstract": "", + "publicationDate": null, + "doi": null, + "url": null, + "geometry": null, + "creationDate": "2024-06-10T12:05:00Z", + "lastUpdate": "2024-06-10T12:05:00Z", + "source": 5, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "Draft entry for preprint test" + } + }, + { + "model": "publications.source", + "pk": 6, + "fields": { + "name": "bioRxiv", + "issn_l": null, + "openalex_id": null, + "openalex_url": null, + "publisher_name": null, + "works_count": null, + "homepage_url": "https://www.biorxiv.org", + "abbreviated_title": null, + "is_oa": true, + "cited_by_count": null, + "is_preprint": true + } + }, + { + "model": "publications.publication", + "pk": 601, + "fields": { + "status": "p", + "title": "CRISPR-based Gene Drives in Mosquito Populations", + "abstract": "Modeling gene‐drive spread through wild Anopheles populations.", + "publicationDate": "2023-09-12", + "doi": "10.1101/2023.09.12.558924", + "url": "https://www.biorxiv.org/content/10.1101/2023.09.12.558924v1", + "geometry": null, + "creationDate": "2024-06-10T13:00:00Z", + "lastUpdate": "2024-06-10T13:00:00Z", + "source": 6, + "timeperiod_startdate": ["2023-01-01"], + "timeperiod_enddate": ["2023-12-31"], + "provenance": "Imported from bioRxiv" + } + }, + { + "model": "publications.publication", + "pk": 602, + "fields": { + "status": "d", + "title": "Preprint on Single-Cell Sequencing Pipelines", + "abstract": "", + "publicationDate": null, + "doi": null, + "url": null, + "geometry": null, + "creationDate": "2024-06-10T13:05:00Z", + "lastUpdate": "2024-06-10T13:05:00Z", + "source": 6, + "timeperiod_startdate": [], + "timeperiod_enddate": [], + "provenance": "Draft entry for bioRxiv preprint test" + } + } +] diff --git a/optimap/settings.py b/optimap/settings.py index cea378da..ff13fa82 100644 --- a/optimap/settings.py +++ b/optimap/settings.py @@ -37,6 +37,8 @@ # SECURITY WARNING: don't run with debug turned on in production! DEBUG = env('OPTIMAP_DEBUG', default=False) +BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + ALLOWED_HOSTS = [i.strip('[]') for i in env('OPTIMAP_ALLOWED_HOST', default='*').split(',')] OPTIMAP_SUPERUSER_EMAILS = [i.strip('[]') for i in env('OPTIMAP_SUPERUSER_EMAILS', default='').split(',')] @@ -205,6 +207,7 @@ OAI_PASSWORD = env("OPTIMAP_OAI_PASSWORD", default="") EMAIL_SEND_DELAY = 2 DATA_DUMP_INTERVAL_HOURS = 6 +OPENALEX_MAILTO = "login@optimap.science" MIDDLEWARE = [ 'django.middleware.cache.UpdateCacheMiddleware', diff --git a/publications/admin.py b/publications/admin.py index ec03f1fb..be359126 100644 --- a/publications/admin.py +++ b/publications/admin.py @@ -1,3 +1,6 @@ +import logging +logger = logging.getLogger(__name__) + from django.contrib import admin, messages from leaflet.admin import LeafletGeoAdmin from publications.models import Publication, Source, HarvestingEvent, BlockedEmail, BlockedDomain @@ -21,16 +24,19 @@ def make_draft(modeladmin, request, queryset): @admin.action(description="Trigger harvesting for selected sources") def trigger_harvesting_for_specific(modeladmin, request, queryset): - user = request.user - for source in queryset: - harvest_oai_endpoint(source.id, user) - + return trigger_harvesting_for_set(queryset, request) + @admin.action(description="Trigger harvesting for all sources") def trigger_harvesting_for_all(modeladmin, request, queryset): all_sources = Source.objects.all() + return trigger_harvesting_for_set(all_sources, request) + +def trigger_harvesting_for_set(sources, request): user = request.user - for source in all_sources: - harvest_oai_endpoint(source.id, user) + + for source in sources: + added, spatial, temporal = harvest_oai_endpoint(source.id, user) + logger.info(f"Harvested {added} publications from source {source.id} ({source.url_field}) of which {spatial} have spatial data and {temporal} have temporal data.") @admin.action(description="Schedule harvesting for selected sources") def schedule_harvesting(modeladmin, request, queryset): diff --git a/publications/api.py b/publications/api.py index b0cc96b8..0509b7f4 100644 --- a/publications/api.py +++ b/publications/api.py @@ -1,10 +1,14 @@ """Publications API URL Configuration.""" from rest_framework import routers - -from publications.viewsets import PublicationViewSet, SubscriptionViewset +from publications.viewsets import ( SourceViewSet, + PublicationViewSet, + SubscriptionViewSet, +) router = routers.DefaultRouter() -router.register(r"publications", PublicationViewSet) -router.register(r"subscriptions", SubscriptionViewset, basename='subscription') +router.register(r"sources", SourceViewSet, basename="source") +router.register(r"publications", PublicationViewSet, basename="publication") +router.register(r"subscriptions", SubscriptionViewSet, basename="subscription") + urlpatterns = router.urls diff --git a/publications/management/__init__.py b/publications/management/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/publications/management/commands/__init__.py b/publications/management/commands/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/publications/management/commands/sync_source_metadata.py b/publications/management/commands/sync_source_metadata.py new file mode 100644 index 00000000..4a5d4428 --- /dev/null +++ b/publications/management/commands/sync_source_metadata.py @@ -0,0 +1,112 @@ +# publications/management/commands/sync_source_metadata.py + +import logging +import time +import socket +import os +from django.core.management.base import BaseCommand +from django.contrib.gis.geos import Point +from geopy.geocoders import Nominatim +from geopy.exc import GeocoderServiceError +from publications.models import Source +import requests + +from pyalex import Sources # optional, install pyalex for client support + +logger = logging.getLogger(__name__) + +ISSN_ENDPOINT = "https://api.openalex.org/sources/issn:{issn}" + +class Command(BaseCommand): + help = "Full sync: metadata + geolocation + works list from OpenAlex." + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.geolocator = Nominatim(user_agent="optimap-sync") + + def fetch_metadata(self, issn: str) -> dict | None: + # Try PyAlex first + try: + client = Sources() + return client.get_single_source(issn, id_type="issn") + except Exception: + pass + + # Fallback to HTTP + try: + resp = requests.get(ISSN_ENDPOINT.format(issn=issn), timeout=10) + if resp.status_code == 302 and "Location" in resp.headers: + resp = requests.get(resp.headers["Location"], timeout=10) + if resp.status_code == 200: + return resp.json() + except requests.RequestException as e: + logger.debug("HTTP metadata fetch failed for %s: %s", issn, e) + return None + + def handle(self, *args, **options): + # DNS check + try: + ip = socket.gethostbyname("api.openalex.org") + self.stdout.write(f"DNS: api.openalex.org → {ip}") + except socket.error as e: + self.stderr.write(f"DNS lookup failed: {e}") + return + if ip.startswith(("127.", "10.", "192.168.", "172.16.", "::1")): + self.stderr.write("OpenAlex resolves to private IP—aborting.") + return + + session = requests.Session() + session.trust_env = False + + for src in Source.objects.exclude(issn_l__isnull=True): + self.stdout.write(f"Syncing ISSN={src.issn_l}") + data = self.fetch_metadata(src.issn_l) + if not data: + self.stderr.write(f"{src.issn_l}: no metadata\n") + continue + + defaults = { + "openalex_id": data.get("id"), + "openalex_url": data.get("id"), + "publisher_name": (data.get("host_organization") or {}).get("display_name") + or data.get("display_name"), + } + + # geolocation from OpenAlex + loc = data.get("location", {}) + lat, lon = loc.get("lat"), loc.get("lon") + if lat and lon: + defaults["geometry"] = Point(lon, lat) + elif not src.geometry: + # fallback geocode by name + try: + geo = self.geolocator.geocode(defaults["publisher_name"]) + if geo: + defaults["geometry"] = Point(geo.longitude, geo.latitude) + except GeocoderServiceError as ge: + logger.debug("Geocoding failed: %s", ge) + + # save metadata & geometry + src, _ = Source.objects.update_or_create(issn_l=src.issn_l, defaults=defaults) + self.stdout.write(f"{src.issn_l}: metadata & geo synced") + + # fetch works list + source_id = src.openalex_id.rstrip("/").split("/")[-1] + resp = session.get( + "https://api.openalex.org/works", + params={"filter": f"locations.source.id:{source_id}", "per-page": 100}, + timeout=30, + headers={"Accept": "application/json"}, + ) + if resp.status_code == 200: + results = resp.json().get("results", []) + ids = [w["id"] for w in results if w.get("id")] + src.articles = ids + src.save(update_fields=["articles"]) + self.stdout.write(f"{src.issn_l}: fetched {len(ids)} works") + else: + logger.warning("Works fetch %s → %s", resp.status_code, resp.text) + + time.sleep(0.2) + + self.stdout.write("Full sync complete.") diff --git a/publications/management/commands/update_openalex_journals.py b/publications/management/commands/update_openalex_journals.py new file mode 100644 index 00000000..25314e89 --- /dev/null +++ b/publications/management/commands/update_openalex_journals.py @@ -0,0 +1,81 @@ +# publications/management/commands/update_openalex_journals.py + +import logging +import requests + +from django.core.management.base import BaseCommand +from django.db.models import Q +from publications.models import Source + +logger = logging.getLogger(__name__) + +ISSN_ENDPOINT = "https://api.openalex.org/sources/issn:{issn}" +SEARCH_ENDPOINT = "https://api.openalex.org/sources" + +def fetch_by_issn(issn: str) -> dict | None: + try: + resp = requests.get(ISSN_ENDPOINT.format(issn=issn), timeout=10) + if resp.status_code == 302 and "Location" in resp.headers: + resp = requests.get(resp.headers["Location"], timeout=10) + if resp.status_code == 200: + return resp.json() + except requests.RequestException as e: + logger.debug("ISSN lookup failed for %s: %s", issn, e) + return None + +def fetch_by_name(name: str) -> dict | None: + try: + resp = requests.get(SEARCH_ENDPOINT, + params={"filter": f"display_name.search:{name}"}, + timeout=10) + resp.raise_for_status() + results = resp.json().get("results", []) + return results[0] if results else None + except requests.RequestException as e: + logger.debug("Name lookup failed for %s: %s", name, e) + return None + +class Command(BaseCommand): + help = "Update Source metadata from OpenAlex (ISSN-based or name lookup)." + + def handle(self, *args, **options): + qs = Source.objects.filter(Q(issn_l__isnull=False) | Q(is_preprint=True)) + total = qs.count() + self.stdout.write(f"Found {total} source(s) to update.\n") + + for src in qs: + key = src.issn_l or src.name + self.stdout.write(f"[{key}] querying OpenAlex…") + + # log the ISSN or name we're using + logger.info("Fetching source metadata for %s", key) + + # fetch metadata + data = fetch_by_issn(src.issn_l) if src.issn_l else fetch_by_name(src.name) + if not data: + logger.info("Skipped ISSN=%s: no OpenAlex data", src.issn_l) + continue + + changed = False + def safe_upd(field: str, new): + nonlocal changed + old = getattr(src, field, None) + if new and new != old: + logger.info("ISSN=%s: %s changed %r → %r", src.issn_l, field, old, new) + setattr(src, field, new) + changed = True + + safe_upd("openalex_url", data.get("id")) + safe_upd("works_count", data.get("works_count")) + # host_organization may be nested under "host_organization" + host = data.get("host_organization") or {} + publisher = host.get("display_name") or data.get("display_name") + safe_upd("publisher_name", publisher) + + if changed: + src.save() + self.stdout.write(f"[{key}] saved\n") + else: + self.stdout.write(f"[{key}] nothing changed\n") + + self.stdout.write("Done updating OpenAlex metadata.") diff --git a/publications/migrations/0001_initial.py b/publications/migrations/0001_initial.py index 4f69d144..cf8b6388 100644 --- a/publications/migrations/0001_initial.py +++ b/publications/migrations/0001_initial.py @@ -1,4 +1,4 @@ -# Generated by Django 5.1.7 on 2025-04-08 10:02 +# Generated by Django 5.1.9 on 2025-06-17 09:37 import django.contrib.auth.models import django.contrib.auth.validators @@ -21,6 +21,31 @@ class Migration(migrations.Migration): ] operations = [ + migrations.CreateModel( + name='Source', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('url_field', models.URLField(max_length=999)), + ('harvest_interval_minutes', models.IntegerField(default=4320)), + ('last_harvest', models.DateTimeField(auto_now_add=True, null=True)), + ('collection_name', models.CharField(blank=True, max_length=255, null=True)), + ('tags', models.CharField(blank=True, max_length=1024, null=True)), + ('is_preprint', models.BooleanField(default=False)), + ('name', models.CharField(max_length=255)), + ('issn_l', models.CharField(blank=True, max_length=9, null=True)), + ('openalex_id', models.CharField(blank=True, max_length=50, null=True)), + ('openalex_url', models.URLField(blank=True, max_length=512, null=True)), + ('publisher_name', models.CharField(blank=True, max_length=255, null=True)), + ('works_count', models.IntegerField(blank=True, null=True)), + ('homepage_url', models.URLField(blank=True, max_length=512, null=True)), + ('abbreviated_title', models.CharField(blank=True, max_length=255, null=True)), + ('is_oa', models.BooleanField(default=False)), + ('cited_by_count', models.IntegerField(blank=True, null=True)), + ], + options={ + 'ordering': ['name'], + }, + ), migrations.CreateModel( name='CustomUser', fields=[ @@ -35,8 +60,6 @@ class Migration(migrations.Migration): ('is_staff', models.BooleanField(default=False, help_text='Designates whether the user can log into this admin site.', verbose_name='staff status')), ('is_active', models.BooleanField(default=True, help_text='Designates whether this user should be treated as active. Unselect this instead of deleting accounts.', verbose_name='active')), ('date_joined', models.DateTimeField(default=django.utils.timezone.now, verbose_name='date joined')), - ('deleted', models.BooleanField(default=False)), - ('deleted_at', models.DateTimeField(blank=True, null=True)), ('groups', models.ManyToManyField(blank=True, related_name='publications_users', to='auth.group')), ('user_permissions', models.ManyToManyField(blank=True, related_name='publications_users_permissions', to='auth.permission')), ], @@ -81,19 +104,6 @@ class Migration(migrations.Migration): ('sent_by', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to=settings.AUTH_USER_MODEL)), ], ), - migrations.CreateModel( - name='Source', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('creationDate', models.DateTimeField(auto_now_add=True)), - ('lastUpdate', models.DateTimeField(auto_now=True)), - ('url_field', models.URLField(max_length=999)), - ('harvest_interval_minutes', models.IntegerField(default=4320)), - ('last_harvest', models.DateTimeField(auto_now_add=True, null=True)), - ('created_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='%(app_label)s_%(class)s_creator', to=settings.AUTH_USER_MODEL, verbose_name='Created by')), - ('updated_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, on_update=True, related_name='%(app_label)s_%(class)s_updater', to=settings.AUTH_USER_MODEL, verbose_name='Updated by')), - ], - ), migrations.CreateModel( name='HarvestingEvent', fields=[ @@ -139,7 +149,6 @@ class Migration(migrations.Migration): ('creationDate', models.DateTimeField(auto_now_add=True)), ('lastUpdate', models.DateTimeField(auto_now=True)), ('doi', models.CharField(blank=True, max_length=1024, null=True, unique=True)), - ('source', models.CharField(blank=True, max_length=4096, null=True)), ('provenance', models.TextField(blank=True, null=True)), ('publicationDate', models.DateField(blank=True, null=True)), ('abstract', models.TextField(blank=True, null=True)), @@ -150,6 +159,7 @@ class Migration(migrations.Migration): ('created_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='%(app_label)s_%(class)s_creator', to=settings.AUTH_USER_MODEL, verbose_name='Created by')), ('job', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='publications', to='publications.harvestingevent')), ('updated_by', django_currentuser.db.models.fields.CurrentUserField(default=django_currentuser.middleware.get_current_authenticated_user, null=True, on_delete=django.db.models.deletion.CASCADE, on_update=True, related_name='%(app_label)s_%(class)s_updater', to=settings.AUTH_USER_MODEL, verbose_name='Updated by')), + ('source', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='publications', to='publications.source')), ], options={ 'ordering': ['-id'], diff --git a/publications/migrations/0002_source_collection_name_source_tags.py b/publications/migrations/0002_source_collection_name_source_tags.py deleted file mode 100644 index a8ed90e0..00000000 --- a/publications/migrations/0002_source_collection_name_source_tags.py +++ /dev/null @@ -1,33 +0,0 @@ -# Generated by Django 5.1.7 on 2025-04-21 19:25 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("publications", "0001_initial"), - ] - - operations = [ - migrations.AddField( - model_name="source", - name="collection_name", - field=models.CharField( - blank=True, - help_text="Identifier for a set or group of journals (e.g., 'Health Journals', 'TestBatch_Apr2025').", - max_length=255, - null=True, - ), - ), - migrations.AddField( - model_name="source", - name="tags", - field=models.CharField( - blank=True, - help_text="Comma-separated tags to provide additional context", - max_length=1024, - null=True, - ), - ), - ] diff --git a/publications/migrations/0003_remove_customuser_deleted_and_more.py b/publications/migrations/0003_remove_customuser_deleted_and_more.py deleted file mode 100644 index 37f01934..00000000 --- a/publications/migrations/0003_remove_customuser_deleted_and_more.py +++ /dev/null @@ -1,21 +0,0 @@ -# Generated by Django 5.1.9 on 2025-05-21 13:35 - -from django.db import migrations - - -class Migration(migrations.Migration): - - dependencies = [ - ('publications', '0002_source_collection_name_source_tags'), - ] - - operations = [ - migrations.RemoveField( - model_name='customuser', - name='deleted', - ), - migrations.RemoveField( - model_name='customuser', - name='deleted_at', - ), - ] diff --git a/publications/models.py b/publications/models.py index 9b928046..d176bd78 100644 --- a/publications/models.py +++ b/publications/models.py @@ -1,16 +1,19 @@ +import logging + +from django.contrib.auth.models import AbstractUser, Group, Permission from django.contrib.gis.db import models from django.contrib.postgres.fields import ArrayField +from django.conf import settings from django_currentuser.db.models import CurrentUserField from django_q.models import Schedule from django.utils.timezone import now from django.contrib.auth.models import AbstractUser, Group, Permission -from django.utils.timezone import now -# handle import/export relations, see https://django-import-export.readthedocs.io/en/stable/advanced_usage.html#creating-non-existent-relations from import_export import fields, resources from import_export.widgets import ForeignKeyWidget -from django.conf import settings +from django.core.exceptions import ValidationError +from stdnum.issn import is_valid as is_valid_issn +from django.contrib.gis.db import models as gis_models -import logging logger = logging.getLogger(__name__) STATUS_CHOICES = ( @@ -26,15 +29,12 @@ class CustomUser(AbstractUser): user_permissions = models.ManyToManyField(Permission, related_name="publications_users_permissions", blank=True) class Publication(models.Model): - # required fields title = models.TextField() status = models.CharField(max_length=1, choices=STATUS_CHOICES, default="d") - created_by = CurrentUserField( # see useful hint at https://github.com/zsoldosp/django-currentuser/issues/69 + created_by = CurrentUserField( verbose_name=("Created by"), related_name="%(app_label)s_%(class)s_creator", ) - - # automatic fields creationDate = models.DateTimeField(auto_now_add=True) lastUpdate = models.DateTimeField(auto_now=True) updated_by = CurrentUserField( @@ -42,103 +42,47 @@ class Publication(models.Model): related_name="%(app_label)s_%(class)s_updater", on_update=True, ) - - # optional fields + doi = models.CharField(max_length=1024, unique=True, blank=True, null=True) - source = models.CharField(max_length=4096, null=True, blank=True) # journal, conference, preprint repo, .. + source = models.ForeignKey('Source', on_delete=models.SET_NULL, null=True, related_name='publications') provenance = models.TextField(null=True, blank=True) publicationDate = models.DateField(null=True, blank=True) abstract = models.TextField(null=True, blank=True) url = models.URLField(max_length=1024, null=True, blank=True, unique=True) - geometry = models.GeometryCollectionField(verbose_name='Publication geometry/ies', srid = 4326, null=True, blank=True)# https://docs.openalex.org/api-entities/sources + geometry = models.GeometryCollectionField( + verbose_name='Publication geometry/ies', srid=4326, null=True, blank=True + ) timeperiod_startdate = ArrayField(models.CharField(max_length=1024, null=True), null=True, blank=True) timeperiod_enddate = ArrayField(models.CharField(max_length=1024, null=True), null=True, blank=True) - - # Linking to HarvestingEvent as "job" job = models.ForeignKey( - 'HarvestingEvent', - on_delete=models.CASCADE, - related_name='publications', - null=True, - blank=True + 'HarvestingEvent', on_delete=models.CASCADE, related_name='publications', null=True, blank=True ) - - def get_absolute_url(self): - return "/api/v1/publications/%i.json" % self.id - # http://localhost:8000/api/v1/publications/5.json - class Meta: ordering = ['-id'] constraints = [ models.UniqueConstraint(fields=['doi', 'url'], name='unique_publication_entry') ] - def __str__(self): - """Return string representation.""" return self.title -class Source(models.Model): - # automatic fields - creationDate = models.DateTimeField(auto_now_add=True) - lastUpdate = models.DateTimeField(auto_now=True) - created_by = CurrentUserField( - verbose_name=("Created by"), - related_name="%(app_label)s_%(class)s_creator", - ) - updated_by = CurrentUserField( - verbose_name=("Updated by"), - related_name="%(app_label)s_%(class)s_updater", - on_update=True, - ) - - url_field = models.URLField(max_length = 999) - harvest_interval_minutes = models.IntegerField(default=60*24*3) - last_harvest = models.DateTimeField(auto_now_add=True,null=True) - - collection_name = models.CharField( - max_length=255, - blank=True, - null=True, - help_text="Identifier for a set or group of journals (e.g., 'Health Journals', 'TestBatch_Apr2025')." - ) - tags = models.CharField( - max_length=1024, - blank=True, - null=True, - help_text="Comma-separated tags to provide additional context" - ) - - def save(self, *args, **kwargs): - super().save(*args, **kwargs) - Schedule.objects.filter(name=f"Harvest Source {self.id}").delete() # Avoid duplicates - Schedule.objects.create( - func='publications.tasks.harvest_oai_endpoint', - args=str(self.id), - schedule_type=Schedule.MINUTES, - minutes=self.harvest_interval_minutes, - name=f"Harvest Source {self.id}", - ) - - class Subscription(models.Model): user = models.ForeignKey(CustomUser, on_delete=models.CASCADE, related_name="subscriptions", null=True, blank=True) name = models.CharField(max_length=4096, default="default_subscription") - search_term = models.CharField(max_length=4096,null=True) + search_term = models.CharField(max_length=4096, null=True) timeperiod_startdate = models.DateField(null=True) timeperiod_enddate = models.DateField(null=True) region = models.GeometryCollectionField(null=True, blank=True) - subscribed = models.BooleanField(default=True) - - def __str__(self): - """Return string representation.""" - return self.name + subscribed = models.BooleanField(default=True) class Meta: ordering = ['name'] verbose_name = "subscription" + def __str__(self): + return self.name + class EmailLog(models.Model): TRIGGER_CHOICES = [ ("admin", "Admin Panel"), @@ -151,8 +95,8 @@ class EmailLog(models.Model): email_content = models.TextField(blank=True, null=True) sent_by = models.ForeignKey(CustomUser, null=True, blank=True, on_delete=models.SET_NULL) trigger_source = models.CharField(max_length=50, choices=TRIGGER_CHOICES, default="manual") - status = models.CharField(max_length=10, choices=STATUS_CHOICES, default="success") - error_message = models.TextField(null=True, blank=True) + status = models.CharField(max_length=10, choices=STATUS_CHOICES, default="success") + error_message = models.TextField(null=True, blank=True) def __str__(self): sender = self.sent_by.email if self.sent_by else "System" @@ -160,55 +104,36 @@ def __str__(self): @classmethod def log_email(cls, recipient, subject, content, sent_by=None, trigger_source="manual", status="success", error_message=None): - """Logs the sent email, storing who triggered it and how it was sent.""" cls.objects.create( recipient_email=recipient, subject=subject, sent_at=now(), email_content=content, sent_by=sent_by, - trigger_source=trigger_source, - status=status, - error_message=error_message, - + trigger_source=trigger_source, + status=status, + error_message=error_message, ) -class PublicationResource(resources.ModelResource): - #created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name='username') - #updated_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, related_name='username') - created_by = fields.Field( - column_name='created_by', - attribute='created_by', - widget=ForeignKeyWidget(CustomUser, field='username')) - updated_by = fields.Field( - column_name='updated_by', - attribute='updated_by', - widget=ForeignKeyWidget(settings.AUTH_USER_MODEL, field='username')) - - class Meta: - model = Publication - fields = ('created_by','updated_by',) - class HarvestingEvent(models.Model): source = models.ForeignKey('Source', on_delete=models.CASCADE, related_name='harvesting_events') - user = models.ForeignKey(CustomUser, on_delete=models.SET_NULL, null=True, blank=True) - started_at = models.DateTimeField(auto_now_add=True) - completed_at = models.DateTimeField(null=True, blank=True) + user = models.ForeignKey(CustomUser, on_delete=models.SET_NULL, null=True, blank=True) + started_at = models.DateTimeField(auto_now_add=True) + completed_at = models.DateTimeField(null=True, blank=True) status = models.CharField( max_length=16, - choices=( + choices=[ ('pending', 'Pending'), ('in_progress', 'In Progress'), ('completed', 'Completed'), ('failed', 'Failed'), - ), + ], default='pending' - ) + ) def __str__(self): return f"Harvesting Event ({self.status}) for {self.source.url_field} at {self.started_at}" - class UserProfile(models.Model): user = models.OneToOneField(CustomUser, on_delete=models.CASCADE) notify_new_manuscripts = models.BooleanField(default=False) @@ -231,3 +156,47 @@ class BlockedDomain(models.Model): def __str__(self): return self.domain + +class Source(models.Model): + url_field = models.URLField(max_length=999) + harvest_interval_minutes = models.IntegerField(default=60*24*3) + last_harvest = models.DateTimeField(auto_now_add=True, null=True) + collection_name = models.CharField(max_length=255, blank=True, null=True) + tags = models.CharField(max_length=1024, blank=True, null=True) + is_preprint = models.BooleanField(default=False) + name = models.CharField(max_length=255) + issn_l = models.CharField(max_length=9, blank=True, null=True) + openalex_id = models.CharField(max_length=50, blank=True, null=True) + openalex_url = models.URLField(max_length=512, blank=True, null=True) + publisher_name = models.CharField(max_length=255, blank=True, null=True) + works_count = models.IntegerField(blank=True, null=True) + homepage_url = models.URLField(max_length=512, blank=True, null=True) + abbreviated_title = models.CharField(max_length=255, blank=True, null=True) + + is_oa = models.BooleanField(default=False) + cited_by_count = models.IntegerField(blank=True, null=True) + + class Meta: + ordering = ['name'] + + def __str__(self): + return self.name + + @property + def works_api_url(self) -> str | None: + if not self.openalex_id: + return None + source_id = self.openalex_id.rstrip('/').split('/')[-1] + return f"https://api.openalex.org/works?filter=primary_location.source.id:{source_id}" + + def save(self, *args, **kwargs): + super().save(*args, **kwargs) + Schedule.objects.filter(name=f"Harvest Source {self.id}").delete() + Schedule.objects.create( + func='publications.tasks.harvest_oai_endpoint', + args=str(self.id), + schedule_type=Schedule.MINUTES, + minutes=self.harvest_interval_minutes, + name=f"Harvest Source {self.id}", + ) +Journal = Source \ No newline at end of file diff --git a/publications/serializers.py b/publications/serializers.py index cf6f663a..c84f66b7 100644 --- a/publications/serializers.py +++ b/publications/serializers.py @@ -1,47 +1,87 @@ """publications serializers.""" -from rest_framework_gis import serializers -from .models import Publication +from rest_framework import serializers +from rest_framework_gis.serializers import GeoFeatureModelSerializer +from rest_framework import serializers as drf_serializers +from .models import Publication, Subscription, Source from django.contrib.auth import get_user_model -User = get_user_model() -from publications.models import Publication,Subscription -from django.contrib.auth import get_user_model User = get_user_model() -class PublicationSerializer(serializers.GeoFeatureModelSerializer): - """publication GeoJSON serializer.""" +class SourceSerializer(serializers.ModelSerializer): + class Meta: + model = Source + fields = ( + "id", + "name", + "issn_l", + "openalex_id", + "openalex_url", + "publisher_name", + "works_count", + "works_api_url", + ) + + + +class PublicationSerializer(GeoFeatureModelSerializer): + source_details = serializers.SerializerMethodField() class Meta: - """publication serializer meta class.""" model = Publication - fields = ("id", "title" ,"abstract", "publicationDate", "url", "doi", "creationDate", "lastUpdate", "timeperiod_startdate", "timeperiod_enddate") geo_field = "geometry" - auto_bbox = True - -class SubscriptionSerializer(serializers.GeoFeatureModelSerializer): - """Subscription GeoJSON serializer.""" + auto_bbox = True + fields = [ + "id", + "title", + "abstract", + "publicationDate", + "doi", + "url", + "timeperiod_startdate", + "timeperiod_enddate", + "source_details", + ] + + def get_source_details(self, obj): + source = obj.source + if not source: + return {} + return SourceSerializer(source, context=self.context).data +class SubscriptionSerializer(GeoFeatureModelSerializer): class Meta: model = Subscription - fields = ("search_term","timeperiod_startdate","timeperiod_enddate","user") + fields = ( + "id", + "user", + "name", + "search_term", + "timeperiod_startdate", + "timeperiod_enddate", + "region", + "subscribed", + ) + geo_field = "region" auto_bbox = True - -class EmailChangeSerializer(serializers.ModelSerializer): + + +class EmailChangeSerializer(serializers.ModelSerializer): """Handles email change requests.""" class Meta: model = User - fields = ['email'] + fields = ["email"] def validate_email(self, value): """Ensure the new email is not already in use.""" if User.objects.filter(email=value).exists(): - raise serializers.ValidationError("This email is already registered.") + raise drf_serializers.ValidationError("This email is already registered.") return value -class UserSerializer(serializers.ModelSerializer): + +class UserSerializer(drf_serializers.ModelSerializer): class Meta: model = User - fields = ["id", "username", "email"] + fields = ["id", "username", "email"] diff --git a/publications/static/css/main.css b/publications/static/css/main.css index b8311a90..2a9d9d7d 100644 --- a/publications/static/css/main.css +++ b/publications/static/css/main.css @@ -175,3 +175,9 @@ main { background: #fff; } +.leaflet-popup-content { + max-width: 250px !important; + white-space: normal; + word-wrap: break-word; + overflow-wrap: break-word; +} diff --git a/publications/static/js/main.js b/publications/static/js/main.js index 369d1937..bfbd4141 100644 --- a/publications/static/js/main.js +++ b/publications/static/js/main.js @@ -1,86 +1,138 @@ -const dataCopyright = " | Publication metadata license: CC-0"; -const publications_url = '/api/v1/publications.json?limit=999999'; +// publications/static/js/main.js -async function initMap() { - var map = L.map("map"); - - var osmLayer = L.tileLayer('https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', { - attribution: 'Map data: © OpenStreetMap contributors' + dataCopyright, - maxZoom: 18 - }).addTo(map); - - //var esriWorldImageryLayer = L.tileLayer('https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}', { - // attribution: 'Tiles © Esri — Source: Esri, i-cubed, USDA, USGS, AEX, GeoEye, Getmapping, Aerogrid, IGN, IGP, UPR-EGP, and the GIS User Community' + dataCopyright, - // maxZoom: 18 - //}).addTo(map); - - var baseLayers = { - "OpenStreetMap": osmLayer, - //"Esri World Imagery": esriWorldImageryLayer - }; - - var publicationsGroup = new L.FeatureGroup(); - map.addLayer(publicationsGroup); - - var overlayMaps = { - "Publications": publicationsGroup - }; - - L.control.scale({ position: 'bottomright' }).addTo(map); - L.control.layers(baseLayers, overlayMaps).addTo(map); - - var publications = await load_publications(); - var publicationsLayer = L.geoJSON(publications, { - onEachFeature: publicationPopup - }) - publicationsLayer.eachLayer( - function (l) { - publicationsGroup.addLayer(l); - }); +// Leaflet map initialization and popup rendering for publication points + +// 1. Load all publications from the API +async function load_publications() { + const response = await fetch(publications_url); + const body = await response.json(); + console.log(`OPTIMAP retrieved ${body.count} results.`); + return body.results; +} + +// 2. Once the DOM is ready, initialize the map +$(document).ready(function() { + initMap(); +}); +// API URL and copyright attribution +const publications_url = '/api/v1/publications/?limit=999999'; +const dataCopyright = + " | Publication metadata license: CC-0"; + +async function initMap() { + const map = L.map('map'); + + // Base layer: OpenStreetMap + const osmLayer = L.tileLayer( + 'https://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png', + { + attribution: + 'Map data: © OpenStreetMap contributors' + + dataCopyright, + maxZoom: 18, + } + ).addTo(map); + + // Group to hold all publication markers + const publicationsGroup = new L.FeatureGroup().addTo(map); + + // Controls: scale and layer switcher + L.control.scale({ position: 'bottomright' }).addTo(map); + L.control + .layers( + { 'OpenStreetMap': osmLayer }, + { Publications: publicationsGroup } + ) + .addTo(map); + + // Fetch data and add to map + const pubs = await load_publications(); + const pubsLayer = L.geoJSON(pubs, { + style: feature => ({ + color: feature.properties.source_details.is_preprint ? 'orange' : 'blue', + weight: 3, + fillOpacity: 0.2, + }), + onEachFeature: publicationPopup + }); + pubsLayer.eachLayer((layer) => publicationsGroup.addLayer(layer)); + + // Fit map to markers + if (publicationsGroup.getBounds().isValid()) { map.fitBounds(publicationsGroup.getBounds()); + } } +// 3. Popup content generator for each publication feature function publicationPopup(feature, layer) { - var popupContent = '
'; - if (feature.properties['title']) { - popupContent += '

'+ feature.properties['title']+'

' + const p = feature.properties; + let html = '
'; + + // Title + if (p.title) html += `

${p.title}

`; + + // Source details from nested object + if (p.source_details) { + const s = p.source_details; + + // Display name + const name = s.display_name || s.name || 'Unknown'; + html += `
Source: ${name}
`; + + // Abbreviated title + if (s.abbreviated_title) { + html += `
${s.abbreviated_title}
`; } - if (feature.properties['timeperiod_startdate'] && feature.properties['timeperiod_enddate']) { - popupContent += '
' + '' + "Timeperiod : " + '' + " "+ "from" + " "+ feature.properties['timeperiod_startdate'] + " " + "to" + " " + feature.properties['timeperiod_enddate'] +'
'; - } + // Homepage link + if (s.homepage_url) { + html += `
Visit journal site
`; + } - if (feature.properties['abstract']) { - popupContent += '

'+ feature.properties['abstract']+ '

' + // ISSN-L link + if (s.issn_l) { + html += + `
ISSN-L: ` + + `${s.issn_l}
`; } - - if (feature.properties['url']) { - popupContent += '
' + "Visit Article" + '
'; - } - if (feature.properties && feature.properties.popupContent) { - popupContent += feature.properties.popupContent; + // Publisher (only if different from display name) + if (s.publisher_name && s.publisher_name !== name) { + html += `
Publisher: ${s.publisher_name}
`; } - popupContent += '
'; + // Open access status + if ('is_oa' in s) { + const status = s.is_oa ? 'Open Access' : 'Closed Access'; + html += `
Access: ${status}
`; + } - layer.bindPopup(popupContent, { - maxHeight: 225 - }); -} + // Citation count + if (s.cited_by_count != null) { + html += `
Cited by ${s.cited_by_count} works
`; + } -async function load_publications() { - response = await fetch(publications_url); - body = await response.json(); - console.log('OPTIMAP retrieved ' + body.count + ' results.'); - return body.results; -} + // Works count + if (s.works_count != null) { + html += `
${s.works_count} works hosted
`; + } + } -// render publications after page is loaded -$(function () { - initMap(); -}); + // Time period + if (p.timeperiod_startdate && p.timeperiod_enddate) { + html += + `
Timeperiod: from ${p.timeperiod_startdate} to ${p.timeperiod_enddate}
`; + } + // Abstract + if (p.abstract) html += `

${p.abstract}

`; + // Article link + if (p.url) { + html += `
Visit Article
`; + } + html += '
'; + layer.bindPopup(html, { maxWidth: 300, maxHeight: 250 }); +} diff --git a/publications/tasks.py b/publications/tasks.py index 9b576fc4..09e907f5 100644 --- a/publications/tasks.py +++ b/publications/tasks.py @@ -7,228 +7,244 @@ import gzip import re import tempfile +import glob import time import calendar -from datetime import datetime, timedelta +from datetime import datetime, timedelta, timezone as dt_timezone import xml.dom.minidom import requests +from pathlib import Path from bs4 import BeautifulSoup -from requests.auth import HTTPBasicAuth from urllib.parse import quote from django.conf import settings -from django.core.mail import send_mail, EmailMessage from django.core.serializers import serialize +from django.core.mail import send_mail, EmailMessage from django.contrib.gis.geos import GEOSGeometry, GeometryCollection from django.utils import timezone +from django_q.tasks import schedule +from django_q.models import Schedule from publications.models import Publication, HarvestingEvent, Source from .models import EmailLog, Subscription from django.contrib.auth import get_user_model -User = get_user_model() from django.urls import reverse -from urllib.parse import quote -from django_q.tasks import schedule -from django_q.models import Schedule -import glob -from pathlib import Path -from datetime import datetime, timezone as dt_timezone +from geopy.geocoders import Nominatim +from django.contrib.gis.geos import Point + +User = get_user_model() BASE_URL = settings.BASE_URL DOI_REGEX = re.compile(r'10\.\d{4,9}/[-._;()/:A-Z0-9]+', re.IGNORECASE) CACHE_DIR = Path(tempfile.gettempdir()) / 'optimap_cache' + def generate_data_dump_filename(extension: str) -> str: - """ - Returns: optimap_data_dump_YYYYMMDDThhmmss. - """ ts = datetime.now(dt_timezone.utc).strftime("%Y%m%dT%H%M%S") return f"optimap_data_dump_{ts}.{extension}" + def cleanup_old_data_dumps(directory: Path, keep: int): """ Deletes all files matching optimap_data_dump_* beyond the newest `keep` ones. """ pattern = str(directory / "optimap_data_dump_*") - files = sorted(glob.glob(pattern), reverse=True) # newest first + files = sorted(glob.glob(pattern), reverse=True) for old in files[keep:]: try: os.remove(old) except OSError: logger.warning("Could not delete old dump %s", old) -def extract_geometry_from_html(content): - for tag in content.find_all("meta"): - if tag.get("name", None) == "DC.SpatialCoverage": - data = tag.get("content", None) + +def extract_geometry_from_html(soup: BeautifulSoup): + for tag in soup.find_all("meta"): + if tag.get("name") == "DC.SpatialCoverage": try: - geom = json.loads(data) + geom = json.loads(tag["content"]) geom_data = geom["features"][0]["geometry"] - # preparing geometry data in accordance to geos API fields - type_geom= {'type': 'GeometryCollection'} - geom_content = {"geometries" : [geom_data]} - type_geom.update(geom_content) - geom_data_string= json.dumps(type_geom) - try : - geom_object = GEOSGeometry(geom_data_string) # GeometryCollection object - logging.debug('Found geometry: %s', geom_object) - return geom_object - except Exception as e: - logger.error("Cannot create geometry from string '%s': %s", geom_data_string, e) - except ValueError as e: - logger.error("Error loading JSON from %s: %s", tag.get("name"), e) - -def extract_timeperiod_from_html(content): - period = [None, None] - for tag in content.find_all("meta"): - if tag.get("name", None) in ['DC.temporal', 'DC.PeriodOfTime']: - data = tag.get("content", None) - period = data.split("/") - logging.debug('Found time period: %s', period) - break; - # returning arrays for array field in DB - return [period[0]], [period[1]] - -def parse_oai_xml_and_save_publications(content, event): + coll = {"type": "GeometryCollection", "geometries": [geom_data]} + return GEOSGeometry(json.dumps(coll)) + except Exception: + pass + return None + + +def extract_timeperiod_from_html(soup: BeautifulSoup): + for tag in soup.find_all("meta"): + if tag.get("name") in ("DC.temporal", "DC.PeriodOfTime"): + parts = tag["content"].split("/") + start = parts[0] if parts[0] else None + end = parts[1] if len(parts) > 1 and parts[1] else None + return ([start] if start else [None]), ([end] if end else [None]) # If missing, return [None] for start and [None] for end + return [None], [None] + +def parse_oai_xml_and_save_publications(content: bytes, event: HarvestingEvent) -> tuple[int, int, int]: + """ + Parse OAI-PMH XML, save Publication records linked to `event`, + and return counts: (added, spatial, temporal). + """ try: - DOMTree = xml.dom.minidom.parseString(content) + dom = xml.dom.minidom.parseString(content) except Exception as e: logger.error("Error parsing XML: %s", e) - return + return 0, 0, 0 - collection = DOMTree.documentElement - records = collection.getElementsByTagName("record") - if not records: - logger.warning("No articles found in OAI-PMH response!") - return - for record in records: + for record in dom.getElementsByTagName("record"): try: - def get_text(tag_name): + def get_text(tag_name: str) -> str | None: nodes = record.getElementsByTagName(tag_name) - return nodes[0].firstChild.nodeValue.strip() if nodes and nodes[0].firstChild else None + return ( + nodes[0].firstChild.nodeValue.strip() + if nodes and nodes[0].firstChild else None + ) - # collect all dc:identifier values - id_nodes = record.getElementsByTagName("dc:identifier") - identifiers = [ + ids = [ n.firstChild.nodeValue.strip() - for n in id_nodes - if n.firstChild and n.firstChild.nodeValue + for n in record.getElementsByTagName("dc:identifier") + if n.firstChild ] - http_urls = [u for u in identifiers if u.lower().startswith("http")] - view_urls = [u for u in http_urls if "/view/" in u] - identifier_value = (view_urls or http_urls or [None])[0] - - title_value = get_text("dc:title") - abstract_text = get_text("dc:description") - journal_value = get_text("dc:publisher") - date_value = get_text("dc:date") - - doi_text = None - for ident in identifiers: - if match := DOI_REGEX.search(ident): - doi_text = match.group(0) + http_ids = [u for u in ids if u.lower().startswith("http")] + identifier = None + for u in http_ids: + if "/view/" in u: + identifier = u + break + if not identifier and http_ids: + identifier = http_ids[0] + + title = get_text("dc:title") + abstract = get_text("dc:description") + publisher_name = get_text("dc:publisher") + pub_date = get_text("dc:date") + + doi = None + for u in ids: + m = DOI_REGEX.search(u) + if m: + doi = m.group(0) break - if doi_text and Publication.objects.filter(doi=doi_text).exists(): - logger.info("Skipping duplicate publication (DOI): %s", doi_text) + if doi and Publication.objects.filter(doi=doi).exists(): continue - if identifier_value and Publication.objects.filter(url=identifier_value).exists(): - logger.info("Skipping duplicate publication (URL): %s", identifier_value) + if identifier and Publication.objects.filter(url=identifier).exists(): continue - # Skip records without a valid URL. - if not identifier_value or not identifier_value.startswith("http"): - logger.warning("Skipping record with invalid URL: %s", identifier_value) + if not identifier or not identifier.startswith("http"): continue - geom_object = GeometryCollection() - period_start, period_end = [], [] + src = None + if publisher_name: + src, _ = Source.objects.get_or_create(name=publisher_name) + + geom = None + ps_list = [None] + pe_list = [None] + try: - resp = requests.get(identifier_value, timeout=10) + resp = requests.get(identifier, timeout=10) resp.raise_for_status() soup = BeautifulSoup(resp.content, "html.parser") - try: - geom = extract_geometry_from_html(soup) - geom_object = geom or GeometryCollection() - except Exception as geo_err: - logger.error("Geometry extraction failed for URL %s: %s", identifier_value, geo_err) - geom_object = GeometryCollection() - - try: - start_time, end_time = extract_timeperiod_from_html(soup) - if isinstance(start_time, list): - period_start = [d for d in start_time if d] - if isinstance(end_time, list): - period_end = [d for d in end_time if d] - except Exception as time_err: - logger.error("Time period extraction failed for URL %s: %s", identifier_value, time_err) - - except Exception as fetch_err: - logger.error("Error fetching HTML for %s: %s", identifier_value, fetch_err) - geom_object = GeometryCollection() - period_start, period_end = [], [] - - publication = Publication( - title=title_value, - abstract=abstract_text, - publicationDate=date_value, - url=identifier_value, - doi=doi_text, - source=journal_value, - geometry=geom_object, - timeperiod_startdate=period_start, - timeperiod_enddate=period_end, - job=event - ) - publication.save() + ps_list, pe_list = extract_timeperiod_from_html(soup) + g = extract_geometry_from_html(soup) + if g: + geom = g + + if src and getattr(src, "is_preprint", False) and geom.empty: + try: + loc = Nominatim(user_agent="optimap-tasks").geocode(src.homepage_url or src.url) + if loc: + geom = Point(loc.longitude, loc.latitude) + except Exception as e: + logger.debug( + "Preprint geocode failed for %s: %s", + src.name if src else identifier, + e + ) + except Exception as e: + logger.debug( + "Retrieval and metadata extraction failed for %s: %s", + src.name if src else identifier, + e + ) + pass + + Publication.objects.create( + title=title, + abstract=abstract, + publicationDate=pub_date, + url=identifier, + doi=doi, + source=src, + geometry=geom, + timeperiod_startdate=ps_list, + timeperiod_enddate=pe_list, + job=event, + ) except Exception as e: logger.error("Error parsing record: %s", e) continue -def harvest_oai_endpoint(source_id, user=None): - source = Source.objects.get(id=source_id) - event = HarvestingEvent.objects.create(source=source, status="in_progress") + added_count = Publication.objects.filter(job=event).count() + spatial_count = Publication.objects.filter(job=event).exclude(geometry__isnull=True).count() + temporal_count = Publication.objects.filter(job=event).exclude(timeperiod_startdate=[]).count() + return added_count, spatial_count, temporal_count +def harvest_oai_endpoint(source_id: int, user=None) -> None: + """ + Fetch OAI-PMH feed (HTTP or file://), create a HarvestingEvent, + parse & save publications, send summary email, and mark completion. + """ try: - response = requests.get(source.url_field) - response.raise_for_status() - - parse_oai_xml_and_save_publications(response.content, event) - - event.status = "completed" - event.completed_at = timezone.now() - event.save() - - new_count = Publication.objects.filter(job=event).count() - spatial_count = Publication.objects.filter(job=event).exclude(geometry__isnull=True).count() - temporal_count = Publication.objects.filter(job=event).exclude(timeperiod_startdate=[]).count() - - subject = f"Harvesting Completed for {source.collection_name}" - completed_str = event.completed_at.strftime('%Y-%m-%d %H:%M:%S') if event.completed_at else 'N/A' - message = ( - f"Harvesting job details:\n\n" - f"Number of added articles: {new_count}\n" - f"Number of articles with spatial metadata: {spatial_count}\n" - f"Number of articles with temporal metadata: {temporal_count}\n" - f"Collection used: {source.collection_name or 'N/A'}\n" - f"Journal: {source.url_field}\n" - f"Job started at: {event.started_at.strftime('%Y-%m-%d %H:%M:%S')}\n" - f"Job completed at: {completed_str}\n" + src = Source.objects.get(pk=source_id) + except Source.DoesNotExist: + logger.error("Source with id %s not found", source_id) + return + if src.url_field.startswith("file://"): + path = src.url_field[7:] + try: + with open(path, "rb") as f: + content = f.read() + except Exception as e: + logger.error("Failed to read local file %s: %s", path, e) + return + else: + try: + resp = requests.get(src.url_field, timeout=30) + resp.raise_for_status() + content = resp.content + except Exception as e: + logger.error("Harvesting failed for %s: %s", src.url_field, e) + return + + low = (src.homepage_url or src.url_field or "").lower() + if any(x in low for x in ("arxiv.org", "biorxiv.org")) and not src.is_preprint: + src.is_preprint = True + src.save(update_fields=["is_preprint"]) + + event = HarvestingEvent.objects.create( + source=src, + user=user, + status="in_progress", + ) + added, spatial, temporal = parse_oai_xml_and_save_publications(content, event) + if user: + subject = "Harvesting Completed" + body = ( + f"Collection: {src.collection_name}\n" + f"Source URL: {src.url_field}\n" + f"Number of added articles: {added}\n" + f"Number of articles with spatial metadata: {spatial}\n" + f"Number of articles with temporal metadata: {temporal}\n" + f"Harvest started : {event.started_at:%Y-%m-%d}\n" ) - - if user and user.email: - send_mail( - subject, - message, - settings.EMAIL_HOST_USER, - [user.email], - fail_silently=False, - ) - - except Exception as e: - logger.error("Harvesting failed for source %s: %s", source.url_field, str(e)) - event.status = "failed" - event.completed_at = timezone.now() - event.save() + send_mail(subject, body, settings.EMAIL_HOST_USER, [user.email]) + + event.status = "completed" + event.completed_at = timezone.now() + event.save() + + return added, spatial, temporal + def send_monthly_email(trigger_source='manual', sent_by=None): recipients = User.objects.filter(userprofile__notify_new_manuscripts=True).values_list('email', flat=True) @@ -250,12 +266,10 @@ def send_monthly_email(trigger_source='manual', sent_by=None): [recipient], fail_silently=False, ) - EmailLog.log_email( recipient, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="success" ) - time.sleep(settings.EMAIL_SEND_DELAY) - + time.sleep(settings.EMAIL_SEND_DELAY) except Exception as e: error_message = str(e) logger.error(f"Failed to send monthly email to {recipient}: {error_message}") @@ -263,40 +277,35 @@ def send_monthly_email(trigger_source='manual', sent_by=None): recipient, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="failed", error_message=error_message ) - def send_subscription_based_email(trigger_source='manual', sent_by=None, user_ids=None): - query = Subscription.objects.filter(subscribed=True, user__isnull=False) + query = Subscription.objects.filter(subscribed=True, user__isnull=False) if user_ids: - query = query.filter(user__id__in=user_ids) + query = query.filter(user__id__in=user_ids) for subscription in query: - user_email = subscription.user.email + user_email = subscription.user.email new_publications = Publication.objects.filter( - geometry__intersects=subscription.region, - # publicationDate__gte=subscription.timeperiod_startdate, - # publicationDate__lte=subscription.timeperiod_enddate + geometry__intersects=subscription.region, ) if not new_publications.exists(): - continue + continue unsubscribe_specific = f"{BASE_URL}{reverse('optimap:unsubscribe')}?search={quote(subscription.search_term)}" unsubscribe_all = f"{BASE_URL}{reverse('optimap:unsubscribe')}?all=true" subject = f"📚 New Manuscripts Matching '{subscription.search_term}'" - bullet_list = "\n".join([f"- {pub.title}" for pub in new_publications]) - content = f"""Dear {subscription.user.username}, - Here are the latest manuscripts matching your subscription: +Here are the latest manuscripts matching your subscription: - {bullet_list} +{bullet_list} - Manage your subscriptions: - Unsubscribe from '{subscription.search_term}': {unsubscribe_specific} - Unsubscribe from All: {unsubscribe_all} - """ +Manage your subscriptions: +Unsubscribe from '{subscription.search_term}': {unsubscribe_specific} +Unsubscribe from All: {unsubscribe_all} +""" try: email = EmailMessage(subject, content, settings.EMAIL_HOST_USER, [user_email]) @@ -304,8 +313,7 @@ def send_subscription_based_email(trigger_source='manual', sent_by=None, user_id EmailLog.log_email( user_email, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="success" ) - time.sleep(settings.EMAIL_SEND_DELAY) - + time.sleep(settings.EMAIL_SEND_DELAY) except Exception as e: error_message = str(e) logger.error(f"Failed to send subscription email to {user_email}: {error_message}") @@ -313,11 +321,13 @@ def send_subscription_based_email(trigger_source='manual', sent_by=None, user_id user_email, subject, content, sent_by=sent_by, trigger_source=trigger_source, status="failed", error_message=error_message ) +# ... (the rest of the file remains unchanged) + def schedule_monthly_email_task(sent_by=None): if not Schedule.objects.filter(func='publications.tasks.send_monthly_email').exists(): now = datetime.now() - last_day_of_month = calendar.monthrange(now.year, now.month)[1] # Get last day of the month - next_run_date = now.replace(day=last_day_of_month, hour=23, minute=59) # Run at the end of the last day + last_day_of_month = calendar.monthrange(now.year, now.month)[1] + next_run_date = now.replace(day=last_day_of_month, hour=23, minute=59) schedule( 'publications.tasks.send_monthly_email', schedule_type='M', @@ -327,11 +337,12 @@ def schedule_monthly_email_task(sent_by=None): ) logger.info(f"Scheduled 'schedule_monthly_email_task' for {next_run_date}") + def schedule_subscription_email_task(sent_by=None): if not Schedule.objects.filter(func='publications.tasks.send_subscription_based_email').exists(): now = datetime.now() - last_day_of_month = calendar.monthrange(now.year, now.month)[1] # Get last day of the month - next_run_date = now.replace(day=last_day_of_month, hour=23, minute=59) # Run at the end of the last day + last_day_of_month = calendar.monthrange(now.year, now.month)[1] + next_run_date = now.replace(day=last_day_of_month, hour=23, minute=59) schedule( 'publications.tasks.send_subscription_based_email', schedule_type='M', @@ -340,7 +351,8 @@ def schedule_subscription_email_task(sent_by=None): kwargs={'trigger_source': 'scheduled', 'sent_by': sent_by.id if sent_by else None} ) logger.info(f"Scheduled 'send_subscription_based_email' for {next_run_date}") - + + def regenerate_geojson_cache(): cache_dir = os.path.join(tempfile.gettempdir(), "optimap_cache") os.makedirs(cache_dir, exist_ok=True) @@ -363,10 +375,11 @@ def regenerate_geojson_cache(): size = os.path.getsize(json_path) logger.info("Cached GeoJSON at %s (%d bytes), gzipped at %s", json_path, size, gzip_path) - # remove old dumps beyond retention + # remove old dumps beyond retention cleanup_old_data_dumps(Path(cache_dir), settings.DATA_DUMP_RETENTION) return json_path + def convert_geojson_to_geopackage(geojson_path): cache_dir = os.path.dirname(geojson_path) gpkg_filename = generate_data_dump_filename("gpkg") @@ -378,13 +391,9 @@ def convert_geojson_to_geopackage(geojson_path): text=True, ) logger.info("ogr2ogr output:\n%s", output) - # remove old dumps beyond retention return gpkg_path - except subprocess.CalledProcessError as e: + except subprocess.CalledProcessError: return None - # on success, return the filename so callers can stream it or inspect it - # remove old dumps beyond retention - return gpkg_path def regenerate_geopackage_cache(): diff --git a/publications/templates/user_settings.html b/publications/templates/user_settings.html index f326676f..12656a42 100644 --- a/publications/templates/user_settings.html +++ b/publications/templates/user_settings.html @@ -240,7 +240,7 @@