Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2c1d6e5
init consolidator
jgarciab Apr 11, 2023
75ffb6d
temp dev
jgarciab Apr 21, 2023
3c39a76
update crawler
jgarciab Mar 14, 2024
c96522f
Merge branch 'consolidator' of github.com:sodascience/websweep into c…
jgarciab Mar 14, 2024
7da71a6
convert to google-re2
jgarciab Mar 19, 2024
4b2281f
clean up
jgarciab Mar 19, 2024
5005768
doctring
jgarciab Mar 19, 2024
72b8171
remove duplicates
jgarciab Mar 19, 2024
d299371
add google-re2
jgarciab Mar 19, 2024
c21583b
clean up unused libraries
jgarciab Mar 19, 2024
28e6ec6
remove unused dependencies
jgarciab Mar 19, 2024
9d84e97
add temporary folder to main
jgarciab May 7, 2024
ed8a89c
Merge branch 'consolidator' into increase_robustness
jgarciab Feb 22, 2026
63d67ae
feat: harden crawling and extraction pipeline
jgarciab Feb 23, 2026
377189f
docs: overhaul README, Sphinx docs, and featured notebook
jgarciab Feb 23, 2026
386af70
ci: add uv-based test, lint, docs, and publish workflows
jgarciab Feb 23, 2026
a7bdc3c
docs: refresh generated HTML and doctree artifacts
jgarciab Feb 23, 2026
3164b92
docs: update generated Sphinx environment cache
jgarciab Feb 23, 2026
a7faabb
chore: finalize robustness updates and docs sync
jgarciab Feb 23, 2026
f3ef3f2
ci(docs): install pandoc for nbsphinx notebook build
jgarciab Feb 23, 2026
ef575c4
docs+cli: clarify library vs CLI workflows and simplify usage guidance
jgarciab Feb 23, 2026
21a25ae
feat: streamline consolidator defaults and harden config restore/init
jgarciab Feb 23, 2026
e8b4d9b
Configure extractor add-on at init and copy into instance
jgarciab Feb 24, 2026
d2d8232
Stabilize storage-path pipeline and complement retries
jgarciab Feb 24, 2026
aae4ed9
Simplify temp/target storage model and align docs
jgarciab Feb 24, 2026
575aa8a
Stop tracking Sphinx build artifacts
jgarciab Mar 7, 2026
bca9e33
Improve docs clarity and crawler defaults
jgarciab Mar 9, 2026
a168818
Clarify README research workflow
jgarciab Mar 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: CI

on:
push:
pull_request:
workflow_dispatch:

jobs:
test:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.10", "3.11", "3.12", "3.13"]

steps:
- name: Check out repository
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Set up uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true

- name: Install dependencies
run: uv sync --group test --group dev

- name: Lint with ruff
run: uv run ruff check src tests

- name: Run tests with pytest
run: uv run pytest -q

- name: Build package
run: uv build --out-dir /tmp/dist

- name: Validate package metadata
run: uv run twine check /tmp/dist/*
85 changes: 85 additions & 0 deletions .github/workflows/docs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
name: Docs

on:
push:
paths:
- "src/**"
- "docs/source/**"
- "examples/**"
- "README.md"
- "Makefile"
- "make.bat"
- "pyproject.toml"
- ".readthedocs.yml"
- ".github/workflows/docs.yml"
pull_request:
paths:
- "src/**"
- "docs/source/**"
- "examples/**"
- "README.md"
- "Makefile"
- "make.bat"
- "pyproject.toml"
- ".readthedocs.yml"
- ".github/workflows/docs.yml"
workflow_dispatch:

jobs:
build-docs:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"

- name: Set up uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true

- name: Install dependencies
run: uv sync --group docs

- name: Install pandoc
run: sudo apt-get update && sudo apt-get install -y pandoc

- name: Validate featured notebook example
run: |
python - <<'PY'
from pathlib import Path
p = Path("examples/example_scraper_extractor.ipynb")
if not p.exists():
raise SystemExit("Missing featured notebook: examples/example_scraper_extractor.ipynb")
text = p.read_text(encoding="utf-8")
required_urls = [
"https://www.dggrootverbruik.nl/",
"https://www.gosliga.nl/",
"https://www.heeren2.nl/",
]
for url in required_urls:
if url not in text:
raise SystemExit(f"Featured notebook does not contain required URL: {url}")
if "127.0.0.1" in text:
raise SystemExit("Featured notebook still references localhost; expected real websites.")
print("Notebook validation passed.")
PY

- name: Build docs
run: uv run make docs

- name: Upload docs artifact
uses: actions/upload-artifact@v4
with:
name: docs-html
path: docs/build/html

- name: Upload featured notebook artifact
uses: actions/upload-artifact@v4
with:
name: notebook-example
path: examples/example_scraper_extractor.ipynb
54 changes: 21 additions & 33 deletions .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
@@ -1,45 +1,33 @@
name: Verify build
name: Lint

on: [push]
on:
push:
pull_request:
workflow_dispatch:

jobs:
build:
lint:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.9"]
python-version: ["3.11"]

steps:
- name: Checkout code
uses: actions/checkout@v3
- name: Checkout code
uses: actions/checkout@v4

- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}

- name: Install pip dependencies
run: |
python -m pip install --upgrade pip
pip install flake8
- name: Set up uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true

- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Install dependencies
run: uv sync --group dev

- name: Install Poetry
uses: snok/install-poetry@v1
with:
version: latest # Upgrade to the latest version of Poetry

- name: Install dependencies
run: poetry install --no-interaction --no-root

- name: Install project
run: poetry install --no-interaction

- name: Run tests
run: poetry run pytest
- name: Run ruff
run: uv run ruff check src tests
40 changes: 40 additions & 0 deletions .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Upload Python Package

on:
release:
types: [published]
workflow_dispatch:

permissions:
contents: read
id-token: write

jobs:
deploy:
runs-on: ubuntu-latest
environment:
name: pypi
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'

- name: Set up uv
uses: astral-sh/setup-uv@v3
with:
enable-cache: true

- name: Install build tools
run: uv sync --group dev

- name: Build package
run: uv build

- name: Validate package metadata
run: uv run twine check dist/*

- name: Publish package
uses: pypa/gh-action-pypi-publish@db8f07d3871a0a180efa06b95d467625c19d5d5f
78 changes: 65 additions & 13 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,16 +1,68 @@
.scrapy/*
*.db
*.log
*.pyc
*.key
data/*
data*
dist/*
.vscode/*
# OS / editor
.DS_Store
**/.DS_Store
sidn_test_kopie.csv
init files/*
examples/data/*
Thumbs.db
.vscode/
.idea/

# Python caches
__pycache__/
*.py[cod]
*.pyo
*.pyd
.pytest_cache/
.mypy_cache/
.ruff_cache/
.pyre/
.hypothesis/
.coverage
.coverage.*
htmlcov/

# Virtual environments / local libs
.venv/
venv/
env/
ENV/
lib/

# Build / packaging
build/
dist/
*.egg-info/
.eggs/
pip-wheel-metadata/

# Docs build outputs
docs/build/
site/

# Logs / local outputs
*.log
logs/
tmp/
*.db
*.sqlite
*.sqlite3
*.ndjson
scraper_data_actually/*

# Project-specific generated data
data/
examples/data/
examples/.ipynb_checkpoints/
crawled_data/
extracted_data/
consolidated_data/
overview_urls.db
overview_urls.duckdb
overview_urls.tsv
scraper_data_actually/
examples/test*

# Legacy local files
sidn_test_kopie.csv
init files/
.scrapy/

# Internal tooling
/scripts/
16 changes: 16 additions & 0 deletions .readthedocs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
version: 2

build:
os: ubuntu-22.04
tools:
python: "3.11"

sphinx:
configuration: docs/source/conf.py

python:
install:
- method: pip
path: .
extra_requirements:
- docs
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2026 ODISSEI Social Data Science Team

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
12 changes: 12 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
include LICENSE
include README.md
include pyproject.toml

recursive-include src/websweep/utils *.json *.dat

prune docs/build
prune .venv
prune examples/.ipynb_checkpoints
prune .pytest_cache

global-exclude __pycache__ *.py[cod] .DS_Store
13 changes: 12 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,26 @@
# You can set these variables from the command line, and also
# from the environment for the first two.
SPHINXOPTS ?=
PYTHON ?= python3
SPHINXBUILD ?= sphinx-build
SOURCEDIR = docs/source
BUILDDIR = docs/build
APIPKGDIR = src/websweep

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile
apidoc:
@$(PYTHON) docs/scripts/sync_featured_notebook.py
@$(PYTHON) -m sphinx.ext.apidoc -o "$(SOURCEDIR)" "$(APIPKGDIR)" --force

docs: apidoc html

docs-clean:
@rm -rf "$(BUILDDIR)"

.PHONY: help apidoc docs docs-clean Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
Expand Down
Loading
Loading