Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
81 commits
Select commit Hold shift + click to select a range
d3b4fe1
fix(test): prevent tests from accessing user's operational indexes
titusz Nov 20, 2025
000c6f9
ci: add Docker build and publish workflow
titusz Nov 20, 2025
2d89cf3
chore(deps): move iscc-sct to main dependencies and add psycopg dev d…
titusz Nov 20, 2025
1aeb981
ci(docker): run tests before building and publishing image
titusz Nov 20, 2025
e6c3e5f
feat(logging): add timestamps and loguru support to production logs
titusz Nov 20, 2025
ba37ca2
fix(ci): pre-download iscc-sct model to prevent race conditions
titusz Nov 20, 2025
dee145b
feat(simprint): implement soft-boundary usearch index with exponentia…
titusz Nov 15, 2025
d5e5653
feat(simprint): expose HNSW tuning parameters for index optimization
titusz Nov 15, 2025
55ef9b6
fix(simprint): prevent division by zero in confidence weighting
titusz Nov 16, 2025
975d550
build: add lancedb dependency for disk-based vector storage
titusz Nov 16, 2025
94b7f2b
feat(simprint): add LanceDB-based disk storage implementation
titusz Nov 16, 2025
2db9702
feat(simprint): add multi-backend index coordinator
titusz Nov 16, 2025
7a05716
refactor(simprint): use standard threshold parameter for match filtering
titusz Nov 16, 2025
de73865
feat(simprint): implement configurable coverage weighting for soft-bo…
titusz Nov 20, 2025
1386c0e
build: replace custom usearch wheels with iscc-usearch package
titusz Feb 16, 2026
f578ccb
fix(test): override settings directly in remote test server fixture
titusz Feb 16, 2026
59cd64f
refactor: replace local NphdIndex/metrics/timer with iscc-usearch re-…
titusz Feb 16, 2026
53b287b
fix(simprint): use list_tables().tables for LanceDB table discovery
titusz Feb 16, 2026
014a5f0
test: replace skipped segfault test with count=0 ValueError assertion
titusz Feb 16, 2026
da91d08
chore: remove unimplemented ShardedUsearchIndex stub
titusz Feb 18, 2026
10181e5
refactor(schema): use StrEnum and fix mutable default argument
titusz Feb 18, 2026
28c30d3
build: bump iscc-usearch to >=0.3.0
titusz Feb 18, 2026
82adda2
build: update ruff-pre-commit from v0.14.0 to v0.15.1
titusz Feb 18, 2026
248f65e
refactor: remove obsolete numba code, upgrade to iscc-usearch v0.5.0
titusz Feb 20, 2026
962ab6b
refactor: replace NphdIndex with ShardedNphdIndex in UsearchIndex
titusz Feb 20, 2026
e2b585d
feat: add LMDB simprint storage with exact search and idempotent updates
titusz Feb 20, 2026
61a7aec
feat: add ShardedIndex128 approximate simprint search with IDF scoring
titusz Feb 20, 2026
77a1295
feat: Session 6 backend consolidation - remove legacy simprint backends
titusz Feb 20, 2026
7d0939e
refactor: rename settings to options to match iscc-sct conventions
titusz Feb 22, 2026
f12cd61
docs: update stale settings.py references in config.py docstring
titusz Feb 22, 2026
84e76eb
docs: add command execution guidelines to CLAUDE.md
titusz Feb 22, 2026
602ee82
docs: track .env.example for configuration reference
titusz Feb 22, 2026
aabbc6b
refactor: externalize hardcoded index parameters into SearchOptions
titusz Feb 22, 2026
6b14845
upgrade iscc-usearch to 0.6.0, simplify NPHD vector handling
titusz Feb 22, 2026
f235af8
feat: add configurable auto-flush and skip clean sub-indexes on flush…
titusz Feb 22, 2026
7439f3f
chore: update uv.lock with latest dependency versions
titusz Mar 1, 2026
2ea8056
docs: add sandbox deployment reference and flush interval guidance
titusz Mar 1, 2026
0749f32
fix: use plain string for cors_origins to avoid pydantic-settings JSO…
titusz Mar 1, 2026
3805047
fix: remove nested progress bar from RemoteIndex.add_assets
titusz Mar 1, 2026
79f3c49
fix: show indexing status in progress bar during batch upload
titusz Mar 1, 2026
f3e314d
docs: update sandbox compose with current flush_interval and shard sizes
titusz Mar 1, 2026
0262c5b
fix: disable auto-rebuild on startup to prevent OOM restart loop
titusz Mar 2, 2026
bbbf494
fix: switch LMDB writemap=False to reduce memory footprint
titusz Mar 2, 2026
9390802
fix: update test assertion for writemap=False small file sizes
titusz Mar 2, 2026
d20d368
perf: batch LMDB simprint writes and skip contains-check on HNSW add
titusz Mar 2, 2026
bffa526
docs: sync sandbox compose example with live config
titusz Mar 2, 2026
63359d4
fix: clear batch on flush failure and tune LMDB for low memory
titusz Mar 2, 2026
5ac4ccb
fix: serialize all LMDB write transactions with RLock
titusz Mar 2, 2026
c4a29bd
docs: reformat deployment.md line wrapping
titusz Mar 2, 2026
1a66832
fix: use stable callback reference for atexit register/unregister
titusz Mar 5, 2026
56c5a23
fix: preserve NPHD indexes on LMDB MapFullError resize
titusz Mar 5, 2026
dab3389
fix: harden close/shutdown for exception safety
titusz Mar 5, 2026
d712da4
fix: single-simprint text search crash and playground error display
titusz Mar 10, 2026
20c52cc
chore: ignore local cauldron/ workspace
titusz Apr 15, 2026
860df99
fix: repopulate simprint db caches after LMDB resize
titusz Apr 15, 2026
4adc6f4
chore: remove legacy modules and unused dependencies
titusz Apr 15, 2026
c55e883
chore: remove postgres backend placeholder
titusz Apr 15, 2026
8d23d2c
refactor(schema): use default value instead of default_factory for ch…
titusz Apr 15, 2026
1ea2c68
docs: refresh CLAUDE.md and README.md for current architecture
titusz Apr 15, 2026
f6abeef
fix: guard serve CLI against multi-worker with usearch backend
titusz Apr 15, 2026
c8d99e8
fix: never auto-rebuild simprint indexes during search
titusz Apr 15, 2026
e75ea40
feat: add /healthz and /readyz probe endpoints
titusz Apr 15, 2026
633c02b
feat: add env-gated Sentry error tracking
titusz Apr 15, 2026
906987f
docs: add sizing profiles and production defaults for deployment
titusz Apr 15, 2026
556f310
chore: update uv.lock (exceptiongroup marker drift)
titusz Apr 16, 2026
586dc2c
ci: opt into Node.js 24 for GitHub Actions
titusz Apr 16, 2026
887f61f
docs: remove legacy documentation for rewrite
titusz Apr 16, 2026
2326d80
docs: add zensical documentation site with full Divio framework
titusz Apr 16, 2026
009a3ac
docs: clarify iscc-search vs iscc-usearch relationship
titusz Apr 16, 2026
bb8d252
chore: add mdformat pre-commit hook, remove legacy deps
titusz Apr 16, 2026
1f90ef2
chore: add ruff and uv cache dirs to .gitignore
titusz Apr 16, 2026
220fe58
chore: drop Python 3.10, add 3.14, remove unused dependencies
titusz Apr 16, 2026
78c8776
ci: upgrade all GitHub Actions to Node.js 24 native versions
titusz Apr 16, 2026
1dae4b7
fix(ci): pin astral-sh/setup-uv to v8.0.0 (no major tag published)
titusz Apr 16, 2026
95f5578
chore: switch scratch workspace to cauldron, document defaults
titusz Apr 16, 2026
caf9ad6
ci: rebuild pipeline with dynamic versioning and wheel-based Docker
titusz Apr 16, 2026
01ae605
feat(cli): add hub + datasets commands for HuggingFace ingestion
titusz Apr 16, 2026
125b989
ci: cache and pre-download iscc-sct model to avoid race in parallel t…
titusz Apr 16, 2026
1dca6a4
test: widen timer tolerance to 1s to stop flaking on macOS CI
titusz Apr 16, 2026
8c01ca6
chore: quiet CLI logs and HuggingFace progress bars
titusz Apr 16, 2026
06ea467
docs: add CHANGELOG.md for 0.1.0 release
titusz Apr 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 9 additions & 55 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,57 +1,11 @@
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
*.egg-info/
dist/
build/
*.egg
# Allowlist-style .dockerignore: ignore everything, then explicitly allow the build inputs.
# The Docker image installs from a pre-built wheel, so no source files are required in the context.
# This prevents dev artifacts (cauldron/, .claude/, scratch/, tests/, docs/, .git/, secrets) from
# ever entering the image layers or being cached by BuildKit.

# Virtual environments
.venv/
venv/
ENV/
env/
# Ignore everything
**

# IDE
.vscode/
.idea/
*.swp
*.swo
*~

# Testing
.pytest_cache/
.coverage
htmlcov/
.tox/
*.cover

# Git
.git/
.gitignore
.gitattributes

# Documentation
docs/
*.md
!README.md

# Development tools
.ruff_cache/
.mypy_cache/
.pre-commit-config.yaml

# Local development
scratch/
.env
.env.*

# CI/CD
.github/

# Misc
*.log
.DS_Store
# Allow the Dockerfile and the pre-built wheel produced by `uv build`
!Dockerfile
!dist/*.whl
85 changes: 85 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
# ISCC-Search Configuration
# Copy to .env and customize as needed. All values shown are defaults.

# --- Server ---

# Index backend URI (memory://, lmdb:///path, usearch:///path)
# Default: usearch:/// + platform-specific user data dir (e.g. ~/.local/share/iscc-search)
# ISCC_SEARCH_INDEX_URI=usearch:///path/to/data

# API secret for authentication (unset = public API)
# ISCC_SEARCH_API_SECRET=

# CORS allowed origins (comma-separated, or * for all)
# ISCC_SEARCH_CORS_ORIGINS=*

# Server host and port
# ISCC_SEARCH_HOST=0.0.0.0
# ISCC_SEARCH_PORT=8000

# Number of worker processes (production only, unset = single worker)
# NOTE: workers > 1 is REJECTED with the usearch:// backend — concurrent writers
# corrupt .usearch files. Leave unset (single worker) for usearch-backed deployments.
# ISCC_SEARCH_WORKERS=

# Auto-flush sub-indexes after N dirty key mutations (0 = disabled).
# Only safe with a single writer process. Production recommendation: 100000.
# Reduces blast radius of hard crashes (OOM / SIGKILL / power loss).
# ISCC_SEARCH_FLUSH_INTERVAL=0

# Log level (debug, info, warning, error, critical)
# ISCC_SEARCH_LOG_LEVEL=info

# --- Shard Sizes (MB) ---

# Maximum shard file size for ISCC-UNIT indexes
# ISCC_SEARCH_SHARD_SIZE_UNITS=1024

# Maximum shard file size for simprint indexes
# ISCC_SEARCH_SHARD_SIZE_SIMPRINTS=1024

# --- HNSW Parameters (Units) ---

# Build-time search depth (efConstruction) for unit indexes
# ISCC_SEARCH_HNSW_EXPANSION_ADD_UNITS=128

# Query-time search depth (ef) for unit indexes
# ISCC_SEARCH_HNSW_EXPANSION_SEARCH_UNITS=64

# Graph connectivity (M) for unit indexes
# ISCC_SEARCH_HNSW_CONNECTIVITY_UNITS=16

# --- HNSW Parameters (Simprints) ---

# Build-time search depth (efConstruction) for simprint indexes
# ISCC_SEARCH_HNSW_EXPANSION_ADD_SIMPRINTS=16

# Query-time search depth (ef) for simprint indexes
# ISCC_SEARCH_HNSW_EXPANSION_SEARCH_SIMPRINTS=512

# Graph connectivity (M) for simprint indexes
# ISCC_SEARCH_HNSW_CONNECTIVITY_SIMPRINTS=8

# --- Match Thresholds ---

# Minimum score for unit similarity matches (0.0-1.0)
# ISCC_SEARCH_MATCH_THRESHOLD_UNITS=0.75

# Minimum score for simprint matches (0.0-1.0)
# ISCC_SEARCH_MATCH_THRESHOLD_SIMPRINTS=0.75

# --- Scoring ---

# Exponent for confidence-weighted score aggregation
# ISCC_SEARCH_CONFIDENCE_EXPONENT=4

# Oversampling multiplier for simprint search diversity
# ISCC_SEARCH_OVERSAMPLING_FACTOR=20

# --- Error Tracking (Sentry) ---

# Sentry DSN for error tracking (unset = disabled)
# ISCC_SEARCH_SENTRY_DSN=

# Sentry performance sampling rate (0.0-1.0)
# ISCC_SEARCH_SENTRY_TRACES_SAMPLE_RATE=0.05
4 changes: 2 additions & 2 deletions .github/actions/setup-python-env/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ inputs:
runs:
using: "composite"
steps:
- uses: actions/setup-python@v5
- uses: actions/setup-python@v6
with:
python-version: ${{ inputs.python-version }}

- name: Install uv
uses: astral-sh/setup-uv@v2
uses: astral-sh/setup-uv@v8.0.0
with:
version: ${{ inputs.uv-version }}
enable-cache: 'true'
Expand Down
223 changes: 223 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
name: CI

on:
pull_request:
push:
branches: [develop, main]

# Cancel stale in-flight PR runs; let push runs complete so publish isn't cancelled.
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}

permissions:
contents: read

jobs:
test-full:
name: Test (ubuntu-latest, py${{ matrix.python-version }})
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ["3.11", "3.12", "3.13", "3.14"]
fail-fast: false
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0 # hatch-vcs needs git history to derive version

- uses: ./.github/actions/setup-python-env
with:
python-version: ${{ matrix.python-version }}

# Cache the iscc-sct ONNX model across runs. Parallel pytest-xdist workers
# otherwise race on the download and can produce a corrupt ONNX file.
- name: Cache iscc-sct model
uses: actions/cache@v5
with:
path: |
~/.local/share/iscc-sct
~/Library/Application Support/iscc-sct
~\AppData\Local\iscc\iscc-sct
key: iscc-sct-model-${{ runner.os }}-${{ hashFiles('uv.lock') }}
restore-keys: |
iscc-sct-model-${{ runner.os }}-

- name: Pre-download iscc-sct model (serial, before parallel tests)
run: uv run python -c "import iscc_sct.code_semantic_text as sct; sct.model()"

- name: OpenAPI build + validation
run: uv run poe build

- name: Run tests
run: uv run poe test

- name: Upload coverage to Codecov
if: matrix.python-version == '3.11'
uses: codecov/codecov-action@v6
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

test-smoke:
name: Test (${{ matrix.os }}, py3.12)
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [windows-latest, macos-latest]
fail-fast: false
defaults:
run:
shell: bash
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0

- uses: ./.github/actions/setup-python-env
with:
python-version: "3.12"

- name: Cache iscc-sct model
uses: actions/cache@v5
with:
path: |
~/.local/share/iscc-sct
~/Library/Application Support/iscc-sct
~\AppData\Local\iscc\iscc-sct
key: iscc-sct-model-${{ runner.os }}-${{ hashFiles('uv.lock') }}
restore-keys: |
iscc-sct-model-${{ runner.os }}-

- name: Pre-download iscc-sct model
run: uv run python -c "import iscc_sct.code_semantic_text as sct; sct.model()"

- name: Run tests
run: uv run poe test

wheel-build:
name: Build wheel
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0

- uses: ./.github/actions/setup-python-env

- name: Build wheel + sdist
run: uv build

- name: Upload dist artifact
uses: actions/upload-artifact@v6
with:
name: dist
path: dist/
retention-days: 7

wheel-smoke:
name: Install wheel and smoke-test
needs: wheel-build
runs-on: ubuntu-latest
steps:
- uses: actions/setup-python@v6
with:
python-version: "3.12"

- uses: actions/download-artifact@v7
with:
name: dist
path: dist

- name: Install wheel into clean venv
run: |
python -m venv .smoke-venv
.smoke-venv/bin/pip install --upgrade pip
.smoke-venv/bin/pip install dist/*.whl

- name: Verify import and CLI
run: |
.smoke-venv/bin/python -c "import iscc_search; print('version:', iscc_search.__version__)"
.smoke-venv/bin/iscc-search --help

docker-smoke:
name: Build + smoke-test Docker image
needs: wheel-build
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v6

- uses: actions/download-artifact@v7
with:
name: dist
path: dist

- uses: docker/setup-buildx-action@v4

- name: Build image (load into local daemon)
uses: docker/build-push-action@v7
with:
context: .
file: ./Dockerfile
load: true
tags: iscc-search:smoke
cache-from: type=gha
cache-to: type=gha,mode=max

- name: Run container and probe health endpoints
run: |
docker run -d --name iscc-search-smoke -p 8000:8000 iscc-search:smoke
# Wait up to 60s for /readyz to return 200
for i in $(seq 1 30); do
if curl -sf http://localhost:8000/readyz > /dev/null; then
echo "Ready after ${i} attempts"
break
fi
sleep 2
done
curl -sf http://localhost:8000/healthz
curl -sf http://localhost:8000/readyz
echo "--- container logs ---"
docker logs iscc-search-smoke
docker stop iscc-search-smoke

publish-develop:
name: Publish Docker image (develop)
if: github.event_name == 'push' && github.ref == 'refs/heads/develop'
needs: [test-full, test-smoke, wheel-smoke, docker-smoke]
runs-on: ubuntu-latest
permissions:
contents: read
packages: write
steps:
- uses: actions/checkout@v6

- uses: actions/download-artifact@v7
with:
name: dist
path: dist

- uses: docker/setup-buildx-action@v4

- uses: docker/login-action@v4
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- id: meta
uses: docker/metadata-action@v6
with:
images: ghcr.io/${{ github.repository }}
tags: |
type=raw,value=develop
type=sha,prefix=develop-

- name: Build and push
uses: docker/build-push-action@v7
with:
context: .
file: ./Dockerfile
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=gha
Loading