VioletCranberry · VioletCranberry · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
diff --git a/.env.example b/.env.example
@@ -30,9 +30,16 @@
 # COCOSEARCH_EMBEDDING_PROVIDER=ollama
 
 # API key for remote embedding providers (OpenAI, OpenRouter)
-# Required when COCOSEARCH_EMBEDDING_PROVIDER is not "ollama"
+# Required when COCOSEARCH_EMBEDDING_PROVIDER is not "ollama" (unless baseUrl is set)
 # COCOSEARCH_EMBEDDING_API_KEY=sk-...
 
+# Base URL for any embedding provider. Use this to point any provider at a local
+# OpenAI-API-compatible server (Infinity, text-embeddings-inference, vLLM, etc.)
+# instead of the default endpoint. When set for a remote provider, the API key
+# requirement is relaxed (local servers typically don't need one).
+# For the "ollama" provider, this overrides COCOSEARCH_OLLAMA_URL.
+# COCOSEARCH_EMBEDDING_BASE_URL=http://localhost:8080
+
 # =============================================================================
 # Optional (default: auto-detected from cocosearch.yaml, git root, or cwd)
 # =============================================================================

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -111,7 +111,7 @@ uv run cocosearch mcp --project-from-cwd
 - **`indexer/flow.py`** — CocoIndex flow definition (the indexing pipeline)
 - **`search/`** — Hybrid search engine: RRF fusion of vector + keyword results, two-level LRU query cache (`cache.py` — exact + semantic similarity at cosine > 0.92), context expansion via Tree-sitter boundaries for 10 languages (`context_expander.py`, exports `CONTEXT_EXPANSION_LANGUAGES`), symbol/language filtering (`filters.py`), auto-detection of code identifiers for hybrid mode (`query_analyzer.py`), optional dependency enrichment (`include_deps` attaches direct dependencies/dependents to search results), interactive REPL (`repl.py`), result formatting (`formatter.py`), pipeline analysis with stage-by-stage diagnostics (`analyze.py`)
 - **`search/db.py`** — PostgreSQL connection pool (singleton) and query execution
-- **`config/`** — YAML config with 4-level precedence resolution (CLI > env > file > defaults), `${VAR}` substitution (`env_substitution.py`), Pydantic schema validation (`schema.py` with `extra="forbid"`, `strict=True`, `EmbeddingSection` with `provider` field and provider-aware model defaults, `LoggingSection` with `file` toggle), user-friendly error formatting with fuzzy field suggestions (`errors.py`), env var validation (`env_validation.py`)
+- **`config/`** — YAML config with 4-level precedence resolution (CLI > env > file > defaults), `${VAR}` substitution (`env_substitution.py`), Pydantic schema validation (`schema.py` with `extra="forbid"`, `strict=True`, `EmbeddingSection` with `provider` field, provider-aware model defaults, and optional `baseUrl` for custom endpoints, `LoggingSection` with `file` toggle), user-friendly error formatting with fuzzy field suggestions (`errors.py`), env var validation (`env_validation.py`)
 - **`management/`** — Index lifecycle: discovery (`discovery.py`), stats (`stats.py` — includes `check_deps_staleness()` for dependency freshness checks), clearing (`clear.py`), git-based naming (`git.py`), metadata with collision detection, status tracking, embedding provider/model tracking, and `deps_extracted_at` timestamp (`metadata.py`), project root detection (`context.py`)
 - **`deps/`** — Dependency graph framework: pluggable extractors (`extractors/`), pluggable module resolvers (`resolver.py`), edge storage (`db.py`), extraction orchestrator (`extractor.py`), query API with transitive BFS traversal (`query.py`), data models (`models.py`), autodiscovery registry (`registry.py`). 11 extractors: Python imports, JavaScript/TypeScript (ES6 + CommonJS + re-exports), Go imports, ArgoCD (Application/ApplicationSet/AppProject — project refs, source repos/charts/paths, destinations, generator repos; multi-document YAML via `safe_load_all`), Docker Compose (image/depends_on/extends), GitHub Actions (uses refs with parsed owner/repo/version, needs inter-job deps), GitLab CI (include local/project/remote/template, extends template inheritance, needs DAG deps, trigger child/multi-project pipelines, image/service refs), Terraform (module sources with version, required_providers, remote_state, tfvars associations), Helm (template includes, Chart.yaml subcharts, chart membership ownership with `is_subchart` indicator, subchart-to-parent links), Markdown (documentation references: frontmatter depends, links, inline code, code blocks). 5 module resolvers: Python (dotted modules, `__init__.py`, relative imports, `src/`/`lib/` prefix stripping), JavaScript (extension probing, index files), Go (import path suffix matching), Terraform (local module sources), Markdown (relative path normalization, directory reference matching). Query layer supports direct lookups (`get_dependencies`/`get_dependents`), transitive BFS trees (`get_dependency_tree`/`get_impact` with cycle detection and depth limits), batch-aware multi-root BFS (`get_dependency_tree_batch`/`get_impact_batch` with shared visited set), and detailed stats (`get_dep_stats_detailed`). Three edge types: "import" (code imports), "call" (symbol calls), "reference" (grammar-level refs with `metadata.kind` for specifics — Helm uses `chart_member` for template/values→Chart.yaml ownership and `subchart_of` for subchart→parent chart links).
 - **`handlers/`** — Language-specific chunking (HCL, Go Template, Dockerfile, Bash, Scala, Groovy) and grammar handlers (`handlers/grammars/` — ArgoCD, Helm Chart, Helm Template, Helm Values, GitHub Actions, GitLab CI, Docker Compose, Kubernetes, Terraform) with autodiscovery registry
@@ -187,7 +187,7 @@ Project config via `cocosearch.yaml` (no leading dot) in project root. The `inde
 
 **Logging:** Log file output is disabled by default. Enable via `logging.file: true` in `cocosearch.yaml` or `COCOSEARCH_LOG_FILE=true` env var. Logs are written to `~/.cocosearch/logs/cocosearch.log` with 10MB rotation and 3 backups. The web dashboard log panel supports category filtering (search, index, mcp, cache, infra, system, deps) and level filtering (DEBUG+, INFO+, WARN+, ERROR+).
 
-**Embedding providers:** CocoSearch supports multiple embedding providers: `ollama` (default, local), `openai`, and `openrouter`. Provider selection is via `COCOSEARCH_EMBEDDING_PROVIDER` env var or the `embedding.provider` field in `cocosearch.yaml`. Remote providers require `COCOSEARCH_EMBEDDING_API_KEY`. Default models: ollama→`nomic-embed-text`, openai→`text-embedding-3-small`, openrouter→`openai/text-embedding-3-small`. Index metadata tracks which provider/model was used; switching requires `--fresh` reindex.
+**Embedding providers:** CocoSearch supports multiple embedding providers: `ollama` (default, local), `openai`, and `openrouter`. Provider selection is via `COCOSEARCH_EMBEDDING_PROVIDER` env var or the `embedding.provider` field in `cocosearch.yaml`. Remote providers require `COCOSEARCH_EMBEDDING_API_KEY` (unless `baseUrl` is set for local OpenAI-compatible servers). `COCOSEARCH_EMBEDDING_BASE_URL` (or `embedding.baseUrl` in config) overrides the provider's default endpoint — use it with local OpenAI-API-compatible servers (Infinity, text-embeddings-inference, vLLM). For the `ollama` provider, `baseUrl` overrides `COCOSEARCH_OLLAMA_URL`. Default models: ollama→`nomic-embed-text`, openai→`text-embedding-3-small`, openrouter→`openai/text-embedding-3-small`. Index metadata tracks which provider/model was used; switching requires `--fresh` reindex.
 
 **Docker / client mode env vars:**
 - `COCOSEARCH_SERVER_URL` — When set, CLI forwards commands to the remote server instead of running locally (e.g., `http://localhost:3000`)

diff --git a/README.md b/README.md
@@ -575,6 +575,7 @@ indexing:
 embedding:
   provider: ollama  # ollama (default), openai, openrouter
   model: nomic-embed-text  # default depends on provider
+  # baseUrl: http://localhost:8080  # custom OpenAI-compatible endpoint
 ```
 
 ### Remote Embedding Providers
@@ -599,11 +600,24 @@ uv run cocosearch config check
 | Provider | Default Model | API Key Required |
 |----------|--------------|-----------------|
 | `ollama` | `nomic-embed-text` | No (local) |
-| `openai` | `text-embedding-3-small` | Yes |
-| `openrouter` | `openai/text-embedding-3-small` | Yes |
+| `openai` | `text-embedding-3-small` | Yes (optional with `baseUrl`) |
+| `openrouter` | `openai/text-embedding-3-small` | Yes (optional with `baseUrl`) |
 
 Switching providers on an existing index requires `--fresh` to reindex with the new embedding model.
 
+#### Custom Endpoints
+
+Use `embedding.baseUrl` (or `COCOSEARCH_EMBEDDING_BASE_URL`) to point any provider at a local OpenAI-compatible server such as [Infinity](https://github.com/michaelfeil/infinity), [text-embeddings-inference](https://github.com/huggingface/text-embeddings-inference), or [vLLM](https://github.com/vllm-project/vllm):
+
+```yaml
+embedding:
+  provider: openai
+  model: BAAI/bge-small-en-v1.5
+  baseUrl: http://localhost:8080
+```
+
+When `baseUrl` is set, the API key is not required. For the `ollama` provider, `baseUrl` overrides `COCOSEARCH_OLLAMA_URL`.
+
 ## Testing
 
 Tests use [pytest](https://docs.pytest.org/). All tests are unit tests, fully mocked, and require no infrastructure. Markers are auto-applied based on directory -- no need to add them manually.

diff --git a/docs/architecture.md b/docs/architecture.md
@@ -12,7 +12,7 @@ CocoSearch is a local-first hybrid semantic code search system. This document pr
 
 ## System Components
 
-**Embedding Provider:** Generates 768-dimensional vectors from code chunks. By default, Ollama runs `nomic-embed-text` locally — no API keys, no network calls. Optional remote providers (OpenAI with `text-embedding-3-small`, OpenRouter) are available for teams that prefer managed infrastructure. Implementation: `src/cocosearch/indexer/embedder.py`
+**Embedding Provider:** Generates 768-dimensional vectors from code chunks. By default, Ollama runs `nomic-embed-text` locally — no API keys, no network calls. Optional remote providers (OpenAI with `text-embedding-3-small`, OpenRouter) are available for teams that prefer managed infrastructure. Any provider can be pointed at a custom endpoint via `baseUrl` for local OpenAI-compatible servers (Infinity, text-embeddings-inference, vLLM). Implementation: `src/cocosearch/indexer/embedder.py`
 
 **PostgreSQL + pgvector:** Database storing code chunks with their vector embeddings. The pgvector extension enables efficient cosine similarity search over embedding vectors. Also provides full-text search via tsvector columns for keyword matching. Implementation: `src/cocosearch/search/db.py`
 
@@ -90,7 +90,7 @@ See [MCP Tools Reference](mcp-tools.md) for complete parameter documentation, re
 
 ## Key Design Decisions
 
-**Local-first:** All processing happens on your machine by default. Ollama runs the embedding model locally, PostgreSQL stores data locally. Optional remote embedding providers (OpenAI, OpenRouter) send only chunk text for embedding — all indexing, storage, and search remain local. Your code never leaves your environment.
+**Local-first:** All processing happens on your machine by default. Ollama runs the embedding model locally, PostgreSQL stores data locally. Optional remote embedding providers (OpenAI, OpenRouter) send only chunk text for embedding — all indexing, storage, and search remain local. Any provider can also target a local OpenAI-compatible server via `baseUrl`, keeping embeddings fully on-machine without Ollama. Your code never leaves your environment.
 
 **Infra-only Docker:** Docker provides PostgreSQL+pgvector and Ollama infrastructure only. CocoSearch runs natively via `uvx` for faster iteration and simpler updates. This keeps the Docker image lightweight and avoids Python dependency management inside containers.
 

diff --git a/docs/how-it-works.md b/docs/how-it-works.md
@@ -134,7 +134,7 @@ By default, everything described above runs on your machine:
 
 The only external dependencies are Docker (to run Postgres and Ollama) and the embedding model weights (downloaded once by Ollama). After that, you could run CocoSearch on an airplane.
 
-**Optional remote embeddings:** If you prefer managed infrastructure or don't want to run Ollama locally, CocoSearch supports OpenAI and OpenRouter as embedding providers. When using a remote provider, only chunk text is sent for embedding — all indexing logic, storage, and search remain fully local. Configure via `embedding.provider` in `cocosearch.yaml` or the `COCOSEARCH_EMBEDDING_PROVIDER` environment variable.
+**Optional remote embeddings:** If you prefer managed infrastructure or don't want to run Ollama locally, CocoSearch supports OpenAI and OpenRouter as embedding providers. When using a remote provider, only chunk text is sent for embedding — all indexing logic, storage, and search remain fully local. Configure via `embedding.provider` in `cocosearch.yaml` or the `COCOSEARCH_EMBEDDING_PROVIDER` environment variable. You can also use `embedding.baseUrl` (or `COCOSEARCH_EMBEDDING_BASE_URL`) to point any provider at a local OpenAI-compatible server (Infinity, text-embeddings-inference, vLLM) — in that case, no API key is required and embeddings stay fully local.
 
 ## Beyond Search: Dependency Graph
 

diff --git a/docs/mcp-configuration.md b/docs/mcp-configuration.md
@@ -197,6 +197,14 @@ claude mcp add --scope user \
   --env COCOSEARCH_EMBEDDING_API_KEY=sk-... \
   cocosearch -- \
   uvx --from cocosearch cocosearch mcp --project-from-cwd
+
+# Or with a local OpenAI-compatible server (no API key needed):
+claude mcp add --scope user \
+  --env COCOSEARCH_EMBEDDING_PROVIDER=openai \
+  --env COCOSEARCH_EMBEDDING_BASE_URL=http://localhost:8080 \
+  --env COCOSEARCH_EMBEDDING_MODEL=BAAI/bge-small-en-v1.5 \
+  cocosearch -- \
+  uvx --from cocosearch cocosearch mcp --project-from-cwd
 ```
 
 **Claude Desktop / OpenCode (JSON config):**
@@ -212,7 +220,19 @@ Add to your server's `"env"` block (or `"environment"` for OpenCode):
 }
 ```
 
-Supported providers: `ollama` (default), `openai`, `openrouter`. With a remote provider, you do not need Ollama running — only PostgreSQL is required.
+For a local OpenAI-compatible server, use `COCOSEARCH_EMBEDDING_BASE_URL` instead of an API key:
+
+```json
+{
+  "env": {
+    "COCOSEARCH_EMBEDDING_PROVIDER": "openai",
+    "COCOSEARCH_EMBEDDING_BASE_URL": "http://localhost:8080",
+    "COCOSEARCH_EMBEDDING_MODEL": "BAAI/bge-small-en-v1.5"
+  }
+}
+```
+
+Supported providers: `ollama` (default), `openai`, `openrouter`. With a remote provider, you do not need Ollama running — only PostgreSQL is required. Use `COCOSEARCH_EMBEDDING_BASE_URL` (or `embedding.baseUrl` in config) to point any provider at a custom endpoint.
 
 ### Project Detection
 

diff --git a/docs/retrieval.md b/docs/retrieval.md
@@ -73,7 +73,7 @@ The indexing pipeline transforms raw code files into searchable chunks with embe
 - Uses CocoIndex's shared transform — embedding function evaluated once and reused across all chunks in the flow
 - The filename prefix is only used for embedding input — the stored `content_text` column retains the raw chunk text
 - Same embedding function used during search queries to ensure consistency (search queries are NOT prefixed with filenames — intentional asymmetry: document embeddings are enriched, queries stay natural)
-- Ollama server address configured via `COCOSEARCH_OLLAMA_URL` environment variable (defaults to http://localhost:11434)
+- Server address configured via `COCOSEARCH_EMBEDDING_BASE_URL` (or `embedding.baseUrl` in config) for any provider, overriding the default endpoint. For the `ollama` provider specifically, `COCOSEARCH_OLLAMA_URL` is the fallback (defaults to http://localhost:11434)
 
 **Implementation:** `src/cocosearch/indexer/embedder.py` — `add_filename_context`, `code_to_embedding`
 

diff --git a/src/cocosearch/cli.py b/src/cocosearch/cli.py
@@ -1873,6 +1873,14 @@ def _source_label(source: str) -> str:
         )
         table.add_row("COCOSEARCH_OLLAMA_URL", ollama_url, ollama_url_source)
 
+    # EMBEDDING_BASE_URL (optional, any provider)
+    base_url, base_url_source = check_resolver.resolve(
+        "embedding.baseUrl", None, "COCOSEARCH_EMBEDDING_BASE_URL"
+    )
+    if base_url is not None:
+        base_url_source = _source_label(base_url_source)
+        table.add_row("COCOSEARCH_EMBEDDING_BASE_URL", base_url, base_url_source)
+
     console.print(table)
     console.print()
 

diff --git a/src/cocosearch/config/resolver.py b/src/cocosearch/config/resolver.py
@@ -253,9 +253,9 @@ def _get_default_value(self, field_path: str) -> Any:
     def bridge_embedding_config(self) -> tuple[str, str]:
         """Resolve embedding config and bridge to env vars.
 
-        Ensures COCOSEARCH_EMBEDDING_PROVIDER, COCOSEARCH_EMBEDDING_MODEL, and
-        COCOSEARCH_EMBEDDING_OUTPUT_DIMENSION env vars reflect the full
-        precedence chain (env > config file > default).
+        Ensures COCOSEARCH_EMBEDDING_PROVIDER, COCOSEARCH_EMBEDDING_MODEL,
+        COCOSEARCH_EMBEDDING_OUTPUT_DIMENSION, and COCOSEARCH_EMBEDDING_BASE_URL
+        env vars reflect the full precedence chain (env > config file > default).
 
         Returns:
             Tuple of (provider, model).
@@ -268,10 +268,16 @@ def bridge_embedding_config(self) -> tuple[str, str]:
             "embedding.outputDimension", None, "COCOSEARCH_EMBEDDING_OUTPUT_DIMENSION"
         )
 
+        base_url, _ = self.resolve(
+            "embedding.baseUrl", None, "COCOSEARCH_EMBEDDING_BASE_URL"
+        )
+
         os.environ["COCOSEARCH_EMBEDDING_PROVIDER"] = str(provider)
         os.environ["COCOSEARCH_EMBEDDING_MODEL"] = str(model)
         if dim is not None:
             os.environ["COCOSEARCH_EMBEDDING_OUTPUT_DIMENSION"] = str(dim)
+        if base_url is not None:
+            os.environ["COCOSEARCH_EMBEDDING_BASE_URL"] = str(base_url)
 
         return provider, model
 

diff --git a/src/cocosearch/config/schema.py b/src/cocosearch/config/schema.py
@@ -50,6 +50,7 @@ class EmbeddingSection(BaseModel):
     provider: str = Field(default="ollama")
     model: str | None = Field(default=None)
     outputDimension: int | None = Field(default=None)
+    baseUrl: str | None = Field(default=None)
 
     @model_validator(mode="after")
     def _validate_provider_and_defaults(self) -> "EmbeddingSection":

diff --git a/src/cocosearch/indexer/embedder.py b/src/cocosearch/indexer/embedder.py
@@ -144,14 +144,17 @@ def code_to_embedding(
 
     kwargs: dict = {"api_type": api_type, "model": model}
 
-    if provider == "ollama":
-        kwargs["address"] = os.environ.get("COCOSEARCH_OLLAMA_URL")
-    else:
-        api_key = os.environ.get("COCOSEARCH_EMBEDDING_API_KEY")
-        if api_key:
-            kwargs["api_key"] = cocoindex.auth_registry.add_transient_auth_entry(
-                api_key
-            )
+    # Resolve address: COCOSEARCH_EMBEDDING_BASE_URL (universal) > COCOSEARCH_OLLAMA_URL (ollama fallback)
+    address = os.environ.get("COCOSEARCH_EMBEDDING_BASE_URL")
+    if address is None and provider == "ollama":
+        address = os.environ.get("COCOSEARCH_OLLAMA_URL")
+    if address:
+        kwargs["address"] = address
+
+    # API key (any provider — local servers just won't set it)
+    api_key = os.environ.get("COCOSEARCH_EMBEDDING_API_KEY")
+    if api_key:
+        kwargs["api_key"] = cocoindex.auth_registry.add_transient_auth_entry(api_key)
 
     output_dim = _resolve_output_dimension(model)
     if output_dim is not None:

diff --git a/src/cocosearch/indexer/flow.py b/src/cocosearch/indexer/flow.py
@@ -248,6 +248,7 @@ def run_index(
         ollama_url=os.environ.get("COCOSEARCH_OLLAMA_URL"),
         embedding_model=embedding_model,
         provider=embedding_provider,
+        base_url=os.environ.get("COCOSEARCH_EMBEDDING_BASE_URL"),
     )
 
     _get_cs_log().infra(

diff --git a/src/cocosearch/indexer/preflight.py b/src/cocosearch/indexer/preflight.py
@@ -24,19 +24,23 @@ def check_infrastructure(
     ollama_url: str | None = None,
     embedding_model: str = "nomic-embed-text",
     provider: str = "ollama",
+    base_url: str | None = None,
 ) -> None:
     """Check infrastructure is reachable. Raises ConnectionError if not.
 
     For Ollama provider: checks PostgreSQL, Ollama server, and model availability.
     For remote providers (OpenAI, OpenRouter): checks PostgreSQL and API key.
+    When base_url is set for remote providers, API key check is skipped
+    (local OpenAI-compatible server, key optional).
     """
     check_postgres(db_url)
     if provider == "ollama":
         resolved_url = ollama_url or DEFAULT_OLLAMA_URL
         check_ollama(resolved_url)
         check_ollama_model(resolved_url, embedding_model)
     else:
-        check_api_key(provider)
+        if base_url is None:
+            check_api_key(provider)
 
 
 def check_postgres(db_url: str) -> None: