Fix get_usage_stats: use per-tool cost multipliers instead of full codebase per query

RecoDemo · claude · RecoDemo · commit 66b956b6aa44 · 2026-02-20T22:02:58.000Z
The naive "without indexer" estimate was massively overstated — it assumed
every query would require reading the entire codebase. Now each tool has a
realistic multiplier (e.g. find_symbol=5%, get_change_impact=30%) reflecting
what fraction of the codebase you'd actually need to read without the indexer.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "mcp-codebase-index"
-version = "0.4.4"
+version = "0.4.5"
 description = "Structural codebase indexer with MCP server for AI-assisted development"
 requires-python = ">=3.11"
 readme = "README.md"
diff --git a/src/mcp_codebase_index/server.py b/src/mcp_codebase_index/server.py
@@ -60,6 +60,27 @@
 _tool_call_counts: dict[str, int] = {}
 _total_chars_returned: int = 0
 
+# Realistic estimate of what % of codebase you'd need to read without the indexer
+_TOOL_COST_MULTIPLIERS: dict[str, float] = {
+    "get_project_summary": 0.10,
+    "list_files": 0.01,
+    "get_structure_summary": 0.05,
+    "get_functions": 0.05,
+    "get_classes": 0.05,
+    "get_imports": 0.03,
+    "get_function_source": 0.02,
+    "get_class_source": 0.03,
+    "find_symbol": 0.05,
+    "get_dependencies": 0.10,
+    "get_dependents": 0.15,
+    "get_change_impact": 0.30,
+    "get_call_chain": 0.20,
+    "get_file_dependencies": 0.02,
+    "get_file_dependents": 0.10,
+    "search_codebase": 0.15,
+    "reindex": 0.0,
+}
+
 
 def _format_result(value: object) -> str:
     """Format a query result as readable text."""
@@ -101,8 +122,13 @@ def _format_usage_stats() -> str:
     if source_chars > 0:
         lines.append(f"Total source in index: {source_chars:,} chars")
         if query_calls > 0 and source_chars > _total_chars_returned:
-            # Each query could have required reading the full source
-            naive_chars = source_chars * query_calls
+            # Per-tool estimate of what you'd read without the indexer
+            naive_chars = 0
+            for tool_name, count in _tool_call_counts.items():
+                if tool_name == "get_usage_stats":
+                    continue
+                multiplier = _TOOL_COST_MULTIPLIERS.get(tool_name, 0.10)
+                naive_chars += int(source_chars * multiplier * count)
             reduction = (1 - _total_chars_returned / naive_chars) * 100 if naive_chars > 0 else 0
             lines.append(
                 f"Estimated without indexer: {naive_chars:,} chars "
diff --git a/tests/test_usage_stats.py b/tests/test_usage_stats.py
@@ -88,7 +88,8 @@ def test_with_indexed_project(self, tmp_path):
         assert "Total source in index:" in result
         assert "Estimated token savings:" in result
 
-    def test_token_savings_calculation(self, tmp_path):
+    def test_token_savings_uses_per_tool_multipliers(self, tmp_path):
+        """Naive estimate should use per-tool cost multipliers, not full codebase per query."""
         import mcp_codebase_index.server as srv
         from mcp_codebase_index.project_indexer import ProjectIndexer
 
@@ -99,15 +100,64 @@ def test_token_savings_calculation(self, tmp_path):
         indexer.index()
         srv._indexer = indexer
 
+        source_chars = sum(m.total_chars for m in indexer._project_index.files.values())
+
+        # find_symbol has multiplier 0.05, so 10 calls = source_chars * 0.05 * 10
         srv._tool_call_counts["find_symbol"] = 10
         srv._total_chars_returned = 500
 
         result = srv._format_usage_stats()
         assert "Estimated without indexer:" in result
         assert "Estimated with indexer:" in result
-        # 500 chars returned vs 6000 * 10 = 60000 naive
         assert "tokens" in result
 
+        # The naive estimate should be source_chars * 0.05 * 10, NOT source_chars * 10
+        expected_naive = int(source_chars * 0.05 * 10)
+        assert f"{expected_naive:,} chars" in result
+
+    def test_different_tools_produce_different_costs(self, tmp_path):
+        """Tools with different multipliers should produce different naive estimates."""
+        import mcp_codebase_index.server as srv
+        from mcp_codebase_index.project_indexer import ProjectIndexer
+
+        (tmp_path / "code.py").write_text("x = 1\n" * 1000)
+
+        indexer = ProjectIndexer(str(tmp_path), include_patterns=["**/*.py"])
+        indexer.index()
+        srv._indexer = indexer
+
+        source_chars = sum(m.total_chars for m in indexer._project_index.files.values())
+
+        # Test with a cheap tool (list_files: 0.01)
+        srv._tool_call_counts["list_files"] = 1
+        srv._total_chars_returned = 50
+        result_cheap = srv._format_usage_stats()
+
+        # Reset and test with an expensive tool (get_change_impact: 0.30)
+        srv._tool_call_counts.clear()
+        srv._total_chars_returned = 50
+        srv._tool_call_counts["get_change_impact"] = 1
+        result_expensive = srv._format_usage_stats()
+
+        # Extract the "Estimated without indexer" numbers
+        def extract_naive(text: str) -> int:
+            for line in text.splitlines():
+                if "Estimated without indexer:" in line:
+                    # Format: "Estimated without indexer: N chars (M tokens) over Q queries"
+                    num_str = line.split(":")[1].split("chars")[0].strip().replace(",", "")
+                    return int(num_str)
+            return 0
+
+        cheap_naive = extract_naive(result_cheap)
+        expensive_naive = extract_naive(result_expensive)
+
+        assert cheap_naive > 0
+        assert expensive_naive > 0
+        assert expensive_naive > cheap_naive
+        # Verify exact values based on multipliers
+        assert cheap_naive == int(source_chars * 0.01)
+        assert expensive_naive == int(source_chars * 0.30)
+
     def test_no_savings_section_without_index(self):
         import mcp_codebase_index.server as srv