fix(builder): address agent review findings for resume feature

przemekboruta · przemekboruta · commit 5887ac75887d · 2026-04-17T16:19:52.000+02:00
H1: Annotate _load_resume_state() call in _build_async with a comment
clarifying that the return value is intentionally discarded — the async
path derives ground-truth state from the filesystem, not from metadata.

M1: Replace fragile split("_", 1)[1] filename parsing in
_find_completed_row_group_ids with re.fullmatch(r"batch_(\d+)", stem),
making it immune to unexpected filename shapes.

L2: Remove unused _ResumeState.buffer_size field — the field was set
but never read; _build_with_resume uses the buffer_size parameter directly.

L4: Move mid-file imports (json, Path, ArtifactStorage) used by resume
tests to the top of test_dataset_builder.py and drop the underscore aliases.

L1: Update plans/525/resume-interrupted-runs.md to reflect that async
engine resume is fully implemented (not deferred) and that missing
metadata triggers a fresh restart instead of raising DatasetGenerationError.
diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/dataset_builder.py
@@ -7,6 +7,7 @@
 import functools
 import logging
 import os
+import re
 import time
 import uuid
 from dataclasses import dataclass
@@ -88,7 +89,6 @@
 class _ResumeState:
     num_completed_batches: int
     actual_num_records: int
-    buffer_size: int
 
 
 class DatasetBuilder:
@@ -255,7 +255,6 @@ def _load_resume_state(self, num_records: int, buffer_size: int) -> _ResumeState
         return _ResumeState(
             num_completed_batches=metadata["num_completed_batches"],
             actual_num_records=metadata["actual_num_records"],
-            buffer_size=buffer_size,
         )
 
     def _build_with_resume(
@@ -377,10 +376,9 @@ def _find_completed_row_group_ids(self) -> set[int]:
             return set()
         ids: set[int] = set()
         for p in final_path.glob("batch_*.parquet"):
-            try:
-                ids.add(int(p.stem.split("_", 1)[1]))
-            except (ValueError, IndexError):
-                continue
+            m = re.fullmatch(r"batch_(\d+)", p.stem)
+            if m:
+                ids.add(int(m.group(1)))
         return ids
 
     def _build_async(
@@ -408,6 +406,8 @@ def _build_async(
         initial_total_num_batches = 0
 
         if resume:
+            # Validate run-parameter compatibility only — the async path derives
+            # ground-truth state from the filesystem, not from the returned state object.
             self._load_resume_state(num_records, buffer_size)
             completed_ids = self._find_completed_row_group_ids()
             skip_row_groups = frozenset(completed_ids)
diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_dataset_builder.py
@@ -3,7 +3,9 @@
 
 from __future__ import annotations
 
+import json
 import logging
+from pathlib import Path
 from typing import TYPE_CHECKING
 from unittest.mock import Mock, patch
 
@@ -32,6 +34,7 @@
 from data_designer.engine.processing.processors.base import Processor
 from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry
 from data_designer.engine.resources.seed_reader import DataFrameSeedReader
+from data_designer.engine.storage.artifact_storage import ArtifactStorage
 
 if TYPE_CHECKING:
     import pandas as pd
@@ -944,22 +947,16 @@ def test_allow_resize_multiple_batches(
 # ---------------------------------------------------------------------------
 
 
-import json as _json
-from pathlib import Path as _Path
-
-from data_designer.engine.storage.artifact_storage import ArtifactStorage as _ArtifactStorage
-
-
-def _write_metadata(dataset_dir: _Path, **fields) -> None:
+def _write_metadata(dataset_dir: Path, **fields) -> None:
     """Write a metadata.json into an existing dataset folder."""
     dataset_dir.mkdir(parents=True, exist_ok=True)
     (dataset_dir / "sentinel.txt").write_text("x")  # make folder non-empty for resolved_dataset_name
-    (dataset_dir / "metadata.json").write_text(_json.dumps(fields))
+    (dataset_dir / "metadata.json").write_text(json.dumps(fields))
 
 
 def _make_resume_builder(stub_resource_provider, stub_test_config_builder, tmp_path, *, buffer_size: int = 2):
     """Return a DatasetBuilder whose ArtifactStorage has resume=True."""
-    storage = _ArtifactStorage(artifact_path=tmp_path, resume=True)
+    storage = ArtifactStorage(artifact_path=tmp_path, resume=True)
     stub_resource_provider.artifact_storage = storage
     stub_resource_provider.run_config = RunConfig(buffer_size=buffer_size)
     return DatasetBuilder(
@@ -1113,7 +1110,7 @@ def test_find_completed_row_group_ids_ignores_non_batch_files(
 # ---------------------------------------------------------------------------
 
 
-def _write_parquet_files(parquet_dir: _Path, row_group_ids: list[int]) -> None:
+def _write_parquet_files(parquet_dir: Path, row_group_ids: list[int]) -> None:
     """Create stub batch_*.parquet files for the given row group IDs."""
     parquet_dir.mkdir(parents=True, exist_ok=True)
     for rg_id in row_group_ids:
diff --git a/plans/525/resume-interrupted-runs.md b/plans/525/resume-interrupted-runs.md
@@ -38,9 +38,9 @@ results = dd.create(config_builder, num_records=10_000, resume=True)
 | Resume state source | Read `metadata.json` written after each completed batch | Already contains `num_completed_batches`, `target_num_records`, `buffer_size`, `actual_num_records`. No new persistence needed. |
 | Partial batch at crash time | Clear `tmp-partial-parquet-files/` at resume start | Simpler and safer than merging an incomplete parquet; losing one batch is acceptable since the user is already recovering from a crash. |
 | Compatibility validation | Raise `DatasetGenerationError` if `num_records` or `buffer_size` changed | Different `num_records` changes which rows land in which batch file, breaking the numbering invariant. `buffer_size` changes the file-per-batch mapping. Both must match. |
-| Async engine | Raise `DatasetGenerationError` if `DATA_DESIGNER_ASYNC_ENGINE=1` with `resume=True` | The async path uses a row-group scheduler rather than an indexed batch loop; resume would require a different strategy. Out of scope for v1. |
+| Async engine | Supported — derives ground-truth state from the filesystem via `_find_completed_row_group_ids()` | The async path scans completed `batch_*.parquet` files rather than reading `metadata.json`, avoiding the metadata-lag crash window. Both `initial_actual_num_records` and `initial_total_num_batches` are sourced from the filesystem. Incremental `write_metadata` calls after each row group enable resumability. |
 | Already-complete runs | Detect and warn, return existing path | If `num_completed_batches == total_num_batches` the dataset is already complete; the user may have re-run by mistake. |
-| No metadata → error | Raise `DatasetGenerationError` | Resuming without a checkpoint is impossible; a clear error is better than silent fallback to a fresh run. |
+| No metadata → restart fresh | Log info message and restart from batch 0 | If `metadata.json` is missing the run was interrupted before any batch completed. Restarting silently is safer UX than forcing the user to remove `resume=True`. |
 
 ## Affected Files
 
@@ -168,7 +168,7 @@ def build(self, *, num_records, on_batch_complete=None, save_multimedia_to_disk=
 ## Trade-offs Considered
 
 - **Automatic resume detection** (no flag, detect existing folder automatically): rejected — removes user intent. A user re-running a pipeline from scratch would be surprised by silent resumption.
-- **Resume support for async engine**: deferred to a follow-up. The async scheduler's row-group model doesn't map 1:1 to batch indices; implementing it would require a separate mechanism.
+- **Resume support for async engine**: implemented (diverges from original plan). The async path scans the filesystem for completed row groups instead of relying on potentially-stale `metadata.json`, handling the crash window between `move_partial_result_to_final_file_path` and `write_metadata`.
 - **Per-column resume** (resume from column N within an interrupted batch): out of scope. Requires per-column checkpointing and state reconstruction, significantly higher complexity.
 
 ## Delivery