1313from typing import TYPE_CHECKING , Callable
1414
1515from data_designer .config .column_types import ColumnConfigT
16+ from data_designer .config .config_builder import BuilderConfig
17+ from data_designer .config .data_designer_config import DataDesignerConfig
1618from data_designer .config .dataset_builders import BuildStage
1719from data_designer .config .processors import (
1820 DropColumnsProcessorConfig ,
2527 GenerationStrategy ,
2628)
2729from data_designer .engine .column_generators .utils .generator_classification import column_type_is_model_generated
28- from data_designer .engine .dataset_builders .artifact_storage import ArtifactStorage
30+ from data_designer .engine .compiler import compile_data_designer_config
31+ from data_designer .engine .dataset_builders .artifact_storage import SDG_CONFIG_FILENAME , ArtifactStorage
2932from data_designer .engine .dataset_builders .errors import DatasetGenerationError , DatasetProcessingError
30- from data_designer .engine .dataset_builders .multi_column_configs import DatasetBuilderColumnConfigT , MultiColumnConfig
33+ from data_designer .engine .dataset_builders .multi_column_configs import MultiColumnConfig
3134from data_designer .engine .dataset_builders .utils .concurrency import (
3235 MAX_CONCURRENCY_PER_NON_LLM_GENERATOR ,
3336 ConcurrentThreadExecutor ,
3437)
38+ from data_designer .engine .dataset_builders .utils .config_compiler import compile_dataset_builder_column_configs
3539from data_designer .engine .dataset_builders .utils .dataset_batch_manager import DatasetBatchManager
3640from data_designer .engine .models .telemetry import InferenceEvent , NemoSourceEnum , TaskStatusEnum , TelemetryHandler
3741from data_designer .engine .processing .processors .base import Processor
5458class ColumnWiseDatasetBuilder :
5559 def __init__ (
5660 self ,
57- column_configs : list [DatasetBuilderColumnConfigT ],
58- processor_configs : list [ProcessorConfig ],
61+ data_designer_config : DataDesignerConfig ,
5962 resource_provider : ResourceProvider ,
6063 registry : DataDesignerRegistry | None = None ,
6164 ):
6265 self .batch_manager = DatasetBatchManager (resource_provider .artifact_storage )
6366 self ._resource_provider = resource_provider
6467 self ._records_to_drop : set [int ] = set ()
6568 self ._registry = registry or DataDesignerRegistry ()
66- self ._column_configs = column_configs
67- self ._processors : dict [BuildStage , list [Processor ]] = self ._initialize_processors (processor_configs )
69+
70+ self ._data_designer_config = compile_data_designer_config (data_designer_config , resource_provider )
71+ self ._column_configs = compile_dataset_builder_column_configs (self ._data_designer_config )
72+ self ._processors : dict [BuildStage , list [Processor ]] = self ._initialize_processors (
73+ self ._data_designer_config .processors or []
74+ )
6875 self ._validate_column_configs ()
6976
7077 @property
@@ -91,9 +98,8 @@ def build(
9198 num_records : int ,
9299 on_batch_complete : Callable [[Path ], None ] | None = None ,
93100 ) -> Path :
94- self ._write_configs ()
95101 self ._run_model_health_check_if_needed ()
96-
102+ self . _write_builder_config ()
97103 generators = self ._initialize_generators ()
98104 start_time = time .perf_counter ()
99105 group_id = uuid .uuid4 ().hex
@@ -152,6 +158,12 @@ def _initialize_generators(self) -> list[ColumnGenerator]:
152158 for config in self ._column_configs
153159 ]
154160
161+ def _write_builder_config (self ) -> None :
162+ self .artifact_storage .mkdir_if_needed (self .artifact_storage .base_dataset_path )
163+ BuilderConfig (data_designer = self ._data_designer_config ).to_json (
164+ self .artifact_storage .base_dataset_path / SDG_CONFIG_FILENAME
165+ )
166+
155167 def _run_batch (
156168 self , generators : list [ColumnGenerator ], * , batch_mode : str , save_partial_results : bool = True , group_id : str
157169 ) -> None :
@@ -303,16 +315,6 @@ def _worker_error_callback(self, exc: Exception, *, context: dict | None = None)
303315 def _worker_result_callback (self , result : dict , * , context : dict | None = None ) -> None :
304316 self .batch_manager .update_record (context ["index" ], result )
305317
306- def _write_configs (self ) -> None :
307- self .artifact_storage .write_configs (
308- json_file_name = "column_configs.json" ,
309- configs = self ._column_configs ,
310- )
311- self .artifact_storage .write_configs (
312- json_file_name = "model_configs.json" ,
313- configs = self ._resource_provider .model_registry .model_configs .values (),
314- )
315-
316318 def _emit_batch_inference_events (
317319 self , batch_mode : str , usage_deltas : dict [str , ModelUsageStats ], group_id : str
318320 ) -> None :
0 commit comments