From b546082ab16037084665b4f6656b2282f28ea8c5 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 5 Feb 2026 15:57:41 -0300 Subject: [PATCH 01/14] refactor: processors use callback-based design with multiple stages Replace single process() method with stage-specific callbacks: - preprocess(): PRE_GENERATION, on full seed data, writes to disk - process_before_batch(): PRE_BATCH, after seed columns, before dependent columns - process_after_batch(): POST_BATCH, after each batch completes - postprocess(): POST_GENERATION, on final combined dataset Key changes: - Remove build_stage config field; stages determined by implemented callbacks - Add implements() method to check if processor overrides a callback - Only run processors that implement each stage - Preprocessed seed data written to disk for memory efficiency - Update docs and tests --- docs/concepts/processors.md | 12 +- .../src/data_designer/config/processors.py | 24 +-- .../tests/config/test_processors.py | 43 +--- .../generators/seed_dataset.py | 6 +- .../dataset_builders/column_wise_builder.py | 147 ++++++++++--- .../engine/processing/processors/base.py | 67 +++++- .../processing/processors/drop_columns.py | 20 +- .../processing/processors/schema_transform.py | 30 ++- .../engine/resources/resource_provider.py | 1 + .../tests/engine/conftest.py | 1 + .../test_column_wise_builder.py | 203 ++++++++++++++++-- .../processors/test_drop_columns.py | 55 ++--- .../processors/test_schema_transform.py | 50 +++-- .../tests/engine/test_validation.py | 3 - .../tests/interface/test_data_designer.py | 7 +- 15 files changed, 487 insertions(+), 182 deletions(-) diff --git a/docs/concepts/processors.md b/docs/concepts/processors.md index 46773ecb..5a2b340a 100644 --- a/docs/concepts/processors.md +++ b/docs/concepts/processors.md @@ -13,7 +13,16 @@ Each processor: - Applies its transformation - Passes the result to the next processor (or to output) -Currently, processors run only at the `POST_BATCH` stage, i.e., after column generation completes for each batch. +Processors can run at four stages, determined by which callback methods they implement: + +| Stage | When it runs | Callback method | Use cases | +|-------|--------------|-----------------|-----------| +| Pre-generation | Once, on full seed data before batching | `preprocess()` | Filter seed data, validate inputs, normalize data | +| Pre-batch | After seed columns, before dependent columns | `process_before_batch()` | Transform seed data before other columns are generated | +| Post-batch | After each batch completes | `process_after_batch()` | Drop columns, transform schema per batch | +| Post-generation | Once, on final dataset after all batches | `postprocess()` | Deduplicate, aggregate statistics, final cleanup | + +A processor can implement any combination of these callbacks. The built-in processors use `process_after_batch()` by default. ## Processor Types @@ -134,7 +143,6 @@ Processors execute in the order they're added. Plan accordingly when one process | Parameter | Type | Description | |-----------|------|-------------| | `name` | str | Identifier for the processor, used in output directory names | -| `build_stage` | BuildStage | When to run (default: `POST_BATCH`) | ### DropColumnsProcessorConfig diff --git a/packages/data-designer-config/src/data_designer/config/processors.py b/packages/data-designer-config/src/data_designer/config/processors.py index db7bb9ce..21d94b78 100644 --- a/packages/data-designer-config/src/data_designer/config/processors.py +++ b/packages/data-designer-config/src/data_designer/config/processors.py @@ -12,11 +12,8 @@ from typing_extensions import TypeAlias from data_designer.config.base import ConfigBase -from data_designer.config.dataset_builders import BuildStage from data_designer.config.errors import InvalidConfigError -SUPPORTED_STAGES = [BuildStage.POST_BATCH] - class ProcessorType(str, Enum): """Enumeration of available processor types. @@ -33,33 +30,22 @@ class ProcessorType(str, Enum): class ProcessorConfig(ConfigBase, ABC): """Abstract base class for all processor configuration types. - Processors are transformations that run before or after columns are generated. - They can modify, reshape, or augment the dataset before it's saved. + Processors are transformations that run at different stages of the generation + pipeline. They can modify, reshape, or augment the dataset. + + The processor implementation determines which stages it handles by overriding + the appropriate callback methods (preprocess, process_after_batch, postprocess). Attributes: name: Unique name of the processor, used to identify the processor in results and to name output artifacts on disk. - build_stage: The stage at which the processor runs. Currently only `POST_BATCH` - is supported, meaning processors run after each batch of columns is generated. """ name: str = Field( description="The name of the processor, used to identify the processor in the results and to write the artifacts to disk.", ) - build_stage: BuildStage = Field( - default=BuildStage.POST_BATCH, - description=f"The stage at which the processor will run. Supported stages: {', '.join(SUPPORTED_STAGES)}", - ) processor_type: str - @field_validator("build_stage") - def validate_build_stage(cls, v: BuildStage) -> BuildStage: - if v not in SUPPORTED_STAGES: - raise ValueError( - f"Invalid dataset builder stage: {v}. Only these stages are supported: {', '.join(SUPPORTED_STAGES)}" - ) - return v - def get_processor_config_from_kwargs(processor_type: ProcessorType, **kwargs: Any) -> ProcessorConfig: """Create a processor configuration from a processor type and keyword arguments. diff --git a/packages/data-designer-config/tests/config/test_processors.py b/packages/data-designer-config/tests/config/test_processors.py index b18814e6..e688be15 100644 --- a/packages/data-designer-config/tests/config/test_processors.py +++ b/packages/data-designer-config/tests/config/test_processors.py @@ -4,7 +4,6 @@ import pytest from pydantic import ValidationError -from data_designer.config.dataset_builders import BuildStage from data_designer.config.errors import InvalidConfigError from data_designer.config.processors import ( DropColumnsProcessorConfig, @@ -16,92 +15,64 @@ def test_drop_columns_processor_config_creation(): - config = DropColumnsProcessorConfig( - name="drop_columns_processor", build_stage=BuildStage.POST_BATCH, column_names=["col1", "col2"] - ) + config = DropColumnsProcessorConfig(name="drop_columns_processor", column_names=["col1", "col2"]) - assert config.build_stage == BuildStage.POST_BATCH assert config.column_names == ["col1", "col2"] assert config.processor_type == ProcessorType.DROP_COLUMNS assert isinstance(config, ProcessorConfig) def test_drop_columns_processor_config_validation(): - # Test unsupported stage raises error - with pytest.raises(ValidationError, match="Invalid dataset builder stage"): - DropColumnsProcessorConfig( - name="drop_columns_processor", build_stage=BuildStage.PRE_BATCH, column_names=["col1"] - ) - # Test missing required field raises error with pytest.raises(ValidationError, match="Field required"): - DropColumnsProcessorConfig(name="drop_columns_processor", build_stage=BuildStage.POST_BATCH) + DropColumnsProcessorConfig(name="drop_columns_processor") def test_drop_columns_processor_config_serialization(): - config = DropColumnsProcessorConfig( - name="drop_columns_processor", build_stage=BuildStage.POST_BATCH, column_names=["col1", "col2"] - ) + config = DropColumnsProcessorConfig(name="drop_columns_processor", column_names=["col1", "col2"]) # Serialize to dict config_dict = config.model_dump() - assert config_dict["build_stage"] == "post_batch" assert config_dict["column_names"] == ["col1", "col2"] # Deserialize from dict config_restored = DropColumnsProcessorConfig.model_validate(config_dict) - assert config_restored.build_stage == config.build_stage assert config_restored.column_names == config.column_names def test_schema_transform_processor_config_creation(): config = SchemaTransformProcessorConfig( name="output_format_processor", - build_stage=BuildStage.POST_BATCH, template={"text": "{{ col1 }}"}, ) - assert config.build_stage == BuildStage.POST_BATCH assert config.template == {"text": "{{ col1 }}"} assert config.processor_type == ProcessorType.SCHEMA_TRANSFORM assert isinstance(config, ProcessorConfig) def test_schema_transform_processor_config_validation(): - # Test unsupported stage raises error - with pytest.raises(ValidationError, match="Invalid dataset builder stage"): - SchemaTransformProcessorConfig( - name="schema_transform_processor", - build_stage=BuildStage.PRE_BATCH, - template={"text": "{{ col1 }}"}, - ) - # Test missing required field raises error with pytest.raises(ValidationError, match="Field required"): - SchemaTransformProcessorConfig(name="schema_transform_processor", build_stage=BuildStage.POST_BATCH) + SchemaTransformProcessorConfig(name="schema_transform_processor") # Test invalid template raises error with pytest.raises(InvalidConfigError, match="Template must be JSON serializable"): - SchemaTransformProcessorConfig( - name="schema_transform_processor", build_stage=BuildStage.POST_BATCH, template={"text": {1, 2, 3}} - ) + SchemaTransformProcessorConfig(name="schema_transform_processor", template={"text": {1, 2, 3}}) def test_schema_transform_processor_config_serialization(): config = SchemaTransformProcessorConfig( name="schema_transform_processor", - build_stage=BuildStage.POST_BATCH, template={"text": "{{ col1 }}"}, ) # Serialize to dict config_dict = config.model_dump() - assert config_dict["build_stage"] == "post_batch" assert config_dict["template"] == {"text": "{{ col1 }}"} # Deserialize from dict config_restored = SchemaTransformProcessorConfig.model_validate(config_dict) - assert config_restored.build_stage == config.build_stage assert config_restored.template == config.template @@ -110,7 +81,6 @@ def test_get_processor_config_from_kwargs(): config_drop_columns = get_processor_config_from_kwargs( ProcessorType.DROP_COLUMNS, name="drop_columns_processor", - build_stage=BuildStage.POST_BATCH, column_names=["col1"], ) assert isinstance(config_drop_columns, DropColumnsProcessorConfig) @@ -120,7 +90,6 @@ def test_get_processor_config_from_kwargs(): config_schema_transform = get_processor_config_from_kwargs( ProcessorType.SCHEMA_TRANSFORM, name="output_format_processor", - build_stage=BuildStage.POST_BATCH, template={"text": "{{ col1 }}"}, ) assert isinstance(config_schema_transform, SchemaTransformProcessorConfig) @@ -134,6 +103,6 @@ class UnknownProcessorType(str, Enum): UNKNOWN = "unknown" result = get_processor_config_from_kwargs( - UnknownProcessorType.UNKNOWN, name="unknown_processor", build_stage=BuildStage.POST_BATCH, column_names=["col1"] + UnknownProcessorType.UNKNOWN, name="unknown_processor", column_names=["col1"] ) assert result is None diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/seed_dataset.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/seed_dataset.py index 22c8b1ec..94b206a1 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/seed_dataset.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/seed_dataset.py @@ -53,7 +53,11 @@ def _initialize(self) -> None: self._num_records_sampled = 0 self._batch_reader = None self._df_remaining = None - self._dataset_uri = self.resource_provider.seed_reader.get_dataset_uri() + # Use preprocessed seed if available, otherwise use original + if self.resource_provider.preprocessed_seed_uri is not None: + self._dataset_uri = self.resource_provider.preprocessed_seed_uri + else: + self._dataset_uri = self.resource_provider.seed_reader.get_dataset_uri() self._seed_dataset_size = self.duckdb_conn.execute(f"SELECT COUNT(*) FROM '{self._dataset_uri}'").fetchone()[0] self._index_range = self._resolve_index_range() diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 781b0673..19b58457 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -5,6 +5,7 @@ import functools import logging +import shutil import time import uuid from pathlib import Path @@ -14,7 +15,6 @@ from data_designer.config.column_types import ColumnConfigT from data_designer.config.config_builder import BuilderConfig from data_designer.config.data_designer_config import DataDesignerConfig -from data_designer.config.dataset_builders import BuildStage from data_designer.config.processors import ( DropColumnsProcessorConfig, ProcessorConfig, @@ -28,7 +28,7 @@ ) from data_designer.engine.column_generators.utils.generator_classification import column_type_is_model_generated from data_designer.engine.compiler import compile_data_designer_config -from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage +from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage, BatchStage from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError from data_designer.engine.dataset_builders.multi_column_configs import MultiColumnConfig from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor @@ -67,9 +67,7 @@ def __init__( self._data_designer_config = compile_data_designer_config(data_designer_config, resource_provider) self._column_configs = compile_dataset_builder_column_configs(self._data_designer_config) - self._processors: dict[BuildStage, list[Processor]] = self._initialize_processors( - self._data_designer_config.processors or [] - ) + self._processors: list[Processor] = self._initialize_processors(self._data_designer_config.processors or []) self._validate_column_configs() @property @@ -98,6 +96,7 @@ def build( ) -> Path: self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() + self._run_pre_generation_processors() self._write_builder_config() generators = self._initialize_generators() start_time = time.perf_counter() @@ -108,14 +107,12 @@ def build( for batch_idx in range(self.batch_manager.num_batches): logger.info(f"âŗ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}") self._run_batch(generators, batch_mode="batch", group_id=group_id) - df_batch = self._run_processors( - stage=BuildStage.POST_BATCH, - dataframe=self.batch_manager.get_current_batch(as_dataframe=True), - current_batch_number=batch_idx, - ) + df_batch = self.batch_manager.get_current_batch(as_dataframe=True) + df_batch = self._run_post_batch_processors(df_batch, current_batch_number=batch_idx) self._write_processed_batch(df_batch) self.batch_manager.finish_batch(on_batch_complete) self.batch_manager.finish() + self._run_post_generation_processors() self._resource_provider.model_registry.log_model_usage(time.perf_counter() - start_time) @@ -124,6 +121,7 @@ def build( def build_preview(self, *, num_records: int) -> pd.DataFrame: self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() + self._run_pre_generation_processors() generators = self._initialize_generators() group_id = uuid.uuid4().hex @@ -133,16 +131,16 @@ def build_preview(self, *, num_records: int) -> pd.DataFrame: dataset = self.batch_manager.get_current_batch(as_dataframe=True) self.batch_manager.reset() + # Reset preprocessed_seed_uri to avoid affecting subsequent build() calls + self._resource_provider.preprocessed_seed_uri = None + self._resource_provider.model_registry.log_model_usage(time.perf_counter() - start_time) return dataset def process_preview(self, dataset: pd.DataFrame) -> pd.DataFrame: - return self._run_processors( - stage=BuildStage.POST_BATCH, - dataframe=dataset.copy(), - current_batch_number=None, # preview mode does not have a batch number - ) + df = self._run_post_batch_processors(dataset.copy(), current_batch_number=None) + return self._run_post_generation_processors_on_df(df) def _initialize_generators(self) -> list[ColumnGenerator]: return [ @@ -162,12 +160,17 @@ def _run_batch( self, generators: list[ColumnGenerator], *, batch_mode: str, save_partial_results: bool = True, group_id: str ) -> None: pre_batch_snapshot = self._resource_provider.model_registry.get_model_usage_snapshot() + ran_pre_batch = False for generator in generators: generator.log_pre_generation() try: generation_strategy = generator.get_generation_strategy() if generator.can_generate_from_scratch and self.batch_manager.buffer_is_empty: self._run_from_scratch_column_generator(generator) + # Run PRE_BATCH after seed generator, before other columns + if not ran_pre_batch: + self._apply_pre_batch_processors() + ran_pre_batch = True elif generation_strategy == GenerationStrategy.CELL_BY_CELL: self._run_cell_by_cell_generator(generator) elif generation_strategy == GenerationStrategy.FULL_COLUMN: @@ -288,20 +291,20 @@ def _validate_column_configs(self) -> None: ).can_generate_from_scratch: raise DatasetGenerationError("🛑 The first column config must be a from-scratch column generator.") - def _initialize_processors(self, processor_configs: list[ProcessorConfig]) -> dict[BuildStage, list[Processor]]: + def _initialize_processors(self, processor_configs: list[ProcessorConfig]) -> list[Processor]: # Check columns marked for drop columns_to_drop = [config.name for config in self.single_column_configs if config.drop] - processors: dict[BuildStage, list[Processor]] = {stage: [] for stage in BuildStage} + processors: list[Processor] = [] for config in processor_configs: - processors[config.build_stage].append( + processors.append( self._registry.processors.get_for_config_type(type(config))( config=config, resource_provider=self._resource_provider, ) ) - # Manually included "drop columns" processor takes precedence (can e.g., pick stages other than post-batch) + # Manually included "drop columns" processor takes precedence if config.processor_type == ProcessorType.DROP_COLUMNS: for column in config.column_names: if column in columns_to_drop: @@ -309,12 +312,11 @@ def _initialize_processors(self, processor_configs: list[ProcessorConfig]) -> di # If there are still columns marked for drop, add the "drop columns" processor to drop them if len(columns_to_drop) > 0: - processors[BuildStage.POST_BATCH].append( # as post-batch by default + processors.append( DropColumnsProcessor( config=DropColumnsProcessorConfig( name="default_drop_columns_processor", column_names=columns_to_drop, - build_stage=BuildStage.POST_BATCH, ), resource_provider=self._resource_provider, ) @@ -322,18 +324,109 @@ def _initialize_processors(self, processor_configs: list[ProcessorConfig]) -> di return processors - def _run_processors( - self, stage: BuildStage, dataframe: pd.DataFrame, current_batch_number: int | None = None - ) -> pd.DataFrame: - for processor in self._processors[stage]: + def _run_pre_generation_processors(self) -> None: + """Run preprocess() on processors that implement it.""" + processors = [p for p in self._processors if p.implements("preprocess")] + if not processors: + return + if self._resource_provider.seed_reader is None: + return + + logger.info("âŗ Running preprocess on seed data...") + df = self._load_seed_dataframe() + original_len = len(df) + + df = self._run_preprocess_on_df(df, processors) + + self._save_preprocessed_seed(df) + logger.info(f"✅ Preprocess complete. Seed data has {len(df)} rows (was {original_len}).") + + def _load_seed_dataframe(self) -> pd.DataFrame: + """Load full seed dataset as DataFrame.""" + seed_reader = self._resource_provider.seed_reader + conn = seed_reader.create_duckdb_connection() + try: + return conn.execute(f"SELECT * FROM '{seed_reader.get_dataset_uri()}'").fetchdf() + finally: + conn.close() + + def _run_preprocess_on_df(self, df: pd.DataFrame, processors: list[Processor]) -> pd.DataFrame: + """Run preprocess() on given processors.""" + for processor in processors: + try: + df = processor.preprocess(df) + except Exception as e: + raise DatasetProcessingError(f"🛑 Failed in preprocess for processor {processor.name}: {e}") from e + return df + + def _save_preprocessed_seed(self, df: pd.DataFrame) -> None: + """Write preprocessed seed to disk and update URI.""" + preprocessed_path = self.artifact_storage.base_dataset_path / "preprocessed_seed.parquet" + self.artifact_storage.mkdir_if_needed(self.artifact_storage.base_dataset_path) + df.to_parquet(preprocessed_path, index=False) + self._resource_provider.preprocessed_seed_uri = str(preprocessed_path) + + def _apply_pre_batch_processors(self) -> None: + """Get batch, run PRE_BATCH processors, update batch manager.""" + processors = [p for p in self._processors if p.implements("process_before_batch")] + if not processors: + return + + df = self.batch_manager.get_current_batch(as_dataframe=True) + for processor in processors: try: - dataframe = processor.process(dataframe, current_batch_number=current_batch_number) + df = processor.process_before_batch(df) except Exception as e: raise DatasetProcessingError( - f"🛑 Failed to process dataset with processor {processor.name} in stage {stage}: {e}" + f"🛑 Failed in process_before_batch for processor {processor.name}: {e}" + ) from e + self.batch_manager.update_records(df.to_dict(orient="records")) + + def _run_post_batch_processors(self, dataframe: pd.DataFrame, current_batch_number: int | None) -> pd.DataFrame: + """Run process_after_batch() on processors that implement it.""" + for processor in self._processors: + if not processor.implements("process_after_batch"): + continue + try: + dataframe = processor.process_after_batch(dataframe, current_batch_number=current_batch_number) + except Exception as e: + raise DatasetProcessingError( + f"🛑 Failed in process_after_batch for processor {processor.name}: {e}" ) from e return dataframe + def _run_post_generation_processors_on_df(self, df: pd.DataFrame) -> pd.DataFrame: + """Run postprocess() on processors that implement it.""" + for processor in self._processors: + if not processor.implements("postprocess"): + continue + try: + df = processor.postprocess(df) + except Exception as e: + raise DatasetProcessingError(f"🛑 Failed in postprocess for processor {processor.name}: {e}") from e + return df + + def _run_post_generation_processors(self) -> None: + """Run postprocess() on processors that implement it.""" + processors = [p for p in self._processors if p.implements("postprocess")] + if not processors: + return + + logger.info("âŗ Running postprocess on final dataset...") + original_df = self.artifact_storage.load_dataset() + + df = self._run_post_generation_processors_on_df(original_df) + + # Always rewrite since processors may modify values + if self.artifact_storage.final_dataset_path.exists(): + shutil.rmtree(self.artifact_storage.final_dataset_path) + self.artifact_storage.write_batch_to_parquet_file( + batch_number=0, + dataframe=df, + batch_stage=BatchStage.FINAL_RESULT, + ) + logger.info(f"✅ Postprocess complete. Final dataset has {len(df)} rows.") + def _worker_error_callback(self, exc: Exception, *, context: dict | None = None) -> None: """If a worker fails, we can handle the exception here.""" logger.warning( diff --git a/packages/data-designer-engine/src/data_designer/engine/processing/processors/base.py b/packages/data-designer-engine/src/data_designer/engine/processing/processors/base.py index 8dd47132..b3fe7a1f 100644 --- a/packages/data-designer-engine/src/data_designer/engine/processing/processors/base.py +++ b/packages/data-designer-engine/src/data_designer/engine/processing/processors/base.py @@ -3,11 +3,72 @@ from __future__ import annotations -from abc import ABC, abstractmethod +from abc import ABC from data_designer.engine.configurable_task import ConfigurableTask, DataT, TaskConfigT class Processor(ConfigurableTask[TaskConfigT], ABC): - @abstractmethod - def process(self, data: DataT, *, current_batch_number: int | None = None) -> DataT: ... + """Base class for dataset processors. + + Processors transform data at different stages of the generation pipeline. + Override the callback methods for the stages you want to handle. + """ + + def implements(self, method_name: str) -> bool: + """Check if subclass overrides a callback method.""" + return getattr(type(self), method_name) is not getattr(Processor, method_name) + + def preprocess(self, data: DataT) -> DataT: + """Called at PRE_GENERATION stage on seed data before batching. + + Override to filter or transform seed data before generation begins. + + Args: + data: The full seed dataset. + + Returns: + Transformed seed dataset. + """ + return data + + def process_before_batch(self, data: DataT) -> DataT: + """Called at PRE_BATCH stage before each batch is generated. + + Override to transform batch data before generation. Unlike preprocess, + this operates on in-memory batch data without disk I/O. + + Args: + data: The batch data before generation. + + Returns: + Transformed batch data. + """ + return data + + def process_after_batch(self, data: DataT, *, current_batch_number: int | None) -> DataT: + """Called at POST_BATCH stage after each batch is generated. + + Override to process each batch of generated data. + + Args: + data: The generated batch data. + current_batch_number: The current batch number (0-indexed), or None in preview mode. + + Returns: + Transformed batch data. + """ + return data + + def postprocess(self, data: DataT) -> DataT: + """Called at POST_GENERATION stage on the final combined dataset. + + Override to transform the complete generated dataset. + + Args: + data: The final combined dataset. + + Returns: + Transformed final dataset. + """ + return data diff --git a/packages/data-designer-engine/src/data_designer/engine/processing/processors/drop_columns.py b/packages/data-designer-engine/src/data_designer/engine/processing/processors/drop_columns.py index 98369a6b..bb26af2a 100644 --- a/packages/data-designer-engine/src/data_designer/engine/processing/processors/drop_columns.py +++ b/packages/data-designer-engine/src/data_designer/engine/processing/processors/drop_columns.py @@ -18,10 +18,15 @@ class DropColumnsProcessor(Processor[DropColumnsProcessorConfig]): - def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame: + """Drops specified columns from the dataset after each batch.""" + + def process_after_batch(self, data: pd.DataFrame, *, current_batch_number: int | None) -> pd.DataFrame: logger.info(f"🙈 Dropping columns: {self.config.column_names}") - if current_batch_number is not None: # not in preview mode - self._save_dropped_columns_if_needed(data, current_batch_number) + if current_batch_number is not None: + self._save_dropped_columns(data, current_batch_number) + return self._drop_columns(data) + + def _drop_columns(self, data: pd.DataFrame) -> pd.DataFrame: for column in self.config.column_names: if column in data.columns: data.drop(columns=[column], inplace=True) @@ -29,7 +34,12 @@ def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None logger.warning(f"âš ī¸ Cannot drop column: `{column}` not found in the dataset.") return data - def _save_dropped_columns_if_needed(self, data: pd.DataFrame, current_batch_number: int) -> None: + def _save_dropped_columns(self, data: pd.DataFrame, current_batch_number: int) -> None: + # Only save columns that actually exist + existing_columns = [col for col in self.config.column_names if col in data.columns] + if not existing_columns: + return + logger.debug("đŸ“Ļ Saving dropped columns to dropped-columns directory") dropped_column_parquet_file_name = self.artifact_storage.create_batch_file_path( batch_number=current_batch_number, @@ -37,6 +47,6 @@ def _save_dropped_columns_if_needed(self, data: pd.DataFrame, current_batch_numb ).name self.artifact_storage.write_parquet_file( parquet_file_name=dropped_column_parquet_file_name, - dataframe=data[self.config.column_names], + dataframe=data[existing_columns], batch_stage=BatchStage.DROPPED_COLUMNS, ) diff --git a/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py b/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py index b84339e6..e71267c1 100644 --- a/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py +++ b/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py @@ -41,19 +41,14 @@ def escape_for_json_string(s: str) -> str: class SchemaTransformProcessor(WithJinja2UserTemplateRendering, Processor[SchemaTransformProcessorConfig]): + """Transforms dataset schema using Jinja2 templates after each batch.""" + @property def template_as_str(self) -> str: return json.dumps(self.config.template) - def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None) -> pd.DataFrame: - self.prepare_jinja2_template_renderer(self.template_as_str, data.columns.to_list()) - formatted_records = [] - for record in data.to_dict(orient="records"): - deserialized = deserialize_json_values(record) - escaped = _json_escape_record(deserialized) - rendered = self.render_template(escaped) - formatted_records.append(json.loads(rendered)) - formatted_data = pd.DataFrame(formatted_records) + def process_after_batch(self, data: pd.DataFrame, *, current_batch_number: int | None) -> pd.DataFrame: + formatted_data = self._transform(data) if current_batch_number is not None: self.artifact_storage.write_batch_to_parquet_file( batch_number=current_batch_number, @@ -61,11 +56,14 @@ def process(self, data: pd.DataFrame, *, current_batch_number: int | None = None batch_stage=BatchStage.PROCESSORS_OUTPUTS, subfolder=self.config.name, ) - else: - self.artifact_storage.write_parquet_file( - parquet_file_name=f"{self.config.name}.parquet", - dataframe=formatted_data, - batch_stage=BatchStage.PROCESSORS_OUTPUTS, - ) - return data + + def _transform(self, data: pd.DataFrame) -> pd.DataFrame: + self.prepare_jinja2_template_renderer(self.template_as_str, data.columns.to_list()) + formatted_records = [] + for record in data.to_dict(orient="records"): + deserialized = deserialize_json_values(record) + escaped = _json_escape_record(deserialized) + rendered = self.render_template(escaped) + formatted_records.append(json.loads(rendered)) + return pd.DataFrame(formatted_records) diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py b/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py index 1fcb8e02..3d756a04 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py @@ -37,6 +37,7 @@ class ResourceProvider(ConfigBase): mcp_registry: MCPRegistry | None = None run_config: RunConfig = RunConfig() seed_reader: SeedReader | None = None + preprocessed_seed_uri: str | None = None def get_dataset_metadata(self) -> DatasetMetadata: """Get metadata about the dataset being generated. diff --git a/packages/data-designer-engine/tests/engine/conftest.py b/packages/data-designer-engine/tests/engine/conftest.py index b04580b9..33d74a3f 100644 --- a/packages/data-designer-engine/tests/engine/conftest.py +++ b/packages/data-designer-engine/tests/engine/conftest.py @@ -45,6 +45,7 @@ def stub_resource_provider(tmp_path, stub_model_facade): mock_provider.seed_reader = Mock() mock_provider.seed_reader.get_column_names.return_value = [] mock_provider.run_config = RunConfig() + mock_provider.preprocessed_seed_uri = None return mock_provider diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index a2d6174a..582b08bf 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -10,7 +10,6 @@ from data_designer.config.column_configs import LLMTextColumnConfig, SamplerColumnConfig from data_designer.config.config_builder import DataDesignerConfigBuilder -from data_designer.config.dataset_builders import BuildStage from data_designer.config.processors import DropColumnsProcessorConfig from data_designer.config.run_config import RunConfig from data_designer.config.sampler_params import SamplerType, UUIDSamplerParams @@ -37,11 +36,7 @@ def stub_test_column_configs(): @pytest.fixture def stub_test_processor_configs(): - return [ - DropColumnsProcessorConfig( - name="drop_columns_processor", build_stage=BuildStage.POST_BATCH, column_names=["column_to_drop"] - ) - ] + return [DropColumnsProcessorConfig(name="drop_columns_processor", column_names=["column_to_drop"])] @pytest.fixture @@ -52,7 +47,6 @@ def stub_test_config_builder(stub_test_column_configs, stub_model_configs): config_builder.add_processor( processor_type="drop_columns", name="drop_columns_processor", - build_stage=BuildStage.POST_BATCH, column_names=["column_to_drop"], ) return config_builder @@ -169,6 +163,7 @@ def test_column_wise_dataset_builder_build_method_basic_flow( stub_resource_provider, ): stub_resource_provider.run_config = RunConfig(buffer_size=50) + stub_resource_provider.seed_reader = None # No seed data for this basic flow test stub_resource_provider.model_registry.run_health_check = Mock() stub_resource_provider.model_registry.get_model_usage_stats = Mock(return_value={"test": "stats"}) stub_resource_provider.model_registry.models = {} @@ -183,6 +178,7 @@ def test_column_wise_dataset_builder_build_method_basic_flow( stub_batch_manager.iter_current_batch.return_value = [(0, {"test": "data"})] stub_column_wise_builder.batch_manager = stub_batch_manager + stub_column_wise_builder._processors = [] # No processors for basic flow test result_path = stub_column_wise_builder.build(num_records=100) @@ -233,12 +229,9 @@ def test_column_wise_dataset_builder_validate_column_configs( def test_column_wise_dataset_builder_initialize_processors(stub_column_wise_builder): processors = stub_column_wise_builder._processors - assert processors.keys() == set(BuildStage) - assert len(processors[BuildStage.PRE_BATCH]) == 0 - assert len(processors[BuildStage.POST_BATCH]) == 1 - assert len(processors[BuildStage.PRE_GENERATION]) == 0 - assert len(processors[BuildStage.POST_GENERATION]) == 0 - assert processors[BuildStage.POST_BATCH][0].config.column_names == ["column_to_drop"] + assert isinstance(processors, list) + assert len(processors) == 1 + assert processors[0].config.column_names == ["column_to_drop"] def test_run_config_default_non_inference_max_parallel_workers() -> None: @@ -390,3 +383,187 @@ def test_fan_out_with_threads_uses_early_shutdown_settings_from_resource_provide assert call_kwargs["shutdown_error_rate"] == expected_rate assert call_kwargs["shutdown_error_window"] == shutdown_error_window assert call_kwargs["disable_early_shutdown"] == disable_early_shutdown + + +def test_run_pre_generation_processors_filters_seed_data(stub_resource_provider, stub_model_configs, tmp_path): + """Test that PRE_GENERATION processors are applied to seed data before generation.""" + from pathlib import Path + + from data_designer.config.seed_source import DataFrameSeedSource, LocalFileSeedSource + from data_designer.engine.processing.processors.base import Processor + from data_designer.engine.resources.seed_reader import DataFrameSeedReader + + # Set up seed reader with test data + seed_df = pd.DataFrame({"seed_id": [1, 2, 3, 4, 5], "value": ["a", "b", "c", "d", "e"]}) + seed_source = DataFrameSeedSource(df=seed_df) + seed_reader = DataFrameSeedReader() + seed_reader.attach(seed_source, Mock()) + stub_resource_provider.seed_reader = seed_reader + + # Create a mock processor that filters rows during preprocess + mock_processor = Mock(spec=Processor) + mock_processor.name = "filter_processor" + mock_processor.implements.side_effect = lambda m: m == "preprocess" + mock_processor.preprocess.side_effect = lambda df: df[df["seed_id"] > 2].reset_index(drop=True) + + # Write seed file to tmp_path + seed_path = tmp_path / "seed.parquet" + seed_df.to_parquet(seed_path, index=False) + + config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) + config_builder.with_seed_dataset(LocalFileSeedSource(path=str(seed_path))) + config_builder.add_column(SamplerColumnConfig(name="uuid", sampler_type="uuid", params=UUIDSamplerParams())) + + builder = ColumnWiseDatasetBuilder( + data_designer_config=config_builder.build(), + resource_provider=stub_resource_provider, + ) + builder._processors = [mock_processor] + + builder._run_pre_generation_processors() + + # Verify preprocess was called + mock_processor.preprocess.assert_called_once() + + # Verify preprocessed_seed_uri was set and points to a valid file + assert stub_resource_provider.preprocessed_seed_uri is not None + preprocessed_path = Path(stub_resource_provider.preprocessed_seed_uri) + assert preprocessed_path.exists() + + # Verify the preprocessed file contains filtered data (3 rows with seed_id > 2) + preprocessed_df = pd.read_parquet(preprocessed_path) + assert len(preprocessed_df) == 3 + assert list(preprocessed_df["seed_id"]) == [3, 4, 5] + + +def test_run_post_generation_processors_modifies_final_dataset(stub_resource_provider, stub_model_configs): + """Test that postprocess callbacks are applied to the final dataset.""" + from data_designer.engine.processing.processors.base import Processor + + # Create test parquet files + final_df = pd.DataFrame({"id": [1, 2, 3, 4, 5], "value": ["a", "b", "c", "d", "e"]}) + stub_resource_provider.artifact_storage.mkdir_if_needed(stub_resource_provider.artifact_storage.final_dataset_path) + final_df.to_parquet(stub_resource_provider.artifact_storage.final_dataset_path / "batch_00000.parquet", index=False) + + # Create a mock processor that filters rows during postprocess + mock_processor = Mock(spec=Processor) + mock_processor.name = "dedup_processor" + mock_processor.implements.side_effect = lambda m: m == "postprocess" + mock_processor.postprocess.return_value = final_df[final_df["id"] > 2] + + config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) + config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) + + builder = ColumnWiseDatasetBuilder( + data_designer_config=config_builder.build(), + resource_provider=stub_resource_provider, + ) + builder._processors = [mock_processor] + + builder._run_post_generation_processors() + + # Verify postprocess was called + mock_processor.postprocess.assert_called_once() + + # Verify final dataset was rewritten with fewer rows + result_df = stub_resource_provider.artifact_storage.load_dataset() + assert len(result_df) == 3 + + +def test_run_pre_generation_processors_skips_when_no_seed_reader(stub_resource_provider, stub_model_configs): + """Test that preprocess is skipped when no seed reader is configured.""" + from data_designer.engine.processing.processors.base import Processor + + stub_resource_provider.seed_reader = None + + mock_processor = Mock(spec=Processor) + mock_processor.name = "filter_processor" + mock_processor.implements.side_effect = lambda m: m == "preprocess" + + config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) + config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) + + builder = ColumnWiseDatasetBuilder( + data_designer_config=config_builder.build(), + resource_provider=stub_resource_provider, + ) + builder._processors = [mock_processor] + + builder._run_pre_generation_processors() + + # Preprocess should not be called when no seed reader + mock_processor.preprocess.assert_not_called() + + +def test_build_preview_runs_pre_generation_processors(stub_resource_provider, stub_model_configs, tmp_path): + """Test that build_preview runs PRE_GENERATION processors.""" + from data_designer.config.seed_source import DataFrameSeedSource, LocalFileSeedSource + from data_designer.engine.resources.seed_reader import DataFrameSeedReader + + # Set up seed reader with test data + seed_df = pd.DataFrame({"seed_id": [1, 2, 3, 4, 5], "text": ["a", "b", "c", "d", "e"]}) + seed_source = DataFrameSeedSource(df=seed_df) + seed_reader = DataFrameSeedReader() + seed_reader.attach(seed_source, Mock()) + stub_resource_provider.seed_reader = seed_reader + + # Write seed file to tmp_path + seed_path = tmp_path / "seed.parquet" + seed_df.to_parquet(seed_path, index=False) + + config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) + config_builder.with_seed_dataset(LocalFileSeedSource(path=str(seed_path))) + config_builder.add_column(SamplerColumnConfig(name="uuid", sampler_type="uuid", params=UUIDSamplerParams())) + + builder = ColumnWiseDatasetBuilder( + data_designer_config=config_builder.build(), + resource_provider=stub_resource_provider, + ) + + # Mock everything to isolate the test + builder._run_model_health_check_if_needed = Mock() + builder._run_mcp_tool_check_if_needed = Mock() + builder._run_pre_generation_processors = Mock() + builder._initialize_generators = Mock(return_value=[]) + builder.batch_manager.start = Mock() + builder._run_batch = Mock() + builder.batch_manager.get_current_batch = Mock(return_value=pd.DataFrame()) + builder.batch_manager.reset = Mock() + builder._resource_provider.model_registry.get_model_usage_stats = Mock(return_value={}) + + builder.build_preview(num_records=5) + + builder._run_pre_generation_processors.assert_called_once() + + +def test_process_preview_runs_both_callbacks(stub_resource_provider, stub_model_configs): + """Test that process_preview runs process_after_batch and postprocess callbacks.""" + from data_designer.engine.processing.processors.base import Processor + + config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) + config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) + + builder = ColumnWiseDatasetBuilder( + data_designer_config=config_builder.build(), + resource_provider=stub_resource_provider, + ) + + # Create a mock processor with both callbacks + mock_processor = Mock(spec=Processor) + mock_processor.name = "test_processor" + mock_processor.implements.side_effect = lambda m: m in ("process_after_batch", "postprocess") + mock_processor.process_after_batch.side_effect = lambda df, **kwargs: df.assign(post_batch_applied=True) + mock_processor.postprocess.side_effect = lambda df: df.assign(post_gen_applied=True) + + builder._processors = [mock_processor] + + input_df = pd.DataFrame({"id": [1, 2, 3]}) + result = builder.process_preview(input_df) + + # Both callbacks should have been called + mock_processor.process_after_batch.assert_called_once() + mock_processor.postprocess.assert_called_once() + + # Result should have both columns added + assert "post_batch_applied" in result.columns + assert "post_gen_applied" in result.columns diff --git a/packages/data-designer-engine/tests/engine/processing/processors/test_drop_columns.py b/packages/data-designer-engine/tests/engine/processing/processors/test_drop_columns.py index 53da3e4a..97662e98 100644 --- a/packages/data-designer-engine/tests/engine/processing/processors/test_drop_columns.py +++ b/packages/data-designer-engine/tests/engine/processing/processors/test_drop_columns.py @@ -8,7 +8,6 @@ import pytest -from data_designer.config.dataset_builders import BuildStage from data_designer.config.processors import DropColumnsProcessorConfig from data_designer.engine.dataset_builders.artifact_storage import BatchStage from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor @@ -20,9 +19,7 @@ @pytest.fixture def stub_processor_config(): - return DropColumnsProcessorConfig( - name="drop_columns_processor", build_stage=BuildStage.POST_BATCH, column_names=["col1", "col2"] - ) + return DropColumnsProcessorConfig(name="drop_columns_processor", column_names=["col1", "col2"]) @pytest.fixture @@ -84,34 +81,34 @@ def stub_empty_dataframe(): ), ], ) -def test_process_scenarios( +def test_process_after_batch_scenarios( stub_processor, stub_sample_dataframe, test_case, column_names, expected_result, expected_warning ): stub_processor.config.column_names = column_names if expected_warning: with patch("data_designer.engine.processing.processors.drop_columns.logger") as mock_logger: - result = stub_processor.process(stub_sample_dataframe.copy()) + result = stub_processor.process_after_batch(stub_sample_dataframe.copy(), current_batch_number=0) pd.testing.assert_frame_equal(result, pd.DataFrame(expected_result)) mock_logger.warning.assert_called_once_with(expected_warning) else: - result = stub_processor.process(stub_sample_dataframe.copy()) + result = stub_processor.process_after_batch(stub_sample_dataframe.copy(), current_batch_number=0) pd.testing.assert_frame_equal(result, pd.DataFrame(expected_result)) -def test_process_logging(stub_processor, stub_sample_dataframe): +def test_process_after_batch_logging(stub_processor, stub_sample_dataframe): with patch("data_designer.engine.processing.processors.drop_columns.logger") as mock_logger: - stub_processor.process(stub_sample_dataframe.copy()) + stub_processor.process_after_batch(stub_sample_dataframe.copy(), current_batch_number=0) mock_logger.info.assert_called_once_with("🙈 Dropping columns: ['col1', 'col2']") -def test_save_dropped_columns_without_preview(stub_processor, stub_sample_dataframe): +def test_save_dropped_columns(stub_processor, stub_sample_dataframe): stub_processor.config.column_names = ["col1", "col2"] with patch("data_designer.engine.processing.processors.drop_columns.logger") as mock_logger: - stub_processor.process(stub_sample_dataframe.copy(), current_batch_number=0) + stub_processor.process_after_batch(stub_sample_dataframe.copy(), current_batch_number=0) stub_processor.artifact_storage.write_parquet_file.assert_called_once() call_args = stub_processor.artifact_storage.write_parquet_file.call_args @@ -126,24 +123,19 @@ def test_save_dropped_columns_without_preview(stub_processor, stub_sample_datafr mock_logger.debug.assert_called_once_with("đŸ“Ļ Saving dropped columns to dropped-columns directory") -def test_save_dropped_columns_with_preview(stub_processor, stub_sample_dataframe): - stub_processor.config.column_names = ["col1", "col2"] - - stub_processor.process(stub_sample_dataframe.copy()) - stub_processor.artifact_storage.write_parquet_file.assert_not_called() - - def test_save_dropped_columns_with_nonexistent_columns(stub_processor, stub_sample_dataframe): + """When columns don't exist, no file is written but warnings are logged.""" stub_processor.config.column_names = ["nonexistent1", "nonexistent2"] with patch("data_designer.engine.processing.processors.drop_columns.logger"): - with pytest.raises(KeyError): - stub_processor.process(stub_sample_dataframe.copy(), current_batch_number=0) + stub_processor.process_after_batch(stub_sample_dataframe.copy(), current_batch_number=0) + # No file is written for nonexistent columns + stub_processor.artifact_storage.write_parquet_file.assert_not_called() -def test_process_inplace_modification(stub_processor, stub_sample_dataframe): +def test_process_after_batch_inplace_modification(stub_processor, stub_sample_dataframe): original_df = stub_sample_dataframe.copy() - result = stub_processor.process(original_df) + result = stub_processor.process_after_batch(original_df, current_batch_number=0) assert result is original_df @@ -152,11 +144,26 @@ def test_process_inplace_modification(stub_processor, stub_sample_dataframe): assert "col3" in result.columns -def test_process_empty_dataframe(stub_processor, stub_empty_dataframe): +def test_process_after_batch_empty_dataframe(stub_processor, stub_empty_dataframe): stub_processor.config.column_names = ["col1"] with patch("data_designer.engine.processing.processors.drop_columns.logger") as mock_logger: - result = stub_processor.process(stub_empty_dataframe) + result = stub_processor.process_after_batch(stub_empty_dataframe, current_batch_number=0) pd.testing.assert_frame_equal(result, stub_empty_dataframe) mock_logger.warning.assert_called_once_with("âš ī¸ Cannot drop column: `col1` not found in the dataset.") + + +def test_process_after_batch_preview_mode_does_not_save(stub_processor, stub_sample_dataframe): + """In preview mode (current_batch_number=None), columns are dropped but not saved to disk.""" + stub_processor.config.column_names = ["col1", "col2"] + + result = stub_processor.process_after_batch(stub_sample_dataframe.copy(), current_batch_number=None) + + # Columns should still be dropped + assert "col1" not in result.columns + assert "col2" not in result.columns + assert "col3" in result.columns + + # But no file should be written + stub_processor.artifact_storage.write_parquet_file.assert_not_called() diff --git a/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py b/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py index 520d67da..b481b1d0 100644 --- a/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py +++ b/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py @@ -9,7 +9,6 @@ import pytest -from data_designer.config.dataset_builders import BuildStage from data_designer.config.processors import SchemaTransformProcessorConfig from data_designer.engine.dataset_builders.artifact_storage import BatchStage from data_designer.engine.processing.processors.schema_transform import SchemaTransformProcessor @@ -23,7 +22,6 @@ @pytest.fixture def stub_processor_config() -> SchemaTransformProcessorConfig: return SchemaTransformProcessorConfig( - build_stage=BuildStage.POST_BATCH, template={"text": "{{ col1 }}", "value": "{{ col2 }}"}, name="test_schema_transform", ) @@ -53,20 +51,20 @@ def stub_simple_dataframe() -> pd.DataFrame: ) -def test_process_returns_original_dataframe( +def test_process_after_batch_returns_original_dataframe( stub_processor: SchemaTransformProcessor, stub_sample_dataframe: pd.DataFrame ) -> None: - result = stub_processor.process(stub_sample_dataframe, current_batch_number=0) + result = stub_processor.process_after_batch(stub_sample_dataframe, current_batch_number=0) pd.testing.assert_frame_equal(result, stub_sample_dataframe) -def test_process_writes_formatted_output_to_parquet( +def test_process_after_batch_writes_formatted_output_to_parquet( stub_processor: SchemaTransformProcessor, stub_sample_dataframe: pd.DataFrame ) -> None: # Process the dataframe - result = stub_processor.process(stub_sample_dataframe, current_batch_number=0) + result = stub_processor.process_after_batch(stub_sample_dataframe, current_batch_number=0) - # Verify the original dataframe is returned + # Verify the original dataframe is returned (formatted data is artifact-only) pd.testing.assert_frame_equal(result, stub_sample_dataframe) # Verify write_batch_to_parquet_file was called with correct parameters @@ -97,20 +95,7 @@ def test_process_writes_formatted_output_to_parquet( assert json.loads(actual) == json.loads(expected), f"Row {i} mismatch: {actual} != {expected}" -def test_process_without_batch_number_does_not_write( - stub_processor: SchemaTransformProcessor, stub_sample_dataframe: pd.DataFrame -) -> None: - # Process without batch number (preview mode) - result = stub_processor.process(stub_sample_dataframe, current_batch_number=None) - - # Verify the original dataframe is returned - pd.testing.assert_frame_equal(result, stub_sample_dataframe) - - # Verify write_batch_to_parquet_file was NOT called - stub_processor.artifact_storage.write_batch_to_parquet_file.assert_not_called() - - -def test_process_with_json_serialized_values(stub_processor: SchemaTransformProcessor) -> None: +def test_process_after_batch_with_json_serialized_values(stub_processor: SchemaTransformProcessor) -> None: # Test with JSON-serialized values in dataframe df_with_json = pd.DataFrame( { @@ -120,7 +105,7 @@ def test_process_with_json_serialized_values(stub_processor: SchemaTransformProc ) # Process the dataframe - stub_processor.process(df_with_json, current_batch_number=0) + stub_processor.process_after_batch(df_with_json, current_batch_number=0) written_dataframe: pd.DataFrame = stub_processor.artifact_storage.write_batch_to_parquet_file.call_args.kwargs[ "dataframe" ] @@ -136,7 +121,7 @@ def test_process_with_json_serialized_values(stub_processor: SchemaTransformProc assert first_output["value"] == '{"nested": "value1"}' -def test_process_with_special_characters_in_llm_output(stub_processor: SchemaTransformProcessor) -> None: +def test_process_after_batch_with_special_characters_in_llm_output(stub_processor: SchemaTransformProcessor) -> None: """Test that LLM outputs with special characters are properly escaped for JSON. This addresses GitHub issue #227 where SchemaTransformProcessor fails with JSONDecodeError @@ -155,7 +140,7 @@ def test_process_with_special_characters_in_llm_output(stub_processor: SchemaTra ) # Process should not raise JSONDecodeError - stub_processor.process(df_with_special_chars, current_batch_number=0) + stub_processor.process_after_batch(df_with_special_chars, current_batch_number=0) written_dataframe: pd.DataFrame = stub_processor.artifact_storage.write_batch_to_parquet_file.call_args.kwargs[ "dataframe" ] @@ -172,7 +157,7 @@ def test_process_with_special_characters_in_llm_output(stub_processor: SchemaTra assert outputs[3]["text"] == "Tab\there" -def test_process_with_mixed_special_characters(stub_processor: SchemaTransformProcessor) -> None: +def test_process_after_batch_with_mixed_special_characters(stub_processor: SchemaTransformProcessor) -> None: """Test complex LLM output with multiple types of special characters.""" df_complex = pd.DataFrame( { @@ -183,7 +168,7 @@ def test_process_with_mixed_special_characters(stub_processor: SchemaTransformPr } ) - stub_processor.process(df_complex, current_batch_number=0) + stub_processor.process_after_batch(df_complex, current_batch_number=0) written_dataframe: pd.DataFrame = stub_processor.artifact_storage.write_batch_to_parquet_file.call_args.kwargs[ "dataframe" ] @@ -191,3 +176,16 @@ def test_process_with_mixed_special_characters(stub_processor: SchemaTransformPr assert len(written_dataframe) == 1 output = written_dataframe.iloc[0].to_dict() assert output["text"] == 'She replied: "I\'m not sure about that\\nLet me think..."' + + +def test_process_after_batch_preview_mode_does_not_write( + stub_processor: SchemaTransformProcessor, stub_sample_dataframe: pd.DataFrame +) -> None: + """In preview mode (current_batch_number=None), no parquet file is written.""" + result = stub_processor.process_after_batch(stub_sample_dataframe, current_batch_number=None) + + # Original dataframe should be returned (formatted data is artifact-only) + pd.testing.assert_frame_equal(result, stub_sample_dataframe) + + # No file should be written + stub_processor.artifact_storage.write_batch_to_parquet_file.assert_not_called() diff --git a/packages/data-designer-engine/tests/engine/test_validation.py b/packages/data-designer-engine/tests/engine/test_validation.py index c0cc4bc0..97f795b5 100644 --- a/packages/data-designer-engine/tests/engine/test_validation.py +++ b/packages/data-designer-engine/tests/engine/test_validation.py @@ -12,7 +12,6 @@ Score, ValidationColumnConfig, ) -from data_designer.config.dataset_builders import BuildStage from data_designer.config.models import ImageContext, ModalityDataType from data_designer.config.processors import ( DropColumnsProcessorConfig, @@ -104,12 +103,10 @@ DropColumnsProcessorConfig( name="drop_columns_processor", column_names=["inexistent_column"], - build_stage=BuildStage.POST_BATCH, ), SchemaTransformProcessorConfig( name="schema_transform_processor_invalid_reference", template={"text": "{{ invalid_reference }}"}, - build_stage=BuildStage.POST_BATCH, ), ] ALLOWED_REFERENCE = [c.name for c in COLUMNS] diff --git a/packages/data-designer/tests/interface/test_data_designer.py b/packages/data-designer/tests/interface/test_data_designer.py index 84636e59..692bfc13 100644 --- a/packages/data-designer/tests/interface/test_data_designer.py +++ b/packages/data-designer/tests/interface/test_data_designer.py @@ -12,7 +12,6 @@ from data_designer.config.column_configs import SamplerColumnConfig from data_designer.config.config_builder import DataDesignerConfigBuilder -from data_designer.config.dataset_builders import BuildStage from data_designer.config.errors import InvalidConfigError from data_designer.config.models import ModelProvider from data_designer.config.processors import DropColumnsProcessorConfig @@ -323,11 +322,7 @@ def test_preview_with_dropped_columns( SamplerColumnConfig(name="uniform", sampler_type="uniform", params={"low": 1, "high": 100}) ) - config_builder.add_processor( - DropColumnsProcessorConfig( - name="drop_columns_processor", build_stage=BuildStage.POST_BATCH, column_names=["category"] - ) - ) + config_builder.add_processor(DropColumnsProcessorConfig(name="drop_columns_processor", column_names=["category"])) data_designer = DataDesigner( artifact_path=stub_artifact_path, From 0347798a8d669a85c26c7475549dbd3649565813 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 5 Feb 2026 17:33:27 -0300 Subject: [PATCH 02/14] add edge case handling and tests for processor stages - Add warning when PRE_BATCH changes row count - Clean up preprocessed seed file after preview - Add parametrized test verifying all 4 stages run in order --- .../dataset_builders/column_wise_builder.py | 23 +++++-- .../test_column_wise_builder.py | 63 +++++++++++++++++++ 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 19b58457..b81b8461 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -131,8 +131,8 @@ def build_preview(self, *, num_records: int) -> pd.DataFrame: dataset = self.batch_manager.get_current_batch(as_dataframe=True) self.batch_manager.reset() - # Reset preprocessed_seed_uri to avoid affecting subsequent build() calls - self._resource_provider.preprocessed_seed_uri = None + # Clean up preprocessed seed file and reset URI to avoid affecting subsequent build() calls + self._cleanup_preprocessed_seed() self._resource_provider.model_registry.log_model_usage(time.perf_counter() - start_time) @@ -345,10 +345,7 @@ def _load_seed_dataframe(self) -> pd.DataFrame: """Load full seed dataset as DataFrame.""" seed_reader = self._resource_provider.seed_reader conn = seed_reader.create_duckdb_connection() - try: - return conn.execute(f"SELECT * FROM '{seed_reader.get_dataset_uri()}'").fetchdf() - finally: - conn.close() + return conn.execute(f"SELECT * FROM '{seed_reader.get_dataset_uri()}'").fetchdf() def _run_preprocess_on_df(self, df: pd.DataFrame, processors: list[Processor]) -> pd.DataFrame: """Run preprocess() on given processors.""" @@ -366,6 +363,14 @@ def _save_preprocessed_seed(self, df: pd.DataFrame) -> None: df.to_parquet(preprocessed_path, index=False) self._resource_provider.preprocessed_seed_uri = str(preprocessed_path) + def _cleanup_preprocessed_seed(self) -> None: + """Remove preprocessed seed file and reset URI.""" + if self._resource_provider.preprocessed_seed_uri is not None: + preprocessed_path = Path(self._resource_provider.preprocessed_seed_uri) + if preprocessed_path.exists(): + preprocessed_path.unlink() + self._resource_provider.preprocessed_seed_uri = None + def _apply_pre_batch_processors(self) -> None: """Get batch, run PRE_BATCH processors, update batch manager.""" processors = [p for p in self._processors if p.implements("process_before_batch")] @@ -373,6 +378,7 @@ def _apply_pre_batch_processors(self) -> None: return df = self.batch_manager.get_current_batch(as_dataframe=True) + original_len = len(df) for processor in processors: try: df = processor.process_before_batch(df) @@ -380,6 +386,11 @@ def _apply_pre_batch_processors(self) -> None: raise DatasetProcessingError( f"🛑 Failed in process_before_batch for processor {processor.name}: {e}" ) from e + if len(df) != original_len: + logger.warning( + f"âš ī¸ PRE_BATCH processors changed row count from {original_len} to {len(df)}. " + "This may cause unexpected behavior in downstream generators." + ) self.batch_manager.update_records(df.to_dict(orient="records")) def _run_post_batch_processors(self, dataframe: pd.DataFrame, current_batch_number: int | None) -> pd.DataFrame: diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index 582b08bf..d0d9f984 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -567,3 +567,66 @@ def test_process_preview_runs_both_callbacks(stub_resource_provider, stub_model_ # Result should have both columns added assert "post_batch_applied" in result.columns assert "post_gen_applied" in result.columns + + +@pytest.mark.parametrize("mode", ["preview", "build"]) +def test_all_processor_stages_run_in_order(stub_resource_provider, stub_model_configs, tmp_path, mode): + """Test that all 4 processor stages run in correct order for both preview and build modes.""" + from data_designer.config.seed_source import DataFrameSeedSource, LocalFileSeedSource + from data_designer.engine.processing.processors.base import Processor + from data_designer.engine.resources.seed_reader import DataFrameSeedReader + + # Set up seed reader with test data + seed_df = pd.DataFrame({"seed_id": [1, 2, 3], "text": ["a", "b", "c"]}) + seed_source = DataFrameSeedSource(df=seed_df) + seed_reader = DataFrameSeedReader() + seed_reader.attach(seed_source, Mock()) + stub_resource_provider.seed_reader = seed_reader + + # Write seed file to tmp_path + seed_path = tmp_path / "seed.parquet" + seed_df.to_parquet(seed_path, index=False) + + config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) + config_builder.with_seed_dataset(LocalFileSeedSource(path=str(seed_path))) + config_builder.add_column(SamplerColumnConfig(name="extra", sampler_type="uuid", params=UUIDSamplerParams())) + + builder = ColumnWiseDatasetBuilder( + data_designer_config=config_builder.build(), + resource_provider=stub_resource_provider, + ) + + # Create a processor that implements all 4 stages to track calls + call_order = [] + + mock_processor = Mock(spec=Processor) + mock_processor.name = "all_stages_processor" + mock_processor.implements.side_effect = lambda m: m in ( + "preprocess", + "process_before_batch", + "process_after_batch", + "postprocess", + ) + mock_processor.preprocess.side_effect = lambda df: (call_order.append("preprocess"), df)[1] + mock_processor.process_before_batch.side_effect = lambda df: (call_order.append("process_before_batch"), df)[1] + mock_processor.process_after_batch.side_effect = lambda df, **kw: (call_order.append("process_after_batch"), df)[1] + mock_processor.postprocess.side_effect = lambda df: (call_order.append("postprocess"), df)[1] + + builder._processors = [mock_processor] + + if mode == "preview": + # Preview flow: build_preview() + process_preview() + raw_dataset = builder.build_preview(num_records=3) + builder.process_preview(raw_dataset) + else: + # Build flow: build() runs all stages internally + builder.build(num_records=3) + + # Verify all 4 stages were called + mock_processor.preprocess.assert_called_once() + mock_processor.process_before_batch.assert_called_once() + mock_processor.process_after_batch.assert_called_once() + mock_processor.postprocess.assert_called_once() + + # Verify call order matches the pipeline stages + assert call_order == ["preprocess", "process_before_batch", "process_after_batch", "postprocess"] From 8c5296bb33e6632ee4c1d41fe6e113887bfb843c Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 5 Feb 2026 17:46:55 -0300 Subject: [PATCH 03/14] remove duplicate processor stage tests The comprehensive test_all_processor_stages_run_in_order covers both preview and build modes, making the individual tests redundant. --- .../test_column_wise_builder.py | 74 ------------------- 1 file changed, 74 deletions(-) diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index d0d9f984..81c8c705 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -495,80 +495,6 @@ def test_run_pre_generation_processors_skips_when_no_seed_reader(stub_resource_p mock_processor.preprocess.assert_not_called() -def test_build_preview_runs_pre_generation_processors(stub_resource_provider, stub_model_configs, tmp_path): - """Test that build_preview runs PRE_GENERATION processors.""" - from data_designer.config.seed_source import DataFrameSeedSource, LocalFileSeedSource - from data_designer.engine.resources.seed_reader import DataFrameSeedReader - - # Set up seed reader with test data - seed_df = pd.DataFrame({"seed_id": [1, 2, 3, 4, 5], "text": ["a", "b", "c", "d", "e"]}) - seed_source = DataFrameSeedSource(df=seed_df) - seed_reader = DataFrameSeedReader() - seed_reader.attach(seed_source, Mock()) - stub_resource_provider.seed_reader = seed_reader - - # Write seed file to tmp_path - seed_path = tmp_path / "seed.parquet" - seed_df.to_parquet(seed_path, index=False) - - config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) - config_builder.with_seed_dataset(LocalFileSeedSource(path=str(seed_path))) - config_builder.add_column(SamplerColumnConfig(name="uuid", sampler_type="uuid", params=UUIDSamplerParams())) - - builder = ColumnWiseDatasetBuilder( - data_designer_config=config_builder.build(), - resource_provider=stub_resource_provider, - ) - - # Mock everything to isolate the test - builder._run_model_health_check_if_needed = Mock() - builder._run_mcp_tool_check_if_needed = Mock() - builder._run_pre_generation_processors = Mock() - builder._initialize_generators = Mock(return_value=[]) - builder.batch_manager.start = Mock() - builder._run_batch = Mock() - builder.batch_manager.get_current_batch = Mock(return_value=pd.DataFrame()) - builder.batch_manager.reset = Mock() - builder._resource_provider.model_registry.get_model_usage_stats = Mock(return_value={}) - - builder.build_preview(num_records=5) - - builder._run_pre_generation_processors.assert_called_once() - - -def test_process_preview_runs_both_callbacks(stub_resource_provider, stub_model_configs): - """Test that process_preview runs process_after_batch and postprocess callbacks.""" - from data_designer.engine.processing.processors.base import Processor - - config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) - config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) - - builder = ColumnWiseDatasetBuilder( - data_designer_config=config_builder.build(), - resource_provider=stub_resource_provider, - ) - - # Create a mock processor with both callbacks - mock_processor = Mock(spec=Processor) - mock_processor.name = "test_processor" - mock_processor.implements.side_effect = lambda m: m in ("process_after_batch", "postprocess") - mock_processor.process_after_batch.side_effect = lambda df, **kwargs: df.assign(post_batch_applied=True) - mock_processor.postprocess.side_effect = lambda df: df.assign(post_gen_applied=True) - - builder._processors = [mock_processor] - - input_df = pd.DataFrame({"id": [1, 2, 3]}) - result = builder.process_preview(input_df) - - # Both callbacks should have been called - mock_processor.process_after_batch.assert_called_once() - mock_processor.postprocess.assert_called_once() - - # Result should have both columns added - assert "post_batch_applied" in result.columns - assert "post_gen_applied" in result.columns - - @pytest.mark.parametrize("mode", ["preview", "build"]) def test_all_processor_stages_run_in_order(stub_resource_provider, stub_model_configs, tmp_path, mode): """Test that all 4 processor stages run in correct order for both preview and build modes.""" From 4ea55c49476343ec95b584eee94d5c72a7c4761c Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 5 Feb 2026 18:03:41 -0300 Subject: [PATCH 04/14] refactor processor tests with fixtures and add edge cases - Move imports to top of file - Add seed_data_setup and builder_with_seed fixtures - Add create_mock_processor helper function - Add edge case tests for exceptions, no-op processors, ordering --- .../test_column_wise_builder.py | 242 +++++++++++------- 1 file changed, 146 insertions(+), 96 deletions(-) diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index 81c8c705..b8854770 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -3,6 +3,7 @@ from __future__ import annotations +from pathlib import Path from typing import TYPE_CHECKING from unittest.mock import Mock, patch @@ -13,12 +14,15 @@ from data_designer.config.processors import DropColumnsProcessorConfig from data_designer.config.run_config import RunConfig from data_designer.config.sampler_params import SamplerType, UUIDSamplerParams +from data_designer.config.seed_source import DataFrameSeedSource, LocalFileSeedSource from data_designer.engine.column_generators.generators.base import GenerationStrategy from data_designer.engine.dataset_builders.column_wise_builder import ColumnWiseDatasetBuilder -from data_designer.engine.dataset_builders.errors import DatasetGenerationError +from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum from data_designer.engine.models.usage import ModelUsageStats, TokenUsageStats +from data_designer.engine.processing.processors.base import Processor from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry +from data_designer.engine.resources.seed_reader import DataFrameSeedReader from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: @@ -80,6 +84,46 @@ def stub_column_wise_builder(stub_resource_provider, stub_test_config_builder): ) +@pytest.fixture +def seed_data_setup(stub_resource_provider, tmp_path): + """Set up seed reader with test data and write seed file to disk.""" + seed_df = pd.DataFrame({"seed_id": [1, 2, 3, 4, 5], "text": ["a", "b", "c", "d", "e"]}) + seed_source = DataFrameSeedSource(df=seed_df) + seed_reader = DataFrameSeedReader() + seed_reader.attach(seed_source, Mock()) + stub_resource_provider.seed_reader = seed_reader + + seed_path = tmp_path / "seed.parquet" + seed_df.to_parquet(seed_path, index=False) + + return {"seed_df": seed_df, "seed_path": seed_path} + + +@pytest.fixture +def builder_with_seed(stub_resource_provider, stub_model_configs, seed_data_setup): + """Create a builder with seed dataset configured.""" + config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) + config_builder.with_seed_dataset(LocalFileSeedSource(path=str(seed_data_setup["seed_path"]))) + config_builder.add_column(SamplerColumnConfig(name="extra", sampler_type="uuid", params=UUIDSamplerParams())) + + return ColumnWiseDatasetBuilder( + data_designer_config=config_builder.build(), + resource_provider=stub_resource_provider, + ) + + +def create_mock_processor(name: str, stages: list[str]) -> Mock: + """Create a mock processor that implements specified stages.""" + mock_processor = Mock(spec=Processor) + mock_processor.name = name + mock_processor.implements.side_effect = lambda m: m in stages + mock_processor.preprocess.side_effect = lambda df: df + mock_processor.process_before_batch.side_effect = lambda df: df + mock_processor.process_after_batch.side_effect = lambda df, **kw: df + mock_processor.postprocess.side_effect = lambda df: df + return mock_processor + + def test_column_wise_dataset_builder_creation(stub_resource_provider, stub_test_config_builder): builder = ColumnWiseDatasetBuilder( data_designer_config=stub_test_config_builder.build(), @@ -344,8 +388,6 @@ def test_fan_out_with_threads_uses_early_shutdown_settings_from_resource_provide shutdown_error_window: int, ) -> None: """Test that _fan_out_with_threads uses run settings from resource_provider.""" - from data_designer.config.run_config import RunConfig - stub_resource_provider.run_config = RunConfig( disable_early_shutdown=disable_early_shutdown, shutdown_error_rate=configured_rate, @@ -385,52 +427,20 @@ def test_fan_out_with_threads_uses_early_shutdown_settings_from_resource_provide assert call_kwargs["disable_early_shutdown"] == disable_early_shutdown -def test_run_pre_generation_processors_filters_seed_data(stub_resource_provider, stub_model_configs, tmp_path): +def test_run_pre_generation_processors_filters_seed_data(stub_resource_provider, builder_with_seed, seed_data_setup): """Test that PRE_GENERATION processors are applied to seed data before generation.""" - from pathlib import Path - - from data_designer.config.seed_source import DataFrameSeedSource, LocalFileSeedSource - from data_designer.engine.processing.processors.base import Processor - from data_designer.engine.resources.seed_reader import DataFrameSeedReader - - # Set up seed reader with test data - seed_df = pd.DataFrame({"seed_id": [1, 2, 3, 4, 5], "value": ["a", "b", "c", "d", "e"]}) - seed_source = DataFrameSeedSource(df=seed_df) - seed_reader = DataFrameSeedReader() - seed_reader.attach(seed_source, Mock()) - stub_resource_provider.seed_reader = seed_reader - - # Create a mock processor that filters rows during preprocess - mock_processor = Mock(spec=Processor) - mock_processor.name = "filter_processor" - mock_processor.implements.side_effect = lambda m: m == "preprocess" + mock_processor = create_mock_processor("filter_processor", ["preprocess"]) mock_processor.preprocess.side_effect = lambda df: df[df["seed_id"] > 2].reset_index(drop=True) - # Write seed file to tmp_path - seed_path = tmp_path / "seed.parquet" - seed_df.to_parquet(seed_path, index=False) - - config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) - config_builder.with_seed_dataset(LocalFileSeedSource(path=str(seed_path))) - config_builder.add_column(SamplerColumnConfig(name="uuid", sampler_type="uuid", params=UUIDSamplerParams())) - - builder = ColumnWiseDatasetBuilder( - data_designer_config=config_builder.build(), - resource_provider=stub_resource_provider, - ) - builder._processors = [mock_processor] - - builder._run_pre_generation_processors() + builder_with_seed._processors = [mock_processor] + builder_with_seed._run_pre_generation_processors() - # Verify preprocess was called mock_processor.preprocess.assert_called_once() - # Verify preprocessed_seed_uri was set and points to a valid file assert stub_resource_provider.preprocessed_seed_uri is not None preprocessed_path = Path(stub_resource_provider.preprocessed_seed_uri) assert preprocessed_path.exists() - # Verify the preprocessed file contains filtered data (3 rows with seed_id > 2) preprocessed_df = pd.read_parquet(preprocessed_path) assert len(preprocessed_df) == 3 assert list(preprocessed_df["seed_id"]) == [3, 4, 5] @@ -438,18 +448,12 @@ def test_run_pre_generation_processors_filters_seed_data(stub_resource_provider, def test_run_post_generation_processors_modifies_final_dataset(stub_resource_provider, stub_model_configs): """Test that postprocess callbacks are applied to the final dataset.""" - from data_designer.engine.processing.processors.base import Processor - - # Create test parquet files final_df = pd.DataFrame({"id": [1, 2, 3, 4, 5], "value": ["a", "b", "c", "d", "e"]}) stub_resource_provider.artifact_storage.mkdir_if_needed(stub_resource_provider.artifact_storage.final_dataset_path) final_df.to_parquet(stub_resource_provider.artifact_storage.final_dataset_path / "batch_00000.parquet", index=False) - # Create a mock processor that filters rows during postprocess - mock_processor = Mock(spec=Processor) - mock_processor.name = "dedup_processor" - mock_processor.implements.side_effect = lambda m: m == "postprocess" - mock_processor.postprocess.return_value = final_df[final_df["id"] > 2] + mock_processor = create_mock_processor("dedup_processor", ["postprocess"]) + mock_processor.postprocess.side_effect = lambda df: df[df["id"] > 2] config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) @@ -462,23 +466,17 @@ def test_run_post_generation_processors_modifies_final_dataset(stub_resource_pro builder._run_post_generation_processors() - # Verify postprocess was called mock_processor.postprocess.assert_called_once() - # Verify final dataset was rewritten with fewer rows result_df = stub_resource_provider.artifact_storage.load_dataset() assert len(result_df) == 3 def test_run_pre_generation_processors_skips_when_no_seed_reader(stub_resource_provider, stub_model_configs): """Test that preprocess is skipped when no seed reader is configured.""" - from data_designer.engine.processing.processors.base import Processor - stub_resource_provider.seed_reader = None - mock_processor = Mock(spec=Processor) - mock_processor.name = "filter_processor" - mock_processor.implements.side_effect = lambda m: m == "preprocess" + mock_processor = create_mock_processor("filter_processor", ["preprocess"]) config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) @@ -491,68 +489,120 @@ def test_run_pre_generation_processors_skips_when_no_seed_reader(stub_resource_p builder._run_pre_generation_processors() - # Preprocess should not be called when no seed reader mock_processor.preprocess.assert_not_called() @pytest.mark.parametrize("mode", ["preview", "build"]) -def test_all_processor_stages_run_in_order(stub_resource_provider, stub_model_configs, tmp_path, mode): +def test_all_processor_stages_run_in_order(builder_with_seed, mode): """Test that all 4 processor stages run in correct order for both preview and build modes.""" - from data_designer.config.seed_source import DataFrameSeedSource, LocalFileSeedSource - from data_designer.engine.processing.processors.base import Processor - from data_designer.engine.resources.seed_reader import DataFrameSeedReader + call_order = [] + all_stages = ["preprocess", "process_before_batch", "process_after_batch", "postprocess"] - # Set up seed reader with test data - seed_df = pd.DataFrame({"seed_id": [1, 2, 3], "text": ["a", "b", "c"]}) - seed_source = DataFrameSeedSource(df=seed_df) - seed_reader = DataFrameSeedReader() - seed_reader.attach(seed_source, Mock()) - stub_resource_provider.seed_reader = seed_reader + mock_processor = create_mock_processor("all_stages_processor", all_stages) + mock_processor.preprocess.side_effect = lambda df: (call_order.append("preprocess"), df)[1] + mock_processor.process_before_batch.side_effect = lambda df: (call_order.append("process_before_batch"), df)[1] + mock_processor.process_after_batch.side_effect = lambda df, **kw: (call_order.append("process_after_batch"), df)[1] + mock_processor.postprocess.side_effect = lambda df: (call_order.append("postprocess"), df)[1] + + builder_with_seed._processors = [mock_processor] + + if mode == "preview": + raw_dataset = builder_with_seed.build_preview(num_records=3) + builder_with_seed.process_preview(raw_dataset) + else: + builder_with_seed.build(num_records=3) + + mock_processor.preprocess.assert_called_once() + mock_processor.process_before_batch.assert_called_once() + mock_processor.process_after_batch.assert_called_once() + mock_processor.postprocess.assert_called_once() + + assert call_order == all_stages - # Write seed file to tmp_path - seed_path = tmp_path / "seed.parquet" - seed_df.to_parquet(seed_path, index=False) + +# --- Edge Case Tests --- + + +def test_processor_exception_in_preprocess_raises_error(builder_with_seed): + """Test that processor exceptions during preprocess are properly wrapped.""" + mock_processor = create_mock_processor("failing_processor", ["preprocess"]) + mock_processor.preprocess.side_effect = ValueError("Preprocessing failed") + + builder_with_seed._processors = [mock_processor] + + with pytest.raises(DatasetProcessingError, match="Failed in preprocess"): + builder_with_seed._run_pre_generation_processors() + + +def test_processor_exception_in_process_after_batch_raises_error(stub_resource_provider, stub_model_configs): + """Test that processor exceptions during process_after_batch are properly wrapped.""" + mock_processor = create_mock_processor("failing_processor", ["process_after_batch"]) + mock_processor.process_after_batch.side_effect = ValueError("Post-batch processing failed") config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) - config_builder.with_seed_dataset(LocalFileSeedSource(path=str(seed_path))) - config_builder.add_column(SamplerColumnConfig(name="extra", sampler_type="uuid", params=UUIDSamplerParams())) + config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) builder = ColumnWiseDatasetBuilder( data_designer_config=config_builder.build(), resource_provider=stub_resource_provider, ) + builder._processors = [mock_processor] + + with pytest.raises(DatasetProcessingError, match="Failed in process_after_batch"): + builder._run_post_batch_processors(pd.DataFrame({"id": [1, 2, 3]}), current_batch_number=0) + + +def test_processor_with_no_implemented_stages_is_skipped(builder_with_seed): + """Test that a processor implementing no stages doesn't cause errors.""" + mock_processor = create_mock_processor("noop_processor", []) + + builder_with_seed._processors = [mock_processor] - # Create a processor that implements all 4 stages to track calls + # Should complete without errors + result = builder_with_seed.build_preview(num_records=3) + + assert len(result) == 3 + mock_processor.preprocess.assert_not_called() + mock_processor.process_before_batch.assert_not_called() + mock_processor.process_after_batch.assert_not_called() + mock_processor.postprocess.assert_not_called() + + +def test_multiple_processors_run_in_definition_order(builder_with_seed): + """Test that multiple processors run in the order they were defined.""" call_order = [] - mock_processor = Mock(spec=Processor) - mock_processor.name = "all_stages_processor" - mock_processor.implements.side_effect = lambda m: m in ( - "preprocess", - "process_before_batch", - "process_after_batch", - "postprocess", - ) - mock_processor.preprocess.side_effect = lambda df: (call_order.append("preprocess"), df)[1] - mock_processor.process_before_batch.side_effect = lambda df: (call_order.append("process_before_batch"), df)[1] - mock_processor.process_after_batch.side_effect = lambda df, **kw: (call_order.append("process_after_batch"), df)[1] - mock_processor.postprocess.side_effect = lambda df: (call_order.append("postprocess"), df)[1] + processor_a = create_mock_processor("processor_a", ["preprocess"]) + processor_a.preprocess.side_effect = lambda df: (call_order.append("a"), df)[1] + + processor_b = create_mock_processor("processor_b", ["preprocess"]) + processor_b.preprocess.side_effect = lambda df: (call_order.append("b"), df)[1] + + processor_c = create_mock_processor("processor_c", ["preprocess"]) + processor_c.preprocess.side_effect = lambda df: (call_order.append("c"), df)[1] + + builder_with_seed._processors = [processor_a, processor_b, processor_c] + builder_with_seed._run_pre_generation_processors() + + assert call_order == ["a", "b", "c"] + +def test_process_preview_with_empty_dataframe(stub_resource_provider, stub_model_configs): + """Test that process_preview handles empty DataFrames gracefully.""" + mock_processor = create_mock_processor("test_processor", ["process_after_batch", "postprocess"]) + + config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) + config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) + + builder = ColumnWiseDatasetBuilder( + data_designer_config=config_builder.build(), + resource_provider=stub_resource_provider, + ) builder._processors = [mock_processor] - if mode == "preview": - # Preview flow: build_preview() + process_preview() - raw_dataset = builder.build_preview(num_records=3) - builder.process_preview(raw_dataset) - else: - # Build flow: build() runs all stages internally - builder.build(num_records=3) + empty_df = pd.DataFrame() + result = builder.process_preview(empty_df) - # Verify all 4 stages were called - mock_processor.preprocess.assert_called_once() - mock_processor.process_before_batch.assert_called_once() + assert len(result) == 0 mock_processor.process_after_batch.assert_called_once() mock_processor.postprocess.assert_called_once() - - # Verify call order matches the pipeline stages - assert call_order == ["preprocess", "process_before_batch", "process_after_batch", "postprocess"] From 4663818d3a4634b16280b36728903b2c9850878b Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Thu, 5 Feb 2026 18:48:44 -0300 Subject: [PATCH 05/14] extract processor execution to ProcessorRunner class - Move all processor stage logic to new ProcessorRunner in utils/ - ProcessorRunner takes dependencies and provides complete stage methods - Builder now calls runner methods directly instead of wrapper methods - Remove unused imports from column_wise_builder --- .../dataset_builders/column_wise_builder.py | 157 ++++-------------- .../utils/processor_runner.py | 121 ++++++++++++++ .../test_column_wise_builder.py | 12 +- 3 files changed, 158 insertions(+), 132 deletions(-) create mode 100644 packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index b81b8461..67f27197 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -5,7 +5,6 @@ import functools import logging -import shutil import time import uuid from pathlib import Path @@ -28,12 +27,13 @@ ) from data_designer.engine.column_generators.utils.generator_classification import column_type_is_model_generated from data_designer.engine.compiler import compile_data_designer_config -from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage, BatchStage -from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError +from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage +from data_designer.engine.dataset_builders.errors import DatasetGenerationError from data_designer.engine.dataset_builders.multi_column_configs import MultiColumnConfig from data_designer.engine.dataset_builders.utils.concurrency import ConcurrentThreadExecutor from data_designer.engine.dataset_builders.utils.config_compiler import compile_dataset_builder_column_configs from data_designer.engine.dataset_builders.utils.dataset_batch_manager import DatasetBatchManager +from data_designer.engine.dataset_builders.utils.processor_runner import ProcessorRunner from data_designer.engine.dataset_builders.utils.progress_tracker import ProgressTracker from data_designer.engine.models.telemetry import InferenceEvent, NemoSourceEnum, TaskStatusEnum, TelemetryHandler from data_designer.engine.processing.processors.base import Processor @@ -67,13 +67,32 @@ def __init__( self._data_designer_config = compile_data_designer_config(data_designer_config, resource_provider) self._column_configs = compile_dataset_builder_column_configs(self._data_designer_config) - self._processors: list[Processor] = self._initialize_processors(self._data_designer_config.processors or []) + processors = self._initialize_processors(self._data_designer_config.processors or []) + self._processor_runner = ProcessorRunner( + processors=processors, + resource_provider=resource_provider, + artifact_storage=resource_provider.artifact_storage, + ) self._validate_column_configs() @property def artifact_storage(self) -> ArtifactStorage: return self._resource_provider.artifact_storage + @property + def _processors(self) -> list[Processor]: + """Expose processors for test compatibility.""" + return self._processor_runner._processors + + @_processors.setter + def _processors(self, processors: list[Processor]) -> None: + """Allow setting processors for test compatibility.""" + self._processor_runner = ProcessorRunner( + processors=processors, + resource_provider=self._resource_provider, + artifact_storage=self.artifact_storage, + ) + @functools.cached_property def single_column_configs(self) -> list[ColumnConfigT]: configs = [] @@ -96,7 +115,7 @@ def build( ) -> Path: self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() - self._run_pre_generation_processors() + self._processor_runner.run_preprocess() self._write_builder_config() generators = self._initialize_generators() start_time = time.perf_counter() @@ -108,11 +127,11 @@ def build( logger.info(f"âŗ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}") self._run_batch(generators, batch_mode="batch", group_id=group_id) df_batch = self.batch_manager.get_current_batch(as_dataframe=True) - df_batch = self._run_post_batch_processors(df_batch, current_batch_number=batch_idx) + df_batch = self._processor_runner.run_post_batch(df_batch, current_batch_number=batch_idx) self._write_processed_batch(df_batch) self.batch_manager.finish_batch(on_batch_complete) self.batch_manager.finish() - self._run_post_generation_processors() + self._processor_runner.run_postprocess() self._resource_provider.model_registry.log_model_usage(time.perf_counter() - start_time) @@ -121,7 +140,7 @@ def build( def build_preview(self, *, num_records: int) -> pd.DataFrame: self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() - self._run_pre_generation_processors() + self._processor_runner.run_preprocess() generators = self._initialize_generators() group_id = uuid.uuid4().hex @@ -132,15 +151,15 @@ def build_preview(self, *, num_records: int) -> pd.DataFrame: self.batch_manager.reset() # Clean up preprocessed seed file and reset URI to avoid affecting subsequent build() calls - self._cleanup_preprocessed_seed() + self._processor_runner.cleanup_preprocessed_seed() self._resource_provider.model_registry.log_model_usage(time.perf_counter() - start_time) return dataset def process_preview(self, dataset: pd.DataFrame) -> pd.DataFrame: - df = self._run_post_batch_processors(dataset.copy(), current_batch_number=None) - return self._run_post_generation_processors_on_df(df) + df = self._processor_runner.run_post_batch(dataset.copy(), current_batch_number=None) + return self._processor_runner.run_postprocess_on_df(df) def _initialize_generators(self) -> list[ColumnGenerator]: return [ @@ -169,7 +188,7 @@ def _run_batch( self._run_from_scratch_column_generator(generator) # Run PRE_BATCH after seed generator, before other columns if not ran_pre_batch: - self._apply_pre_batch_processors() + self._processor_runner.run_pre_batch(self.batch_manager) ran_pre_batch = True elif generation_strategy == GenerationStrategy.CELL_BY_CELL: self._run_cell_by_cell_generator(generator) @@ -324,120 +343,6 @@ def _initialize_processors(self, processor_configs: list[ProcessorConfig]) -> li return processors - def _run_pre_generation_processors(self) -> None: - """Run preprocess() on processors that implement it.""" - processors = [p for p in self._processors if p.implements("preprocess")] - if not processors: - return - if self._resource_provider.seed_reader is None: - return - - logger.info("âŗ Running preprocess on seed data...") - df = self._load_seed_dataframe() - original_len = len(df) - - df = self._run_preprocess_on_df(df, processors) - - self._save_preprocessed_seed(df) - logger.info(f"✅ Preprocess complete. Seed data has {len(df)} rows (was {original_len}).") - - def _load_seed_dataframe(self) -> pd.DataFrame: - """Load full seed dataset as DataFrame.""" - seed_reader = self._resource_provider.seed_reader - conn = seed_reader.create_duckdb_connection() - return conn.execute(f"SELECT * FROM '{seed_reader.get_dataset_uri()}'").fetchdf() - - def _run_preprocess_on_df(self, df: pd.DataFrame, processors: list[Processor]) -> pd.DataFrame: - """Run preprocess() on given processors.""" - for processor in processors: - try: - df = processor.preprocess(df) - except Exception as e: - raise DatasetProcessingError(f"🛑 Failed in preprocess for processor {processor.name}: {e}") from e - return df - - def _save_preprocessed_seed(self, df: pd.DataFrame) -> None: - """Write preprocessed seed to disk and update URI.""" - preprocessed_path = self.artifact_storage.base_dataset_path / "preprocessed_seed.parquet" - self.artifact_storage.mkdir_if_needed(self.artifact_storage.base_dataset_path) - df.to_parquet(preprocessed_path, index=False) - self._resource_provider.preprocessed_seed_uri = str(preprocessed_path) - - def _cleanup_preprocessed_seed(self) -> None: - """Remove preprocessed seed file and reset URI.""" - if self._resource_provider.preprocessed_seed_uri is not None: - preprocessed_path = Path(self._resource_provider.preprocessed_seed_uri) - if preprocessed_path.exists(): - preprocessed_path.unlink() - self._resource_provider.preprocessed_seed_uri = None - - def _apply_pre_batch_processors(self) -> None: - """Get batch, run PRE_BATCH processors, update batch manager.""" - processors = [p for p in self._processors if p.implements("process_before_batch")] - if not processors: - return - - df = self.batch_manager.get_current_batch(as_dataframe=True) - original_len = len(df) - for processor in processors: - try: - df = processor.process_before_batch(df) - except Exception as e: - raise DatasetProcessingError( - f"🛑 Failed in process_before_batch for processor {processor.name}: {e}" - ) from e - if len(df) != original_len: - logger.warning( - f"âš ī¸ PRE_BATCH processors changed row count from {original_len} to {len(df)}. " - "This may cause unexpected behavior in downstream generators." - ) - self.batch_manager.update_records(df.to_dict(orient="records")) - - def _run_post_batch_processors(self, dataframe: pd.DataFrame, current_batch_number: int | None) -> pd.DataFrame: - """Run process_after_batch() on processors that implement it.""" - for processor in self._processors: - if not processor.implements("process_after_batch"): - continue - try: - dataframe = processor.process_after_batch(dataframe, current_batch_number=current_batch_number) - except Exception as e: - raise DatasetProcessingError( - f"🛑 Failed in process_after_batch for processor {processor.name}: {e}" - ) from e - return dataframe - - def _run_post_generation_processors_on_df(self, df: pd.DataFrame) -> pd.DataFrame: - """Run postprocess() on processors that implement it.""" - for processor in self._processors: - if not processor.implements("postprocess"): - continue - try: - df = processor.postprocess(df) - except Exception as e: - raise DatasetProcessingError(f"🛑 Failed in postprocess for processor {processor.name}: {e}") from e - return df - - def _run_post_generation_processors(self) -> None: - """Run postprocess() on processors that implement it.""" - processors = [p for p in self._processors if p.implements("postprocess")] - if not processors: - return - - logger.info("âŗ Running postprocess on final dataset...") - original_df = self.artifact_storage.load_dataset() - - df = self._run_post_generation_processors_on_df(original_df) - - # Always rewrite since processors may modify values - if self.artifact_storage.final_dataset_path.exists(): - shutil.rmtree(self.artifact_storage.final_dataset_path) - self.artifact_storage.write_batch_to_parquet_file( - batch_number=0, - dataframe=df, - batch_stage=BatchStage.FINAL_RESULT, - ) - logger.info(f"✅ Postprocess complete. Final dataset has {len(df)} rows.") - def _worker_error_callback(self, exc: Exception, *, context: dict | None = None) -> None: """If a worker fails, we can handle the exception here.""" logger.warning( diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py new file mode 100644 index 00000000..d20d3f4e --- /dev/null +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import logging +import shutil +from pathlib import Path +from typing import TYPE_CHECKING + +from data_designer.engine.dataset_builders.artifact_storage import BatchStage +from data_designer.engine.dataset_builders.errors import DatasetProcessingError + +if TYPE_CHECKING: + import pandas as pd + + from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage + from data_designer.engine.dataset_builders.utils.dataset_batch_manager import DatasetBatchManager + from data_designer.engine.processing.processors.base import Processor + from data_designer.engine.resources.resource_provider import ResourceProvider + +logger = logging.getLogger(__name__) + + +class ProcessorRunner: + """Runs processor callbacks at various stages of dataset generation.""" + + def __init__( + self, + processors: list[Processor], + resource_provider: ResourceProvider, + artifact_storage: ArtifactStorage, + ): + self._processors = processors + self._resource_provider = resource_provider + self._artifact_storage = artifact_storage + + def has_processors_for(self, method_name: str) -> bool: + """Check if any processor implements the given method.""" + return any(p.implements(method_name) for p in self._processors) + + def _run_stage(self, df: pd.DataFrame, method_name: str, **kwargs) -> pd.DataFrame: + """Run a processor callback on all processors that implement it.""" + for processor in self._processors: + if not processor.implements(method_name): + continue + try: + df = getattr(processor, method_name)(df, **kwargs) + except Exception as e: + raise DatasetProcessingError(f"🛑 Failed in {method_name} for {processor.name}: {e}") from e + return df + + def run_preprocess(self) -> None: + """Load seed data, run preprocess(), save preprocessed seed.""" + if not self.has_processors_for("preprocess"): + return + if self._resource_provider.seed_reader is None: + return + + logger.info("âŗ Running preprocess on seed data...") + seed_reader = self._resource_provider.seed_reader + conn = seed_reader.create_duckdb_connection() + df = conn.execute(f"SELECT * FROM '{seed_reader.get_dataset_uri()}'").fetchdf() + original_len = len(df) + + df = self._run_stage(df, "preprocess") + + preprocessed_path = self._artifact_storage.base_dataset_path / "preprocessed_seed.parquet" + self._artifact_storage.mkdir_if_needed(self._artifact_storage.base_dataset_path) + df.to_parquet(preprocessed_path, index=False) + self._resource_provider.preprocessed_seed_uri = str(preprocessed_path) + logger.info(f"✅ Preprocess complete. Seed data has {len(df)} rows (was {original_len}).") + + def cleanup_preprocessed_seed(self) -> None: + """Remove preprocessed seed file and reset URI.""" + if self._resource_provider.preprocessed_seed_uri is not None: + preprocessed_path = Path(self._resource_provider.preprocessed_seed_uri) + if preprocessed_path.exists(): + preprocessed_path.unlink() + self._resource_provider.preprocessed_seed_uri = None + + def run_pre_batch(self, batch_manager: DatasetBatchManager) -> None: + """Run process_before_batch() on current batch.""" + if not self.has_processors_for("process_before_batch"): + return + + df = batch_manager.get_current_batch(as_dataframe=True) + original_len = len(df) + df = self._run_stage(df, "process_before_batch") + if len(df) != original_len: + logger.warning( + f"âš ī¸ PRE_BATCH processors changed row count from {original_len} to {len(df)}. " + "This may cause unexpected behavior in downstream generators." + ) + batch_manager.update_records(df.to_dict(orient="records")) + + def run_post_batch(self, df: pd.DataFrame, current_batch_number: int | None) -> pd.DataFrame: + """Run process_after_batch() on processors that implement it.""" + return self._run_stage(df, "process_after_batch", current_batch_number=current_batch_number) + + def run_postprocess_on_df(self, df: pd.DataFrame) -> pd.DataFrame: + """Run postprocess() on a DataFrame (for preview mode).""" + return self._run_stage(df, "postprocess") + + def run_postprocess(self) -> None: + """Load final dataset, run postprocess(), rewrite dataset.""" + if not self.has_processors_for("postprocess"): + return + + logger.info("âŗ Running postprocess on final dataset...") + df = self._artifact_storage.load_dataset() + df = self._run_stage(df, "postprocess") + + if self._artifact_storage.final_dataset_path.exists(): + shutil.rmtree(self._artifact_storage.final_dataset_path) + self._artifact_storage.write_batch_to_parquet_file( + batch_number=0, + dataframe=df, + batch_stage=BatchStage.FINAL_RESULT, + ) + logger.info(f"✅ Postprocess complete. Final dataset has {len(df)} rows.") diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index b8854770..bfc3d329 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -433,7 +433,7 @@ def test_run_pre_generation_processors_filters_seed_data(stub_resource_provider, mock_processor.preprocess.side_effect = lambda df: df[df["seed_id"] > 2].reset_index(drop=True) builder_with_seed._processors = [mock_processor] - builder_with_seed._run_pre_generation_processors() + builder_with_seed._processor_runner.run_preprocess() mock_processor.preprocess.assert_called_once() @@ -464,7 +464,7 @@ def test_run_post_generation_processors_modifies_final_dataset(stub_resource_pro ) builder._processors = [mock_processor] - builder._run_post_generation_processors() + builder._processor_runner.run_postprocess() mock_processor.postprocess.assert_called_once() @@ -487,7 +487,7 @@ def test_run_pre_generation_processors_skips_when_no_seed_reader(stub_resource_p ) builder._processors = [mock_processor] - builder._run_pre_generation_processors() + builder._processor_runner.run_preprocess() mock_processor.preprocess.assert_not_called() @@ -531,7 +531,7 @@ def test_processor_exception_in_preprocess_raises_error(builder_with_seed): builder_with_seed._processors = [mock_processor] with pytest.raises(DatasetProcessingError, match="Failed in preprocess"): - builder_with_seed._run_pre_generation_processors() + builder_with_seed._processor_runner.run_preprocess() def test_processor_exception_in_process_after_batch_raises_error(stub_resource_provider, stub_model_configs): @@ -549,7 +549,7 @@ def test_processor_exception_in_process_after_batch_raises_error(stub_resource_p builder._processors = [mock_processor] with pytest.raises(DatasetProcessingError, match="Failed in process_after_batch"): - builder._run_post_batch_processors(pd.DataFrame({"id": [1, 2, 3]}), current_batch_number=0) + builder._processor_runner.run_post_batch(pd.DataFrame({"id": [1, 2, 3]}), current_batch_number=0) def test_processor_with_no_implemented_stages_is_skipped(builder_with_seed): @@ -582,7 +582,7 @@ def test_multiple_processors_run_in_definition_order(builder_with_seed): processor_c.preprocess.side_effect = lambda df: (call_order.append("c"), df)[1] builder_with_seed._processors = [processor_a, processor_b, processor_c] - builder_with_seed._run_pre_generation_processors() + builder_with_seed._processor_runner.run_preprocess() assert call_order == ["a", "b", "c"] From b25f83b893e9a41fc95f392083245a2d37162abb Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Mon, 9 Feb 2026 11:17:12 -0300 Subject: [PATCH 06/14] address PR review feedback on processor refactor - Add ProcessorStage enum to replace raw strings in processor_runner - Rename _processors property to public processors + set_processor_runner - Fix docstring to include process_before_batch callback - Add info box to processors docs about full schema in batches - Add clarifying comment about pre-batch processing location Co-authored-by: Cursor --- docs/concepts/processors.md | 3 ++ .../src/data_designer/config/processors.py | 2 +- .../dataset_builders/column_wise_builder.py | 9 ++--- .../utils/processor_runner.py | 40 ++++++++++++------- .../test_column_wise_builder.py | 22 +++++----- 5 files changed, 44 insertions(+), 32 deletions(-) diff --git a/docs/concepts/processors.md b/docs/concepts/processors.md index 5a2b340a..82fadb48 100644 --- a/docs/concepts/processors.md +++ b/docs/concepts/processors.md @@ -22,6 +22,9 @@ Processors can run at four stages, determined by which callback methods they imp | Post-batch | After each batch completes | `process_after_batch()` | Drop columns, transform schema per batch | | Post-generation | Once, on final dataset after all batches | `postprocess()` | Deduplicate, aggregate statistics, final cleanup | +!!! info "Full Schema Available in Every Batch" + Each batch carries the full dataset schema throughout generation. Column dropping and other schema changes happen post-generation, so all columns remain accessible to processors and generators during the build. + A processor can implement any combination of these callbacks. The built-in processors use `process_after_batch()` by default. ## Processor Types diff --git a/packages/data-designer-config/src/data_designer/config/processors.py b/packages/data-designer-config/src/data_designer/config/processors.py index 21d94b78..0c14fade 100644 --- a/packages/data-designer-config/src/data_designer/config/processors.py +++ b/packages/data-designer-config/src/data_designer/config/processors.py @@ -34,7 +34,7 @@ class ProcessorConfig(ConfigBase, ABC): pipeline. They can modify, reshape, or augment the dataset. The processor implementation determines which stages it handles by overriding - the appropriate callback methods (preprocess, process_after_batch, postprocess). + the appropriate callback methods (preprocess, process_before_batch, process_after_batch, postprocess). Attributes: name: Unique name of the processor, used to identify the processor in results diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 67f27197..14eb7a71 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -80,13 +80,11 @@ def artifact_storage(self) -> ArtifactStorage: return self._resource_provider.artifact_storage @property - def _processors(self) -> list[Processor]: - """Expose processors for test compatibility.""" + def processors(self) -> list[Processor]: return self._processor_runner._processors - @_processors.setter - def _processors(self, processors: list[Processor]) -> None: - """Allow setting processors for test compatibility.""" + def set_processor_runner(self, processors: list[Processor]) -> None: + """Replace the processor runner with a new one using the given processors.""" self._processor_runner = ProcessorRunner( processors=processors, resource_provider=self._resource_provider, @@ -125,6 +123,7 @@ def build( self.batch_manager.start(num_records=num_records, buffer_size=buffer_size) for batch_idx in range(self.batch_manager.num_batches): logger.info(f"âŗ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}") + # Note: pre-batch processing runs inside _run_batch, after seed columns are populated self._run_batch(generators, batch_mode="batch", group_id=group_id) df_batch = self.batch_manager.get_current_batch(as_dataframe=True) df_batch = self._processor_runner.run_post_batch(df_batch, current_batch_number=batch_idx) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py index d20d3f4e..a3ba5485 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py @@ -5,6 +5,7 @@ import logging import shutil +from enum import Enum from pathlib import Path from typing import TYPE_CHECKING @@ -22,6 +23,15 @@ logger = logging.getLogger(__name__) +class ProcessorStage(str, Enum): + """Valid processor callback stages.""" + + PREPROCESS = "preprocess" + PRE_BATCH = "process_before_batch" + POST_BATCH = "process_after_batch" + POSTPROCESS = "postprocess" + + class ProcessorRunner: """Runs processor callbacks at various stages of dataset generation.""" @@ -35,24 +45,24 @@ def __init__( self._resource_provider = resource_provider self._artifact_storage = artifact_storage - def has_processors_for(self, method_name: str) -> bool: - """Check if any processor implements the given method.""" - return any(p.implements(method_name) for p in self._processors) + def has_processors_for(self, stage: ProcessorStage) -> bool: + """Check if any processor implements the given stage.""" + return any(p.implements(stage.value) for p in self._processors) - def _run_stage(self, df: pd.DataFrame, method_name: str, **kwargs) -> pd.DataFrame: + def _run_stage(self, df: pd.DataFrame, stage: ProcessorStage, **kwargs) -> pd.DataFrame: """Run a processor callback on all processors that implement it.""" for processor in self._processors: - if not processor.implements(method_name): + if not processor.implements(stage.value): continue try: - df = getattr(processor, method_name)(df, **kwargs) + df = getattr(processor, stage.value)(df, **kwargs) except Exception as e: - raise DatasetProcessingError(f"🛑 Failed in {method_name} for {processor.name}: {e}") from e + raise DatasetProcessingError(f"🛑 Failed in {stage.value} for {processor.name}: {e}") from e return df def run_preprocess(self) -> None: """Load seed data, run preprocess(), save preprocessed seed.""" - if not self.has_processors_for("preprocess"): + if not self.has_processors_for(ProcessorStage.PREPROCESS): return if self._resource_provider.seed_reader is None: return @@ -63,7 +73,7 @@ def run_preprocess(self) -> None: df = conn.execute(f"SELECT * FROM '{seed_reader.get_dataset_uri()}'").fetchdf() original_len = len(df) - df = self._run_stage(df, "preprocess") + df = self._run_stage(df, ProcessorStage.PREPROCESS) preprocessed_path = self._artifact_storage.base_dataset_path / "preprocessed_seed.parquet" self._artifact_storage.mkdir_if_needed(self._artifact_storage.base_dataset_path) @@ -81,12 +91,12 @@ def cleanup_preprocessed_seed(self) -> None: def run_pre_batch(self, batch_manager: DatasetBatchManager) -> None: """Run process_before_batch() on current batch.""" - if not self.has_processors_for("process_before_batch"): + if not self.has_processors_for(ProcessorStage.PRE_BATCH): return df = batch_manager.get_current_batch(as_dataframe=True) original_len = len(df) - df = self._run_stage(df, "process_before_batch") + df = self._run_stage(df, ProcessorStage.PRE_BATCH) if len(df) != original_len: logger.warning( f"âš ī¸ PRE_BATCH processors changed row count from {original_len} to {len(df)}. " @@ -96,20 +106,20 @@ def run_pre_batch(self, batch_manager: DatasetBatchManager) -> None: def run_post_batch(self, df: pd.DataFrame, current_batch_number: int | None) -> pd.DataFrame: """Run process_after_batch() on processors that implement it.""" - return self._run_stage(df, "process_after_batch", current_batch_number=current_batch_number) + return self._run_stage(df, ProcessorStage.POST_BATCH, current_batch_number=current_batch_number) def run_postprocess_on_df(self, df: pd.DataFrame) -> pd.DataFrame: """Run postprocess() on a DataFrame (for preview mode).""" - return self._run_stage(df, "postprocess") + return self._run_stage(df, ProcessorStage.POSTPROCESS) def run_postprocess(self) -> None: """Load final dataset, run postprocess(), rewrite dataset.""" - if not self.has_processors_for("postprocess"): + if not self.has_processors_for(ProcessorStage.POSTPROCESS): return logger.info("âŗ Running postprocess on final dataset...") df = self._artifact_storage.load_dataset() - df = self._run_stage(df, "postprocess") + df = self._run_stage(df, ProcessorStage.POSTPROCESS) if self._artifact_storage.final_dataset_path.exists(): shutil.rmtree(self._artifact_storage.final_dataset_path) diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index bfc3d329..b58bd560 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -222,7 +222,7 @@ def test_column_wise_dataset_builder_build_method_basic_flow( stub_batch_manager.iter_current_batch.return_value = [(0, {"test": "data"})] stub_column_wise_builder.batch_manager = stub_batch_manager - stub_column_wise_builder._processors = [] # No processors for basic flow test + stub_column_wise_builder.set_processor_runner([]) # No processors for basic flow test result_path = stub_column_wise_builder.build(num_records=100) @@ -272,7 +272,7 @@ def test_column_wise_dataset_builder_validate_column_configs( def test_column_wise_dataset_builder_initialize_processors(stub_column_wise_builder): - processors = stub_column_wise_builder._processors + processors = stub_column_wise_builder.processors assert isinstance(processors, list) assert len(processors) == 1 assert processors[0].config.column_names == ["column_to_drop"] @@ -432,7 +432,7 @@ def test_run_pre_generation_processors_filters_seed_data(stub_resource_provider, mock_processor = create_mock_processor("filter_processor", ["preprocess"]) mock_processor.preprocess.side_effect = lambda df: df[df["seed_id"] > 2].reset_index(drop=True) - builder_with_seed._processors = [mock_processor] + builder_with_seed.set_processor_runner([mock_processor]) builder_with_seed._processor_runner.run_preprocess() mock_processor.preprocess.assert_called_once() @@ -462,7 +462,7 @@ def test_run_post_generation_processors_modifies_final_dataset(stub_resource_pro data_designer_config=config_builder.build(), resource_provider=stub_resource_provider, ) - builder._processors = [mock_processor] + builder.set_processor_runner([mock_processor]) builder._processor_runner.run_postprocess() @@ -485,7 +485,7 @@ def test_run_pre_generation_processors_skips_when_no_seed_reader(stub_resource_p data_designer_config=config_builder.build(), resource_provider=stub_resource_provider, ) - builder._processors = [mock_processor] + builder.set_processor_runner([mock_processor]) builder._processor_runner.run_preprocess() @@ -504,7 +504,7 @@ def test_all_processor_stages_run_in_order(builder_with_seed, mode): mock_processor.process_after_batch.side_effect = lambda df, **kw: (call_order.append("process_after_batch"), df)[1] mock_processor.postprocess.side_effect = lambda df: (call_order.append("postprocess"), df)[1] - builder_with_seed._processors = [mock_processor] + builder_with_seed.set_processor_runner([mock_processor]) if mode == "preview": raw_dataset = builder_with_seed.build_preview(num_records=3) @@ -528,7 +528,7 @@ def test_processor_exception_in_preprocess_raises_error(builder_with_seed): mock_processor = create_mock_processor("failing_processor", ["preprocess"]) mock_processor.preprocess.side_effect = ValueError("Preprocessing failed") - builder_with_seed._processors = [mock_processor] + builder_with_seed.set_processor_runner([mock_processor]) with pytest.raises(DatasetProcessingError, match="Failed in preprocess"): builder_with_seed._processor_runner.run_preprocess() @@ -546,7 +546,7 @@ def test_processor_exception_in_process_after_batch_raises_error(stub_resource_p data_designer_config=config_builder.build(), resource_provider=stub_resource_provider, ) - builder._processors = [mock_processor] + builder.set_processor_runner([mock_processor]) with pytest.raises(DatasetProcessingError, match="Failed in process_after_batch"): builder._processor_runner.run_post_batch(pd.DataFrame({"id": [1, 2, 3]}), current_batch_number=0) @@ -556,7 +556,7 @@ def test_processor_with_no_implemented_stages_is_skipped(builder_with_seed): """Test that a processor implementing no stages doesn't cause errors.""" mock_processor = create_mock_processor("noop_processor", []) - builder_with_seed._processors = [mock_processor] + builder_with_seed.set_processor_runner([mock_processor]) # Should complete without errors result = builder_with_seed.build_preview(num_records=3) @@ -581,7 +581,7 @@ def test_multiple_processors_run_in_definition_order(builder_with_seed): processor_c = create_mock_processor("processor_c", ["preprocess"]) processor_c.preprocess.side_effect = lambda df: (call_order.append("c"), df)[1] - builder_with_seed._processors = [processor_a, processor_b, processor_c] + builder_with_seed.set_processor_runner([processor_a, processor_b, processor_c]) builder_with_seed._processor_runner.run_preprocess() assert call_order == ["a", "b", "c"] @@ -598,7 +598,7 @@ def test_process_preview_with_empty_dataframe(stub_resource_provider, stub_model data_designer_config=config_builder.build(), resource_provider=stub_resource_provider, ) - builder._processors = [mock_processor] + builder.set_processor_runner([mock_processor]) empty_df = pd.DataFrame() result = builder.process_preview(empty_df) From 50749818c2db0c3dfdee104bf0026ff02025ce5f Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Mon, 9 Feb 2026 11:27:59 -0300 Subject: [PATCH 07/14] reword docs info box about schema availability during generation Co-authored-by: Cursor --- docs/concepts/processors.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/concepts/processors.md b/docs/concepts/processors.md index 82fadb48..442353c9 100644 --- a/docs/concepts/processors.md +++ b/docs/concepts/processors.md @@ -22,8 +22,8 @@ Processors can run at four stages, determined by which callback methods they imp | Post-batch | After each batch completes | `process_after_batch()` | Drop columns, transform schema per batch | | Post-generation | Once, on final dataset after all batches | `postprocess()` | Deduplicate, aggregate statistics, final cleanup | -!!! info "Full Schema Available in Every Batch" - Each batch carries the full dataset schema throughout generation. Column dropping and other schema changes happen post-generation, so all columns remain accessible to processors and generators during the build. +!!! info "Full Schema Available During Generation" + Each batch carries the full dataset schema during generation. Post-batch schema changes such as column dropping only alter past batches, so all columns remain accessible to generators while building follow-up batches. A processor can implement any combination of these callbacks. The built-in processors use `process_after_batch()` by default. From 56b29adbd55091112cf6567d4fe27962de054467 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Mon, 9 Feb 2026 11:37:22 -0300 Subject: [PATCH 08/14] support row-count changes in PRE_BATCH processors Add replace_records() to DatasetBatchManager that replaces the buffer without requiring matching length. Use it in run_pre_batch() so processors that filter or expand rows don't crash. --- .../engine/dataset_builders/utils/dataset_batch_manager.py | 6 ++++++ .../engine/dataset_builders/utils/processor_runner.py | 7 ++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py index e277088a..663afc77 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py @@ -198,3 +198,9 @@ def update_records(self, records: list[dict]) -> None: f"the number of records in the buffer ({len(self._buffer)})." ) self._buffer = records + + def replace_records(self, records: list[dict]) -> None: + """Replace the buffer contents, updating the current batch size.""" + self._buffer = records + if self._num_records_list is not None: + self._num_records_list[self._current_batch_number] = len(records) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py index a3ba5485..aac7c58d 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py @@ -98,11 +98,8 @@ def run_pre_batch(self, batch_manager: DatasetBatchManager) -> None: original_len = len(df) df = self._run_stage(df, ProcessorStage.PRE_BATCH) if len(df) != original_len: - logger.warning( - f"âš ī¸ PRE_BATCH processors changed row count from {original_len} to {len(df)}. " - "This may cause unexpected behavior in downstream generators." - ) - batch_manager.update_records(df.to_dict(orient="records")) + logger.info(f"â„šī¸ PRE_BATCH processors changed row count from {original_len} to {len(df)}.") + batch_manager.replace_records(df.to_dict(orient="records")) def run_post_batch(self, df: pd.DataFrame, current_batch_number: int | None) -> pd.DataFrame: """Run process_after_batch() on processors that implement it.""" From fec7af5f9a91d60db2ac91289d17d69cd5f2b676 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Mon, 9 Feb 2026 11:51:28 -0300 Subject: [PATCH 09/14] fix preview artifact output and harden processors API - Restore schema transform preview write so processor artifacts are available in preview mode (read back by data_designer.py) - Return tuple from processors property to prevent mutation --- .../engine/dataset_builders/column_wise_builder.py | 4 ++-- .../engine/processing/processors/schema_transform.py | 6 ++++++ .../dataset_builders/test_column_wise_builder.py | 2 +- .../processing/processors/test_schema_transform.py | 11 +++++++---- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 14eb7a71..9405d111 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -80,8 +80,8 @@ def artifact_storage(self) -> ArtifactStorage: return self._resource_provider.artifact_storage @property - def processors(self) -> list[Processor]: - return self._processor_runner._processors + def processors(self) -> tuple[Processor, ...]: + return tuple(self._processor_runner._processors) def set_processor_runner(self, processors: list[Processor]) -> None: """Replace the processor runner with a new one using the given processors.""" diff --git a/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py b/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py index e71267c1..349afddc 100644 --- a/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py +++ b/packages/data-designer-engine/src/data_designer/engine/processing/processors/schema_transform.py @@ -56,6 +56,12 @@ def process_after_batch(self, data: pd.DataFrame, *, current_batch_number: int | batch_stage=BatchStage.PROCESSORS_OUTPUTS, subfolder=self.config.name, ) + else: + self.artifact_storage.write_parquet_file( + parquet_file_name=f"{self.config.name}.parquet", + dataframe=formatted_data, + batch_stage=BatchStage.PROCESSORS_OUTPUTS, + ) return data def _transform(self, data: pd.DataFrame) -> pd.DataFrame: diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index b58bd560..cae82614 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -273,7 +273,7 @@ def test_column_wise_dataset_builder_validate_column_configs( def test_column_wise_dataset_builder_initialize_processors(stub_column_wise_builder): processors = stub_column_wise_builder.processors - assert isinstance(processors, list) + assert isinstance(processors, tuple) assert len(processors) == 1 assert processors[0].config.column_names == ["column_to_drop"] diff --git a/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py b/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py index b481b1d0..69b5b357 100644 --- a/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py +++ b/packages/data-designer-engine/tests/engine/processing/processors/test_schema_transform.py @@ -178,14 +178,17 @@ def test_process_after_batch_with_mixed_special_characters(stub_processor: Schem assert output["text"] == 'She replied: "I\'m not sure about that\\nLet me think..."' -def test_process_after_batch_preview_mode_does_not_write( +def test_process_after_batch_preview_mode_writes_single_file( stub_processor: SchemaTransformProcessor, stub_sample_dataframe: pd.DataFrame ) -> None: - """In preview mode (current_batch_number=None), no parquet file is written.""" + """In preview mode (current_batch_number=None), transformed output is written as a single file.""" + stub_processor.artifact_storage.write_parquet_file = Mock() result = stub_processor.process_after_batch(stub_sample_dataframe, current_batch_number=None) - # Original dataframe should be returned (formatted data is artifact-only) pd.testing.assert_frame_equal(result, stub_sample_dataframe) - # No file should be written stub_processor.artifact_storage.write_batch_to_parquet_file.assert_not_called() + stub_processor.artifact_storage.write_parquet_file.assert_called_once() + call_args = stub_processor.artifact_storage.write_parquet_file.call_args + assert call_args.kwargs["parquet_file_name"] == "test_schema_transform.parquet" + assert call_args.kwargs["batch_stage"] == BatchStage.PROCESSORS_OUTPUTS From c1515507d10be2cadd6b53cd3f10305e428cfd81 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 11 Feb 2026 09:48:48 -0300 Subject: [PATCH 10/14] remove preprocessor stage and rename postprocess to process_after_generation - Remove preprocess() callback, run_preprocess(), cleanup_preprocessed_seed(), preprocessed_seed_uri, and all related plumbing - Rename postprocess() to process_after_generation() across base class, ProcessorRunner, builder, config enums, docs, and tests - Drop resource_provider parameter from ProcessorRunner (no longer needed) --- docs/concepts/processors.md | 5 +- .../data_designer/config/dataset_builders.py | 3 +- .../src/data_designer/config/processors.py | 2 +- .../generators/seed_dataset.py | 6 +- .../dataset_builders/column_wise_builder.py | 11 +-- .../utils/processor_runner.py | 54 ++-------- .../engine/processing/processors/base.py | 20 +--- .../engine/resources/resource_provider.py | 1 - .../tests/engine/conftest.py | 1 - .../test_column_wise_builder.py | 98 +++++-------------- 10 files changed, 43 insertions(+), 158 deletions(-) diff --git a/docs/concepts/processors.md b/docs/concepts/processors.md index 442353c9..26dbd2eb 100644 --- a/docs/concepts/processors.md +++ b/docs/concepts/processors.md @@ -13,14 +13,13 @@ Each processor: - Applies its transformation - Passes the result to the next processor (or to output) -Processors can run at four stages, determined by which callback methods they implement: +Processors can run at three stages, determined by which callback methods they implement: | Stage | When it runs | Callback method | Use cases | |-------|--------------|-----------------|-----------| -| Pre-generation | Once, on full seed data before batching | `preprocess()` | Filter seed data, validate inputs, normalize data | | Pre-batch | After seed columns, before dependent columns | `process_before_batch()` | Transform seed data before other columns are generated | | Post-batch | After each batch completes | `process_after_batch()` | Drop columns, transform schema per batch | -| Post-generation | Once, on final dataset after all batches | `postprocess()` | Deduplicate, aggregate statistics, final cleanup | +| After generation | Once, on final dataset after all batches | `process_after_generation()` | Deduplicate, aggregate statistics, final cleanup | !!! info "Full Schema Available During Generation" Each batch carries the full dataset schema during generation. Post-batch schema changes such as column dropping only alter past batches, so all columns remain accessible to generators while building follow-up batches. diff --git a/packages/data-designer-config/src/data_designer/config/dataset_builders.py b/packages/data-designer-config/src/data_designer/config/dataset_builders.py index bbfbb2fb..67607388 100644 --- a/packages/data-designer-config/src/data_designer/config/dataset_builders.py +++ b/packages/data-designer-config/src/data_designer/config/dataset_builders.py @@ -9,5 +9,4 @@ class BuildStage(str, Enum): PRE_BATCH = "pre_batch" POST_BATCH = "post_batch" - PRE_GENERATION = "pre_generation" - POST_GENERATION = "post_generation" + AFTER_GENERATION = "after_generation" diff --git a/packages/data-designer-config/src/data_designer/config/processors.py b/packages/data-designer-config/src/data_designer/config/processors.py index 0c14fade..0435457b 100644 --- a/packages/data-designer-config/src/data_designer/config/processors.py +++ b/packages/data-designer-config/src/data_designer/config/processors.py @@ -34,7 +34,7 @@ class ProcessorConfig(ConfigBase, ABC): pipeline. They can modify, reshape, or augment the dataset. The processor implementation determines which stages it handles by overriding - the appropriate callback methods (preprocess, process_before_batch, process_after_batch, postprocess). + the appropriate callback methods (process_before_batch, process_after_batch, process_after_generation). Attributes: name: Unique name of the processor, used to identify the processor in results diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/seed_dataset.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/seed_dataset.py index 94b206a1..22c8b1ec 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/seed_dataset.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/seed_dataset.py @@ -53,11 +53,7 @@ def _initialize(self) -> None: self._num_records_sampled = 0 self._batch_reader = None self._df_remaining = None - # Use preprocessed seed if available, otherwise use original - if self.resource_provider.preprocessed_seed_uri is not None: - self._dataset_uri = self.resource_provider.preprocessed_seed_uri - else: - self._dataset_uri = self.resource_provider.seed_reader.get_dataset_uri() + self._dataset_uri = self.resource_provider.seed_reader.get_dataset_uri() self._seed_dataset_size = self.duckdb_conn.execute(f"SELECT COUNT(*) FROM '{self._dataset_uri}'").fetchone()[0] self._index_range = self._resolve_index_range() diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 9405d111..121d89bb 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -70,7 +70,6 @@ def __init__( processors = self._initialize_processors(self._data_designer_config.processors or []) self._processor_runner = ProcessorRunner( processors=processors, - resource_provider=resource_provider, artifact_storage=resource_provider.artifact_storage, ) self._validate_column_configs() @@ -87,7 +86,6 @@ def set_processor_runner(self, processors: list[Processor]) -> None: """Replace the processor runner with a new one using the given processors.""" self._processor_runner = ProcessorRunner( processors=processors, - resource_provider=self._resource_provider, artifact_storage=self.artifact_storage, ) @@ -113,7 +111,6 @@ def build( ) -> Path: self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() - self._processor_runner.run_preprocess() self._write_builder_config() generators = self._initialize_generators() start_time = time.perf_counter() @@ -130,7 +127,7 @@ def build( self._write_processed_batch(df_batch) self.batch_manager.finish_batch(on_batch_complete) self.batch_manager.finish() - self._processor_runner.run_postprocess() + self._processor_runner.run_after_generation() self._resource_provider.model_registry.log_model_usage(time.perf_counter() - start_time) @@ -139,7 +136,6 @@ def build( def build_preview(self, *, num_records: int) -> pd.DataFrame: self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() - self._processor_runner.run_preprocess() generators = self._initialize_generators() group_id = uuid.uuid4().hex @@ -149,16 +145,13 @@ def build_preview(self, *, num_records: int) -> pd.DataFrame: dataset = self.batch_manager.get_current_batch(as_dataframe=True) self.batch_manager.reset() - # Clean up preprocessed seed file and reset URI to avoid affecting subsequent build() calls - self._processor_runner.cleanup_preprocessed_seed() - self._resource_provider.model_registry.log_model_usage(time.perf_counter() - start_time) return dataset def process_preview(self, dataset: pd.DataFrame) -> pd.DataFrame: df = self._processor_runner.run_post_batch(dataset.copy(), current_batch_number=None) - return self._processor_runner.run_postprocess_on_df(df) + return self._processor_runner.run_after_generation_on_df(df) def _initialize_generators(self) -> list[ColumnGenerator]: return [ diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py index aac7c58d..fb7e0ad5 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py @@ -6,7 +6,6 @@ import logging import shutil from enum import Enum -from pathlib import Path from typing import TYPE_CHECKING from data_designer.engine.dataset_builders.artifact_storage import BatchStage @@ -18,7 +17,6 @@ from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.utils.dataset_batch_manager import DatasetBatchManager from data_designer.engine.processing.processors.base import Processor - from data_designer.engine.resources.resource_provider import ResourceProvider logger = logging.getLogger(__name__) @@ -26,10 +24,9 @@ class ProcessorStage(str, Enum): """Valid processor callback stages.""" - PREPROCESS = "preprocess" PRE_BATCH = "process_before_batch" POST_BATCH = "process_after_batch" - POSTPROCESS = "postprocess" + AFTER_GENERATION = "process_after_generation" class ProcessorRunner: @@ -38,11 +35,9 @@ class ProcessorRunner: def __init__( self, processors: list[Processor], - resource_provider: ResourceProvider, artifact_storage: ArtifactStorage, ): self._processors = processors - self._resource_provider = resource_provider self._artifact_storage = artifact_storage def has_processors_for(self, stage: ProcessorStage) -> bool: @@ -60,35 +55,6 @@ def _run_stage(self, df: pd.DataFrame, stage: ProcessorStage, **kwargs) -> pd.Da raise DatasetProcessingError(f"🛑 Failed in {stage.value} for {processor.name}: {e}") from e return df - def run_preprocess(self) -> None: - """Load seed data, run preprocess(), save preprocessed seed.""" - if not self.has_processors_for(ProcessorStage.PREPROCESS): - return - if self._resource_provider.seed_reader is None: - return - - logger.info("âŗ Running preprocess on seed data...") - seed_reader = self._resource_provider.seed_reader - conn = seed_reader.create_duckdb_connection() - df = conn.execute(f"SELECT * FROM '{seed_reader.get_dataset_uri()}'").fetchdf() - original_len = len(df) - - df = self._run_stage(df, ProcessorStage.PREPROCESS) - - preprocessed_path = self._artifact_storage.base_dataset_path / "preprocessed_seed.parquet" - self._artifact_storage.mkdir_if_needed(self._artifact_storage.base_dataset_path) - df.to_parquet(preprocessed_path, index=False) - self._resource_provider.preprocessed_seed_uri = str(preprocessed_path) - logger.info(f"✅ Preprocess complete. Seed data has {len(df)} rows (was {original_len}).") - - def cleanup_preprocessed_seed(self) -> None: - """Remove preprocessed seed file and reset URI.""" - if self._resource_provider.preprocessed_seed_uri is not None: - preprocessed_path = Path(self._resource_provider.preprocessed_seed_uri) - if preprocessed_path.exists(): - preprocessed_path.unlink() - self._resource_provider.preprocessed_seed_uri = None - def run_pre_batch(self, batch_manager: DatasetBatchManager) -> None: """Run process_before_batch() on current batch.""" if not self.has_processors_for(ProcessorStage.PRE_BATCH): @@ -105,18 +71,18 @@ def run_post_batch(self, df: pd.DataFrame, current_batch_number: int | None) -> """Run process_after_batch() on processors that implement it.""" return self._run_stage(df, ProcessorStage.POST_BATCH, current_batch_number=current_batch_number) - def run_postprocess_on_df(self, df: pd.DataFrame) -> pd.DataFrame: - """Run postprocess() on a DataFrame (for preview mode).""" - return self._run_stage(df, ProcessorStage.POSTPROCESS) + def run_after_generation_on_df(self, df: pd.DataFrame) -> pd.DataFrame: + """Run process_after_generation() on a DataFrame (for preview mode).""" + return self._run_stage(df, ProcessorStage.AFTER_GENERATION) - def run_postprocess(self) -> None: - """Load final dataset, run postprocess(), rewrite dataset.""" - if not self.has_processors_for(ProcessorStage.POSTPROCESS): + def run_after_generation(self) -> None: + """Load final dataset, run process_after_generation(), rewrite dataset.""" + if not self.has_processors_for(ProcessorStage.AFTER_GENERATION): return - logger.info("âŗ Running postprocess on final dataset...") + logger.info("âŗ Running process_after_generation on final dataset...") df = self._artifact_storage.load_dataset() - df = self._run_stage(df, ProcessorStage.POSTPROCESS) + df = self._run_stage(df, ProcessorStage.AFTER_GENERATION) if self._artifact_storage.final_dataset_path.exists(): shutil.rmtree(self._artifact_storage.final_dataset_path) @@ -125,4 +91,4 @@ def run_postprocess(self) -> None: dataframe=df, batch_stage=BatchStage.FINAL_RESULT, ) - logger.info(f"✅ Postprocess complete. Final dataset has {len(df)} rows.") + logger.info(f"✅ process_after_generation complete. Final dataset has {len(df)} rows.") diff --git a/packages/data-designer-engine/src/data_designer/engine/processing/processors/base.py b/packages/data-designer-engine/src/data_designer/engine/processing/processors/base.py index b3fe7a1f..d6a32e91 100644 --- a/packages/data-designer-engine/src/data_designer/engine/processing/processors/base.py +++ b/packages/data-designer-engine/src/data_designer/engine/processing/processors/base.py @@ -19,24 +19,10 @@ def implements(self, method_name: str) -> bool: """Check if subclass overrides a callback method.""" return getattr(type(self), method_name) is not getattr(Processor, method_name) - def preprocess(self, data: DataT) -> DataT: - """Called at PRE_GENERATION stage on seed data before batching. - - Override to filter or transform seed data before generation begins. - - Args: - data: The full seed dataset. - - Returns: - Transformed seed dataset. - """ - return data - def process_before_batch(self, data: DataT) -> DataT: """Called at PRE_BATCH stage before each batch is generated. - Override to transform batch data before generation. Unlike preprocess, - this operates on in-memory batch data without disk I/O. + Override to transform batch data before generation begins. Args: data: The batch data before generation. @@ -60,8 +46,8 @@ def process_after_batch(self, data: DataT, *, current_batch_number: int | None) """ return data - def postprocess(self, data: DataT) -> DataT: - """Called at POST_GENERATION stage on the final combined dataset. + def process_after_generation(self, data: DataT) -> DataT: + """Called at AFTER_GENERATION stage on the final combined dataset. Override to transform the complete generated dataset. diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py b/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py index 3d756a04..1fcb8e02 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/resource_provider.py @@ -37,7 +37,6 @@ class ResourceProvider(ConfigBase): mcp_registry: MCPRegistry | None = None run_config: RunConfig = RunConfig() seed_reader: SeedReader | None = None - preprocessed_seed_uri: str | None = None def get_dataset_metadata(self) -> DatasetMetadata: """Get metadata about the dataset being generated. diff --git a/packages/data-designer-engine/tests/engine/conftest.py b/packages/data-designer-engine/tests/engine/conftest.py index 33d74a3f..b04580b9 100644 --- a/packages/data-designer-engine/tests/engine/conftest.py +++ b/packages/data-designer-engine/tests/engine/conftest.py @@ -45,7 +45,6 @@ def stub_resource_provider(tmp_path, stub_model_facade): mock_provider.seed_reader = Mock() mock_provider.seed_reader.get_column_names.return_value = [] mock_provider.run_config = RunConfig() - mock_provider.preprocessed_seed_uri = None return mock_provider diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index cae82614..184d16c5 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -3,7 +3,6 @@ from __future__ import annotations -from pathlib import Path from typing import TYPE_CHECKING from unittest.mock import Mock, patch @@ -117,10 +116,9 @@ def create_mock_processor(name: str, stages: list[str]) -> Mock: mock_processor = Mock(spec=Processor) mock_processor.name = name mock_processor.implements.side_effect = lambda m: m in stages - mock_processor.preprocess.side_effect = lambda df: df mock_processor.process_before_batch.side_effect = lambda df: df mock_processor.process_after_batch.side_effect = lambda df, **kw: df - mock_processor.postprocess.side_effect = lambda df: df + mock_processor.process_after_generation.side_effect = lambda df: df return mock_processor @@ -427,33 +425,14 @@ def test_fan_out_with_threads_uses_early_shutdown_settings_from_resource_provide assert call_kwargs["disable_early_shutdown"] == disable_early_shutdown -def test_run_pre_generation_processors_filters_seed_data(stub_resource_provider, builder_with_seed, seed_data_setup): - """Test that PRE_GENERATION processors are applied to seed data before generation.""" - mock_processor = create_mock_processor("filter_processor", ["preprocess"]) - mock_processor.preprocess.side_effect = lambda df: df[df["seed_id"] > 2].reset_index(drop=True) - - builder_with_seed.set_processor_runner([mock_processor]) - builder_with_seed._processor_runner.run_preprocess() - - mock_processor.preprocess.assert_called_once() - - assert stub_resource_provider.preprocessed_seed_uri is not None - preprocessed_path = Path(stub_resource_provider.preprocessed_seed_uri) - assert preprocessed_path.exists() - - preprocessed_df = pd.read_parquet(preprocessed_path) - assert len(preprocessed_df) == 3 - assert list(preprocessed_df["seed_id"]) == [3, 4, 5] - - def test_run_post_generation_processors_modifies_final_dataset(stub_resource_provider, stub_model_configs): - """Test that postprocess callbacks are applied to the final dataset.""" + """Test that process_after_generation callbacks are applied to the final dataset.""" final_df = pd.DataFrame({"id": [1, 2, 3, 4, 5], "value": ["a", "b", "c", "d", "e"]}) stub_resource_provider.artifact_storage.mkdir_if_needed(stub_resource_provider.artifact_storage.final_dataset_path) final_df.to_parquet(stub_resource_provider.artifact_storage.final_dataset_path / "batch_00000.parquet", index=False) - mock_processor = create_mock_processor("dedup_processor", ["postprocess"]) - mock_processor.postprocess.side_effect = lambda df: df[df["id"] > 2] + mock_processor = create_mock_processor("dedup_processor", ["process_after_generation"]) + mock_processor.process_after_generation.side_effect = lambda df: df[df["id"] > 2] config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) @@ -464,45 +443,27 @@ def test_run_post_generation_processors_modifies_final_dataset(stub_resource_pro ) builder.set_processor_runner([mock_processor]) - builder._processor_runner.run_postprocess() + builder._processor_runner.run_after_generation() - mock_processor.postprocess.assert_called_once() + mock_processor.process_after_generation.assert_called_once() result_df = stub_resource_provider.artifact_storage.load_dataset() assert len(result_df) == 3 -def test_run_pre_generation_processors_skips_when_no_seed_reader(stub_resource_provider, stub_model_configs): - """Test that preprocess is skipped when no seed reader is configured.""" - stub_resource_provider.seed_reader = None - - mock_processor = create_mock_processor("filter_processor", ["preprocess"]) - - config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) - config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) - - builder = ColumnWiseDatasetBuilder( - data_designer_config=config_builder.build(), - resource_provider=stub_resource_provider, - ) - builder.set_processor_runner([mock_processor]) - - builder._processor_runner.run_preprocess() - - mock_processor.preprocess.assert_not_called() - - @pytest.mark.parametrize("mode", ["preview", "build"]) def test_all_processor_stages_run_in_order(builder_with_seed, mode): - """Test that all 4 processor stages run in correct order for both preview and build modes.""" + """Test that all 3 processor stages run in correct order for both preview and build modes.""" call_order = [] - all_stages = ["preprocess", "process_before_batch", "process_after_batch", "postprocess"] + all_stages = ["process_before_batch", "process_after_batch", "process_after_generation"] mock_processor = create_mock_processor("all_stages_processor", all_stages) - mock_processor.preprocess.side_effect = lambda df: (call_order.append("preprocess"), df)[1] mock_processor.process_before_batch.side_effect = lambda df: (call_order.append("process_before_batch"), df)[1] mock_processor.process_after_batch.side_effect = lambda df, **kw: (call_order.append("process_after_batch"), df)[1] - mock_processor.postprocess.side_effect = lambda df: (call_order.append("postprocess"), df)[1] + mock_processor.process_after_generation.side_effect = lambda df: ( + call_order.append("process_after_generation"), + df, + )[1] builder_with_seed.set_processor_runner([mock_processor]) @@ -512,10 +473,9 @@ def test_all_processor_stages_run_in_order(builder_with_seed, mode): else: builder_with_seed.build(num_records=3) - mock_processor.preprocess.assert_called_once() mock_processor.process_before_batch.assert_called_once() mock_processor.process_after_batch.assert_called_once() - mock_processor.postprocess.assert_called_once() + mock_processor.process_after_generation.assert_called_once() assert call_order == all_stages @@ -523,17 +483,6 @@ def test_all_processor_stages_run_in_order(builder_with_seed, mode): # --- Edge Case Tests --- -def test_processor_exception_in_preprocess_raises_error(builder_with_seed): - """Test that processor exceptions during preprocess are properly wrapped.""" - mock_processor = create_mock_processor("failing_processor", ["preprocess"]) - mock_processor.preprocess.side_effect = ValueError("Preprocessing failed") - - builder_with_seed.set_processor_runner([mock_processor]) - - with pytest.raises(DatasetProcessingError, match="Failed in preprocess"): - builder_with_seed._processor_runner.run_preprocess() - - def test_processor_exception_in_process_after_batch_raises_error(stub_resource_provider, stub_model_configs): """Test that processor exceptions during process_after_batch are properly wrapped.""" mock_processor = create_mock_processor("failing_processor", ["process_after_batch"]) @@ -562,34 +511,33 @@ def test_processor_with_no_implemented_stages_is_skipped(builder_with_seed): result = builder_with_seed.build_preview(num_records=3) assert len(result) == 3 - mock_processor.preprocess.assert_not_called() mock_processor.process_before_batch.assert_not_called() mock_processor.process_after_batch.assert_not_called() - mock_processor.postprocess.assert_not_called() + mock_processor.process_after_generation.assert_not_called() def test_multiple_processors_run_in_definition_order(builder_with_seed): """Test that multiple processors run in the order they were defined.""" call_order = [] - processor_a = create_mock_processor("processor_a", ["preprocess"]) - processor_a.preprocess.side_effect = lambda df: (call_order.append("a"), df)[1] + processor_a = create_mock_processor("processor_a", ["process_before_batch"]) + processor_a.process_before_batch.side_effect = lambda df: (call_order.append("a"), df)[1] - processor_b = create_mock_processor("processor_b", ["preprocess"]) - processor_b.preprocess.side_effect = lambda df: (call_order.append("b"), df)[1] + processor_b = create_mock_processor("processor_b", ["process_before_batch"]) + processor_b.process_before_batch.side_effect = lambda df: (call_order.append("b"), df)[1] - processor_c = create_mock_processor("processor_c", ["preprocess"]) - processor_c.preprocess.side_effect = lambda df: (call_order.append("c"), df)[1] + processor_c = create_mock_processor("processor_c", ["process_before_batch"]) + processor_c.process_before_batch.side_effect = lambda df: (call_order.append("c"), df)[1] builder_with_seed.set_processor_runner([processor_a, processor_b, processor_c]) - builder_with_seed._processor_runner.run_preprocess() + builder_with_seed.build(num_records=3) assert call_order == ["a", "b", "c"] def test_process_preview_with_empty_dataframe(stub_resource_provider, stub_model_configs): """Test that process_preview handles empty DataFrames gracefully.""" - mock_processor = create_mock_processor("test_processor", ["process_after_batch", "postprocess"]) + mock_processor = create_mock_processor("test_processor", ["process_after_batch", "process_after_generation"]) config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) @@ -605,4 +553,4 @@ def test_process_preview_with_empty_dataframe(stub_resource_provider, stub_model assert len(result) == 0 mock_processor.process_after_batch.assert_called_once() - mock_processor.postprocess.assert_called_once() + mock_processor.process_after_generation.assert_called_once() From cd7031e28e704cad3e02c5fb20e226b206561742 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 11 Feb 2026 10:13:06 -0300 Subject: [PATCH 11/14] address PR review feedback - Remove dead BuildStage enum and fix wrong return type on get_processor_configs() - Log row count delta instead of absolute values in PRE_BATCH - Preserve batch partitioning in run_after_generation() - Consolidate and simplify processor tests --- .../src/data_designer/config/__init__.py | 3 - .../data_designer/config/config_builder.py | 3 +- .../data_designer/config/dataset_builders.py | 12 -- .../utils/processor_runner.py | 28 +++-- .../test_column_wise_builder.py | 111 +++++++++--------- 5 files changed, 74 insertions(+), 83 deletions(-) delete mode 100644 packages/data-designer-config/src/data_designer/config/dataset_builders.py diff --git a/packages/data-designer-config/src/data_designer/config/__init__.py b/packages/data-designer-config/src/data_designer/config/__init__.py index 306192f0..2ea641e7 100644 --- a/packages/data-designer-config/src/data_designer/config/__init__.py +++ b/packages/data-designer-config/src/data_designer/config/__init__.py @@ -30,7 +30,6 @@ from data_designer.config.config_builder import DataDesignerConfigBuilder # noqa: F401 from data_designer.config.custom_column import custom_column_generator # noqa: F401 from data_designer.config.data_designer_config import DataDesignerConfig # noqa: F401 - from data_designer.config.dataset_builders import BuildStage # noqa: F401 from data_designer.config.mcp import ( # noqa: F401 LocalStdioMCPProvider, MCPProvider, @@ -141,8 +140,6 @@ "custom_column_generator": (f"{_MOD_BASE}.custom_column", "custom_column_generator"), # data_designer_config "DataDesignerConfig": (f"{_MOD_BASE}.data_designer_config", "DataDesignerConfig"), - # dataset_builders - "BuildStage": (f"{_MOD_BASE}.dataset_builders", "BuildStage"), # mcp "LocalStdioMCPProvider": (_MOD_MCP, "LocalStdioMCPProvider"), "MCPProvider": (_MOD_MCP, "MCPProvider"), diff --git a/packages/data-designer-config/src/data_designer/config/config_builder.py b/packages/data-designer-config/src/data_designer/config/config_builder.py index 9114c46f..a59d8f70 100644 --- a/packages/data-designer-config/src/data_designer/config/config_builder.py +++ b/packages/data-designer-config/src/data_designer/config/config_builder.py @@ -22,7 +22,6 @@ get_column_display_order, ) from data_designer.config.data_designer_config import DataDesignerConfig -from data_designer.config.dataset_builders import BuildStage from data_designer.config.default_model_settings import get_default_model_configs from data_designer.config.errors import BuilderConfigurationError, BuilderSerializationError, InvalidColumnTypeError from data_designer.config.exportable_config import ExportableConfigBase @@ -562,7 +561,7 @@ def get_columns_excluding_type(self, column_type: DataDesignerColumnType) -> lis column_type = resolve_string_enum(column_type, DataDesignerColumnType) return [c for c in self._column_configs.values() if c.column_type != column_type] - def get_processor_configs(self) -> dict[BuildStage, list[ProcessorConfigT]]: + def get_processor_configs(self) -> list[ProcessorConfigT]: """Get processor configuration objects. Returns: diff --git a/packages/data-designer-config/src/data_designer/config/dataset_builders.py b/packages/data-designer-config/src/data_designer/config/dataset_builders.py deleted file mode 100644 index 67607388..00000000 --- a/packages/data-designer-config/src/data_designer/config/dataset_builders.py +++ /dev/null @@ -1,12 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -from enum import Enum - - -class BuildStage(str, Enum): - PRE_BATCH = "pre_batch" - POST_BATCH = "post_batch" - AFTER_GENERATION = "after_generation" diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py index fb7e0ad5..0ced9c5a 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py @@ -64,7 +64,8 @@ def run_pre_batch(self, batch_manager: DatasetBatchManager) -> None: original_len = len(df) df = self._run_stage(df, ProcessorStage.PRE_BATCH) if len(df) != original_len: - logger.info(f"â„šī¸ PRE_BATCH processors changed row count from {original_len} to {len(df)}.") + delta = len(df) - original_len + logger.info(f"â„šī¸ PRE_BATCH processors changed the record count by {delta:+d} records.") batch_manager.replace_records(df.to_dict(orient="records")) def run_post_batch(self, df: pd.DataFrame, current_batch_number: int | None) -> pd.DataFrame: @@ -76,19 +77,28 @@ def run_after_generation_on_df(self, df: pd.DataFrame) -> pd.DataFrame: return self._run_stage(df, ProcessorStage.AFTER_GENERATION) def run_after_generation(self) -> None: - """Load final dataset, run process_after_generation(), rewrite dataset.""" + """Load final dataset, run process_after_generation(), rewrite dataset. + + Preserves the original batch partitioning: the processed dataset is split + back into the same number of parquet files that existed before processing. + """ if not self.has_processors_for(ProcessorStage.AFTER_GENERATION): return logger.info("âŗ Running process_after_generation on final dataset...") + num_batch_files = len(list(self._artifact_storage.final_dataset_path.glob("*.parquet"))) df = self._artifact_storage.load_dataset() df = self._run_stage(df, ProcessorStage.AFTER_GENERATION) - if self._artifact_storage.final_dataset_path.exists(): - shutil.rmtree(self._artifact_storage.final_dataset_path) - self._artifact_storage.write_batch_to_parquet_file( - batch_number=0, - dataframe=df, - batch_stage=BatchStage.FINAL_RESULT, - ) + shutil.rmtree(self._artifact_storage.final_dataset_path) + num_batches = max(num_batch_files, 1) + rows_per_batch = len(df) // num_batches + for i in range(num_batches): + start = i * rows_per_batch + end = start + rows_per_batch if i < num_batches - 1 else len(df) + self._artifact_storage.write_batch_to_parquet_file( + batch_number=i, + dataframe=df.iloc[start:end], + batch_stage=BatchStage.FINAL_RESULT, + ) logger.info(f"✅ process_after_generation complete. Final dataset has {len(df)} rows.") diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index 184d16c5..60090b69 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -269,13 +269,6 @@ def test_column_wise_dataset_builder_validate_column_configs( ) -def test_column_wise_dataset_builder_initialize_processors(stub_column_wise_builder): - processors = stub_column_wise_builder.processors - assert isinstance(processors, tuple) - assert len(processors) == 1 - assert processors[0].config.column_names == ["column_to_drop"] - - def test_run_config_default_non_inference_max_parallel_workers() -> None: run_config = RunConfig() assert run_config.non_inference_max_parallel_workers == 4 @@ -425,30 +418,58 @@ def test_fan_out_with_threads_uses_early_shutdown_settings_from_resource_provide assert call_kwargs["disable_early_shutdown"] == disable_early_shutdown -def test_run_post_generation_processors_modifies_final_dataset(stub_resource_provider, stub_model_configs): - """Test that process_after_generation callbacks are applied to the final dataset.""" - final_df = pd.DataFrame({"id": [1, 2, 3, 4, 5], "value": ["a", "b", "c", "d", "e"]}) - stub_resource_provider.artifact_storage.mkdir_if_needed(stub_resource_provider.artifact_storage.final_dataset_path) - final_df.to_parquet(stub_resource_provider.artifact_storage.final_dataset_path / "batch_00000.parquet", index=False) +# Processor tests - mock_processor = create_mock_processor("dedup_processor", ["process_after_generation"]) - mock_processor.process_after_generation.side_effect = lambda df: df[df["id"] > 2] +@pytest.fixture +def simple_builder(stub_resource_provider, stub_model_configs): + """Minimal builder with a single UUID column and no batch files on disk.""" config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) - - builder = ColumnWiseDatasetBuilder( + return ColumnWiseDatasetBuilder( data_designer_config=config_builder.build(), resource_provider=stub_resource_provider, ) - builder.set_processor_runner([mock_processor]) - builder._processor_runner.run_after_generation() - mock_processor.process_after_generation.assert_called_once() +def test_initialize_processors(stub_column_wise_builder): + processors = stub_column_wise_builder.processors + assert isinstance(processors, tuple) + assert len(processors) == 1 + assert processors[0].config.column_names == ["column_to_drop"] - result_df = stub_resource_provider.artifact_storage.load_dataset() - assert len(result_df) == 3 + +@pytest.mark.parametrize( + "processor_fn,num_batches,expected_rows", + [ + pytest.param(lambda df: df[df["id"] > 3], 1, 6, id="single_batch_filter"), + pytest.param(lambda df: df, 3, 9, id="multi_batch_noop"), + pytest.param(lambda df: df[df["id"] != 3].reset_index(drop=True), 3, 8, id="multi_batch_uneven"), + ], +) +def test_run_after_generation(stub_resource_provider, simple_builder, processor_fn, num_batches, expected_rows): + """Test that process_after_generation applies callbacks and preserves batch partitioning.""" + storage = stub_resource_provider.artifact_storage + storage.mkdir_if_needed(storage.final_dataset_path) + all_ids = list(range(1, 10)) # 9 rows total + chunk_size = len(all_ids) // num_batches + for i in range(num_batches): + start = i * chunk_size + end = len(all_ids) if i == num_batches - 1 else start + chunk_size + pd.DataFrame({"id": all_ids[start:end]}).to_parquet( + storage.final_dataset_path / f"batch_{i:05d}.parquet", index=False + ) + + mock_processor = create_mock_processor("proc", ["process_after_generation"]) + mock_processor.process_after_generation.side_effect = processor_fn + + simple_builder.set_processor_runner([mock_processor]) + simple_builder._processor_runner.run_after_generation() + + mock_processor.process_after_generation.assert_called_once() + batch_files = sorted(storage.final_dataset_path.glob("*.parquet")) + assert len(batch_files) == num_batches + assert sum(len(pd.read_parquet(f)) for f in batch_files) == expected_rows @pytest.mark.parametrize("mode", ["preview", "build"]) @@ -480,34 +501,22 @@ def test_all_processor_stages_run_in_order(builder_with_seed, mode): assert call_order == all_stages -# --- Edge Case Tests --- - - -def test_processor_exception_in_process_after_batch_raises_error(stub_resource_provider, stub_model_configs): +def test_processor_exception_in_process_after_batch_raises_error(simple_builder): """Test that processor exceptions during process_after_batch are properly wrapped.""" mock_processor = create_mock_processor("failing_processor", ["process_after_batch"]) mock_processor.process_after_batch.side_effect = ValueError("Post-batch processing failed") - config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) - config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) - - builder = ColumnWiseDatasetBuilder( - data_designer_config=config_builder.build(), - resource_provider=stub_resource_provider, - ) - builder.set_processor_runner([mock_processor]) + simple_builder.set_processor_runner([mock_processor]) with pytest.raises(DatasetProcessingError, match="Failed in process_after_batch"): - builder._processor_runner.run_post_batch(pd.DataFrame({"id": [1, 2, 3]}), current_batch_number=0) + simple_builder._processor_runner.run_post_batch(pd.DataFrame({"id": [1, 2, 3]}), current_batch_number=0) def test_processor_with_no_implemented_stages_is_skipped(builder_with_seed): """Test that a processor implementing no stages doesn't cause errors.""" mock_processor = create_mock_processor("noop_processor", []) - builder_with_seed.set_processor_runner([mock_processor]) - # Should complete without errors result = builder_with_seed.build_preview(num_records=3) assert len(result) == 3 @@ -520,36 +529,24 @@ def test_multiple_processors_run_in_definition_order(builder_with_seed): """Test that multiple processors run in the order they were defined.""" call_order = [] - processor_a = create_mock_processor("processor_a", ["process_before_batch"]) - processor_a.process_before_batch.side_effect = lambda df: (call_order.append("a"), df)[1] - - processor_b = create_mock_processor("processor_b", ["process_before_batch"]) - processor_b.process_before_batch.side_effect = lambda df: (call_order.append("b"), df)[1] - - processor_c = create_mock_processor("processor_c", ["process_before_batch"]) - processor_c.process_before_batch.side_effect = lambda df: (call_order.append("c"), df)[1] + processors = [] + for label in ["a", "b", "c"]: + p = create_mock_processor(f"processor_{label}", ["process_before_batch"]) + p.process_before_batch.side_effect = lambda df, lbl=label: (call_order.append(lbl), df)[1] + processors.append(p) - builder_with_seed.set_processor_runner([processor_a, processor_b, processor_c]) + builder_with_seed.set_processor_runner(processors) builder_with_seed.build(num_records=3) assert call_order == ["a", "b", "c"] -def test_process_preview_with_empty_dataframe(stub_resource_provider, stub_model_configs): +def test_process_preview_with_empty_dataframe(simple_builder): """Test that process_preview handles empty DataFrames gracefully.""" mock_processor = create_mock_processor("test_processor", ["process_after_batch", "process_after_generation"]) + simple_builder.set_processor_runner([mock_processor]) - config_builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) - config_builder.add_column(SamplerColumnConfig(name="id", sampler_type="uuid", params=UUIDSamplerParams())) - - builder = ColumnWiseDatasetBuilder( - data_designer_config=config_builder.build(), - resource_provider=stub_resource_provider, - ) - builder.set_processor_runner([mock_processor]) - - empty_df = pd.DataFrame() - result = builder.process_preview(empty_df) + result = simple_builder.process_preview(pd.DataFrame()) assert len(result) == 0 mock_processor.process_after_batch.assert_called_once() From b6e4155a88d074f08c749aa17737a13ca85056a5 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 11 Feb 2026 11:11:17 -0300 Subject: [PATCH 12/14] chunk after-generation output by batch_size Avoids empty parquet files when processors reduce row count below the original file count. Re-chunks by buffer_size instead of matching the number of input files. --- .../dataset_builders/column_wise_builder.py | 2 +- .../utils/processor_runner.py | 20 ++++++------- .../test_column_wise_builder.py | 28 ++++++++----------- 3 files changed, 21 insertions(+), 29 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 121d89bb..3dd866c6 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -127,7 +127,7 @@ def build( self._write_processed_batch(df_batch) self.batch_manager.finish_batch(on_batch_complete) self.batch_manager.finish() - self._processor_runner.run_after_generation() + self._processor_runner.run_after_generation(buffer_size) self._resource_provider.model_registry.log_model_usage(time.perf_counter() - start_time) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py index 0ced9c5a..e71518d1 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py @@ -76,29 +76,25 @@ def run_after_generation_on_df(self, df: pd.DataFrame) -> pd.DataFrame: """Run process_after_generation() on a DataFrame (for preview mode).""" return self._run_stage(df, ProcessorStage.AFTER_GENERATION) - def run_after_generation(self) -> None: - """Load final dataset, run process_after_generation(), rewrite dataset. + def run_after_generation(self, batch_size: int) -> None: + """Load final dataset, run process_after_generation(), rewrite in chunks. - Preserves the original batch partitioning: the processed dataset is split - back into the same number of parquet files that existed before processing. + Re-chunks the processed dataset using the given batch_size so that output + files stay consistently sized regardless of how many rows the processor + adds or removes. """ if not self.has_processors_for(ProcessorStage.AFTER_GENERATION): return logger.info("âŗ Running process_after_generation on final dataset...") - num_batch_files = len(list(self._artifact_storage.final_dataset_path.glob("*.parquet"))) df = self._artifact_storage.load_dataset() df = self._run_stage(df, ProcessorStage.AFTER_GENERATION) shutil.rmtree(self._artifact_storage.final_dataset_path) - num_batches = max(num_batch_files, 1) - rows_per_batch = len(df) // num_batches - for i in range(num_batches): - start = i * rows_per_batch - end = start + rows_per_batch if i < num_batches - 1 else len(df) + for i in range(0, max(len(df), 1), batch_size): self._artifact_storage.write_batch_to_parquet_file( - batch_number=i, - dataframe=df.iloc[start:end], + batch_number=i // batch_size, + dataframe=df.iloc[i : i + batch_size], batch_stage=BatchStage.FINAL_RESULT, ) logger.info(f"✅ process_after_generation complete. Final dataset has {len(df)} rows.") diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py index 60090b69..b973174f 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_column_wise_builder.py @@ -440,35 +440,31 @@ def test_initialize_processors(stub_column_wise_builder): @pytest.mark.parametrize( - "processor_fn,num_batches,expected_rows", + "processor_fn,batch_size,expected_rows,expected_files", [ - pytest.param(lambda df: df[df["id"] > 3], 1, 6, id="single_batch_filter"), - pytest.param(lambda df: df, 3, 9, id="multi_batch_noop"), - pytest.param(lambda df: df[df["id"] != 3].reset_index(drop=True), 3, 8, id="multi_batch_uneven"), + pytest.param(lambda df: df, 3, 9, 3, id="noop_even"), + pytest.param(lambda df: df[df["id"] > 3], 3, 6, 2, id="filter_even"), + pytest.param(lambda df: df[df["id"] != 3].reset_index(drop=True), 3, 8, 3, id="filter_uneven"), + pytest.param(lambda df: df[df["id"] > 8], 3, 1, 1, id="filter_fewer_than_batch_size"), ], ) -def test_run_after_generation(stub_resource_provider, simple_builder, processor_fn, num_batches, expected_rows): - """Test that process_after_generation applies callbacks and preserves batch partitioning.""" +def test_run_after_generation( + stub_resource_provider, simple_builder, processor_fn, batch_size, expected_rows, expected_files +): + """Test that process_after_generation re-chunks output by batch_size.""" storage = stub_resource_provider.artifact_storage storage.mkdir_if_needed(storage.final_dataset_path) - all_ids = list(range(1, 10)) # 9 rows total - chunk_size = len(all_ids) // num_batches - for i in range(num_batches): - start = i * chunk_size - end = len(all_ids) if i == num_batches - 1 else start + chunk_size - pd.DataFrame({"id": all_ids[start:end]}).to_parquet( - storage.final_dataset_path / f"batch_{i:05d}.parquet", index=False - ) + pd.DataFrame({"id": list(range(1, 10))}).to_parquet(storage.final_dataset_path / "batch_00000.parquet", index=False) mock_processor = create_mock_processor("proc", ["process_after_generation"]) mock_processor.process_after_generation.side_effect = processor_fn simple_builder.set_processor_runner([mock_processor]) - simple_builder._processor_runner.run_after_generation() + simple_builder._processor_runner.run_after_generation(batch_size) mock_processor.process_after_generation.assert_called_once() batch_files = sorted(storage.final_dataset_path.glob("*.parquet")) - assert len(batch_files) == num_batches + assert len(batch_files) == expected_files assert sum(len(pd.read_parquet(f)) for f in batch_files) == expected_rows From 014ae3055f7b56f8a5d01c40cc145d5cd470b8fe Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 11 Feb 2026 17:02:42 -0300 Subject: [PATCH 13/14] address nabinchha review feedback - Rename replace_records to replace_buffer - Move row-count-change logging into _run_stage for all stages - Expose processors as public property on ProcessorRunner --- .../engine/dataset_builders/column_wise_builder.py | 2 +- .../utils/dataset_batch_manager.py | 2 +- .../dataset_builders/utils/processor_runner.py | 14 +++++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 3dd866c6..ac274b82 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -80,7 +80,7 @@ def artifact_storage(self) -> ArtifactStorage: @property def processors(self) -> tuple[Processor, ...]: - return tuple(self._processor_runner._processors) + return self._processor_runner.processors def set_processor_runner(self, processors: list[Processor]) -> None: """Replace the processor runner with a new one using the given processors.""" diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py index 663afc77..a60d52b9 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/dataset_batch_manager.py @@ -199,7 +199,7 @@ def update_records(self, records: list[dict]) -> None: ) self._buffer = records - def replace_records(self, records: list[dict]) -> None: + def replace_buffer(self, records: list[dict]) -> None: """Replace the buffer contents, updating the current batch size.""" self._buffer = records if self._num_records_list is not None: diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py index e71518d1..284d45f5 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/utils/processor_runner.py @@ -40,12 +40,17 @@ def __init__( self._processors = processors self._artifact_storage = artifact_storage + @property + def processors(self) -> tuple[Processor, ...]: + return tuple(self._processors) + def has_processors_for(self, stage: ProcessorStage) -> bool: """Check if any processor implements the given stage.""" return any(p.implements(stage.value) for p in self._processors) def _run_stage(self, df: pd.DataFrame, stage: ProcessorStage, **kwargs) -> pd.DataFrame: """Run a processor callback on all processors that implement it.""" + original_len = len(df) for processor in self._processors: if not processor.implements(stage.value): continue @@ -53,6 +58,9 @@ def _run_stage(self, df: pd.DataFrame, stage: ProcessorStage, **kwargs) -> pd.Da df = getattr(processor, stage.value)(df, **kwargs) except Exception as e: raise DatasetProcessingError(f"🛑 Failed in {stage.value} for {processor.name}: {e}") from e + if len(df) != original_len: + delta = len(df) - original_len + logger.info(f"â„šī¸ {stage.name} processors changed the record count by {delta:+d} records.") return df def run_pre_batch(self, batch_manager: DatasetBatchManager) -> None: @@ -61,12 +69,8 @@ def run_pre_batch(self, batch_manager: DatasetBatchManager) -> None: return df = batch_manager.get_current_batch(as_dataframe=True) - original_len = len(df) df = self._run_stage(df, ProcessorStage.PRE_BATCH) - if len(df) != original_len: - delta = len(df) - original_len - logger.info(f"â„šī¸ PRE_BATCH processors changed the record count by {delta:+d} records.") - batch_manager.replace_records(df.to_dict(orient="records")) + batch_manager.replace_buffer(df.to_dict(orient="records")) def run_post_batch(self, df: pd.DataFrame, current_batch_number: int | None) -> pd.DataFrame: """Run process_after_batch() on processors that implement it.""" From a8715dc9ade9cd135f9b358a33258e04502c79d1 Mon Sep 17 00:00:00 2001 From: Andre Manoel Date: Wed, 11 Feb 2026 17:18:57 -0300 Subject: [PATCH 14/14] move post-batch processing into _run_batch Both pre-batch and post-batch now run inside _run_batch, keeping the full batch lifecycle (generate, process, write, finish) in one place. Preview mode skips post-batch via current_batch_number=None guard. --- .../dataset_builders/column_wise_builder.py | 28 ++++++++++++++----- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index ac274b82..e5404d49 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -120,12 +120,13 @@ def build( self.batch_manager.start(num_records=num_records, buffer_size=buffer_size) for batch_idx in range(self.batch_manager.num_batches): logger.info(f"âŗ Processing batch {batch_idx + 1} of {self.batch_manager.num_batches}") - # Note: pre-batch processing runs inside _run_batch, after seed columns are populated - self._run_batch(generators, batch_mode="batch", group_id=group_id) - df_batch = self.batch_manager.get_current_batch(as_dataframe=True) - df_batch = self._processor_runner.run_post_batch(df_batch, current_batch_number=batch_idx) - self._write_processed_batch(df_batch) - self.batch_manager.finish_batch(on_batch_complete) + self._run_batch( + generators, + batch_mode="batch", + group_id=group_id, + current_batch_number=batch_idx, + on_batch_complete=on_batch_complete, + ) self.batch_manager.finish() self._processor_runner.run_after_generation(buffer_size) @@ -168,7 +169,14 @@ def _write_builder_config(self) -> None: ) def _run_batch( - self, generators: list[ColumnGenerator], *, batch_mode: str, save_partial_results: bool = True, group_id: str + self, + generators: list[ColumnGenerator], + *, + batch_mode: str, + save_partial_results: bool = True, + group_id: str, + current_batch_number: int | None = None, + on_batch_complete: Callable[[Path], None] | None = None, ) -> None: pre_batch_snapshot = self._resource_provider.model_registry.get_model_usage_snapshot() ran_pre_batch = False @@ -205,6 +213,12 @@ def _run_batch( except Exception: pass + if current_batch_number is not None: + df_batch = self.batch_manager.get_current_batch(as_dataframe=True) + df_batch = self._processor_runner.run_post_batch(df_batch, current_batch_number=current_batch_number) + self._write_processed_batch(df_batch) + self.batch_manager.finish_batch(on_batch_complete) + def _run_from_scratch_column_generator(self, generator: ColumnGenerator) -> None: df = generator.generate_from_scratch(self.batch_manager.num_records_batch) self.batch_manager.add_records(df.to_dict(orient="records"))