From dc041f78a95d40b835a9ec34356f10248885daf9 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 12:16:17 -0700 Subject: [PATCH 01/69] Add generation type to ModelConfig --- src/data_designer/config/models.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index 6bff8efd..3e06a8fc 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -205,11 +205,18 @@ def _is_value_in_range(self, value: float, min_value: float, max_value: float) - return min_value <= value <= max_value +class GenerationType(str, Enum): + CHAT_COMPLETION = "chat-completion" + TEXT_EMBEDDING = "text-embedding" + IMAGE_GENERATION = "image-generation" + + class ModelConfig(ConfigBase): alias: str model: str inference_parameters: InferenceParameters = Field(default_factory=InferenceParameters) provider: Optional[str] = None + generation_type: GenerationType = GenerationType.CHAT_COMPLETION class ModelProvider(ConfigBase): From 0d6b830f6439b6bece0b642c921281fd817de5d3 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 12:21:28 -0700 Subject: [PATCH 02/69] pass tests --- src/data_designer/config/default_model_settings.py | 5 +++-- src/data_designer/config/models.py | 2 +- tests/cli/repositories/test_model_repository.py | 4 +++- tests/config/test_config_builder.py | 2 +- tests/config/test_models.py | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/data_designer/config/default_model_settings.py b/src/data_designer/config/default_model_settings.py index 32c1d42b..33d6dad4 100644 --- a/src/data_designer/config/default_model_settings.py +++ b/src/data_designer/config/default_model_settings.py @@ -103,7 +103,8 @@ def resolve_seed_default_model_settings() -> None: f"🍾 Default model configs were not found, so writing the following to {str(MODEL_CONFIGS_FILE_PATH)!r}" ) save_config_file( - MODEL_CONFIGS_FILE_PATH, {"model_configs": [mc.model_dump() for mc in get_builtin_model_configs()]} + MODEL_CONFIGS_FILE_PATH, + {"model_configs": [mc.model_dump(mode="json") for mc in get_builtin_model_configs()]}, ) if not MODEL_PROVIDERS_FILE_PATH.exists(): @@ -111,7 +112,7 @@ def resolve_seed_default_model_settings() -> None: f"πŸͺ„ Default model providers were not found, so writing the following to {str(MODEL_PROVIDERS_FILE_PATH)!r}" ) save_config_file( - MODEL_PROVIDERS_FILE_PATH, {"providers": [p.model_dump() for p in get_builtin_model_providers()]} + MODEL_PROVIDERS_FILE_PATH, {"providers": [p.model_dump(mode="json") for p in get_builtin_model_providers()]} ) if not MANAGED_ASSETS_PATH.exists(): diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index 3e06a8fc..17698346 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -215,8 +215,8 @@ class ModelConfig(ConfigBase): alias: str model: str inference_parameters: InferenceParameters = Field(default_factory=InferenceParameters) - provider: Optional[str] = None generation_type: GenerationType = GenerationType.CHAT_COMPLETION + provider: Optional[str] = None class ModelProvider(ConfigBase): diff --git a/tests/cli/repositories/test_model_repository.py b/tests/cli/repositories/test_model_repository.py index 01884b5c..624cd360 100644 --- a/tests/cli/repositories/test_model_repository.py +++ b/tests/cli/repositories/test_model_repository.py @@ -21,7 +21,9 @@ def test_load_does_not_exist(): def test_load_exists(tmp_path: Path, stub_model_configs: list[ModelConfig]): model_configs_file_path = tmp_path / MODEL_CONFIGS_FILE_NAME - save_config_file(model_configs_file_path, {"model_configs": [mc.model_dump() for mc in stub_model_configs]}) + save_config_file( + model_configs_file_path, {"model_configs": [mc.model_dump(mode="json") for mc in stub_model_configs]} + ) repository = ModelRepository(tmp_path) assert repository.load() is not None assert repository.load().model_configs == stub_model_configs diff --git a/tests/config/test_config_builder.py b/tests/config/test_config_builder.py index 337d934e..aab8112a 100644 --- a/tests/config/test_config_builder.py +++ b/tests/config/test_config_builder.py @@ -54,7 +54,7 @@ def stub_data_designer_builder(stub_data_designer_builder_config_str): def test_loading_model_configs_in_constructor(stub_model_configs): - stub_model_configs_dict = [mc.model_dump() for mc in stub_model_configs] + stub_model_configs_dict = [mc.model_dump(mode="json") for mc in stub_model_configs] # test loading model configs from a list builder = DataDesignerConfigBuilder(model_configs=stub_model_configs) assert builder.model_configs == stub_model_configs diff --git a/tests/config/test_models.py b/tests/config/test_models.py index 9ccda6d5..6a3d7b25 100644 --- a/tests/config/test_models.py +++ b/tests/config/test_models.py @@ -212,7 +212,7 @@ def test_load_model_configs(): ModelConfig(alias="test", model="test"), ModelConfig(alias="test2", model="test2"), ] - stub_model_configs_dict_list = [mc.model_dump() for mc in stub_model_configs] + stub_model_configs_dict_list = [mc.model_dump(mode="json") for mc in stub_model_configs] assert load_model_configs([]) == [] assert load_model_configs(stub_model_configs) == stub_model_configs From 254fd8a71e261a7bb3ac71ad14d8aa10772529ea Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 14:36:02 -0700 Subject: [PATCH 03/69] added generate_text_embeddings --- .../generators/llm_generators.py | 1 - src/data_designer/engine/models/facade.py | 51 ++++++++++++++++++- .../generators/test_llm_generators.py | 17 ------- tests/engine/models/test_facade.py | 48 +++++++++++------ 4 files changed, 82 insertions(+), 35 deletions(-) diff --git a/src/data_designer/engine/column_generators/generators/llm_generators.py b/src/data_designer/engine/column_generators/generators/llm_generators.py index ee0ab58a..8f4cfc90 100644 --- a/src/data_designer/engine/column_generators/generators/llm_generators.py +++ b/src/data_designer/engine/column_generators/generators/llm_generators.py @@ -96,7 +96,6 @@ def generate(self, data: dict) -> dict: max_correction_steps=self.max_conversation_correction_steps, max_conversation_restarts=self.max_conversation_restarts, purpose=f"running generation for column '{self.config.name}'", - **self.inference_parameters.generate_kwargs, ) data[self.config.name] = deserialize_json_values(self.response_recipe.serialize_output(response)) diff --git a/src/data_designer/engine/models/facade.py b/src/data_designer/engine/models/facade.py index 93ca0fd7..b0ad3472 100644 --- a/src/data_designer/engine/models/facade.py +++ b/src/data_designer/engine/models/facade.py @@ -9,7 +9,7 @@ from typing import Any from litellm.types.router import DeploymentTypedDict, LiteLLM_Params -from litellm.types.utils import ModelResponse +from litellm.types.utils import EmbeddingResponse, ModelResponse from data_designer.config.models import ModelConfig, ModelProvider from data_designer.engine.model_provider import ModelProviderRegistry @@ -67,6 +67,7 @@ def completion(self, messages: list[dict[str, str]], skip_usage_tracking: bool = extra={"model": self.model_name, "messages": messages, "sensitive": True}, ) response = None + kwargs = {**self._model_config.inference_parameters.generate_kwargs, **kwargs} if self.model_provider.extra_body: kwargs["extra_body"] = {**kwargs.get("extra_body", {}), **self.model_provider.extra_body} try: @@ -87,6 +88,41 @@ def completion(self, messages: list[dict[str, str]], skip_usage_tracking: bool = if not skip_usage_tracking: self._track_usage(response) + @catch_llm_exceptions + def generate_text_embeddings( + self, input_texts: list[str], skip_usage_tracking: bool = False, **kwargs + ) -> list[list[float]]: + logger.debug( + f"Generating embeddings with model {self.model_name!r}...", + extra={ + "model": self.model_name, + "input_count": len(input_texts), + "sensitive": True, + }, + ) + kwargs |= self._model_config.inference_parameters.generate_kwargs + if self.model_provider.extra_body: + kwargs["extra_body"] = {**kwargs.get("extra_body", {}), **self.model_provider.extra_body} + try: + response = self._router.embedding(model=self.model_name, input=input_texts, **kwargs) + logger.debug( + f"Received embeddings from model {self.model_name!r}", + extra={ + "model": self.model_name, + "embedding_count": len(response.data) if response.data else 0, + "usage": self._usage_stats.model_dump(), + }, + ) + if response.data and len(response.data) == len(input_texts): + return [data["embedding"] for data in response.data] + else: + raise ValueError(f"Expected {len(input_texts)} embeddings, but received {len(response.data)}") + except Exception as e: + raise e + finally: + if not skip_usage_tracking: + self._track_usage_from_embedding(response) + @catch_llm_exceptions def generate( self, @@ -223,3 +259,16 @@ def _track_usage(self, response: ModelResponse | None) -> None: ), request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), ) + + def _track_usage_from_embedding(self, response: EmbeddingResponse | None) -> None: + if response is None: + self._usage_stats.extend(request_usage=RequestUsageStats(successful_requests=0, failed_requests=1)) + return + if response.usage is not None and response.usage.prompt_tokens is not None: + self._usage_stats.extend( + token_usage=TokenUsageStats( + prompt_tokens=response.usage.prompt_tokens, + completion_tokens=0, + ), + request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), + ) diff --git a/tests/engine/column_generators/generators/test_llm_generators.py b/tests/engine/column_generators/generators/test_llm_generators.py index 259f3a08..acaa2c6f 100644 --- a/tests/engine/column_generators/generators/test_llm_generators.py +++ b/tests/engine/column_generators/generators/test_llm_generators.py @@ -259,20 +259,3 @@ def test_generate_with_json_deserialization(): result = generator.generate(data) assert result["test_column"] == {"result": "json_output"} - - -def test_generate_with_inference_parameters(): - generator, _, mock_model, _, mock_inference_params, mock_prompt_renderer, mock_response_recipe = ( - _create_generator_with_mocks() - ) - - mock_inference_params.generate_kwargs = {"temperature": 0.7, "max_tokens": 100} - _setup_generate_mocks(mock_prompt_renderer, mock_response_recipe, mock_model) - - data = {"input": "test_input"} - generator.generate(data) - - call_args = mock_model.generate.call_args - assert call_args[1]["temperature"] == 0.7 - assert call_args[1]["max_tokens"] == 100 - assert call_args[1]["purpose"] == "running generation for column 'test_column'" diff --git a/tests/engine/models/test_facade.py b/tests/engine/models/test_facade.py index 4fa73d9a..d240eeaa 100644 --- a/tests/engine/models/test_facade.py +++ b/tests/engine/models/test_facade.py @@ -133,7 +133,9 @@ def raise_exception(*args, **kwargs): stub_model_facade.completion(messages) -def test_completion_with_kwargs(stub_model_facade, stub_expected_response): +def test_completion_kwargs_overrides_model_config_generate_kwargs( + stub_model_configs, stub_model_facade, stub_expected_response +): captured_kwargs = {} def mock_completion(model_name, messages, **kwargs): @@ -147,28 +149,42 @@ def mock_completion(model_name, messages, **kwargs): result = stub_model_facade.completion(messages, **kwargs) assert result == stub_expected_response - assert captured_kwargs == kwargs + # completion kwargs overrides model config generate kwargs + assert captured_kwargs == {**stub_model_configs[0].inference_parameters.generate_kwargs, **kwargs} @patch("data_designer.engine.models.facade.CustomRouter.completion", autospec=True) -def test_completion_with_extra_body(mock_router_completion, stub_model_facade): +def test_provider_extra_body_overrides_completion_kwargs(mock_router_completion, stub_model_configs, stub_model_facade): messages = [{"role": "user", "content": "test"}] + stub_provider_extra_body = {"foo": "bar"} - # completion call has no extra body argument and provider has no extra body + # model config has generate kwargs, completion call has no kwargs, and provider has no extra body _ = stub_model_facade.completion(messages) assert len(mock_router_completion.call_args) == 2 assert mock_router_completion.call_args[0][1] == "stub-model-text" assert mock_router_completion.call_args[0][2] == messages + assert mock_router_completion.call_args[1] == stub_model_configs[0].inference_parameters.generate_kwargs - # completion call has no extra body argument and provider has extra body. - # Should pull extra body from model provider - custom_extra_body = {"some_custom_key": "some_custom_value"} - stub_model_facade.model_provider.extra_body = custom_extra_body - _ = stub_model_facade.completion(messages) - assert mock_router_completion.call_args[1] == {"extra_body": custom_extra_body} - - # completion call has extra body argument and provider has extra body. - # Should merge the two with provider extra body taking precedence - completion_extra_body = {"some_completion_key": "some_completion_value", "some_custom_key": "some_different_value"} - _ = stub_model_facade.completion(messages, extra_body=completion_extra_body) - assert mock_router_completion.call_args[1] == {"extra_body": {**completion_extra_body, **custom_extra_body}} + # model config has generate kwargs, completion call has kwargs, and provider has no extra body + # completion kwargs overrides model config generate kwargs + _ = stub_model_facade.completion(messages, temperature=0.1) + assert len(mock_router_completion.call_args) == 2 + assert mock_router_completion.call_args[0][1] == "stub-model-text" + assert mock_router_completion.call_args[0][2] == messages + assert mock_router_completion.call_args[1] == { + **stub_model_configs[0].inference_parameters.generate_kwargs, + "temperature": 0.1, + } + + # model config has generate kwargs, completion call has kwargs, and provider has extra body + # provider extra body overrides completion kwargs + stub_model_facade.model_provider.extra_body = stub_provider_extra_body + _ = stub_model_facade.completion(messages, temperature=0.15, extra_body={"foo": "bat"}) + assert len(mock_router_completion.call_args) == 2 + assert mock_router_completion.call_args[0][1] == "stub-model-text" + assert mock_router_completion.call_args[0][2] == messages + assert mock_router_completion.call_args[1] == { + **stub_model_configs[0].inference_parameters.generate_kwargs, + "temperature": 0.15, + "extra_body": stub_provider_extra_body, + } From 1126ea1bdfdf842ed8073aaf0cea6e405a77c0ce Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 15:18:59 -0700 Subject: [PATCH 04/69] tests --- src/data_designer/engine/models/facade.py | 21 +-- tests/engine/models/test_facade.py | 152 +++++++++++++--------- 2 files changed, 105 insertions(+), 68 deletions(-) diff --git a/src/data_designer/engine/models/facade.py b/src/data_designer/engine/models/facade.py index b0ad3472..4e3f36ef 100644 --- a/src/data_designer/engine/models/facade.py +++ b/src/data_designer/engine/models/facade.py @@ -67,11 +67,9 @@ def completion(self, messages: list[dict[str, str]], skip_usage_tracking: bool = extra={"model": self.model_name, "messages": messages, "sensitive": True}, ) response = None - kwargs = {**self._model_config.inference_parameters.generate_kwargs, **kwargs} - if self.model_provider.extra_body: - kwargs["extra_body"] = {**kwargs.get("extra_body", {}), **self.model_provider.extra_body} + kwargs = self.consolidate_kwargs(**kwargs) try: - response = self._router.completion(self.model_name, messages, **kwargs) + response = self._router.completion(model=self.model_name, messages=messages, **kwargs) logger.debug( f"Received completion from model {self.model_name!r}", extra={ @@ -85,9 +83,15 @@ def completion(self, messages: list[dict[str, str]], skip_usage_tracking: bool = except Exception as e: raise e finally: - if not skip_usage_tracking: + if not skip_usage_tracking and response is not None: self._track_usage(response) + def consolidate_kwargs(self, **kwargs) -> dict[str, Any]: + kwargs = {**self._model_config.inference_parameters.generate_kwargs, **kwargs} + if self.model_provider.extra_body: + kwargs["extra_body"] = {**kwargs.get("extra_body", {}), **self.model_provider.extra_body} + return kwargs + @catch_llm_exceptions def generate_text_embeddings( self, input_texts: list[str], skip_usage_tracking: bool = False, **kwargs @@ -100,9 +104,8 @@ def generate_text_embeddings( "sensitive": True, }, ) - kwargs |= self._model_config.inference_parameters.generate_kwargs - if self.model_provider.extra_body: - kwargs["extra_body"] = {**kwargs.get("extra_body", {}), **self.model_provider.extra_body} + kwargs = self.consolidate_kwargs(**kwargs) + response = None try: response = self._router.embedding(model=self.model_name, input=input_texts, **kwargs) logger.debug( @@ -120,7 +123,7 @@ def generate_text_embeddings( except Exception as e: raise e finally: - if not skip_usage_tracking: + if not skip_usage_tracking and response is not None: self._track_usage_from_embedding(response) @catch_llm_exceptions diff --git a/tests/engine/models/test_facade.py b/tests/engine/models/test_facade.py index d240eeaa..afe27730 100644 --- a/tests/engine/models/test_facade.py +++ b/tests/engine/models/test_facade.py @@ -4,7 +4,7 @@ from collections import namedtuple from unittest.mock import patch -from litellm.types.utils import Choices, Message, ModelResponse +from litellm.types.utils import Choices, EmbeddingResponse, Message, ModelResponse import pytest from data_designer.engine.models.errors import ModelGenerationValidationFailureError @@ -30,10 +30,20 @@ def stub_model_facade(stub_model_configs, stub_secrets_resolver, stub_model_prov @pytest.fixture -def stub_expected_response(): +def stub_completion_messages(): + return [{"role": "user", "content": "test"}] + + +@pytest.fixture +def stub_expected_completion_response(): return ModelResponse(choices=Choices(message=Message(content="Test response"))) +@pytest.fixture +def stub_expected_embedding_response(): + return EmbeddingResponse(data=[{"embedding": [0.1, 0.2, 0.3]}] * 2) + + @pytest.mark.parametrize( "max_correction_steps,max_conversation_restarts,total_calls", [ @@ -105,6 +115,24 @@ def test_usage_stats_property(stub_model_facade): assert hasattr(stub_model_facade.usage_stats, "model_dump") +def test_consolidate_kwargs(stub_model_configs, stub_model_facade): + # Model config generate kwargs are used as base + result = stub_model_facade.consolidate_kwargs() + assert result == stub_model_configs[0].inference_parameters.generate_kwargs + + # kwargs overrides model config generate kwargs + result = stub_model_facade.consolidate_kwargs(temperature=0.01) + assert result == {**stub_model_configs[0].inference_parameters.generate_kwargs, "temperature": 0.01} + + # Provider extra_body overrides all other kwargs + stub_model_facade.model_provider.extra_body = {"foo_provider": "bar_provider"} + result = stub_model_facade.consolidate_kwargs(extra_body={"foo": "bar"}) + assert result == { + **stub_model_configs[0].inference_parameters.generate_kwargs, + "extra_body": {"foo_provider": "bar_provider", "foo": "bar"}, + } + + @pytest.mark.parametrize( "skip_usage_tracking", [ @@ -112,79 +140,85 @@ def test_usage_stats_property(stub_model_facade): True, ], ) -def test_completion_success(stub_model_facade, stub_expected_response, skip_usage_tracking): - stub_model_facade._router.completion = lambda model_name, messages, **kwargs: stub_expected_response - - messages = [{"role": "user", "content": "test"}] - result = stub_model_facade.completion(messages, skip_usage_tracking=skip_usage_tracking) - - assert result == stub_expected_response - - -def test_completion_with_exception(stub_model_facade): - def raise_exception(*args, **kwargs): - raise Exception("Router error") +@patch("data_designer.engine.models.facade.CustomRouter.completion", autospec=True) +def test_completion_success( + mock_router_completion, + stub_completion_messages, + stub_model_configs, + stub_model_facade, + stub_expected_completion_response, + skip_usage_tracking, +): + mock_router_completion.side_effect = lambda self, model, messages, **kwargs: stub_expected_completion_response + result = stub_model_facade.completion(stub_completion_messages, skip_usage_tracking=skip_usage_tracking) + assert result == stub_expected_completion_response + assert mock_router_completion.call_count == 1 + assert mock_router_completion.call_args[1] == { + "model": "stub-model-text", + "messages": stub_completion_messages, + **stub_model_configs[0].inference_parameters.generate_kwargs, + } - stub_model_facade._router.completion = raise_exception - messages = [{"role": "user", "content": "test"}] +@patch("data_designer.engine.models.facade.CustomRouter.completion", autospec=True) +def test_completion_with_exception(mock_router_completion, stub_completion_messages, stub_model_facade): + mock_router_completion.side_effect = Exception("Router error") with pytest.raises(Exception, match="Router error"): - stub_model_facade.completion(messages) + stub_model_facade.completion(stub_completion_messages) -def test_completion_kwargs_overrides_model_config_generate_kwargs( - stub_model_configs, stub_model_facade, stub_expected_response +@patch("data_designer.engine.models.facade.CustomRouter.completion", autospec=True) +def test_completion_with_kwargs( + mock_router_completion, + stub_completion_messages, + stub_model_configs, + stub_model_facade, + stub_expected_completion_response, ): captured_kwargs = {} - def mock_completion(model_name, messages, **kwargs): + def mock_completion(self, model, messages, **kwargs): captured_kwargs.update(kwargs) - return stub_expected_response + return stub_expected_completion_response - stub_model_facade._router.completion = mock_completion + mock_router_completion.side_effect = mock_completion - messages = [{"role": "user", "content": "test"}] kwargs = {"temperature": 0.7, "max_tokens": 100} - result = stub_model_facade.completion(messages, **kwargs) + result = stub_model_facade.completion(stub_completion_messages, **kwargs) - assert result == stub_expected_response + assert result == stub_expected_completion_response # completion kwargs overrides model config generate kwargs assert captured_kwargs == {**stub_model_configs[0].inference_parameters.generate_kwargs, **kwargs} -@patch("data_designer.engine.models.facade.CustomRouter.completion", autospec=True) -def test_provider_extra_body_overrides_completion_kwargs(mock_router_completion, stub_model_configs, stub_model_facade): - messages = [{"role": "user", "content": "test"}] - stub_provider_extra_body = {"foo": "bar"} - - # model config has generate kwargs, completion call has no kwargs, and provider has no extra body - _ = stub_model_facade.completion(messages) - assert len(mock_router_completion.call_args) == 2 - assert mock_router_completion.call_args[0][1] == "stub-model-text" - assert mock_router_completion.call_args[0][2] == messages - assert mock_router_completion.call_args[1] == stub_model_configs[0].inference_parameters.generate_kwargs - - # model config has generate kwargs, completion call has kwargs, and provider has no extra body - # completion kwargs overrides model config generate kwargs - _ = stub_model_facade.completion(messages, temperature=0.1) - assert len(mock_router_completion.call_args) == 2 - assert mock_router_completion.call_args[0][1] == "stub-model-text" - assert mock_router_completion.call_args[0][2] == messages - assert mock_router_completion.call_args[1] == { - **stub_model_configs[0].inference_parameters.generate_kwargs, - "temperature": 0.1, - } +@patch("data_designer.engine.models.facade.CustomRouter.embedding", autospec=True) +def test_generate_text_embeddings_success(mock_router_embedding, stub_model_facade, stub_expected_embedding_response): + mock_router_embedding.side_effect = lambda self, model, input, **kwargs: stub_expected_embedding_response + input_texts = ["test1", "test2"] + result = stub_model_facade.generate_text_embeddings(input_texts) + assert result == [data["embedding"] for data in stub_expected_embedding_response.data] - # model config has generate kwargs, completion call has kwargs, and provider has extra body - # provider extra body overrides completion kwargs - stub_model_facade.model_provider.extra_body = stub_provider_extra_body - _ = stub_model_facade.completion(messages, temperature=0.15, extra_body={"foo": "bat"}) - assert len(mock_router_completion.call_args) == 2 - assert mock_router_completion.call_args[0][1] == "stub-model-text" - assert mock_router_completion.call_args[0][2] == messages - assert mock_router_completion.call_args[1] == { - **stub_model_configs[0].inference_parameters.generate_kwargs, - "temperature": 0.15, - "extra_body": stub_provider_extra_body, - } + +@patch("data_designer.engine.models.facade.CustomRouter.embedding", autospec=True) +def test_generate_text_embeddings_with_exception(mock_router_embedding, stub_model_facade): + mock_router_embedding.side_effect = Exception("Router error") + + with pytest.raises(Exception, match="Router error"): + stub_model_facade.generate_text_embeddings(["test1", "test2"]) + + +@patch("data_designer.engine.models.facade.CustomRouter.embedding", autospec=True) +def test_generate_text_embeddings_with_kwargs( + mock_router_embedding, stub_model_configs, stub_model_facade, stub_expected_embedding_response +): + captured_kwargs = {} + + def mock_embedding(self, model, input, **kwargs): + captured_kwargs.update(kwargs) + return stub_expected_embedding_response + + mock_router_embedding.side_effect = mock_embedding + kwargs = {"temperature": 0.7, "max_tokens": 100, "input_type": "query"} + _ = stub_model_facade.generate_text_embeddings(["test1", "test2"], **kwargs) + assert captured_kwargs == {**stub_model_configs[0].inference_parameters.generate_kwargs, **kwargs} From 744bc8fd4c9d5662966ba09282b69daf326be9b8 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 16:46:42 -0700 Subject: [PATCH 05/69] remove sensitive=True old artifact no longer needed --- src/data_designer/engine/models/facade.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/data_designer/engine/models/facade.py b/src/data_designer/engine/models/facade.py index 4e3f36ef..ea72d4c3 100644 --- a/src/data_designer/engine/models/facade.py +++ b/src/data_designer/engine/models/facade.py @@ -64,7 +64,7 @@ def usage_stats(self) -> ModelUsageStats: def completion(self, messages: list[dict[str, str]], skip_usage_tracking: bool = False, **kwargs) -> ModelResponse: logger.debug( f"Prompting model {self.model_name!r}...", - extra={"model": self.model_name, "messages": messages, "sensitive": True}, + extra={"model": self.model_name, "messages": messages}, ) response = None kwargs = self.consolidate_kwargs(**kwargs) @@ -101,7 +101,6 @@ def generate_text_embeddings( extra={ "model": self.model_name, "input_count": len(input_texts), - "sensitive": True, }, ) kwargs = self.consolidate_kwargs(**kwargs) From b913f8d6dfc0d3717badc30a3ae4176287cbf9b8 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 17:11:23 -0700 Subject: [PATCH 06/69] Slight refactor --- .../utils/column_statistics_calculations.py | 2 +- ...llm_generators.py => generation_mixins.py} | 94 ++++--------------- .../generators/llm_completion_generators.py | 71 ++++++++++++++ .../engine/column_generators/registry.py | 2 +- .../dataset_builders/column_wise_builder.py | 6 +- ...s.py => test_llm_completion_generators.py} | 6 +- .../engine/column_generators/test_registry.py | 2 +- 7 files changed, 97 insertions(+), 86 deletions(-) rename src/data_designer/engine/column_generators/generators/{llm_generators.py => generation_mixins.py} (64%) create mode 100644 src/data_designer/engine/column_generators/generators/llm_completion_generators.py rename tests/engine/column_generators/generators/{test_llm_generators.py => test_llm_completion_generators.py} (97%) diff --git a/src/data_designer/engine/analysis/utils/column_statistics_calculations.py b/src/data_designer/engine/analysis/utils/column_statistics_calculations.py index 120caef4..1b23c0ea 100644 --- a/src/data_designer/engine/analysis/utils/column_statistics_calculations.py +++ b/src/data_designer/engine/analysis/utils/column_statistics_calculations.py @@ -23,7 +23,7 @@ SingleColumnConfig, ValidationColumnConfig, ) -from data_designer.engine.column_generators.generators.llm_generators import ( +from data_designer.engine.column_generators.utils.prompt_renderer import ( PromptType, RecordBasedPromptRenderer, create_response_recipe, diff --git a/src/data_designer/engine/column_generators/generators/llm_generators.py b/src/data_designer/engine/column_generators/generators/generation_mixins.py similarity index 64% rename from src/data_designer/engine/column_generators/generators/llm_generators.py rename to src/data_designer/engine/column_generators/generators/generation_mixins.py index 8f4cfc90..4e29a37a 100644 --- a/src/data_designer/engine/column_generators/generators/llm_generators.py +++ b/src/data_designer/engine/column_generators/generators/generation_mixins.py @@ -4,20 +4,9 @@ import functools import logging -from data_designer.config.column_configs import ( - LLMCodeColumnConfig, - LLMJudgeColumnConfig, - LLMStructuredColumnConfig, - LLMTextColumnConfig, -) from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP from data_designer.config.models import InferenceParameters, ModelConfig from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX -from data_designer.engine.column_generators.generators.base import ( - ColumnGenerator, - GenerationStrategy, - GeneratorMetadata, -) from data_designer.engine.column_generators.utils.prompt_renderer import ( PromptType, RecordBasedPromptRenderer, @@ -26,7 +15,6 @@ from data_designer.engine.models.facade import ModelFacade from data_designer.engine.models.recipes.base import ResponseRecipe from data_designer.engine.processing.utils import deserialize_json_values -from data_designer.engine.resources.resource_provider import ResourceType DEFAULT_MAX_CONVERSATION_RESTARTS = 5 DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS = 0 @@ -35,7 +23,7 @@ logger = logging.getLogger(__name__) -class WithLLMGeneration: +class WithModelGeneration: @functools.cached_property def model(self) -> ModelFacade: return self.resource_provider.model_registry.get_model(model_alias=self.config.model_alias) @@ -59,6 +47,21 @@ def prompt_renderer(self) -> RecordBasedPromptRenderer: }, ) + def log_pre_generation(self) -> None: + emoji = COLUMN_TYPE_EMOJI_MAP[self.config.column_type] + logger.info(f"{emoji} Preparing {self.config.column_type} column generation") + logger.info(f" |-- column name: {self.config.name!r}") + logger.info(f" |-- model config:\n{self.model_config.model_dump_json(indent=4)}") + if self.model_config.provider is None: + logger.info(f" |-- default model provider: {self._get_provider_name()!r}") + + def _get_provider_name(self) -> str: + model_alias = self.model_config.alias + provider = self.resource_provider.model_registry.get_model_provider(model_alias=model_alias) + return provider.name + + +class WithCompletionGeneration(WithModelGeneration): @functools.cached_property def response_recipe(self) -> ResponseRecipe: return create_response_recipe(self.config, self.model_config) @@ -104,68 +107,3 @@ def generate(self, data: dict) -> dict: data[self.config.name + REASONING_TRACE_COLUMN_POSTFIX] = reasoning_trace return data - - def log_pre_generation(self) -> None: - emoji = COLUMN_TYPE_EMOJI_MAP[self.config.column_type] - logger.info(f"{emoji} Preparing {self.config.column_type} column generation") - logger.info(f" |-- column name: {self.config.name!r}") - logger.info(f" |-- model config:\n{self.model_config.model_dump_json(indent=4)}") - if self.model_config.provider is None: - logger.info(f" |-- default model provider: {self._get_provider_name()!r}") - - def _get_provider_name(self) -> str: - model_alias = self.model_config.alias - provider = self.resource_provider.model_registry.get_model_provider(model_alias=model_alias) - return provider.name - - -class LLMTextCellGenerator(WithLLMGeneration, ColumnGenerator[LLMTextColumnConfig]): - @staticmethod - def metadata() -> GeneratorMetadata: - return GeneratorMetadata( - name="llm_text_generator", - description="Generate a new dataset cell from a prompt template", - generation_strategy=GenerationStrategy.CELL_BY_CELL, - required_resources=[ResourceType.MODEL_REGISTRY], - ) - - -class LLMCodeCellGenerator(WithLLMGeneration, ColumnGenerator[LLMCodeColumnConfig]): - @staticmethod - def metadata() -> GeneratorMetadata: - return GeneratorMetadata( - name="llm_code_generator", - description="Generate a new dataset cell from a prompt template", - generation_strategy=GenerationStrategy.CELL_BY_CELL, - required_resources=[ResourceType.MODEL_REGISTRY], - ) - - -class LLMStructuredCellGenerator(WithLLMGeneration, ColumnGenerator[LLMStructuredColumnConfig]): - @staticmethod - def metadata() -> GeneratorMetadata: - return GeneratorMetadata( - name="llm_structured_generator", - description="Generate a new dataset cell from a prompt template", - generation_strategy=GenerationStrategy.CELL_BY_CELL, - required_resources=[ResourceType.MODEL_REGISTRY], - ) - - -class LLMJudgeCellGenerator(WithLLMGeneration, ColumnGenerator[LLMJudgeColumnConfig]): - @staticmethod - def metadata() -> GeneratorMetadata: - return GeneratorMetadata( - name="llm_judge_generator", - description="Judge a new dataset cell based on a set of rubrics", - generation_strategy=GenerationStrategy.CELL_BY_CELL, - required_resources=[ResourceType.MODEL_REGISTRY], - ) - - @property - def max_conversation_correction_steps(self) -> int: - return DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS - - @property - def max_conversation_restarts(self) -> int: - return 2 * DEFAULT_MAX_CONVERSATION_RESTARTS diff --git a/src/data_designer/engine/column_generators/generators/llm_completion_generators.py b/src/data_designer/engine/column_generators/generators/llm_completion_generators.py new file mode 100644 index 00000000..cc61c619 --- /dev/null +++ b/src/data_designer/engine/column_generators/generators/llm_completion_generators.py @@ -0,0 +1,71 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import logging + +from data_designer.config.column_configs import ( + LLMCodeColumnConfig, + LLMJudgeColumnConfig, + LLMStructuredColumnConfig, + LLMTextColumnConfig, +) +from data_designer.engine.column_generators.generators.base import ( + ColumnGenerator, + GenerationStrategy, + GeneratorMetadata, +) +from data_designer.engine.column_generators.generators.generation_mixins import ( + DEFAULT_MAX_CONVERSATION_RESTARTS, + WithCompletionGeneration, +) +from data_designer.engine.resources.resource_provider import ResourceType + +logger = logging.getLogger(__name__) + + +class LLMTextCellGenerator(WithCompletionGeneration, ColumnGenerator[LLMTextColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + return GeneratorMetadata( + name="llm_text_generator", + description="Generate a new dataset cell from a prompt template", + generation_strategy=GenerationStrategy.CELL_BY_CELL, + required_resources=[ResourceType.MODEL_REGISTRY], + ) + + +class LLMCodeCellGenerator(WithCompletionGeneration, ColumnGenerator[LLMCodeColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + return GeneratorMetadata( + name="llm_code_generator", + description="Generate a new dataset cell from a prompt template", + generation_strategy=GenerationStrategy.CELL_BY_CELL, + required_resources=[ResourceType.MODEL_REGISTRY], + ) + + +class LLMStructuredCellGenerator(WithCompletionGeneration, ColumnGenerator[LLMStructuredColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + return GeneratorMetadata( + name="llm_structured_generator", + description="Generate a new dataset cell from a prompt template", + generation_strategy=GenerationStrategy.CELL_BY_CELL, + required_resources=[ResourceType.MODEL_REGISTRY], + ) + + +class LLMJudgeCellGenerator(WithCompletionGeneration, ColumnGenerator[LLMJudgeColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + return GeneratorMetadata( + name="llm_judge_generator", + description="Judge a new dataset cell based on a set of rubrics", + generation_strategy=GenerationStrategy.CELL_BY_CELL, + required_resources=[ResourceType.MODEL_REGISTRY], + ) + + @property + def max_conversation_restarts(self) -> int: + return DEFAULT_MAX_CONVERSATION_RESTARTS * 2 diff --git a/src/data_designer/engine/column_generators/registry.py b/src/data_designer/engine/column_generators/registry.py index 61b43753..56a176ae 100644 --- a/src/data_designer/engine/column_generators/registry.py +++ b/src/data_designer/engine/column_generators/registry.py @@ -13,7 +13,7 @@ from data_designer.config.column_types import DataDesignerColumnType from data_designer.engine.column_generators.generators.base import ColumnGenerator from data_designer.engine.column_generators.generators.expression import ExpressionColumnGenerator -from data_designer.engine.column_generators.generators.llm_generators import ( +from data_designer.engine.column_generators.generators.llm_completion_generators import ( LLMCodeCellGenerator, LLMJudgeCellGenerator, LLMStructuredCellGenerator, diff --git a/src/data_designer/engine/dataset_builders/column_wise_builder.py b/src/data_designer/engine/dataset_builders/column_wise_builder.py index e7060f82..ae6c54cc 100644 --- a/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -18,7 +18,7 @@ ProcessorType, ) from data_designer.engine.column_generators.generators.base import ColumnGenerator, GenerationStrategy -from data_designer.engine.column_generators.generators.llm_generators import WithLLMGeneration +from data_designer.engine.column_generators.generators.generation_mixins import WithCompletionGeneration from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError from data_designer.engine.dataset_builders.multi_column_configs import ( @@ -169,7 +169,7 @@ def _run_from_scratch_column_generator(self, generator: ColumnGenerator) -> None def _run_cell_by_cell_generator(self, generator: ColumnGenerator) -> None: max_workers = MAX_CONCURRENCY_PER_NON_LLM_GENERATOR - if isinstance(generator, WithLLMGeneration): + if isinstance(generator, WithCompletionGeneration): max_workers = generator.inference_parameters.max_parallel_requests self._fan_out_with_threads(generator, max_workers=max_workers) @@ -183,7 +183,7 @@ def _run_model_health_check_if_needed(self) -> bool: set(config.model_alias for config in self.llm_generated_column_configs) ) - def _fan_out_with_threads(self, generator: WithLLMGeneration, max_workers: int) -> None: + def _fan_out_with_threads(self, generator: WithCompletionGeneration, max_workers: int) -> None: if generator.generation_strategy != GenerationStrategy.CELL_BY_CELL: raise DatasetGenerationError( f"Generator {generator.metadata().name} is not a {GenerationStrategy.CELL_BY_CELL} " diff --git a/tests/engine/column_generators/generators/test_llm_generators.py b/tests/engine/column_generators/generators/test_llm_completion_generators.py similarity index 97% rename from tests/engine/column_generators/generators/test_llm_generators.py rename to tests/engine/column_generators/generators/test_llm_completion_generators.py index acaa2c6f..ab398aed 100644 --- a/tests/engine/column_generators/generators/test_llm_generators.py +++ b/tests/engine/column_generators/generators/test_llm_completion_generators.py @@ -11,10 +11,12 @@ LLMStructuredColumnConfig, LLMTextColumnConfig, ) -from data_designer.engine.column_generators.generators.llm_generators import ( +from data_designer.engine.column_generators.generators.generation_mixins import ( DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS, DEFAULT_MAX_CONVERSATION_RESTARTS, REASONING_TRACE_COLUMN_POSTFIX, +) +from data_designer.engine.column_generators.generators.llm_completion_generators import ( LLMCodeCellGenerator, LLMJudgeCellGenerator, LLMStructuredCellGenerator, @@ -94,7 +96,7 @@ def test_generate_method(): assert call_args[1]["multi_modal_context"] is None -@patch("data_designer.engine.column_generators.generators.llm_generators.logger", autospec=True) +@patch("data_designer.engine.column_generators.generators.generation_mixins.logger", autospec=True) def test_log_pre_generation(mock_logger): generator, mock_resource_provider, _, mock_model_config, _, _, _ = _create_generator_with_mocks() mock_model_config.model_dump_json.return_value = '{"test": "config"}' diff --git a/tests/engine/column_generators/test_registry.py b/tests/engine/column_generators/test_registry.py index f70b0d90..57457b94 100644 --- a/tests/engine/column_generators/test_registry.py +++ b/tests/engine/column_generators/test_registry.py @@ -3,7 +3,7 @@ from data_designer.config.column_types import DataDesignerColumnType from data_designer.engine.column_generators.generators.expression import ExpressionColumnGenerator -from data_designer.engine.column_generators.generators.llm_generators import ( +from data_designer.engine.column_generators.generators.llm_completion_generators import ( LLMCodeCellGenerator, LLMJudgeCellGenerator, LLMStructuredCellGenerator, From 052db7a41f142c8aa2b28f1701fb0fd3bfaa652f Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 17:21:20 -0700 Subject: [PATCH 07/69] slight refactor --- .../column_generators/generators/base.py | 48 ++++++++ .../generators/generation_mixins.py | 109 ------------------ .../generators/llm_completion_generators.py | 63 +++++++++- .../dataset_builders/column_wise_builder.py | 2 +- .../test_llm_completion_generators.py | 6 +- 5 files changed, 111 insertions(+), 117 deletions(-) delete mode 100644 src/data_designer/engine/column_generators/generators/generation_mixins.py diff --git a/src/data_designer/engine/column_generators/generators/base.py b/src/data_designer/engine/column_generators/generators/base.py index f4ddb60c..8977a63b 100644 --- a/src/data_designer/engine/column_generators/generators/base.py +++ b/src/data_designer/engine/column_generators/generators/base.py @@ -2,12 +2,22 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC, abstractmethod +import functools +import logging from typing import overload import pandas as pd +from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP +from data_designer.config.models import InferenceParameters, ModelConfig from data_designer.config.utils.type_helpers import StrEnum +from data_designer.engine.column_generators.utils.prompt_renderer import ( + RecordBasedPromptRenderer, +) from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, DataT, TaskConfigT +from data_designer.engine.models.facade import ModelFacade + +logger = logging.getLogger(__name__) class GenerationStrategy(StrEnum): @@ -59,3 +69,41 @@ def can_generate_from_scratch(self) -> bool: @abstractmethod def generate_from_scratch(self, num_records: int) -> pd.DataFrame: ... + + +class WithModelGeneration: + @functools.cached_property + def model(self) -> ModelFacade: + return self.resource_provider.model_registry.get_model(model_alias=self.config.model_alias) + + @functools.cached_property + def model_config(self) -> ModelConfig: + return self.resource_provider.model_registry.get_model_config(model_alias=self.config.model_alias) + + @functools.cached_property + def inference_parameters(self) -> InferenceParameters: + return self.model_config.inference_parameters + + @functools.cached_property + def prompt_renderer(self) -> RecordBasedPromptRenderer: + return RecordBasedPromptRenderer( + response_recipe=self.response_recipe, + error_message_context={ + "column_name": self.config.name, + "column_type": self.config.column_type, + "model_alias": self.config.model_alias, + }, + ) + + def log_pre_generation(self) -> None: + emoji = COLUMN_TYPE_EMOJI_MAP[self.config.column_type] + logger.info(f"{emoji} Preparing {self.config.column_type} column generation") + logger.info(f" |-- column name: {self.config.name!r}") + logger.info(f" |-- model config:\n{self.model_config.model_dump_json(indent=4)}") + if self.model_config.provider is None: + logger.info(f" |-- default model provider: {self._get_provider_name()!r}") + + def _get_provider_name(self) -> str: + model_alias = self.model_config.alias + provider = self.resource_provider.model_registry.get_model_provider(model_alias=model_alias) + return provider.name diff --git a/src/data_designer/engine/column_generators/generators/generation_mixins.py b/src/data_designer/engine/column_generators/generators/generation_mixins.py deleted file mode 100644 index 4e29a37a..00000000 --- a/src/data_designer/engine/column_generators/generators/generation_mixins.py +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import functools -import logging - -from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP -from data_designer.config.models import InferenceParameters, ModelConfig -from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX -from data_designer.engine.column_generators.utils.prompt_renderer import ( - PromptType, - RecordBasedPromptRenderer, - create_response_recipe, -) -from data_designer.engine.models.facade import ModelFacade -from data_designer.engine.models.recipes.base import ResponseRecipe -from data_designer.engine.processing.utils import deserialize_json_values - -DEFAULT_MAX_CONVERSATION_RESTARTS = 5 -DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS = 0 - - -logger = logging.getLogger(__name__) - - -class WithModelGeneration: - @functools.cached_property - def model(self) -> ModelFacade: - return self.resource_provider.model_registry.get_model(model_alias=self.config.model_alias) - - @functools.cached_property - def model_config(self) -> ModelConfig: - return self.resource_provider.model_registry.get_model_config(model_alias=self.config.model_alias) - - @functools.cached_property - def inference_parameters(self) -> InferenceParameters: - return self.model_config.inference_parameters - - @functools.cached_property - def prompt_renderer(self) -> RecordBasedPromptRenderer: - return RecordBasedPromptRenderer( - response_recipe=self.response_recipe, - error_message_context={ - "column_name": self.config.name, - "column_type": self.config.column_type, - "model_alias": self.config.model_alias, - }, - ) - - def log_pre_generation(self) -> None: - emoji = COLUMN_TYPE_EMOJI_MAP[self.config.column_type] - logger.info(f"{emoji} Preparing {self.config.column_type} column generation") - logger.info(f" |-- column name: {self.config.name!r}") - logger.info(f" |-- model config:\n{self.model_config.model_dump_json(indent=4)}") - if self.model_config.provider is None: - logger.info(f" |-- default model provider: {self._get_provider_name()!r}") - - def _get_provider_name(self) -> str: - model_alias = self.model_config.alias - provider = self.resource_provider.model_registry.get_model_provider(model_alias=model_alias) - return provider.name - - -class WithCompletionGeneration(WithModelGeneration): - @functools.cached_property - def response_recipe(self) -> ResponseRecipe: - return create_response_recipe(self.config, self.model_config) - - @property - def max_conversation_correction_steps(self) -> int: - return DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS - - @property - def max_conversation_restarts(self) -> int: - return DEFAULT_MAX_CONVERSATION_RESTARTS - - def generate(self, data: dict) -> dict: - deserialized_record = deserialize_json_values(data) - - multi_modal_context = None - if self.config.multi_modal_context is not None and len(self.config.multi_modal_context) > 0: - multi_modal_context = [ - context.get_context(deserialized_record) for context in self.config.multi_modal_context - ] - - response, reasoning_trace = self.model.generate( - prompt=self.prompt_renderer.render( - record=deserialized_record, - prompt_template=self.config.prompt, - prompt_type=PromptType.USER_PROMPT, - ), - system_prompt=self.prompt_renderer.render( - record=deserialized_record, - prompt_template=self.config.system_prompt, - prompt_type=PromptType.SYSTEM_PROMPT, - ), - parser=self.response_recipe.parse, - multi_modal_context=multi_modal_context, - max_correction_steps=self.max_conversation_correction_steps, - max_conversation_restarts=self.max_conversation_restarts, - purpose=f"running generation for column '{self.config.name}'", - ) - - data[self.config.name] = deserialize_json_values(self.response_recipe.serialize_output(response)) - - if reasoning_trace: - data[self.config.name + REASONING_TRACE_COLUMN_POSTFIX] = reasoning_trace - - return data diff --git a/src/data_designer/engine/column_generators/generators/llm_completion_generators.py b/src/data_designer/engine/column_generators/generators/llm_completion_generators.py index cc61c619..5665ba85 100644 --- a/src/data_designer/engine/column_generators/generators/llm_completion_generators.py +++ b/src/data_designer/engine/column_generators/generators/llm_completion_generators.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import functools import logging from data_designer.config.column_configs import ( @@ -9,20 +10,76 @@ LLMStructuredColumnConfig, LLMTextColumnConfig, ) +from data_designer.config.utils.constants import REASONING_TRACE_COLUMN_POSTFIX from data_designer.engine.column_generators.generators.base import ( ColumnGenerator, GenerationStrategy, GeneratorMetadata, + WithModelGeneration, ) -from data_designer.engine.column_generators.generators.generation_mixins import ( - DEFAULT_MAX_CONVERSATION_RESTARTS, - WithCompletionGeneration, +from data_designer.engine.column_generators.utils.prompt_renderer import ( + PromptType, + create_response_recipe, ) +from data_designer.engine.models.recipes.base import ResponseRecipe +from data_designer.engine.processing.utils import deserialize_json_values from data_designer.engine.resources.resource_provider import ResourceType logger = logging.getLogger(__name__) +DEFAULT_MAX_CONVERSATION_RESTARTS = 5 +DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS = 0 + + +class WithCompletionGeneration(WithModelGeneration): + @functools.cached_property + def response_recipe(self) -> ResponseRecipe: + return create_response_recipe(self.config, self.model_config) + + @property + def max_conversation_correction_steps(self) -> int: + return DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS + + @property + def max_conversation_restarts(self) -> int: + return DEFAULT_MAX_CONVERSATION_RESTARTS + + def generate(self, data: dict) -> dict: + deserialized_record = deserialize_json_values(data) + + multi_modal_context = None + if self.config.multi_modal_context is not None and len(self.config.multi_modal_context) > 0: + multi_modal_context = [ + context.get_context(deserialized_record) for context in self.config.multi_modal_context + ] + + response, reasoning_trace = self.model.generate( + prompt=self.prompt_renderer.render( + record=deserialized_record, + prompt_template=self.config.prompt, + prompt_type=PromptType.USER_PROMPT, + ), + system_prompt=self.prompt_renderer.render( + record=deserialized_record, + prompt_template=self.config.system_prompt, + prompt_type=PromptType.SYSTEM_PROMPT, + ), + parser=self.response_recipe.parse, + multi_modal_context=multi_modal_context, + max_correction_steps=self.max_conversation_correction_steps, + max_conversation_restarts=self.max_conversation_restarts, + purpose=f"running generation for column '{self.config.name}'", + ) + + data[self.config.name] = deserialize_json_values(self.response_recipe.serialize_output(response)) + + if reasoning_trace: + data[self.config.name + REASONING_TRACE_COLUMN_POSTFIX] = reasoning_trace + + return data + + class LLMTextCellGenerator(WithCompletionGeneration, ColumnGenerator[LLMTextColumnConfig]): @staticmethod def metadata() -> GeneratorMetadata: diff --git a/src/data_designer/engine/dataset_builders/column_wise_builder.py b/src/data_designer/engine/dataset_builders/column_wise_builder.py index ae6c54cc..78a5e9fa 100644 --- a/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -18,7 +18,7 @@ ProcessorType, ) from data_designer.engine.column_generators.generators.base import ColumnGenerator, GenerationStrategy -from data_designer.engine.column_generators.generators.generation_mixins import WithCompletionGeneration +from data_designer.engine.column_generators.generators.llm_completion_generators import WithCompletionGeneration from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError from data_designer.engine.dataset_builders.multi_column_configs import ( diff --git a/tests/engine/column_generators/generators/test_llm_completion_generators.py b/tests/engine/column_generators/generators/test_llm_completion_generators.py index ab398aed..3a411fc9 100644 --- a/tests/engine/column_generators/generators/test_llm_completion_generators.py +++ b/tests/engine/column_generators/generators/test_llm_completion_generators.py @@ -11,12 +11,10 @@ LLMStructuredColumnConfig, LLMTextColumnConfig, ) -from data_designer.engine.column_generators.generators.generation_mixins import ( +from data_designer.engine.column_generators.generators.llm_completion_generators import ( DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS, DEFAULT_MAX_CONVERSATION_RESTARTS, REASONING_TRACE_COLUMN_POSTFIX, -) -from data_designer.engine.column_generators.generators.llm_completion_generators import ( LLMCodeCellGenerator, LLMJudgeCellGenerator, LLMStructuredCellGenerator, @@ -96,7 +94,7 @@ def test_generate_method(): assert call_args[1]["multi_modal_context"] is None -@patch("data_designer.engine.column_generators.generators.generation_mixins.logger", autospec=True) +@patch("data_designer.engine.column_generators.generators.base.logger", autospec=True) def test_log_pre_generation(mock_logger): generator, mock_resource_provider, _, mock_model_config, _, _, _ = _create_generator_with_mocks() mock_model_config.model_dump_json.return_value = '{"test": "config"}' From 5504c8dd1b4745e27c6590e2424cc4cb26a7944d Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 18:17:07 -0700 Subject: [PATCH 08/69] Added embedding generator --- src/data_designer/config/column_configs.py | 23 +++++++++++++ src/data_designer/config/column_types.py | 8 +++++ src/data_designer/config/models.py | 2 +- .../config/utils/visualization.py | 15 +++++++- .../generators/embedding_generators.py | 34 +++++++++++++++++++ .../engine/column_generators/registry.py | 3 ++ src/data_designer/engine/models/facade.py | 6 +++- src/data_designer/engine/models/registry.py | 29 ++++++++++------ src/data_designer/essentials/__init__.py | 4 +++ 9 files changed, 111 insertions(+), 13 deletions(-) create mode 100644 src/data_designer/engine/column_generators/generators/embedding_generators.py diff --git a/src/data_designer/config/column_configs.py b/src/data_designer/config/column_configs.py index d19b6a9e..c5468f19 100644 --- a/src/data_designer/config/column_configs.py +++ b/src/data_designer/config/column_configs.py @@ -377,3 +377,26 @@ class SeedDatasetColumnConfig(SingleColumnConfig): """ column_type: Literal["seed-dataset"] = "seed-dataset" + + +class EmbeddingColumnConfig(SingleColumnConfig): + """Configuration for embedding generation columns. + + Embedding columns generate embeddings for text input using a specified model. + + Attributes: + column_type: Discriminator field, always "embedding" for this configuration type. + target_column: The column to generate embeddings for. + model_alias: The model to use for embedding generation. + chunk_separator: Optional separator to split the text in the target column into chunks. For example, if chunk_separator + is '\n', the text will be split into chunks of text separated by newlines and embeddings generated for each chunk. + """ + + column_type: Literal["embedding"] = "embedding" + target_column: str + model_alias: str + chunk_separator: Optional[str] = None + + @property + def required_columns(self) -> list[str]: + return [self.target_column] diff --git a/src/data_designer/config/column_types.py b/src/data_designer/config/column_types.py index 50ba498d..aab55c4d 100644 --- a/src/data_designer/config/column_types.py +++ b/src/data_designer/config/column_types.py @@ -7,6 +7,7 @@ from ..plugin_manager import PluginManager from .column_configs import ( + EmbeddingColumnConfig, ExpressionColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, @@ -31,6 +32,7 @@ SamplerColumnConfig, SeedDatasetColumnConfig, ValidationColumnConfig, + EmbeddingColumnConfig, ] ColumnConfigT = plugin_manager.inject_into_column_config_type_union(ColumnConfigT) @@ -50,6 +52,7 @@ DataDesignerColumnType.SEED_DATASET: "🌱", DataDesignerColumnType.SAMPLER: "🎲", DataDesignerColumnType.VALIDATION: "πŸ”", + DataDesignerColumnType.EMBEDDING: "🧬", } COLUMN_TYPE_EMOJI_MAP.update( {DataDesignerColumnType(p.name): p.emoji for p in plugin_manager.get_column_generator_plugins()} @@ -66,6 +69,7 @@ def column_type_used_in_execution_dag(column_type: Union[str, DataDesignerColumn DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_TEXT, DataDesignerColumnType.VALIDATION, + DataDesignerColumnType.EMBEDDING, } dag_column_types.update(plugin_manager.get_plugin_column_types(DataDesignerColumnType)) return column_type in dag_column_types @@ -79,6 +83,7 @@ def column_type_is_llm_generated(column_type: Union[str, DataDesignerColumnType] DataDesignerColumnType.LLM_CODE, DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_JUDGE, + DataDesignerColumnType.EMBEDDING, } llm_generated_column_types.update( plugin_manager.get_plugin_column_types( @@ -117,6 +122,8 @@ def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType return SamplerColumnConfig(name=name, **_resolve_sampler_kwargs(name, kwargs)) if column_type == DataDesignerColumnType.SEED_DATASET: return SeedDatasetColumnConfig(name=name, **kwargs) + if column_type == DataDesignerColumnType.EMBEDDING: + return EmbeddingColumnConfig(name=name, **kwargs) if plugin := plugin_manager.get_column_generator_plugin_if_exists(column_type.value): return plugin.config_cls(name=name, **kwargs) raise InvalidColumnTypeError(f"πŸ›‘ {column_type} is not a valid column type.") # pragma: no cover @@ -131,6 +138,7 @@ def get_column_display_order() -> list[DataDesignerColumnType]: DataDesignerColumnType.LLM_CODE, DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_JUDGE, + DataDesignerColumnType.EMBEDDING, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EXPRESSION, ] diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index 17698346..481633ac 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -207,7 +207,7 @@ def _is_value_in_range(self, value: float, min_value: float, max_value: float) - class GenerationType(str, Enum): CHAT_COMPLETION = "chat-completion" - TEXT_EMBEDDING = "text-embedding" + EMBEDDING = "embedding" IMAGE_GENERATION = "image-generation" diff --git a/src/data_designer/config/utils/visualization.py b/src/data_designer/config/utils/visualization.py index 26ab4ad3..0972daf7 100644 --- a/src/data_designer/config/utils/visualization.py +++ b/src/data_designer/config/utils/visualization.py @@ -8,7 +8,7 @@ from functools import cached_property import json import os -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import numpy as np import pandas as pd @@ -171,6 +171,7 @@ def display_sample_record( + config_builder.get_columns_of_type(DataDesignerColumnType.EXPRESSION) + config_builder.get_columns_of_type(DataDesignerColumnType.LLM_TEXT) + config_builder.get_columns_of_type(DataDesignerColumnType.LLM_STRUCTURED) + + config_builder.get_columns_of_type(DataDesignerColumnType.EMBEDDING) ) if len(non_code_columns) > 0: table = Table(title="Generated Columns", **table_kws) @@ -178,6 +179,10 @@ def display_sample_record( table.add_column("Value") for col in non_code_columns: if not col.drop: + if col.column_type == DataDesignerColumnType.EMBEDDING: + record[col.name]["embeddings"] = [ + get_truncated_list_as_string(embd) for embd in record[col.name].get("embeddings") + ] table.add_row(col.name, convert_to_row_element(record[col.name])) render_list.append(pad_console_element(table)) @@ -237,6 +242,14 @@ def display_sample_record( console.print(Group(*render_list), markup=False) +def get_truncated_list_as_string(long_list: list[Any], max_items: int = 2) -> str: + if len(long_list) > max_items: + truncated_part = long_list[:max_items] + return f"[{', '.join(str(x) for x in truncated_part)} ...]" + else: + return str(long_list) + + def display_sampler_table( sampler_params: dict[SamplerType, ConfigBase], title: Optional[str] = None, diff --git a/src/data_designer/engine/column_generators/generators/embedding_generators.py b/src/data_designer/engine/column_generators/generators/embedding_generators.py new file mode 100644 index 00000000..ec827805 --- /dev/null +++ b/src/data_designer/engine/column_generators/generators/embedding_generators.py @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from data_designer.config.column_configs import EmbeddingColumnConfig +from data_designer.engine.column_generators.generators.base import ( + ColumnGenerator, + GenerationStrategy, + GeneratorMetadata, + WithModelGeneration, +) +from data_designer.engine.processing.utils import deserialize_json_values + + +class EmbeddingCellGenerator(WithModelGeneration, ColumnGenerator[EmbeddingColumnConfig]): + @staticmethod + def metadata() -> GeneratorMetadata: + return GeneratorMetadata( + name="embedding_cell_generator", + description="Generate embeddings for a text column.", + generation_strategy=GenerationStrategy.CELL_BY_CELL, + required_resources=None, + ) + + def generate(self, data: dict) -> dict: + deserialized_record = deserialize_json_values(data) + input_text = deserialized_record[self.config.target_column] + input_chunks = input_text.split(self.config.chunk_separator) if self.config.chunk_separator else [input_text] + embeddings = self.model.generate_text_embeddings(input_texts=input_chunks) + data[self.config.name] = { + "embeddings": embeddings, + "num_embeddings": len(embeddings), + "dimension": len(embeddings[0]) if len(embeddings) > 0 else 0, + } + return data diff --git a/src/data_designer/engine/column_generators/registry.py b/src/data_designer/engine/column_generators/registry.py index 56a176ae..961eac1a 100644 --- a/src/data_designer/engine/column_generators/registry.py +++ b/src/data_designer/engine/column_generators/registry.py @@ -3,6 +3,7 @@ from data_designer.config.base import ConfigBase from data_designer.config.column_configs import ( + EmbeddingColumnConfig, ExpressionColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, @@ -12,6 +13,7 @@ ) from data_designer.config.column_types import DataDesignerColumnType from data_designer.engine.column_generators.generators.base import ColumnGenerator +from data_designer.engine.column_generators.generators.embedding_generators import EmbeddingCellGenerator from data_designer.engine.column_generators.generators.expression import ExpressionColumnGenerator from data_designer.engine.column_generators.generators.llm_completion_generators import ( LLMCodeCellGenerator, @@ -40,6 +42,7 @@ def create_default_column_generator_registry(with_plugins: bool = True) -> Colum registry.register(DataDesignerColumnType.LLM_CODE, LLMCodeCellGenerator, LLMCodeColumnConfig) registry.register(DataDesignerColumnType.LLM_JUDGE, LLMJudgeCellGenerator, LLMJudgeColumnConfig) registry.register(DataDesignerColumnType.EXPRESSION, ExpressionColumnGenerator, ExpressionColumnConfig) + registry.register(DataDesignerColumnType.EMBEDDING, EmbeddingCellGenerator, EmbeddingColumnConfig) registry.register(DataDesignerColumnType.SAMPLER, SamplerColumnGenerator, SamplerMultiColumnConfig) registry.register(DataDesignerColumnType.SEED_DATASET, SeedDatasetColumnGenerator, SeedDatasetMultiColumnConfig) registry.register(DataDesignerColumnType.VALIDATION, ValidationColumnGenerator, ValidationColumnConfig) diff --git a/src/data_designer/engine/models/facade.py b/src/data_designer/engine/models/facade.py index ea72d4c3..c205a4ca 100644 --- a/src/data_designer/engine/models/facade.py +++ b/src/data_designer/engine/models/facade.py @@ -11,7 +11,7 @@ from litellm.types.router import DeploymentTypedDict, LiteLLM_Params from litellm.types.utils import EmbeddingResponse, ModelResponse -from data_designer.config.models import ModelConfig, ModelProvider +from data_designer.config.models import GenerationType, ModelConfig, ModelProvider from data_designer.engine.model_provider import ModelProviderRegistry from data_designer.engine.models.errors import ( GenerationValidationFailureError, @@ -49,6 +49,10 @@ def model_name(self) -> str: def model_provider(self) -> ModelProvider: return self._model_provider_registry.get_provider(self._model_config.provider) + @property + def model_generation_type(self) -> GenerationType: + return self._model_config.generation_type + @property def model_provider_name(self) -> str: return self.model_provider.name diff --git a/src/data_designer/engine/models/registry.py b/src/data_designer/engine/models/registry.py index aafd8c80..4330ea18 100644 --- a/src/data_designer/engine/models/registry.py +++ b/src/data_designer/engine/models/registry.py @@ -5,7 +5,7 @@ import logging -from data_designer.config.models import ModelConfig +from data_designer.config.models import GenerationType, ModelConfig from data_designer.engine.model_provider import ModelProvider, ModelProviderRegistry from data_designer.engine.models.facade import ModelFacade from data_designer.engine.models.litellm_overrides import apply_litellm_patches @@ -81,15 +81,24 @@ def run_health_check(self, model_aliases: set[str]) -> None: f" |-- πŸ‘€ Checking {model.model_name!r} in provider named {model.model_provider_name!r} for model alias {model.model_alias!r}..." ) try: - model.generate( - prompt="Hello!", - parser=lambda x: x, - system_prompt="You are a helpful assistant.", - max_correction_steps=0, - max_conversation_restarts=0, - skip_usage_tracking=True, - purpose="running health checks", - ) + if model.model_generation_type == GenerationType.EMBEDDING: + model.generate_text_embeddings( + input_texts=["Hello!"], + skip_usage_tracking=True, + purpose="running health checks", + ) + elif model.model_generation_type == GenerationType.CHAT_COMPLETION: + model.generate( + prompt="Hello!", + parser=lambda x: x, + system_prompt="You are a helpful assistant.", + max_correction_steps=0, + max_conversation_restarts=0, + skip_usage_tracking=True, + purpose="running health checks", + ) + else: + raise ValueError(f"Unsupported generation type: {model.model_generation_type}") logger.info(" |-- βœ… Passed!") except Exception as e: logger.error(" |-- ❌ Failed!") diff --git a/src/data_designer/essentials/__init__.py b/src/data_designer/essentials/__init__.py index 8cd8eb92..ee43519c 100644 --- a/src/data_designer/essentials/__init__.py +++ b/src/data_designer/essentials/__init__.py @@ -6,6 +6,7 @@ from ..config.analysis.column_profilers import JudgeScoreProfilerConfig from ..config.column_configs import ( + EmbeddingColumnConfig, ExpressionColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, @@ -22,6 +23,7 @@ from ..config.dataset_builders import BuildStage from ..config.datastore import DatastoreSettings from ..config.models import ( + GenerationType, ImageContext, ImageFormat, InferenceParameters, @@ -91,8 +93,10 @@ "DatastoreSettings", "DatetimeSamplerParams", "DropColumnsProcessorConfig", + "EmbeddingColumnConfig", "ExpressionColumnConfig", "GaussianSamplerParams", + "GenerationType", "IndexRange", "InfoType", "ImageContext", From 4b6f877875fa93f718d31211323f9a34207630b7 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 18:20:57 -0700 Subject: [PATCH 09/69] chunk_separator -> chunk_pattern --- src/data_designer/config/column_configs.py | 7 ++++--- .../column_generators/generators/embedding_generators.py | 4 +++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/data_designer/config/column_configs.py b/src/data_designer/config/column_configs.py index c5468f19..339be35d 100644 --- a/src/data_designer/config/column_configs.py +++ b/src/data_designer/config/column_configs.py @@ -388,14 +388,15 @@ class EmbeddingColumnConfig(SingleColumnConfig): column_type: Discriminator field, always "embedding" for this configuration type. target_column: The column to generate embeddings for. model_alias: The model to use for embedding generation. - chunk_separator: Optional separator to split the text in the target column into chunks. For example, if chunk_separator - is '\n', the text will be split into chunks of text separated by newlines and embeddings generated for each chunk. + chunk_pattern: Optional regex pattern to split the text in the target column into chunks. For example, if chunk_pattern + is r'\n+', the text will be split into chunks using one or more newlines as separators and embeddings generated for each chunk. + If not provided, the entire text will be embedded as a single chunk. """ column_type: Literal["embedding"] = "embedding" target_column: str model_alias: str - chunk_separator: Optional[str] = None + chunk_pattern: Optional[str] = None @property def required_columns(self) -> list[str]: diff --git a/src/data_designer/engine/column_generators/generators/embedding_generators.py b/src/data_designer/engine/column_generators/generators/embedding_generators.py index ec827805..ac791d4f 100644 --- a/src/data_designer/engine/column_generators/generators/embedding_generators.py +++ b/src/data_designer/engine/column_generators/generators/embedding_generators.py @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import re + from data_designer.config.column_configs import EmbeddingColumnConfig from data_designer.engine.column_generators.generators.base import ( ColumnGenerator, @@ -24,7 +26,7 @@ def metadata() -> GeneratorMetadata: def generate(self, data: dict) -> dict: deserialized_record = deserialize_json_values(data) input_text = deserialized_record[self.config.target_column] - input_chunks = input_text.split(self.config.chunk_separator) if self.config.chunk_separator else [input_text] + input_chunks = re.split(self.config.chunk_pattern, input_text) if self.config.chunk_pattern else [input_text] embeddings = self.model.generate_text_embeddings(input_texts=input_chunks) data[self.config.name] = { "embeddings": embeddings, From 04fc0f3645062f15b392b70cc64feea2e1d11cab Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 25 Nov 2025 18:22:49 -0700 Subject: [PATCH 10/69] update tests --- tests/config/test_columns.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/config/test_columns.py b/tests/config/test_columns.py index f0f5c51a..f7763b07 100644 --- a/tests/config/test_columns.py +++ b/tests/config/test_columns.py @@ -49,6 +49,7 @@ def test_data_designer_column_type_get_display_order(): DataDesignerColumnType.LLM_CODE, DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_JUDGE, + DataDesignerColumnType.EMBEDDING, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EXPRESSION, ] @@ -59,6 +60,7 @@ def test_data_designer_column_type_is_llm_generated(): assert column_type_is_llm_generated(DataDesignerColumnType.LLM_CODE) assert column_type_is_llm_generated(DataDesignerColumnType.LLM_STRUCTURED) assert column_type_is_llm_generated(DataDesignerColumnType.LLM_JUDGE) + assert column_type_is_llm_generated(DataDesignerColumnType.EMBEDDING) assert not column_type_is_llm_generated(DataDesignerColumnType.SAMPLER) assert not column_type_is_llm_generated(DataDesignerColumnType.VALIDATION) assert not column_type_is_llm_generated(DataDesignerColumnType.EXPRESSION) @@ -72,6 +74,7 @@ def test_data_designer_column_type_is_in_dag(): assert column_type_used_in_execution_dag(DataDesignerColumnType.LLM_STRUCTURED) assert column_type_used_in_execution_dag(DataDesignerColumnType.LLM_TEXT) assert column_type_used_in_execution_dag(DataDesignerColumnType.VALIDATION) + assert column_type_used_in_execution_dag(DataDesignerColumnType.EMBEDDING) assert not column_type_used_in_execution_dag(DataDesignerColumnType.SAMPLER) assert not column_type_used_in_execution_dag(DataDesignerColumnType.SEED_DATASET) From 26d6da1917326fbb57a6e88cf3392145a4f69362 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 26 Nov 2025 09:44:05 -0700 Subject: [PATCH 11/69] rename for consistency --- .../generators/{embedding_generators.py => embedding.py} | 3 ++- .../{llm_completion_generators.py => llm_completion.py} | 0 src/data_designer/engine/column_generators/registry.py | 4 ++-- .../engine/dataset_builders/column_wise_builder.py | 2 +- .../generators/test_llm_completion_generators.py | 2 +- tests/engine/column_generators/test_registry.py | 2 +- 6 files changed, 7 insertions(+), 6 deletions(-) rename src/data_designer/engine/column_generators/generators/{embedding_generators.py => embedding.py} (91%) rename src/data_designer/engine/column_generators/generators/{llm_completion_generators.py => llm_completion.py} (100%) diff --git a/src/data_designer/engine/column_generators/generators/embedding_generators.py b/src/data_designer/engine/column_generators/generators/embedding.py similarity index 91% rename from src/data_designer/engine/column_generators/generators/embedding_generators.py rename to src/data_designer/engine/column_generators/generators/embedding.py index ac791d4f..d9981ccd 100644 --- a/src/data_designer/engine/column_generators/generators/embedding_generators.py +++ b/src/data_designer/engine/column_generators/generators/embedding.py @@ -11,6 +11,7 @@ WithModelGeneration, ) from data_designer.engine.processing.utils import deserialize_json_values +from data_designer.engine.resources.resource_provider import ResourceType class EmbeddingCellGenerator(WithModelGeneration, ColumnGenerator[EmbeddingColumnConfig]): @@ -20,7 +21,7 @@ def metadata() -> GeneratorMetadata: name="embedding_cell_generator", description="Generate embeddings for a text column.", generation_strategy=GenerationStrategy.CELL_BY_CELL, - required_resources=None, + required_resources=[ResourceType.MODEL_REGISTRY], ) def generate(self, data: dict) -> dict: diff --git a/src/data_designer/engine/column_generators/generators/llm_completion_generators.py b/src/data_designer/engine/column_generators/generators/llm_completion.py similarity index 100% rename from src/data_designer/engine/column_generators/generators/llm_completion_generators.py rename to src/data_designer/engine/column_generators/generators/llm_completion.py diff --git a/src/data_designer/engine/column_generators/registry.py b/src/data_designer/engine/column_generators/registry.py index 961eac1a..7171e561 100644 --- a/src/data_designer/engine/column_generators/registry.py +++ b/src/data_designer/engine/column_generators/registry.py @@ -13,9 +13,9 @@ ) from data_designer.config.column_types import DataDesignerColumnType from data_designer.engine.column_generators.generators.base import ColumnGenerator -from data_designer.engine.column_generators.generators.embedding_generators import EmbeddingCellGenerator +from data_designer.engine.column_generators.generators.embedding import EmbeddingCellGenerator from data_designer.engine.column_generators.generators.expression import ExpressionColumnGenerator -from data_designer.engine.column_generators.generators.llm_completion_generators import ( +from data_designer.engine.column_generators.generators.llm_completion import ( LLMCodeCellGenerator, LLMJudgeCellGenerator, LLMStructuredCellGenerator, diff --git a/src/data_designer/engine/dataset_builders/column_wise_builder.py b/src/data_designer/engine/dataset_builders/column_wise_builder.py index 78a5e9fa..ff9289ee 100644 --- a/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -18,7 +18,7 @@ ProcessorType, ) from data_designer.engine.column_generators.generators.base import ColumnGenerator, GenerationStrategy -from data_designer.engine.column_generators.generators.llm_completion_generators import WithCompletionGeneration +from data_designer.engine.column_generators.generators.llm_completion import WithCompletionGeneration from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError from data_designer.engine.dataset_builders.multi_column_configs import ( diff --git a/tests/engine/column_generators/generators/test_llm_completion_generators.py b/tests/engine/column_generators/generators/test_llm_completion_generators.py index 3a411fc9..0b787b7e 100644 --- a/tests/engine/column_generators/generators/test_llm_completion_generators.py +++ b/tests/engine/column_generators/generators/test_llm_completion_generators.py @@ -11,7 +11,7 @@ LLMStructuredColumnConfig, LLMTextColumnConfig, ) -from data_designer.engine.column_generators.generators.llm_completion_generators import ( +from data_designer.engine.column_generators.generators.llm_completion import ( DEFAULT_MAX_CONVERSATION_CORRECTION_STEPS, DEFAULT_MAX_CONVERSATION_RESTARTS, REASONING_TRACE_COLUMN_POSTFIX, diff --git a/tests/engine/column_generators/test_registry.py b/tests/engine/column_generators/test_registry.py index 57457b94..0d325937 100644 --- a/tests/engine/column_generators/test_registry.py +++ b/tests/engine/column_generators/test_registry.py @@ -3,7 +3,7 @@ from data_designer.config.column_types import DataDesignerColumnType from data_designer.engine.column_generators.generators.expression import ExpressionColumnGenerator -from data_designer.engine.column_generators.generators.llm_completion_generators import ( +from data_designer.engine.column_generators.generators.llm_completion import ( LLMCodeCellGenerator, LLMJudgeCellGenerator, LLMStructuredCellGenerator, From 6facbd2c8a710052fc76c3c33c3c451dca04c697 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 26 Nov 2025 11:04:00 -0700 Subject: [PATCH 12/69] Restructure InferenceParameters -> CompletionInferenceParameters, BaseInferenceParameters, EmbeddingInferenceParameters --- .../config/default_model_settings.py | 18 +++-- src/data_designer/config/models.py | 79 ++++++++++++++++--- src/data_designer/essentials/__init__.py | 6 +- tests/cli/conftest.py | 10 +-- .../cli/controllers/test_model_controller.py | 4 +- tests/cli/services/test_model_service.py | 10 ++- tests/config/test_config_builder.py | 12 +-- tests/config/test_default_model_settings.py | 8 +- tests/config/test_models.py | 64 ++++++++------- tests/conftest.py | 4 +- tests/engine/models/conftest.py | 6 +- tests/engine/models/test_model_registry.py | 6 +- tests/essentials/test_init.py | 12 +++ 13 files changed, 162 insertions(+), 77 deletions(-) diff --git a/src/data_designer/config/default_model_settings.py b/src/data_designer/config/default_model_settings.py index 33d6dad4..cb565178 100644 --- a/src/data_designer/config/default_model_settings.py +++ b/src/data_designer/config/default_model_settings.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Any, Literal, Optional -from .models import InferenceParameters, ModelConfig, ModelProvider +from .models import CompletionInferenceParameters, ModelConfig, ModelProvider from .utils.constants import ( MANAGED_ASSETS_PATH, MODEL_CONFIGS_FILE_PATH, @@ -21,28 +21,30 @@ logger = logging.getLogger(__name__) -def get_default_text_alias_inference_parameters() -> InferenceParameters: - return InferenceParameters( +def get_default_text_alias_inference_parameters() -> CompletionInferenceParameters: + return CompletionInferenceParameters( temperature=0.85, top_p=0.95, ) -def get_default_reasoning_alias_inference_parameters() -> InferenceParameters: - return InferenceParameters( +def get_default_reasoning_alias_inference_parameters() -> CompletionInferenceParameters: + return CompletionInferenceParameters( temperature=0.35, top_p=0.95, ) -def get_default_vision_alias_inference_parameters() -> InferenceParameters: - return InferenceParameters( +def get_default_vision_alias_inference_parameters() -> CompletionInferenceParameters: + return CompletionInferenceParameters( temperature=0.85, top_p=0.95, ) -def get_default_inference_parameters(model_alias: Literal["text", "reasoning", "vision"]) -> InferenceParameters: +def get_default_inference_parameters( + model_alias: Literal["text", "reasoning", "vision"], +) -> CompletionInferenceParameters: if model_alias == "reasoning": return get_default_reasoning_alias_inference_parameters() elif model_alias == "vision": diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index 481633ac..1df7055e 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -5,7 +5,7 @@ from enum import Enum import logging from pathlib import Path -from typing import Any, Generic, List, Optional, TypeVar, Union +from typing import Any, Generic, List, Literal, Optional, TypeVar, Union import numpy as np from pydantic import BaseModel, Field, model_validator @@ -136,10 +136,7 @@ def sample(self) -> float: DistributionT: TypeAlias = Union[UniformDistribution, ManualDistribution] -class InferenceParameters(ConfigBase): - temperature: Optional[Union[float, DistributionT]] = None - top_p: Optional[Union[float, DistributionT]] = None - max_tokens: Optional[int] = Field(default=None, ge=1) +class BaseInferenceParameters(ConfigBase, ABC): max_parallel_requests: int = Field(default=4, ge=1) timeout: Optional[int] = Field(default=None, ge=1) extra_body: Optional[dict[str, Any]] = None @@ -147,6 +144,21 @@ class InferenceParameters(ConfigBase): @property def generate_kwargs(self) -> dict[str, Union[float, int]]: result = {} + if self.timeout is not None: + result["timeout"] = self.timeout + if self.extra_body is not None and self.extra_body != {}: + result["extra_body"] = self.extra_body + return result + + +class CompletionInferenceParameters(BaseInferenceParameters): + temperature: Optional[Union[float, DistributionT]] = None + top_p: Optional[Union[float, DistributionT]] = None + max_tokens: Optional[int] = Field(default=None, ge=1) + + @property + def generate_kwargs(self) -> dict[str, Union[float, int]]: + result = super().generate_kwargs if self.temperature is not None: result["temperature"] = ( self.temperature.sample() if hasattr(self.temperature, "sample") else self.temperature @@ -155,10 +167,6 @@ def generate_kwargs(self) -> dict[str, Union[float, int]]: result["top_p"] = self.top_p.sample() if hasattr(self.top_p, "sample") else self.top_p if self.max_tokens is not None: result["max_tokens"] = self.max_tokens - if self.timeout is not None: - result["timeout"] = self.timeout - if self.extra_body is not None and self.extra_body != {}: - result["extra_body"] = self.extra_body return result @model_validator(mode="after") @@ -205,6 +213,40 @@ def _is_value_in_range(self, value: float, min_value: float, max_value: float) - return min_value <= value <= max_value +# Maintain backwards compatibility with a deprecation warning +class InferenceParameters(CompletionInferenceParameters): + """ + Deprecated: Use CompletionInferenceParameters instead. + This alias will be removed in a future version. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + logger.warning( + "InferenceParameters is deprecated and will be removed in a future version. " + "Use CompletionInferenceParameters instead." + ) + super().__init__(*args, **kwargs) + + +class EmbeddingInferenceParameters(BaseInferenceParameters): + encoding_format: Optional[Literal["float", "base64"]] = "float" + dimensions: Optional[int] = None + + @property + def generate_kwargs(self) -> dict[str, Union[float, int]]: + result = super().generate_kwargs + if self.encoding_format is not None: + result["encoding_format"] = self.encoding_format + if self.dimensions is not None: + result["dimensions"] = self.dimensions + return result + + +InferenceParametersT: TypeAlias = Union[ + InferenceParameters, CompletionInferenceParameters, EmbeddingInferenceParameters +] + + class GenerationType(str, Enum): CHAT_COMPLETION = "chat-completion" EMBEDDING = "embedding" @@ -214,10 +256,25 @@ class GenerationType(str, Enum): class ModelConfig(ConfigBase): alias: str model: str - inference_parameters: InferenceParameters = Field(default_factory=InferenceParameters) - generation_type: GenerationType = GenerationType.CHAT_COMPLETION + inference_parameters: InferenceParametersT = Field(default_factory=CompletionInferenceParameters) provider: Optional[str] = None + @model_validator(mode="after") + def _normalize_deprecated_inference_parameters(self) -> Self: + """Normalize deprecated InferenceParameters to CompletionInferenceParameters.""" + if isinstance(self.inference_parameters, InferenceParameters): + self.inference_parameters = CompletionInferenceParameters(**self.inference_parameters.model_dump()) + return self + + @property + def generation_type(self) -> GenerationType: + if isinstance(self.inference_parameters, CompletionInferenceParameters): + return GenerationType.CHAT_COMPLETION + elif isinstance(self.inference_parameters, EmbeddingInferenceParameters): + return GenerationType.EMBEDDING + else: + raise ValueError(f"Unsupported inference parameters type: {type(self.inference_parameters)}") + class ModelProvider(ConfigBase): name: str diff --git a/src/data_designer/essentials/__init__.py b/src/data_designer/essentials/__init__.py index ee43519c..cd1dd6ba 100644 --- a/src/data_designer/essentials/__init__.py +++ b/src/data_designer/essentials/__init__.py @@ -23,6 +23,8 @@ from ..config.dataset_builders import BuildStage from ..config.datastore import DatastoreSettings from ..config.models import ( + CompletionInferenceParameters, + EmbeddingInferenceParameters, GenerationType, ImageContext, ImageFormat, @@ -80,20 +82,22 @@ "BernoulliMixtureSamplerParams", "BernoulliSamplerParams", "BinomialSamplerParams", + "BuildStage", "CategorySamplerParams", "CodeLang", "CodeValidatorParams", "ColumnInequalityConstraint", + "CompletionInferenceParameters", "configure_logging", "DataDesignerColumnType", "DataDesignerConfig", "DataDesignerConfigBuilder", - "BuildStage", "DatastoreSeedDatasetReference", "DatastoreSettings", "DatetimeSamplerParams", "DropColumnsProcessorConfig", "EmbeddingColumnConfig", + "EmbeddingInferenceParameters", "ExpressionColumnConfig", "GaussianSamplerParams", "GenerationType", diff --git a/tests/cli/conftest.py b/tests/cli/conftest.py index 758e837e..66a06347 100644 --- a/tests/cli/conftest.py +++ b/tests/cli/conftest.py @@ -9,16 +9,16 @@ from data_designer.cli.repositories.provider_repository import ModelProviderRegistry, ProviderRepository from data_designer.cli.services.model_service import ModelService from data_designer.cli.services.provider_service import ProviderService -from data_designer.config.models import InferenceParameters, ModelConfig, ModelProvider +from data_designer.config.models import CompletionInferenceParameters, ModelConfig, ModelProvider @pytest.fixture -def stub_inference_parameters() -> InferenceParameters: - return InferenceParameters(temperature=0.7, top_p=0.9, max_tokens=2048, max_parallel_requests=4) +def stub_inference_parameters() -> CompletionInferenceParameters: + return CompletionInferenceParameters(temperature=0.7, top_p=0.9, max_tokens=2048, max_parallel_requests=4) @pytest.fixture -def stub_model_configs(stub_inference_parameters: InferenceParameters) -> list[ModelConfig]: +def stub_model_configs(stub_inference_parameters: CompletionInferenceParameters) -> list[ModelConfig]: return [ ModelConfig( alias="test-alias-1", @@ -41,7 +41,7 @@ def stub_new_model_config() -> ModelConfig: alias="test-alias-3", model="test-model-3", provider="test-provider-1", - inference_parameters=InferenceParameters( + inference_parameters=CompletionInferenceParameters( temperature=0.7, top_p=0.9, max_tokens=2048, diff --git a/tests/cli/controllers/test_model_controller.py b/tests/cli/controllers/test_model_controller.py index b630b04a..4f718ca4 100644 --- a/tests/cli/controllers/test_model_controller.py +++ b/tests/cli/controllers/test_model_controller.py @@ -9,7 +9,7 @@ from data_designer.cli.controllers.model_controller import ModelController from data_designer.cli.repositories.model_repository import ModelConfigRegistry from data_designer.cli.repositories.provider_repository import ModelProviderRegistry, ProviderRepository -from data_designer.config.models import InferenceParameters, ModelConfig +from data_designer.config.models import CompletionInferenceParameters, ModelConfig @pytest.fixture @@ -141,7 +141,7 @@ def test_run_updates_model( alias="test-alias-1-updated", model="test-model-1-updated", provider="test-provider-1", - inference_parameters=InferenceParameters(temperature=0.8, top_p=0.95, max_tokens=1024), + inference_parameters=CompletionInferenceParameters(temperature=0.8, top_p=0.95, max_tokens=1024), ) mock_builder = MagicMock() diff --git a/tests/cli/services/test_model_service.py b/tests/cli/services/test_model_service.py index 1d9bf5aa..4287eee8 100644 --- a/tests/cli/services/test_model_service.py +++ b/tests/cli/services/test_model_service.py @@ -7,7 +7,7 @@ from data_designer.cli.repositories.model_repository import ModelRepository from data_designer.cli.services.model_service import ModelService -from data_designer.config.models import InferenceParameters, ModelConfig +from data_designer.config.models import CompletionInferenceParameters, ModelConfig def test_list_all(stub_model_service: ModelService, stub_model_configs: list[ModelConfig]): @@ -30,7 +30,9 @@ def test_add( assert stub_model_service.list_all() == stub_model_configs + [stub_new_model_config] -def test_add_duplicate_alias(stub_model_service: ModelService, stub_inference_parameters: InferenceParameters): +def test_add_duplicate_alias( + stub_model_service: ModelService, stub_inference_parameters: CompletionInferenceParameters +): """Test adding a model with an alias that already exists.""" duplicate_model = ModelConfig( alias="test-alias-1", @@ -61,7 +63,9 @@ def test_update_nonexistent_model(stub_model_service: ModelService, stub_new_mod stub_model_service.update("nonexistent", stub_new_model_config) -def test_update_to_existing_alias(stub_model_service: ModelService, stub_inference_parameters: InferenceParameters): +def test_update_to_existing_alias( + stub_model_service: ModelService, stub_inference_parameters: CompletionInferenceParameters +): """Test updating a model to an alias that already exists.""" updated_model = ModelConfig( alias="test-alias-2", # Already exists diff --git a/tests/config/test_config_builder.py b/tests/config/test_config_builder.py index aab8112a..57741e59 100644 --- a/tests/config/test_config_builder.py +++ b/tests/config/test_config_builder.py @@ -26,7 +26,7 @@ from data_designer.config.data_designer_config import DataDesignerConfig from data_designer.config.datastore import DatastoreSettings from data_designer.config.errors import BuilderConfigurationError, InvalidColumnTypeError, InvalidConfigError -from data_designer.config.models import InferenceParameters, ModelConfig +from data_designer.config.models import CompletionInferenceParameters, ModelConfig from data_designer.config.sampler_constraints import ColumnInequalityConstraint, ScalarInequalityConstraint from data_designer.config.sampler_params import SamplerType, UUIDSamplerParams from data_designer.config.seed import DatastoreSeedDatasetReference, SamplingStrategy @@ -670,7 +670,7 @@ def test_add_model_config(stub_empty_builder): new_model_config = ModelConfig( alias="new-model", model="openai/gpt-4", - inference_parameters=InferenceParameters( + inference_parameters=CompletionInferenceParameters( temperature=0.7, top_p=0.95, max_tokens=1024, @@ -691,7 +691,7 @@ def test_add_model_config(stub_empty_builder): alias="provider-model", model="anthropic/claude-3", provider="anthropic", - inference_parameters=InferenceParameters(temperature=0.8), + inference_parameters=CompletionInferenceParameters(temperature=0.8), ) stub_empty_builder.add_model_config(provider_model_config) @@ -717,7 +717,7 @@ def test_add_model_config_duplicate_alias(stub_empty_builder): duplicate_model_config = ModelConfig( alias="stub-model", model="different/model", - inference_parameters=InferenceParameters(temperature=0.5), + inference_parameters=CompletionInferenceParameters(temperature=0.5), ) with pytest.raises( @@ -733,12 +733,12 @@ def test_delete_model_config(stub_empty_builder): model_config_1 = ModelConfig( alias="model-to-delete", model="model/delete", - inference_parameters=InferenceParameters(temperature=0.5), + inference_parameters=CompletionInferenceParameters(temperature=0.5), ) model_config_2 = ModelConfig( alias="model-to-keep", model="model/keep", - inference_parameters=InferenceParameters(temperature=0.6), + inference_parameters=CompletionInferenceParameters(temperature=0.6), ) stub_empty_builder.add_model_config(model_config_1) stub_empty_builder.add_model_config(model_config_2) diff --git a/tests/config/test_default_model_settings.py b/tests/config/test_default_model_settings.py index 222bb410..8f389a69 100644 --- a/tests/config/test_default_model_settings.py +++ b/tests/config/test_default_model_settings.py @@ -18,20 +18,20 @@ get_default_providers, resolve_seed_default_model_settings, ) -from data_designer.config.models import InferenceParameters +from data_designer.config.models import CompletionInferenceParameters from data_designer.config.utils.visualization import get_nvidia_api_key, get_openai_api_key def test_get_default_inference_parameters(): - assert get_default_inference_parameters("text") == InferenceParameters( + assert get_default_inference_parameters("text") == CompletionInferenceParameters( temperature=0.85, top_p=0.95, ) - assert get_default_inference_parameters("reasoning") == InferenceParameters( + assert get_default_inference_parameters("reasoning") == CompletionInferenceParameters( temperature=0.35, top_p=0.95, ) - assert get_default_inference_parameters("vision") == InferenceParameters( + assert get_default_inference_parameters("vision") == CompletionInferenceParameters( temperature=0.85, top_p=0.95, ) diff --git a/tests/config/test_models.py b/tests/config/test_models.py index 6a3d7b25..f1f65401 100644 --- a/tests/config/test_models.py +++ b/tests/config/test_models.py @@ -11,9 +11,9 @@ from data_designer.config.errors import InvalidConfigError from data_designer.config.models import ( + CompletionInferenceParameters, ImageContext, ImageFormat, - InferenceParameters, ManualDistribution, ManualDistributionParams, ModalityDataType, @@ -46,13 +46,13 @@ def test_image_context_validate_image_format(): def test_inference_parameters_default_construction(): - empty_inference_parameters = InferenceParameters() + empty_inference_parameters = CompletionInferenceParameters() assert empty_inference_parameters.generate_kwargs == {} assert empty_inference_parameters.max_parallel_requests == 4 def test_inference_parameters_generate_kwargs(): - assert InferenceParameters( + assert CompletionInferenceParameters( temperature=0.95, top_p=0.95, max_tokens=100, @@ -67,9 +67,9 @@ def test_inference_parameters_generate_kwargs(): "extra_body": {"reasoning_effort": "high"}, } - assert InferenceParameters().generate_kwargs == {} + assert CompletionInferenceParameters().generate_kwargs == {} - inference_parameters_kwargs = InferenceParameters( + inference_parameters_kwargs = CompletionInferenceParameters( temperature=UniformDistribution(params=UniformDistributionParams(low=0.0, high=1.0)), top_p=ManualDistribution(params=ManualDistributionParams(values=[0.0, 1.0], weights=[0.5, 0.5])), ).generate_kwargs @@ -131,32 +131,38 @@ def test_inference_parameters_temperature_validation(): # All temp values provide in a manual destribution should be valid with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters( + CompletionInferenceParameters( temperature=ManualDistribution(params=ManualDistributionParams(values=[0.5, 2.5], weights=[0.5, 0.5])) ) # High and low values of uniform distribution should be valid with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters(temperature=UniformDistribution(params=UniformDistributionParams(low=0.5, high=2.5))) + CompletionInferenceParameters( + temperature=UniformDistribution(params=UniformDistributionParams(low=0.5, high=2.5)) + ) with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters(temperature=UniformDistribution(params=UniformDistributionParams(low=-0.5, high=2.0))) + CompletionInferenceParameters( + temperature=UniformDistribution(params=UniformDistributionParams(low=-0.5, high=2.0)) + ) # Static values should be valid with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters(temperature=3.0) + CompletionInferenceParameters(temperature=3.0) with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters(temperature=-1.0) + CompletionInferenceParameters(temperature=-1.0) # Valid temperature values shouldn't raise validation errors try: - InferenceParameters(temperature=0.1) - InferenceParameters(temperature=UniformDistribution(params=UniformDistributionParams(low=0.5, high=2.0))) - InferenceParameters( + CompletionInferenceParameters(temperature=0.1) + CompletionInferenceParameters( + temperature=UniformDistribution(params=UniformDistributionParams(low=0.5, high=2.0)) + ) + CompletionInferenceParameters( temperature=ManualDistribution(params=ManualDistributionParams(values=[0.5, 2.0], weights=[0.5, 0.5])) ) except Exception: - pytest.fail("Unexpected exception raised during InferenceParameters temperature validation") + pytest.fail("Unexpected exception raised during CompletionInferenceParameters temperature validation") def test_generation_parameters_top_p_validation(): @@ -164,31 +170,31 @@ def test_generation_parameters_top_p_validation(): # All top_p values provide in a manual destribution should be valid with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters( + CompletionInferenceParameters( top_p=ManualDistribution(params=ManualDistributionParams(values=[0.5, 1.5], weights=[0.5, 0.5])) ) # High and low values of uniform distribution should be valid with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters(top_p=UniformDistribution(params=UniformDistributionParams(low=0.5, high=1.5))) + CompletionInferenceParameters(top_p=UniformDistribution(params=UniformDistributionParams(low=0.5, high=1.5))) with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters(top_p=UniformDistribution(params=UniformDistributionParams(low=-0.5, high=1.0))) + CompletionInferenceParameters(top_p=UniformDistribution(params=UniformDistributionParams(low=-0.5, high=1.0))) # Static values should be valid with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters(top_p=1.5) + CompletionInferenceParameters(top_p=1.5) with pytest.raises(ValidationError, match=expected_error_msg): - InferenceParameters(top_p=-0.1) + CompletionInferenceParameters(top_p=-0.1) # Valid top_p values shouldn't raise validation errors try: - InferenceParameters(top_p=0.1) - InferenceParameters(top_p=UniformDistribution(params=UniformDistributionParams(low=0.5, high=1.0))) - InferenceParameters( + CompletionInferenceParameters(top_p=0.1) + CompletionInferenceParameters(top_p=UniformDistribution(params=UniformDistributionParams(low=0.5, high=1.0))) + CompletionInferenceParameters( top_p=ManualDistribution(params=ManualDistributionParams(values=[0.5, 1.0], weights=[0.5, 0.5])) ) except Exception: - pytest.fail("Unexpected exception raised during InferenceParameters top_p validation") + pytest.fail("Unexpected exception raised during CompletionInferenceParameters top_p validation") def test_generation_parameters_max_tokens_validation(): @@ -196,15 +202,15 @@ def test_generation_parameters_max_tokens_validation(): ValidationError, match="Input should be greater than or equal to 1", ): - InferenceParameters(max_tokens=0) + CompletionInferenceParameters(max_tokens=0) # Valid max_tokens values shouldn't raise validation errors try: - InferenceParameters(max_tokens=128_000) - InferenceParameters(max_tokens=4096) - InferenceParameters(max_tokens=1) + CompletionInferenceParameters(max_tokens=128_000) + CompletionInferenceParameters(max_tokens=4096) + CompletionInferenceParameters(max_tokens=1) except Exception: - pytest.fail("Unexpected exception raised during InferenceParameters max_tokens validation") + pytest.fail("Unexpected exception raised during CompletionInferenceParameters max_tokens validation") def test_load_model_configs(): @@ -250,4 +256,4 @@ def test_load_model_configs(): def test_model_config_default_construction(): model_config = ModelConfig(alias="test", model="test") - assert model_config.inference_parameters == InferenceParameters() + assert model_config.inference_parameters == CompletionInferenceParameters() diff --git a/tests/conftest.py b/tests/conftest.py index 31dc0057..46b5d318 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,7 +17,7 @@ from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.data_designer_config import DataDesignerConfig from data_designer.config.datastore import DatastoreSettings -from data_designer.config.models import InferenceParameters, ModelConfig, ModelProvider +from data_designer.config.models import CompletionInferenceParameters, ModelConfig, ModelProvider @pytest.fixture @@ -135,7 +135,7 @@ def stub_model_configs() -> list[ModelConfig]: ModelConfig( alias="stub-model", model="stub-model", - inference_parameters=InferenceParameters( + inference_parameters=CompletionInferenceParameters( temperature=0.9, top_p=0.9, max_tokens=2048, diff --git a/tests/engine/models/conftest.py b/tests/engine/models/conftest.py index 95e6941f..7edcd073 100644 --- a/tests/engine/models/conftest.py +++ b/tests/engine/models/conftest.py @@ -5,7 +5,7 @@ import pytest -from data_designer.config.models import InferenceParameters, ModelConfig +from data_designer.config.models import CompletionInferenceParameters, ModelConfig from data_designer.engine.model_provider import ModelProvider, ModelProviderRegistry from data_designer.engine.models.registry import ModelRegistry, create_model_registry from data_designer.engine.secret_resolver import SecretsFileResolver @@ -38,7 +38,7 @@ def stub_model_configs() -> list[ModelConfig]: alias="stub-text", model="stub-model-text", provider="stub-model-provider", - inference_parameters=InferenceParameters( + inference_parameters=CompletionInferenceParameters( temperature=0.80, top_p=0.95, max_tokens=100, max_parallel_requests=10, timeout=100 ), ), @@ -46,7 +46,7 @@ def stub_model_configs() -> list[ModelConfig]: alias="stub-reasoning", model="stub-model-reasoning", provider="stub-model-provider", - inference_parameters=InferenceParameters( + inference_parameters=CompletionInferenceParameters( temperature=0.80, top_p=0.95, max_tokens=100, max_parallel_requests=10, timeout=100 ), ), diff --git a/tests/engine/models/test_model_registry.py b/tests/engine/models/test_model_registry.py index 571b9605..83e3b650 100644 --- a/tests/engine/models/test_model_registry.py +++ b/tests/engine/models/test_model_registry.py @@ -6,7 +6,7 @@ from litellm import AuthenticationError import pytest -from data_designer.config.models import InferenceParameters, ModelConfig +from data_designer.config.models import CompletionInferenceParameters, ModelConfig from data_designer.engine.models.errors import ModelAuthenticationError from data_designer.engine.models.facade import ModelFacade from data_designer.engine.models.registry import ModelRegistry, create_model_registry @@ -24,7 +24,7 @@ def stub_new_model_config(): alias="stub-vision", model="stub-model-vision", provider="stub-model-provider", - inference_parameters=InferenceParameters( + inference_parameters=CompletionInferenceParameters( temperature=0.80, top_p=0.95, max_tokens=100, max_parallel_requests=10, timeout=100 ), ) @@ -36,7 +36,7 @@ def stub_no_usage_config(): alias="no-usage", model="no-usage-model", provider="stub-model-provider", - inference_parameters=InferenceParameters(), + inference_parameters=CompletionInferenceParameters(), ) diff --git a/tests/essentials/test_init.py b/tests/essentials/test_init.py index 89f8388a..d810bba3 100644 --- a/tests/essentials/test_init.py +++ b/tests/essentials/test_init.py @@ -17,14 +17,17 @@ CodeLang, CodeValidatorParams, ColumnInequalityConstraint, + CompletionInferenceParameters, DataDesignerColumnType, DataDesignerConfig, DataDesignerConfigBuilder, DatastoreSeedDatasetReference, DatastoreSettings, DatetimeSamplerParams, + EmbeddingInferenceParameters, ExpressionColumnConfig, GaussianSamplerParams, + GenerationType, ImageContext, ImageFormat, InferenceParameters, @@ -109,6 +112,9 @@ def test_model_config_imports(): assert ImageContext is not None assert ImageFormat is not None assert InferenceParameters is not None + assert CompletionInferenceParameters is not None + assert EmbeddingInferenceParameters is not None + assert GenerationType is not None assert ManualDistribution is not None assert ManualDistributionParams is not None assert Modality is not None @@ -232,6 +238,7 @@ def test_all_contains_column_configs(): assert "Score" in __all__ assert "SeedDatasetColumnConfig" in __all__ assert "ValidationColumnConfig" in __all__ + assert "EmbeddingColumnConfig" in __all__ def test_all_contains_sampler_params(): @@ -250,6 +257,8 @@ def test_all_contains_sampler_params(): assert "TimeDeltaSamplerParams" in __all__ assert "UniformSamplerParams" in __all__ assert "UUIDSamplerParams" in __all__ + assert "PersonFromFakerSamplerParams" in __all__ + assert "ProcessorType" in __all__ def test_all_contains_constraints(): @@ -263,6 +272,9 @@ def test_all_contains_model_configs(): assert "ImageContext" in __all__ assert "ImageFormat" in __all__ assert "InferenceParameters" in __all__ + assert "CompletionInferenceParameters" in __all__ + assert "EmbeddingInferenceParameters" in __all__ + assert "GenerationType" in __all__ assert "ManualDistribution" in __all__ assert "ManualDistributionParams" in __all__ assert "Modality" in __all__ From 2c1b2676fe0234016a7e13fe57171da2295eaf7c Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 26 Nov 2025 13:04:18 -0700 Subject: [PATCH 13/69] Remove purpose from consolidated kwargs --- src/data_designer/config/models.py | 2 +- .../engine/column_generators/generators/embedding.py | 1 + src/data_designer/engine/models/facade.py | 2 ++ tests/engine/models/test_facade.py | 8 ++++---- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index 1df7055e..7b129556 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -229,7 +229,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: class EmbeddingInferenceParameters(BaseInferenceParameters): - encoding_format: Optional[Literal["float", "base64"]] = "float" + encoding_format: Optional[Literal["float", "base64"]] = None dimensions: Optional[int] = None @property diff --git a/src/data_designer/engine/column_generators/generators/embedding.py b/src/data_designer/engine/column_generators/generators/embedding.py index d9981ccd..48fc309f 100644 --- a/src/data_designer/engine/column_generators/generators/embedding.py +++ b/src/data_designer/engine/column_generators/generators/embedding.py @@ -28,6 +28,7 @@ def generate(self, data: dict) -> dict: deserialized_record = deserialize_json_values(data) input_text = deserialized_record[self.config.target_column] input_chunks = re.split(self.config.chunk_pattern, input_text) if self.config.chunk_pattern else [input_text] + input_chunks = [chunk.strip() for chunk in input_chunks if chunk.strip()] embeddings = self.model.generate_text_embeddings(input_texts=input_chunks) data[self.config.name] = { "embeddings": embeddings, diff --git a/src/data_designer/engine/models/facade.py b/src/data_designer/engine/models/facade.py index c205a4ca..6b98c0a7 100644 --- a/src/data_designer/engine/models/facade.py +++ b/src/data_designer/engine/models/facade.py @@ -91,6 +91,8 @@ def completion(self, messages: list[dict[str, str]], skip_usage_tracking: bool = self._track_usage(response) def consolidate_kwargs(self, **kwargs) -> dict[str, Any]: + # Remove purpose from kwargs to avoid passing it to the model + kwargs.pop("purpose", None) kwargs = {**self._model_config.inference_parameters.generate_kwargs, **kwargs} if self.model_provider.extra_body: kwargs["extra_body"] = {**kwargs.get("extra_body", {}), **self.model_provider.extra_body} diff --git a/tests/engine/models/test_facade.py b/tests/engine/models/test_facade.py index afe27730..8765d0ab 100644 --- a/tests/engine/models/test_facade.py +++ b/tests/engine/models/test_facade.py @@ -116,17 +116,17 @@ def test_usage_stats_property(stub_model_facade): def test_consolidate_kwargs(stub_model_configs, stub_model_facade): - # Model config generate kwargs are used as base - result = stub_model_facade.consolidate_kwargs() + # Model config generate kwargs are used as base, and purpose is removed + result = stub_model_facade.consolidate_kwargs(purpose="test") assert result == stub_model_configs[0].inference_parameters.generate_kwargs # kwargs overrides model config generate kwargs - result = stub_model_facade.consolidate_kwargs(temperature=0.01) + result = stub_model_facade.consolidate_kwargs(temperature=0.01, purpose="test") assert result == {**stub_model_configs[0].inference_parameters.generate_kwargs, "temperature": 0.01} # Provider extra_body overrides all other kwargs stub_model_facade.model_provider.extra_body = {"foo_provider": "bar_provider"} - result = stub_model_facade.consolidate_kwargs(extra_body={"foo": "bar"}) + result = stub_model_facade.consolidate_kwargs(extra_body={"foo": "bar"}, purpose="test") assert result == { **stub_model_configs[0].inference_parameters.generate_kwargs, "extra_body": {"foo_provider": "bar_provider", "foo": "bar"}, From 4b1492baf805adc0719d73857b1f19a219f49375 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 2 Dec 2025 11:38:33 -0700 Subject: [PATCH 14/69] WithModelConfiguration.inference_parameters should should be typed with BaseInferenceParameters --- src/data_designer/engine/column_generators/generators/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data_designer/engine/column_generators/generators/base.py b/src/data_designer/engine/column_generators/generators/base.py index 8977a63b..580c96a6 100644 --- a/src/data_designer/engine/column_generators/generators/base.py +++ b/src/data_designer/engine/column_generators/generators/base.py @@ -9,7 +9,7 @@ import pandas as pd from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP -from data_designer.config.models import InferenceParameters, ModelConfig +from data_designer.config.models import BaseInferenceParameters, ModelConfig from data_designer.config.utils.type_helpers import StrEnum from data_designer.engine.column_generators.utils.prompt_renderer import ( RecordBasedPromptRenderer, @@ -81,7 +81,7 @@ def model_config(self) -> ModelConfig: return self.resource_provider.model_registry.get_model_config(model_alias=self.config.model_alias) @functools.cached_property - def inference_parameters(self) -> InferenceParameters: + def inference_parameters(self) -> BaseInferenceParameters: return self.model_config.inference_parameters @functools.cached_property From c445caf53f213a54b80b3df71a0c00334ccf519b Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 2 Dec 2025 14:37:07 -0700 Subject: [PATCH 15/69] Type as WithModelGeneration --- .../engine/dataset_builders/column_wise_builder.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/data_designer/engine/dataset_builders/column_wise_builder.py b/src/data_designer/engine/dataset_builders/column_wise_builder.py index ff9289ee..2e30407c 100644 --- a/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -17,8 +17,11 @@ ProcessorConfig, ProcessorType, ) -from data_designer.engine.column_generators.generators.base import ColumnGenerator, GenerationStrategy -from data_designer.engine.column_generators.generators.llm_completion import WithCompletionGeneration +from data_designer.engine.column_generators.generators.base import ( + ColumnGenerator, + GenerationStrategy, + WithModelGeneration, +) from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.errors import DatasetGenerationError, DatasetProcessingError from data_designer.engine.dataset_builders.multi_column_configs import ( @@ -169,7 +172,7 @@ def _run_from_scratch_column_generator(self, generator: ColumnGenerator) -> None def _run_cell_by_cell_generator(self, generator: ColumnGenerator) -> None: max_workers = MAX_CONCURRENCY_PER_NON_LLM_GENERATOR - if isinstance(generator, WithCompletionGeneration): + if isinstance(generator, WithModelGeneration): max_workers = generator.inference_parameters.max_parallel_requests self._fan_out_with_threads(generator, max_workers=max_workers) @@ -183,7 +186,7 @@ def _run_model_health_check_if_needed(self) -> bool: set(config.model_alias for config in self.llm_generated_column_configs) ) - def _fan_out_with_threads(self, generator: WithCompletionGeneration, max_workers: int) -> None: + def _fan_out_with_threads(self, generator: WithModelGeneration, max_workers: int) -> None: if generator.generation_strategy != GenerationStrategy.CELL_BY_CELL: raise DatasetGenerationError( f"Generator {generator.metadata().name} is not a {GenerationStrategy.CELL_BY_CELL} " From 4b8aa2bf9258c1a1fc3be10ff1d817ae797ed2d7 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 2 Dec 2025 16:06:52 -0700 Subject: [PATCH 16/69] Add image generation modality --- src/data_designer/config/column_configs.py | 40 +++++++++++++++ src/data_designer/config/column_types.py | 8 +++ src/data_designer/config/models.py | 20 +++++++- .../column_generators/generators/base.py | 14 ------ .../column_generators/generators/image.py | 49 +++++++++++++++++++ .../generators/llm_completion.py | 12 +++++ .../engine/column_generators/registry.py | 4 +- src/data_designer/engine/models/facade.py | 36 +++++++++++++- src/data_designer/engine/models/registry.py | 6 +++ src/data_designer/essentials/__init__.py | 4 ++ 10 files changed, 176 insertions(+), 17 deletions(-) create mode 100644 src/data_designer/engine/column_generators/generators/image.py diff --git a/src/data_designer/config/column_configs.py b/src/data_designer/config/column_configs.py index 339be35d..eb93f9f0 100644 --- a/src/data_designer/config/column_configs.py +++ b/src/data_designer/config/column_configs.py @@ -401,3 +401,43 @@ class EmbeddingColumnConfig(SingleColumnConfig): @property def required_columns(self) -> list[str]: return [self.target_column] + + +class ImageGenerationColumnConfig(SingleColumnConfig): + """Configuration for image generation columns. + + Image columns generate images using a specified model. + + Attributes: + column_type: Discriminator field, always "image-generation" for this configuration type. + prompt: Prompt template for image generation. Supports Jinja2 templating to + reference other columns (e.g., "Generate an image of a {{ character_name }}"). + Must be a valid Jinja2 template. + model_alias: The model to use for image generation. + """ + + column_type: Literal["image-generation"] = "image-generation" + prompt: str + model_alias: str + + @property + def required_columns(self) -> list[str]: + """Get columns referenced in the prompt template. + + Returns: + List of unique column names referenced in Jinja2 templates. + """ + return list(get_prompt_template_keywords(self.prompt)) + + @model_validator(mode="after") + def assert_prompt_valid_jinja(self) -> Self: + """Validate that prompt is a valid Jinja2 template. + + Returns: + The validated instance. + + Raises: + InvalidConfigError: If prompt contains invalid Jinja2 syntax. + """ + assert_valid_jinja2_template(self.prompt) + return self diff --git a/src/data_designer/config/column_types.py b/src/data_designer/config/column_types.py index aab55c4d..efdeb094 100644 --- a/src/data_designer/config/column_types.py +++ b/src/data_designer/config/column_types.py @@ -9,6 +9,7 @@ from .column_configs import ( EmbeddingColumnConfig, ExpressionColumnConfig, + ImageGenerationColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, LLMStructuredColumnConfig, @@ -33,6 +34,7 @@ SeedDatasetColumnConfig, ValidationColumnConfig, EmbeddingColumnConfig, + ImageGenerationColumnConfig, ] ColumnConfigT = plugin_manager.inject_into_column_config_type_union(ColumnConfigT) @@ -53,6 +55,7 @@ DataDesignerColumnType.SAMPLER: "🎲", DataDesignerColumnType.VALIDATION: "πŸ”", DataDesignerColumnType.EMBEDDING: "🧬", + DataDesignerColumnType.IMAGE_GENERATION: "πŸ–ΌοΈ", } COLUMN_TYPE_EMOJI_MAP.update( {DataDesignerColumnType(p.name): p.emoji for p in plugin_manager.get_column_generator_plugins()} @@ -70,6 +73,7 @@ def column_type_used_in_execution_dag(column_type: Union[str, DataDesignerColumn DataDesignerColumnType.LLM_TEXT, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EMBEDDING, + DataDesignerColumnType.IMAGE_GENERATION, } dag_column_types.update(plugin_manager.get_plugin_column_types(DataDesignerColumnType)) return column_type in dag_column_types @@ -84,6 +88,7 @@ def column_type_is_llm_generated(column_type: Union[str, DataDesignerColumnType] DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_JUDGE, DataDesignerColumnType.EMBEDDING, + DataDesignerColumnType.IMAGE_GENERATION, } llm_generated_column_types.update( plugin_manager.get_plugin_column_types( @@ -124,6 +129,8 @@ def get_column_config_from_kwargs(name: str, column_type: DataDesignerColumnType return SeedDatasetColumnConfig(name=name, **kwargs) if column_type == DataDesignerColumnType.EMBEDDING: return EmbeddingColumnConfig(name=name, **kwargs) + if column_type == DataDesignerColumnType.IMAGE_GENERATION: + return ImageGenerationColumnConfig(name=name, **kwargs) if plugin := plugin_manager.get_column_generator_plugin_if_exists(column_type.value): return plugin.config_cls(name=name, **kwargs) raise InvalidColumnTypeError(f"πŸ›‘ {column_type} is not a valid column type.") # pragma: no cover @@ -139,6 +146,7 @@ def get_column_display_order() -> list[DataDesignerColumnType]: DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_JUDGE, DataDesignerColumnType.EMBEDDING, + DataDesignerColumnType.IMAGE_GENERATION, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EXPRESSION, ] diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index 7b129556..6e535038 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -242,8 +242,24 @@ def generate_kwargs(self) -> dict[str, Union[float, int]]: return result +class ImageGenerationInferenceParameters(BaseInferenceParameters): + quality: str + size: str + output_format: Optional[ModalityDataType] = ModalityDataType.BASE64 + + @property + def generate_kwargs(self) -> dict[str, Union[float, int]]: + result = super().generate_kwargs + result["size"] = self.size + result["quality"] = self.quality + result["response_format"] = ( + self.output_format.value if self.output_format == ModalityDataType.URL else "b64_json" + ) + return result + + InferenceParametersT: TypeAlias = Union[ - InferenceParameters, CompletionInferenceParameters, EmbeddingInferenceParameters + InferenceParameters, CompletionInferenceParameters, EmbeddingInferenceParameters, ImageGenerationInferenceParameters ] @@ -272,6 +288,8 @@ def generation_type(self) -> GenerationType: return GenerationType.CHAT_COMPLETION elif isinstance(self.inference_parameters, EmbeddingInferenceParameters): return GenerationType.EMBEDDING + elif isinstance(self.inference_parameters, ImageGenerationInferenceParameters): + return GenerationType.IMAGE_GENERATION else: raise ValueError(f"Unsupported inference parameters type: {type(self.inference_parameters)}") diff --git a/src/data_designer/engine/column_generators/generators/base.py b/src/data_designer/engine/column_generators/generators/base.py index 580c96a6..a98038b3 100644 --- a/src/data_designer/engine/column_generators/generators/base.py +++ b/src/data_designer/engine/column_generators/generators/base.py @@ -11,9 +11,6 @@ from data_designer.config.column_types import COLUMN_TYPE_EMOJI_MAP from data_designer.config.models import BaseInferenceParameters, ModelConfig from data_designer.config.utils.type_helpers import StrEnum -from data_designer.engine.column_generators.utils.prompt_renderer import ( - RecordBasedPromptRenderer, -) from data_designer.engine.configurable_task import ConfigurableTask, ConfigurableTaskMetadata, DataT, TaskConfigT from data_designer.engine.models.facade import ModelFacade @@ -84,17 +81,6 @@ def model_config(self) -> ModelConfig: def inference_parameters(self) -> BaseInferenceParameters: return self.model_config.inference_parameters - @functools.cached_property - def prompt_renderer(self) -> RecordBasedPromptRenderer: - return RecordBasedPromptRenderer( - response_recipe=self.response_recipe, - error_message_context={ - "column_name": self.config.name, - "column_type": self.config.column_type, - "model_alias": self.config.model_alias, - }, - ) - def log_pre_generation(self) -> None: emoji = COLUMN_TYPE_EMOJI_MAP[self.config.column_type] logger.info(f"{emoji} Preparing {self.config.column_type} column generation") diff --git a/src/data_designer/engine/column_generators/generators/image.py b/src/data_designer/engine/column_generators/generators/image.py new file mode 100644 index 00000000..f7cfba89 --- /dev/null +++ b/src/data_designer/engine/column_generators/generators/image.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + + +from litellm.types.utils import ImageResponse + +from data_designer.config.column_configs import ImageGenerationColumnConfig +from data_designer.config.models import ModalityDataType +from data_designer.engine.column_generators.generators.base import ( + ColumnGenerator, + GenerationStrategy, + GeneratorMetadata, + WithModelGeneration, +) +from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering +from data_designer.engine.processing.utils import deserialize_json_values +from data_designer.engine.resources.resource_provider import ResourceType + + +class ImageCellGenerator( + WithModelGeneration, WithJinja2UserTemplateRendering, ColumnGenerator[ImageGenerationColumnConfig] +): + @staticmethod + def metadata() -> GeneratorMetadata: + return GeneratorMetadata( + name="image_cell_generator", + description="Generate images using a specified model.", + generation_strategy=GenerationStrategy.CELL_BY_CELL, + required_resources=[ResourceType.MODEL_REGISTRY], + ) + + def generate(self, data: dict) -> dict: + deserialized_record = deserialize_json_values(data) + missing_columns = list(set(self.config.required_columns) - set(data.keys())) + if len(missing_columns) > 0: + error_msg = ( + f"There was an error preparing the Jinja2 expression template. " + f"The following columns {missing_columns} are missing!" + ) + raise ValueError(error_msg) + + self.prepare_jinja2_template_renderer(self.config.prompt, list(deserialized_record.keys())) + prompt = self.render_template(deserialized_record) + image_response: ImageResponse = self.model.generate_image(prompt=prompt) + if self.model_config.inference_parameters.output_format == ModalityDataType.URL: + data[self.config.name] = image_response.data[0].url + else: + data[self.config.name] = image_response.data[0].b64_json + return data diff --git a/src/data_designer/engine/column_generators/generators/llm_completion.py b/src/data_designer/engine/column_generators/generators/llm_completion.py index 5665ba85..8fae174b 100644 --- a/src/data_designer/engine/column_generators/generators/llm_completion.py +++ b/src/data_designer/engine/column_generators/generators/llm_completion.py @@ -19,6 +19,7 @@ ) from data_designer.engine.column_generators.utils.prompt_renderer import ( PromptType, + RecordBasedPromptRenderer, create_response_recipe, ) from data_designer.engine.models.recipes.base import ResponseRecipe @@ -45,6 +46,17 @@ def max_conversation_correction_steps(self) -> int: def max_conversation_restarts(self) -> int: return DEFAULT_MAX_CONVERSATION_RESTARTS + @functools.cached_property + def prompt_renderer(self) -> RecordBasedPromptRenderer: + return RecordBasedPromptRenderer( + response_recipe=self.response_recipe, + error_message_context={ + "column_name": self.config.name, + "column_type": self.config.column_type, + "model_alias": self.config.model_alias, + }, + ) + def generate(self, data: dict) -> dict: deserialized_record = deserialize_json_values(data) diff --git a/src/data_designer/engine/column_generators/registry.py b/src/data_designer/engine/column_generators/registry.py index 7171e561..3d000729 100644 --- a/src/data_designer/engine/column_generators/registry.py +++ b/src/data_designer/engine/column_generators/registry.py @@ -5,6 +5,7 @@ from data_designer.config.column_configs import ( EmbeddingColumnConfig, ExpressionColumnConfig, + ImageGenerationColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, LLMStructuredColumnConfig, @@ -15,6 +16,7 @@ from data_designer.engine.column_generators.generators.base import ColumnGenerator from data_designer.engine.column_generators.generators.embedding import EmbeddingCellGenerator from data_designer.engine.column_generators.generators.expression import ExpressionColumnGenerator +from data_designer.engine.column_generators.generators.image import ImageCellGenerator from data_designer.engine.column_generators.generators.llm_completion import ( LLMCodeCellGenerator, LLMJudgeCellGenerator, @@ -47,7 +49,7 @@ def create_default_column_generator_registry(with_plugins: bool = True) -> Colum registry.register(DataDesignerColumnType.SEED_DATASET, SeedDatasetColumnGenerator, SeedDatasetMultiColumnConfig) registry.register(DataDesignerColumnType.VALIDATION, ValidationColumnGenerator, ValidationColumnConfig) registry.register(DataDesignerColumnType.LLM_STRUCTURED, LLMStructuredCellGenerator, LLMStructuredColumnConfig) - + registry.register(DataDesignerColumnType.IMAGE_GENERATION, ImageCellGenerator, ImageGenerationColumnConfig) if with_plugins: for plugin in PluginRegistry().get_plugins(PluginType.COLUMN_GENERATOR): registry.register( diff --git a/src/data_designer/engine/models/facade.py b/src/data_designer/engine/models/facade.py index 6b98c0a7..33c79797 100644 --- a/src/data_designer/engine/models/facade.py +++ b/src/data_designer/engine/models/facade.py @@ -9,7 +9,7 @@ from typing import Any from litellm.types.router import DeploymentTypedDict, LiteLLM_Params -from litellm.types.utils import EmbeddingResponse, ModelResponse +from litellm.types.utils import EmbeddingResponse, ImageResponse, ImageUsage, ModelResponse from data_designer.config.models import GenerationType, ModelConfig, ModelProvider from data_designer.engine.model_provider import ModelProviderRegistry @@ -131,6 +131,27 @@ def generate_text_embeddings( if not skip_usage_tracking and response is not None: self._track_usage_from_embedding(response) + @catch_llm_exceptions + def generate_image(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> ImageResponse: + logger.debug( + f"Generating image with model {self.model_name!r}...", + extra={"model": self.model_name, "prompt": prompt}, + ) + kwargs = self.consolidate_kwargs(**kwargs) + response = None + try: + response = self._router.image_generation(prompt=prompt, model=self.model_name, **kwargs) + logger.debug( + f"Received image from model {self.model_name!r}", + extra={"model": self.model_name, "response": response}, + ) + return response + except Exception as e: + raise e + finally: + if not skip_usage_tracking and response is not None: + self._track_usage_from_image(response) + @catch_llm_exceptions def generate( self, @@ -280,3 +301,16 @@ def _track_usage_from_embedding(self, response: EmbeddingResponse | None) -> Non ), request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), ) + + def _track_usage_from_image(self, response: ImageResponse | None) -> None: + if response is None: + self._usage_stats.extend(request_usage=RequestUsageStats(successful_requests=0, failed_requests=1)) + return + if response.usage is not None and isinstance(response.usage, ImageUsage): + self._usage_stats.extend( + token_usage=TokenUsageStats( + prompt_tokens=response.usage.input_tokens, + completion_tokens=response.usage.output_tokens, + ), + request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), + ) diff --git a/src/data_designer/engine/models/registry.py b/src/data_designer/engine/models/registry.py index 4330ea18..91025684 100644 --- a/src/data_designer/engine/models/registry.py +++ b/src/data_designer/engine/models/registry.py @@ -97,6 +97,12 @@ def run_health_check(self, model_aliases: set[str]) -> None: skip_usage_tracking=True, purpose="running health checks", ) + elif model.model_generation_type == GenerationType.IMAGE_GENERATION: + model.generate_image( + prompt="Generate a simple pixel", + skip_usage_tracking=True, + purpose="running health checks", + ) else: raise ValueError(f"Unsupported generation type: {model.model_generation_type}") logger.info(" |-- βœ… Passed!") diff --git a/src/data_designer/essentials/__init__.py b/src/data_designer/essentials/__init__.py index cd1dd6ba..e8c6091c 100644 --- a/src/data_designer/essentials/__init__.py +++ b/src/data_designer/essentials/__init__.py @@ -8,6 +8,7 @@ from ..config.column_configs import ( EmbeddingColumnConfig, ExpressionColumnConfig, + ImageGenerationColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, LLMStructuredColumnConfig, @@ -28,6 +29,7 @@ GenerationType, ImageContext, ImageFormat, + ImageGenerationInferenceParameters, InferenceParameters, ManualDistribution, ManualDistributionParams, @@ -105,6 +107,8 @@ "InfoType", "ImageContext", "ImageFormat", + "ImageGenerationColumnConfig", + "ImageGenerationInferenceParameters", "InferenceParameters", "JudgeScoreProfilerConfig", "LLMCodeColumnConfig", From 2c5933f789b8e1dc47d40be56f3ff76741850d10 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 2 Dec 2025 17:49:32 -0700 Subject: [PATCH 17/69] update return type for generate_kwargs --- src/data_designer/config/models.py | 10 ++++------ tests/config/test_columns.py | 3 +++ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index 6e535038..9d0ee6a6 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -142,7 +142,7 @@ class BaseInferenceParameters(ConfigBase, ABC): extra_body: Optional[dict[str, Any]] = None @property - def generate_kwargs(self) -> dict[str, Union[float, int]]: + def generate_kwargs(self) -> dict[str, Any]: result = {} if self.timeout is not None: result["timeout"] = self.timeout @@ -157,7 +157,7 @@ class CompletionInferenceParameters(BaseInferenceParameters): max_tokens: Optional[int] = Field(default=None, ge=1) @property - def generate_kwargs(self) -> dict[str, Union[float, int]]: + def generate_kwargs(self) -> dict[str, Any]: result = super().generate_kwargs if self.temperature is not None: result["temperature"] = ( @@ -248,13 +248,11 @@ class ImageGenerationInferenceParameters(BaseInferenceParameters): output_format: Optional[ModalityDataType] = ModalityDataType.BASE64 @property - def generate_kwargs(self) -> dict[str, Union[float, int]]: + def generate_kwargs(self) -> dict[str, Any]: result = super().generate_kwargs result["size"] = self.size result["quality"] = self.quality - result["response_format"] = ( - self.output_format.value if self.output_format == ModalityDataType.URL else "b64_json" - ) + result["response_format"] = "b64_json" if self.output_format == ModalityDataType.BASE64 else self.output_format return result diff --git a/tests/config/test_columns.py b/tests/config/test_columns.py index f7763b07..2e74695f 100644 --- a/tests/config/test_columns.py +++ b/tests/config/test_columns.py @@ -50,6 +50,7 @@ def test_data_designer_column_type_get_display_order(): DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_JUDGE, DataDesignerColumnType.EMBEDDING, + DataDesignerColumnType.IMAGE_GENERATION, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EXPRESSION, ] @@ -61,6 +62,7 @@ def test_data_designer_column_type_is_llm_generated(): assert column_type_is_llm_generated(DataDesignerColumnType.LLM_STRUCTURED) assert column_type_is_llm_generated(DataDesignerColumnType.LLM_JUDGE) assert column_type_is_llm_generated(DataDesignerColumnType.EMBEDDING) + assert column_type_is_llm_generated(DataDesignerColumnType.IMAGE_GENERATION) assert not column_type_is_llm_generated(DataDesignerColumnType.SAMPLER) assert not column_type_is_llm_generated(DataDesignerColumnType.VALIDATION) assert not column_type_is_llm_generated(DataDesignerColumnType.EXPRESSION) @@ -75,6 +77,7 @@ def test_data_designer_column_type_is_in_dag(): assert column_type_used_in_execution_dag(DataDesignerColumnType.LLM_TEXT) assert column_type_used_in_execution_dag(DataDesignerColumnType.VALIDATION) assert column_type_used_in_execution_dag(DataDesignerColumnType.EMBEDDING) + assert column_type_used_in_execution_dag(DataDesignerColumnType.IMAGE_GENERATION) assert not column_type_used_in_execution_dag(DataDesignerColumnType.SAMPLER) assert not column_type_used_in_execution_dag(DataDesignerColumnType.SEED_DATASET) From c6c29d4fdca3a292d06abdbfaee11c2f66269cfb Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 3 Dec 2025 10:25:17 -0700 Subject: [PATCH 18/69] make generation_type a field of ModelConfig as opposed to a prop resolved based on the type of InferenceParameters --- src/data_designer/config/models.py | 25 +++++++++------- tests/config/test_models.py | 47 +++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index 9d0ee6a6..b10deca0 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -271,6 +271,7 @@ class ModelConfig(ConfigBase): alias: str model: str inference_parameters: InferenceParametersT = Field(default_factory=CompletionInferenceParameters) + generation_type: GenerationType = Field(default=GenerationType.CHAT_COMPLETION) provider: Optional[str] = None @model_validator(mode="after") @@ -280,16 +281,20 @@ def _normalize_deprecated_inference_parameters(self) -> Self: self.inference_parameters = CompletionInferenceParameters(**self.inference_parameters.model_dump()) return self - @property - def generation_type(self) -> GenerationType: - if isinstance(self.inference_parameters, CompletionInferenceParameters): - return GenerationType.CHAT_COMPLETION - elif isinstance(self.inference_parameters, EmbeddingInferenceParameters): - return GenerationType.EMBEDDING - elif isinstance(self.inference_parameters, ImageGenerationInferenceParameters): - return GenerationType.IMAGE_GENERATION - else: - raise ValueError(f"Unsupported inference parameters type: {type(self.inference_parameters)}") + @model_validator(mode="after") + def _validate_generation_type(self) -> Self: + generation_type_instance_map = { + GenerationType.CHAT_COMPLETION: CompletionInferenceParameters, + GenerationType.EMBEDDING: EmbeddingInferenceParameters, + GenerationType.IMAGE_GENERATION: ImageGenerationInferenceParameters, + } + if self.generation_type not in generation_type_instance_map: + raise ValueError(f"Invalid generation type: {self.generation_type}") + if not isinstance(self.inference_parameters, generation_type_instance_map[self.generation_type]): + raise ValueError( + f"Inference parameters must be an instance of {generation_type_instance_map[self.generation_type].__name__!r} when generation_type is {self.generation_type!r}" + ) + return self class ModelProvider(ConfigBase): diff --git a/tests/config/test_models.py b/tests/config/test_models.py index f1f65401..40f6afe9 100644 --- a/tests/config/test_models.py +++ b/tests/config/test_models.py @@ -12,8 +12,11 @@ from data_designer.config.errors import InvalidConfigError from data_designer.config.models import ( CompletionInferenceParameters, + EmbeddingInferenceParameters, + GenerationType, ImageContext, ImageFormat, + ImageGenerationInferenceParameters, ManualDistribution, ManualDistributionParams, ModalityDataType, @@ -254,6 +257,48 @@ def test_load_model_configs(): load_model_configs(tmp_file.name) -def test_model_config_default_construction(): +def test_model_config_construction(): + # test default construction model_config = ModelConfig(alias="test", model="test") assert model_config.inference_parameters == CompletionInferenceParameters() + assert model_config.generation_type == GenerationType.CHAT_COMPLETION + + # test construction with completion inference parameters + completion_params = CompletionInferenceParameters(temperature=0.5, top_p=0.5, max_tokens=100) + model_config = ModelConfig(alias="test", model="test", inference_parameters=completion_params) + assert model_config.inference_parameters == completion_params + assert model_config.generation_type == GenerationType.CHAT_COMPLETION + + # test construction with embedding inference parameters + embedding_params = EmbeddingInferenceParameters(dimensions=100) + model_config = ModelConfig( + alias="test", model="test", generation_type=GenerationType.EMBEDDING, inference_parameters=embedding_params + ) + assert model_config.inference_parameters == embedding_params + assert model_config.generation_type == GenerationType.EMBEDDING + + # test construction with image generation inference parameters + image_generation_params = ImageGenerationInferenceParameters(size="1024x1024", quality="standard") + model_config = ModelConfig( + alias="test", + model="test", + generation_type=GenerationType.IMAGE_GENERATION, + inference_parameters=image_generation_params, + ) + assert model_config.inference_parameters == image_generation_params + assert model_config.generation_type == GenerationType.IMAGE_GENERATION + + +def test_model_config_invalid_generation_type(): + with pytest.raises(ValidationError, match="Input should be"): + ModelConfig(alias="test", model="test", generation_type="invalid_generation_type") + with pytest.raises( + ValidationError, + match="Inference parameters must be an instance of 'EmbeddingInferenceParameters' when generation_type is 'embedding'", + ): + ModelConfig( + alias="test", + model="test", + generation_type=GenerationType.EMBEDDING, + inference_parameters=CompletionInferenceParameters(), + ) From 06a724b4090df4f150d348dd7c5e9b67b562daa4 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 3 Dec 2025 11:07:57 -0700 Subject: [PATCH 19/69] remove regex based chunking from embedding generator --- src/data_designer/config/column_configs.py | 7 +--- src/data_designer/config/models.py | 2 +- .../column_generators/generators/embedding.py | 10 ++--- src/data_designer/engine/processing/utils.py | 38 +++++++++++++++++++ tests/engine/processing/test_utils.py | 17 +++++++++ 5 files changed, 62 insertions(+), 12 deletions(-) diff --git a/src/data_designer/config/column_configs.py b/src/data_designer/config/column_configs.py index eb93f9f0..a3bef936 100644 --- a/src/data_designer/config/column_configs.py +++ b/src/data_designer/config/column_configs.py @@ -386,17 +386,14 @@ class EmbeddingColumnConfig(SingleColumnConfig): Attributes: column_type: Discriminator field, always "embedding" for this configuration type. - target_column: The column to generate embeddings for. + target_column: The column to generate embeddings for. The column could be a single text string or a list of text strings in stringified JSON format. + If it is a list of text strings in stringified JSON format, the embeddings will be generated for each text string. model_alias: The model to use for embedding generation. - chunk_pattern: Optional regex pattern to split the text in the target column into chunks. For example, if chunk_pattern - is r'\n+', the text will be split into chunks using one or more newlines as separators and embeddings generated for each chunk. - If not provided, the entire text will be embedded as a single chunk. """ column_type: Literal["embedding"] = "embedding" target_column: str model_alias: str - chunk_pattern: Optional[str] = None @property def required_columns(self) -> list[str]: diff --git a/src/data_designer/config/models.py b/src/data_designer/config/models.py index b10deca0..4b3ae12c 100644 --- a/src/data_designer/config/models.py +++ b/src/data_designer/config/models.py @@ -271,7 +271,7 @@ class ModelConfig(ConfigBase): alias: str model: str inference_parameters: InferenceParametersT = Field(default_factory=CompletionInferenceParameters) - generation_type: GenerationType = Field(default=GenerationType.CHAT_COMPLETION) + generation_type: Optional[GenerationType] = Field(default=GenerationType.CHAT_COMPLETION) provider: Optional[str] = None @model_validator(mode="after") diff --git a/src/data_designer/engine/column_generators/generators/embedding.py b/src/data_designer/engine/column_generators/generators/embedding.py index 48fc309f..ed738e8f 100644 --- a/src/data_designer/engine/column_generators/generators/embedding.py +++ b/src/data_designer/engine/column_generators/generators/embedding.py @@ -1,7 +1,6 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import re from data_designer.config.column_configs import EmbeddingColumnConfig from data_designer.engine.column_generators.generators.base import ( @@ -10,7 +9,7 @@ GeneratorMetadata, WithModelGeneration, ) -from data_designer.engine.processing.utils import deserialize_json_values +from data_designer.engine.processing.utils import deserialize_json_values, parse_list_string from data_designer.engine.resources.resource_provider import ResourceType @@ -26,10 +25,9 @@ def metadata() -> GeneratorMetadata: def generate(self, data: dict) -> dict: deserialized_record = deserialize_json_values(data) - input_text = deserialized_record[self.config.target_column] - input_chunks = re.split(self.config.chunk_pattern, input_text) if self.config.chunk_pattern else [input_text] - input_chunks = [chunk.strip() for chunk in input_chunks if chunk.strip()] - embeddings = self.model.generate_text_embeddings(input_texts=input_chunks) + input_texts = parse_list_string(deserialized_record[self.config.target_column]) + embeddings = self.model.generate_text_embeddings(input_texts=input_texts) + data[self.config.name] = { "embeddings": embeddings, "num_embeddings": len(embeddings), diff --git a/src/data_designer/engine/processing/utils.py b/src/data_designer/engine/processing/utils.py index 3579b3bd..5d42c40e 100644 --- a/src/data_designer/engine/processing/utils.py +++ b/src/data_designer/engine/processing/utils.py @@ -1,8 +1,10 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import ast import json import logging +import re from typing import Any, TypeVar, Union, overload import pandas as pd @@ -100,6 +102,42 @@ def deserialize_json_values(data): return data +def parse_list_string(text: str) -> list[str]: + """Parse a list from a string, handling JSON arrays, Python lists, and trailing commas.""" + text = text.strip() + + # Try JSON first + try: + list_obj = json.loads(text) + if isinstance(list_obj, list): + return _clean_whitespace(list_obj) + except json.JSONDecodeError: + pass + + # Remove trailing commas before closing brackets (common in JSON-like strings) + text_cleaned = re.sub(r",\s*]", "]", text) + text_cleaned = re.sub(r",\s*}", "}", text_cleaned) + + # Try JSON again with cleaned text + try: + return _clean_whitespace(json.loads(text_cleaned)) + except json.JSONDecodeError: + pass + + # Try Python literal eval (handles single quotes) + try: + return _clean_whitespace(ast.literal_eval(text_cleaned)) + except (ValueError, SyntaxError): + pass + + # If all else fails, return the original text + return [text.strip()] + + +def _clean_whitespace(texts: list[str]) -> list[str]: + return [text.strip() for text in texts] + + def _verify_columns_are_unique(datasets: list[pd.DataFrame]) -> None: joined_columns = set() for df in datasets: diff --git a/tests/engine/processing/test_utils.py b/tests/engine/processing/test_utils.py index a41e0ec2..dec0fe6a 100644 --- a/tests/engine/processing/test_utils.py +++ b/tests/engine/processing/test_utils.py @@ -9,6 +9,7 @@ from data_designer.engine.processing.utils import ( concat_datasets, deserialize_json_values, + parse_list_string, ) @@ -116,3 +117,19 @@ def test_concat_datasets_logging(mock_logger, stub_sample_dataframes): def test_deserialize_json_values_scenarios(test_case, input_data, expected_result): result = deserialize_json_values(input_data) assert result == expected_result + + +@pytest.mark.parametrize( + "input_string,expected_result", + [ + ('["a", "b", "c"]', ["a", "b", "c"]), # valid stringified json array + ('[" a ", " b", "c "]', ["a", "b", "c"]), # valid stringified json array with whitespace + ('["a", "b", "c",]', ["a", "b", "c"]), # valid stringified json array with trailing comma + ("['a', 'b', 'c']", ["a", "b", "c"]), # valid python-style list with single quotes + ("['a', 'b', 'c', ]", ["a", "b", "c"]), # valid python-style list with trailing comma + ("simple string ", ["simple string"]), # simple string with whitespace + ], +) +def test_parse_list_string_scenarios(input_string, expected_result): + result = parse_list_string(input_string) + assert result == expected_result From f291033e6e1e0debdf31f10f732931c724370afe Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 4 Feb 2026 09:59:52 -0700 Subject: [PATCH 20/69] save progress --- .../src/data_designer/config/__init__.py | 6 + .../data_designer/config/column_configs.py | 11 +- .../src/data_designer/config/models.py | 65 ++- .../config/utils/visualization.py | 128 ++++++ .../column_generators/generators/image.py | 20 +- .../src/data_designer/engine/models/facade.py | 116 ++++- .../integrations/huggingface/client.py | 419 ++++++++++++++++++ pyproject.toml | 3 + uv.lock | 412 ++++++++++++++++- 9 files changed, 1148 insertions(+), 32 deletions(-) create mode 100644 packages/data-designer/src/data_designer/integrations/huggingface/client.py diff --git a/packages/data-designer-config/src/data_designer/config/__init__.py b/packages/data-designer-config/src/data_designer/config/__init__.py index 0ebf06be..46122609 100644 --- a/packages/data-designer-config/src/data_designer/config/__init__.py +++ b/packages/data-designer-config/src/data_designer/config/__init__.py @@ -15,6 +15,7 @@ from data_designer.config.column_configs import ( # noqa: F401 EmbeddingColumnConfig, ExpressionColumnConfig, + ImageGenerationColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, LLMStructuredColumnConfig, @@ -34,7 +35,9 @@ ToolConfig, ) from data_designer.config.models import ( # noqa: F401 + ChatCompletionImageInferenceParams, ChatCompletionInferenceParams, + DiffusionImageInferenceParams, EmbeddingInferenceParams, GenerationType, ImageContext, @@ -117,6 +120,7 @@ # column_configs "EmbeddingColumnConfig": (_MOD_COLUMN_CONFIGS, "EmbeddingColumnConfig"), "ExpressionColumnConfig": (_MOD_COLUMN_CONFIGS, "ExpressionColumnConfig"), + "ImageGenerationColumnConfig": (_MOD_COLUMN_CONFIGS, "ImageGenerationColumnConfig"), "LLMCodeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMCodeColumnConfig"), "LLMJudgeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMJudgeColumnConfig"), "LLMStructuredColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMStructuredColumnConfig"), @@ -138,7 +142,9 @@ "MCPProvider": (_MOD_MCP, "MCPProvider"), "ToolConfig": (_MOD_MCP, "ToolConfig"), # models + "ChatCompletionImageInferenceParams": (_MOD_MODELS, "ChatCompletionImageInferenceParams"), "ChatCompletionInferenceParams": (_MOD_MODELS, "ChatCompletionInferenceParams"), + "DiffusionImageInferenceParams": (_MOD_MODELS, "DiffusionImageInferenceParams"), "EmbeddingInferenceParams": (_MOD_MODELS, "EmbeddingInferenceParams"), "GenerationType": (_MOD_MODELS, "GenerationType"), "ImageContext": (_MOD_MODELS, "ImageContext"), diff --git a/packages/data-designer-config/src/data_designer/config/column_configs.py b/packages/data-designer-config/src/data_designer/config/column_configs.py index ee5efa80..9e1f5737 100644 --- a/packages/data-designer-config/src/data_designer/config/column_configs.py +++ b/packages/data-designer-config/src/data_designer/config/column_configs.py @@ -480,7 +480,14 @@ def side_effect_columns(self) -> list[str]: class ImageGenerationColumnConfig(SingleColumnConfig): """Configuration for image generation columns. - Image columns generate images using a specified model. + Image columns generate images using either autoregressive or diffusion models. + The API used is automatically determined by the model's inference parameters: + + - **Autoregressive models** (ChatCompletionImageInferenceParams): + GPT-5, gpt-image-*, Gemini image generation models via chat completions API + + - **Diffusion models** (DiffusionImageInferenceParams): + DALL-E, Imagen, Stable Diffusion via image_generation API Attributes: column_type: Discriminator field, always "image-generation" for this configuration type. @@ -505,7 +512,7 @@ def required_columns(self) -> list[str]: Returns: List of unique column names referenced in Jinja2 templates. """ - return list(extract_keywords_from_jinja2_template(self.expr)) + return list(extract_keywords_from_jinja2_template(self.prompt)) @model_validator(mode="after") def assert_prompt_valid_jinja(self) -> Self: diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py index 5e9b3518..203ddbdb 100644 --- a/packages/data-designer-config/src/data_designer/config/models.py +++ b/packages/data-designer-config/src/data_designer/config/models.py @@ -242,7 +242,8 @@ def sample(self) -> float: class GenerationType(str, Enum): CHAT_COMPLETION = "chat-completion" EMBEDDING = "embedding" - IMAGE_GENERATION = "image-generation" + CHAT_COMPLETION_IMAGE = "chat-completion-image" + DIFFUSION_IMAGE = "diffusion-image" class BaseInferenceParams(ConfigBase, ABC): @@ -415,23 +416,64 @@ def generate_kwargs(self) -> dict[str, float | int]: return result -class ImageGenerationInferenceParams(BaseInferenceParams): - generation_type: Literal[GenerationType.IMAGE_GENERATION] = GenerationType.IMAGE_GENERATION +class ChatCompletionImageInferenceParams(BaseInferenceParams): + """Configuration for image generation using autoregressive models via chat completions API. + + Uses the standard chat completions API for autoregressive multimodal models + that can generate images (GPT-5, gpt-image-*, Gemini image generation, etc.). + + Attributes: + generation_type: Type of generation, always "chat-completion-image" for this class. + quality: Optional quality setting for image generation (e.g., "standard", "hd"). + size: Optional size specification for generated images (e.g., "1024x1024", "1792x1024"). + """ + + generation_type: Literal[GenerationType.CHAT_COMPLETION_IMAGE] = GenerationType.CHAT_COMPLETION_IMAGE + quality: str | None = None + size: str | None = None + + @property + def generate_kwargs(self) -> dict[str, Any]: + result = super().generate_kwargs + if self.quality is not None: + result["quality"] = self.quality + if self.size is not None: + result["size"] = self.size + return result + + +class DiffusionImageInferenceParams(BaseInferenceParams): + """Configuration for image generation using diffusion models via image_generation API. + + Uses the legacy image_generation API for diffusion models like DALL-E, Imagen, + and Stable Diffusion. + + Attributes: + generation_type: Type of generation, always "diffusion-image" for this class. + quality: Quality setting for image generation (e.g., "standard", "hd"). + size: Size specification for generated images (e.g., "1024x1024", "1792x1024"). + output_format: Format of the output ("url" or "base64"). Default: "base64". + """ + + generation_type: Literal[GenerationType.DIFFUSION_IMAGE] = GenerationType.DIFFUSION_IMAGE quality: str size: str - output_format: ModalityDataType | None = ModalityDataType.BASE64 + output_format: ModalityDataType = ModalityDataType.BASE64 @property def generate_kwargs(self) -> dict[str, Any]: result = super().generate_kwargs result["size"] = self.size result["quality"] = self.quality - result["response_format"] = "b64_json" if self.output_format == ModalityDataType.BASE64 else self.output_format + result["response_format"] = "b64_json" if self.output_format == ModalityDataType.BASE64 else "url" return result InferenceParamsT: TypeAlias = Annotated[ - ChatCompletionInferenceParams | EmbeddingInferenceParams | ImageGenerationInferenceParams, + ChatCompletionInferenceParams + | EmbeddingInferenceParams + | ChatCompletionImageInferenceParams + | DiffusionImageInferenceParams, Field(discriminator="generation_type"), ] @@ -464,8 +506,15 @@ def generation_type(self) -> GenerationType: def _convert_inference_parameters(cls, value: Any) -> Any: """Convert raw dict to appropriate inference parameters type based on field presence.""" if isinstance(value, dict): - # Infer type from presence of embedding-specific fields - if "encoding_format" in value or "dimensions" in value: + # Check for explicit generation_type first + gen_type = value.get("generation_type") + + # Infer type from generation_type or field presence + if gen_type == "chat-completion-image": + return ChatCompletionImageInferenceParams(**value) + elif gen_type == "diffusion-image": + return DiffusionImageInferenceParams(**value) + elif gen_type == "embedding" or "encoding_format" in value or "dimensions" in value: return EmbeddingInferenceParams(**value) else: return ChatCompletionInferenceParams(**value) diff --git a/packages/data-designer-config/src/data_designer/config/utils/visualization.py b/packages/data-designer-config/src/data_designer/config/utils/visualization.py index 7e5c79a9..38189068 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/visualization.py +++ b/packages/data-designer-config/src/data_designer/config/utils/visualization.py @@ -3,6 +3,8 @@ from __future__ import annotations +import base64 +import io import json import os from collections import OrderedDict @@ -39,6 +41,93 @@ console = Console() +def _is_base64_image(value: str) -> bool: + """Check if a string is base64-encoded image data.""" + if not isinstance(value, str): + return False + # Check if it starts with data URI scheme + if value.startswith("...") or plain base64 + + Returns: + Base64 string without data URI prefix + + Raises: + ModelAPIError: If data URI format is invalid + """ + if data.startswith("data:image/"): + # Extract base64 portion after comma + if "," in data: + return data.split(",", 1)[1] + else: + raise ModelAPIError("Invalid data URI format: missing comma separator") + + # Already plain base64 + return data + + def _download_url_to_base64(self, url: str) -> str: + """Download image from URL and convert to base64. + + Args: + url: Image URL + + Returns: + Base64-encoded image string + + Raises: + ModelAPIError: If download fails + """ + import base64 + + from data_designer.lazy_heavy_imports import httpx + + try: + with httpx.Client(timeout=30.0) as client: + response = client.get(url) + response.raise_for_status() + image_bytes = response.content + return base64.b64encode(image_bytes).decode("utf-8") + except Exception as e: + raise ModelAPIError(f"Failed to download image from URL {url}: {e}") from e diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py b/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py new file mode 100644 index 00000000..ad7ef0d5 --- /dev/null +++ b/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py @@ -0,0 +1,6 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from data_designer.engine.storage.image_storage import ImageFormat, ImageStorageManager + +__all__ = ["ImageFormat", "ImageStorageManager"] diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py new file mode 100644 index 00000000..d632bbc1 --- /dev/null +++ b/packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py @@ -0,0 +1,166 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import base64 +import uuid +from enum import Enum +from pathlib import Path + + +class ImageFormat(str, Enum): + """Supported image formats.""" + + PNG = "png" + JPEG = "jpeg" + JPG = "jpg" + WEBP = "webp" + + +class ImageStorageManager: + """Manages disk storage of generated images. + + Handles: + - Creating images directory + - Decoding base64 to bytes + - Detecting image format + - Saving with UUID filenames + - Returning relative paths + """ + + def __init__(self, base_path: Path, images_subdir: str = "images", validate_images: bool = True) -> None: + """Initialize image storage manager. + + Args: + base_path: Base directory for dataset + images_subdir: Subdirectory name for images (default: "images") + validate_images: Whether to validate images after saving (default: True) + """ + self.base_path = Path(base_path) + self.images_dir = self.base_path / images_subdir + self.images_subdir = images_subdir + self.validate_images = validate_images + self._ensure_images_directory() + + def _ensure_images_directory(self) -> None: + """Create images directory if it doesn't exist.""" + self.images_dir.mkdir(parents=True, exist_ok=True) + + def save_base64_image(self, base64_data: str) -> str: + """Save base64 image to disk and return relative path. + + Args: + base64_data: Base64 encoded image string (with or without data URI prefix) + + Returns: + Relative path to saved image (e.g., "images/f47ac10b-58cc.png") + + Raises: + ValueError: If base64 data is invalid + OSError: If disk write fails + """ + # Decode base64 to bytes + image_bytes = self._decode_base64(base64_data) + + # Detect format + image_format = self._detect_format(image_bytes) + + # Generate unique filename + image_id = uuid.uuid4() + filename = f"{image_id}.{image_format.value}" + full_path = self.images_dir / filename + relative_path = f"{self.images_subdir}/{filename}" + + # Write to disk + with open(full_path, "wb") as f: + f.write(image_bytes) + + # Optional validation + if self.validate_images: + self._validate_image(full_path) + + return relative_path + + def _decode_base64(self, base64_data: str) -> bytes: + """Decode base64 string to bytes. + + Args: + base64_data: Base64 string (with or without data URI prefix) + + Returns: + Decoded bytes + + Raises: + ValueError: If base64 data is invalid + """ + # Remove data URI prefix if present (e.g., "data:image/png;base64,") + if base64_data.startswith("data:"): + if "," in base64_data: + base64_data = base64_data.split(",", 1)[1] + else: + raise ValueError("Invalid data URI format: missing comma separator") + + try: + return base64.b64decode(base64_data, validate=True) + except Exception as e: + raise ValueError(f"Invalid base64 data: {e}") from e + + def _detect_format(self, image_bytes: bytes) -> ImageFormat: + """Detect image format from bytes. + + Args: + image_bytes: Image data as bytes + + Returns: + Detected format (defaults to PNG if unknown) + """ + # Check magic bytes first (fast) + if image_bytes.startswith(b"\x89PNG\r\n\x1a\n"): + return ImageFormat.PNG + elif image_bytes.startswith(b"\xff\xd8\xff"): + return ImageFormat.JPG + elif image_bytes.startswith(b"RIFF") and b"WEBP" in image_bytes[:12]: + return ImageFormat.WEBP + + # Fallback to PIL for robust detection + try: + import io + + from PIL import Image + + img = Image.open(io.BytesIO(image_bytes)) + format_str = img.format.lower() if img.format else None + if format_str in ["png", "jpeg", "jpg", "webp"]: + return ImageFormat(format_str if format_str != "jpeg" else "jpg") + except Exception: + pass + + # Default to PNG + return ImageFormat.PNG + + def _validate_image(self, image_path: Path) -> None: + """Validate that saved image is readable. + + Args: + image_path: Path to image file + + Raises: + ValueError: If image is corrupted or unreadable + """ + try: + from PIL import Image + + with Image.open(image_path) as img: + img.verify() + except Exception as e: + # Clean up invalid file + image_path.unlink(missing_ok=True) + raise ValueError(f"Saved image is invalid or corrupted: {e}") from e + + def cleanup(self) -> None: + """Clean up image directory (for preview mode).""" + import shutil + + if self.images_dir.exists(): + shutil.rmtree(self.images_dir) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index c047d73b..2e84ee3c 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -66,6 +66,7 @@ def upload_dataset( Uploads the complete dataset including: - Main parquet batch files from parquet-files/ β†’ data/ + - Images from images/ β†’ images/ (if present) - Processor output batch files from processors-files/{name}/ β†’ {name}/ - Existing builder_config.json and metadata.json files - Auto-generated README.md (dataset card) @@ -102,6 +103,7 @@ def upload_dataset( raise HuggingFaceHubClientUploadError(f"Failed to upload dataset card: {e}") from e self._upload_main_dataset_files(repo_id=repo_id, parquet_folder=base_dataset_path / FINAL_DATASET_FOLDER_NAME) + self._upload_images_folder(repo_id=repo_id, images_folder=base_dataset_path / "images") self._upload_processor_files( repo_id=repo_id, processors_folder=base_dataset_path / PROCESSORS_OUTPUTS_FOLDER_NAME ) @@ -178,6 +180,36 @@ def _upload_main_dataset_files(self, repo_id: str, parquet_folder: Path) -> None except Exception as e: raise HuggingFaceHubClientUploadError(f"Failed to upload parquet files: {e}") from e + def _upload_images_folder(self, repo_id: str, images_folder: Path) -> None: + """Upload images folder to Hugging Face Hub. + + Args: + repo_id: Hugging Face dataset repo ID + images_folder: Path to images folder + + Raises: + HuggingFaceUploadError: If upload fails + """ + if not images_folder.exists(): + return + + image_files = list(images_folder.glob("*")) + if not image_files: + return + + logger.info(f" |-- {RandomEmoji.loading()} Uploading {len(image_files)} images...") + + try: + self._api.upload_folder( + repo_id=repo_id, + folder_path=str(images_folder), + path_in_repo="images", + repo_type="dataset", + commit_message="Upload images", + ) + except Exception as e: + raise HuggingFaceHubClientUploadError(f"Failed to upload images: {e}") from e + def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None: """Upload processor output files. From ed9787bf297a5a57c90f5b58ebd049a2fbe07cae Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 6 Feb 2026 10:17:21 -0700 Subject: [PATCH 22/69] support generation of multiple images --- .../src/data_designer/config/models.py | 14 +++- .../config/utils/visualization.py | 24 ++++++- .../column_generators/generators/image.py | 20 +++--- .../src/data_designer/engine/models/facade.py | 72 ++++++++++--------- 4 files changed, 84 insertions(+), 46 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py index dc3533e3..3dab2d8d 100644 --- a/packages/data-designer-config/src/data_designer/config/models.py +++ b/packages/data-designer-config/src/data_designer/config/models.py @@ -435,8 +435,11 @@ class ImageInferenceParams(BaseInferenceParams): - Preview mode: Images stored as base64 directly in dataframe Common parameters like quality and size are provided as optional fields. - For model-specific parameters, use the `extra_body` field inherited from - BaseInferenceParams. + For model-specific parameters (including n for number of images), use the `extra_body` + field inherited from BaseInferenceParams. + + If the API returns multiple images (either from prompt or API parameters), all images + will be stored as a list in the dataframe. Attributes: generation_type: Type of generation, always "image" for this class. @@ -451,6 +454,13 @@ class ImageInferenceParams(BaseInferenceParams): size="1024x1024" ) + # Generate multiple images using extra_body + dd.ImageInferenceParams( + quality="hd", + size="1024x1024", + extra_body={"n": 3} # Request 3 images from API + ) + # With model-specific params via extra_body dd.ImageInferenceParams( quality="hd", diff --git a/packages/data-designer-config/src/data_designer/config/utils/visualization.py b/packages/data-designer-config/src/data_designer/config/utils/visualization.py index 56d28fd3..62b57f5e 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/visualization.py +++ b/packages/data-designer-config/src/data_designer/config/utils/visualization.py @@ -394,7 +394,28 @@ def display_sample_record( if col.drop: continue image_data = record[col.name] - if _is_base64_image(image_data): + + # Handle list of images + if isinstance(image_data, list): + previews = [] + for idx, img in enumerate(image_data): + if _is_base64_image(img): + previews.append(f"[{idx}] ") + if in_notebook: + images_to_display_later.append((f"{col.name}[{idx}]", img)) + elif _is_image_url(img): + previews.append(f"[{idx}] ") + if in_notebook: + images_to_display_later.append((f"{col.name}[{idx}]", img)) + elif _is_image_path(img): + previews.append(f"[{idx}] ") + if in_notebook: + images_to_display_later.append((f"{col.name}[{idx}]", img)) + else: + previews.append(f"[{idx}] {str(img)[:30]}") + preview = "\n".join(previews) if previews else "" + # Handle single image (backwards compatibility) + elif _is_base64_image(image_data): preview = f"" if in_notebook: images_to_display_later.append((col.name, image_data)) @@ -408,6 +429,7 @@ def display_sample_record( images_to_display_later.append((col.name, image_data)) else: preview = str(image_data)[:100] + "..." if len(str(image_data)) > 100 else str(image_data) + table.add_row(col.name, preview) render_list.append(pad_console_element(table)) diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py index 2d24fc2d..db3c9c9e 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py @@ -36,13 +36,13 @@ def get_generation_strategy() -> GenerationStrategy: return GenerationStrategy.CELL_BY_CELL def generate(self, data: dict) -> dict: - """Generate image and optionally save to disk. + """Generate image(s) and optionally save to disk. Args: data: Record data Returns: - Record with image path (create mode) or base64 data (preview mode) added + Record with image path(s) (create mode) or base64 data (preview mode) added """ deserialized_record = deserialize_json_values(data) @@ -63,16 +63,18 @@ def generate(self, data: dict) -> dict: if not prompt or not prompt.strip(): raise ValueError(f"Rendered prompt for column {self.config.name!r} is empty") - # Generate image (returns base64 string) - base64_image = self.model.generate_image(prompt=prompt) + # Generate images (returns list of base64 strings) + base64_images = self.model.generate_image(prompt=prompt) # Store in dataframe based on mode if self.image_storage_manager: - # Create mode: save to disk and store relative path - relative_path = self.image_storage_manager.save_base64_image(base64_image) - data[self.config.name] = relative_path + # Create mode: save each image to disk and store list of relative paths + relative_paths = [ + self.image_storage_manager.save_base64_image(base64_image) for base64_image in base64_images + ] + data[self.config.name] = relative_paths else: - # Preview mode: store base64 directly - data[self.config.name] = base64_image + # Preview mode: store list of base64 strings directly + data[self.config.name] = base64_images return data diff --git a/packages/data-designer-engine/src/data_designer/engine/models/facade.py b/packages/data-designer-engine/src/data_designer/engine/models/facade.py index 1abd235b..b78d2e1e 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/facade.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/facade.py @@ -163,22 +163,23 @@ def generate_text_embeddings( self._track_usage_from_embedding(response) @catch_llm_exceptions - def generate_image(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> str: - """Generate image and return base64-encoded data. + def generate_image(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> list[str]: + """Generate image(s) and return base64-encoded data. Automatically detects the appropriate API based on model name: - Diffusion models (DALL-E, Stable Diffusion, Imagen, etc.) β†’ image_generation API - All other models β†’ chat/completions API (default) - Both paths return base64-encoded image data. + Both paths return base64-encoded image data. If the API returns multiple images, + all are returned in the list. Args: prompt: The prompt for image generation skip_usage_tracking: Whether to skip usage tracking - **kwargs: Additional arguments to pass to the model + **kwargs: Additional arguments to pass to the model (including n=number of images) Returns: - Base64-encoded image string (without data URI prefix) + List of base64-encoded image strings (without data URI prefix) Raises: ModelAPIError: If image generation fails or returns invalid data @@ -214,11 +215,11 @@ def _is_diffusion_model(self) -> bool: ] return any(pattern in model_lower for pattern in diffusion_patterns) - def _generate_image_chat_completion(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> str: - """Generate image using autoregressive model via chat completions API. + def _generate_image_chat_completion(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> list[str]: + """Generate image(s) using autoregressive model via chat completions API. Returns: - Base64-encoded image string + List of base64-encoded image strings """ kwargs = self.consolidate_kwargs(**kwargs) messages = [ChatMessage.as_user(content=prompt)] @@ -232,7 +233,7 @@ def _generate_image_chat_completion(self, prompt: str, skip_usage_tracking: bool ) logger.debug( - f"Received image from autoregressive model {self.model_name!r}", + f"Received image(s) from autoregressive model {self.model_name!r}", extra={"model": self.model_name, "response": response}, ) @@ -241,42 +242,45 @@ def _generate_image_chat_completion(self, prompt: str, skip_usage_tracking: bool raise ModelAPIError("Response missing choices") message = response.choices[0].message + images = [] # Extract base64 from images attribute (primary path) if hasattr(message, "images") and message.images: - first_image = message.images[0] - - # Handle different response formats - if isinstance(first_image, dict) and "image_url" in first_image: - image_url = first_image["image_url"] - - if isinstance(image_url, dict) and "url" in image_url: - url = image_url["url"] - return self._extract_base64_from_data_uri(url) - elif isinstance(image_url, str): - return self._extract_base64_from_data_uri(image_url) - - # Fallback: treat as base64 string - if isinstance(first_image, str): - return self._extract_base64_from_data_uri(first_image) + for image in message.images: + # Handle different response formats + if isinstance(image, dict) and "image_url" in image: + image_url = image["image_url"] + + if isinstance(image_url, dict) and "url" in image_url: + url = image_url["url"] + images.append(self._extract_base64_from_data_uri(url)) + elif isinstance(image_url, str): + images.append(self._extract_base64_from_data_uri(image_url)) + # Fallback: treat as base64 string + elif isinstance(image, str): + images.append(self._extract_base64_from_data_uri(image)) # Fallback: check content field - content = message.content or "" - if content: - return self._extract_base64_from_data_uri(content) + if not images: + content = message.content or "" + if content: + images.append(self._extract_base64_from_data_uri(content)) + + if not images: + raise ModelAPIError("No image data found in response") - raise ModelAPIError("No image data found in response") + return images except Exception: raise - def _generate_image_diffusion(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> str: - """Generate image using diffusion model via image_generation API. + def _generate_image_diffusion(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> list[str]: + """Generate image(s) using diffusion model via image_generation API. Always returns base64. The API is configured to return base64 format. Returns: - Base64-encoded image string + List of base64-encoded image strings """ kwargs = self.consolidate_kwargs(**kwargs) @@ -289,7 +293,7 @@ def _generate_image_diffusion(self, prompt: str, skip_usage_tracking: bool = Fal response = self._router.image_generation(prompt=prompt, model=self.model_name, **kwargs) logger.debug( - f"Received image from diffusion model {self.model_name!r}", + f"Received {len(response.data)} image(s) from diffusion model {self.model_name!r}", extra={"model": self.model_name, "response": response}, ) @@ -297,8 +301,8 @@ def _generate_image_diffusion(self, prompt: str, skip_usage_tracking: bool = Fal if not response.data or len(response.data) == 0: raise ModelAPIError("Image generation returned no data") - # Return base64 data - return response.data[0].b64_json + # Return all images as list + return [img.b64_json for img in response.data] except Exception: raise From 7dea87a0e75e242951751cf3af14e94aac77eb46 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 6 Feb 2026 10:28:40 -0700 Subject: [PATCH 23/69] clean up visualization --- .../config/utils/image_helpers.py | 110 ++++++++++++++++++ .../config/utils/visualization.py | 94 +++------------ 2 files changed, 124 insertions(+), 80 deletions(-) create mode 100644 packages/data-designer-config/src/data_designer/config/utils/image_helpers.py diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py new file mode 100644 index 00000000..a32714d3 --- /dev/null +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -0,0 +1,110 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Helper utilities for working with images.""" + +from __future__ import annotations + +import base64 +from pathlib import Path + +from data_designer.config.models import ImageFormat + + +def is_image_path(value: str) -> bool: + """Check if a string is an image file path. + + Args: + value: String to check + + Returns: + True if the string looks like an image file path, False otherwise + """ + if not isinstance(value, str): + return False + return any(value.lower().endswith(ext) for ext in get_supported_image_extensions()) + + +def is_base64_image(value: str) -> bool: + """Check if a string is base64-encoded image data. + + Args: + value: String to check + + Returns: + True if the string looks like base64-encoded image data, False otherwise + """ + if not isinstance(value, str): + return False + # Check if it starts with data URI scheme + if value.startswith("..." and returns + just the base64 portion. + + Args: + data: Data URI (e.g., "") or plain base64 + + Returns: + Base64 string without data URI prefix + + Raises: + ValueError: If data URI format is invalid + """ + if data.startswith("data:"): + if "," in data: + return data.split(",", 1)[1] + raise ValueError("Invalid data URI format: missing comma separator") + return data + + +def decode_base64_image(base64_data: str) -> bytes: + """Decode base64 string to image bytes. + + Automatically handles data URIs by extracting the base64 portion first. + + Args: + base64_data: Base64 string (with or without data URI prefix) + + Returns: + Decoded image bytes + + Raises: + ValueError: If base64 data is invalid + """ + # Remove data URI prefix if present + base64_data = extract_base64_from_data_uri(base64_data) + + try: + return base64.b64decode(base64_data, validate=True) + except Exception as e: + raise ValueError(f"Invalid base64 data: {e}") from e + + +def detect_image_format(image_bytes: bytes) -> ImageFormat: + """Detect image format from bytes. + + Uses magic bytes for fast detection, falls back to PIL for robust detection. + + Args: + image_bytes: Image data as bytes + + Returns: + Detected format (defaults to PNG if unknown) + """ + # Check magic bytes first (fast) + if image_bytes.startswith(IMAGE_FORMAT_MAGIC_BYTES[ImageFormat.PNG]): + return ImageFormat.PNG + elif image_bytes.startswith(IMAGE_FORMAT_MAGIC_BYTES[ImageFormat.JPG]): + return ImageFormat.JPG + elif image_bytes.startswith(b"RIFF") and b"WEBP" in image_bytes[:12]: + return ImageFormat.WEBP + + # Fallback to PIL for robust detection + try: + img = PIL.Image.open(io.BytesIO(image_bytes)) + format_str = img.format.lower() if img.format else None + if format_str in ["png", "jpeg", "jpg", "webp"]: + return ImageFormat(format_str if format_str != "jpeg" else "jpg") + except Exception: + pass + + # Default to PNG + return ImageFormat.PNG def is_image_path(value: str) -> bool: diff --git a/packages/data-designer-config/src/data_designer/config/utils/visualization.py b/packages/data-designer-config/src/data_designer/config/utils/visualization.py index dd819aa3..c349ec86 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/visualization.py +++ b/packages/data-designer-config/src/data_designer/config/utils/visualization.py @@ -29,16 +29,18 @@ from data_designer.config.utils.constants import NVIDIA_API_KEY_ENV_VAR_NAME, OPENAI_API_KEY_ENV_VAR_NAME from data_designer.config.utils.errors import DatasetSampleDisplayError from data_designer.config.utils.image_helpers import ( + extract_base64_from_data_uri, is_base64_image, is_image_path, is_image_url, load_image_path_to_base64, ) -from data_designer.lazy_heavy_imports import np, pd +from data_designer.lazy_heavy_imports import PIL, np, pd if TYPE_CHECKING: import numpy as np import pandas as pd + import PIL from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.dataset_metadata import DatasetMetadata @@ -64,7 +66,6 @@ def _display_image_if_in_notebook( try: # Check if we're in a Jupyter environment from IPython.display import HTML, display - from PIL import Image as PILImage get_ipython() # This will raise NameError if not in IPython/Jupyter @@ -77,23 +78,21 @@ def _display_image_if_in_notebook( ) return False base64_data = loaded_base64 - # Decode the image - elif image_data.startswith("" + result = extract_base64_from_data_uri(data_uri) + assert result == "iVBORw0KGgoAAAANS" + + +def test_extract_base64_plain_base64_without_prefix(): + plain_base64 = "iVBORw0KGgoAAAANS" + result = extract_base64_from_data_uri(plain_base64) + assert result == plain_base64 + + +def test_extract_base64_invalid_data_uri_raises_error(): + with pytest.raises(ValueError, match="Invalid data URI format: missing comma separator"): + extract_base64_from_data_uri("data:image/png;base64") + + +# Tests for decode_base64_image + + +def test_decode_base64_image_valid(): + png_bytes = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01" + base64_data = base64.b64encode(png_bytes).decode() + result = decode_base64_image(base64_data) + assert result == png_bytes + + +def test_decode_base64_image_with_data_uri(): + png_bytes = b"\x89PNG\r\n\x1a\n" + base64_data = base64.b64encode(png_bytes).decode() + data_uri = f"data:image/png;base64,{base64_data}" + result = decode_base64_image(data_uri) + assert result == png_bytes + + +def test_decode_base64_image_invalid_raises_error(): + with pytest.raises(ValueError, match="Invalid base64 data"): + decode_base64_image("not-valid-base64!!!") + + +# Tests for detect_image_format + + +def test_detect_image_format_png(): + png_magic = b"\x89PNG\r\n\x1a\n" + b"\x00" * 10 + assert detect_image_format(png_magic) == ImageFormat.PNG + + +def test_detect_image_format_jpg(): + jpg_magic = b"\xff\xd8\xff" + b"\x00" * 10 + assert detect_image_format(jpg_magic) == ImageFormat.JPG + + +def test_detect_image_format_webp(): + webp_magic = b"RIFF" + b"\x00" * 4 + b"WEBP" + assert detect_image_format(webp_magic) == ImageFormat.WEBP + + +def test_detect_image_format_unknown_defaults_to_png(): + unknown_bytes = b"\x00\x00\x00\x00" + b"\x00" * 10 + assert detect_image_format(unknown_bytes) == ImageFormat.PNG + + +# Tests for is_image_path + + +def test_is_image_path_various_extensions(): + assert is_image_path("/path/to/image.png") is True + assert is_image_path("image.PNG") is True + assert is_image_path("image.jpg") is True + assert is_image_path("image.jpeg") is True + + +def test_is_image_path_non_image(): + assert is_image_path("/path/to/file.txt") is False + assert is_image_path("document.pdf") is False + + +def test_is_image_path_extension_in_directory(): + assert is_image_path("/some.png/file.txt") is False + + +# Tests for is_base64_image + + +def test_is_base64_image_data_uri(): + assert is_base64_image("") is True + + +def test_is_base64_image_long_valid_base64(): + long_base64 = base64.b64encode(b"x" * 100).decode() + assert is_base64_image(long_base64) is True + + +def test_is_base64_image_short_string(): + assert is_base64_image("short") is False + + +# Tests for is_image_url + + +def test_is_image_url_http_and_https(): + assert is_image_url("http://example.com/image.png") is True + assert is_image_url("https://example.com/photo.jpg") is True + + +def test_is_image_url_with_query_params(): + assert is_image_url("https://example.com/image.png?size=large") is True + + +def test_is_image_url_without_image_extension(): + assert is_image_url("https://example.com/page.html") is False + + +def test_is_image_url_non_http(): + assert is_image_url("ftp://example.com/image.png") is False + + +# Tests for get_supported_image_extensions + + +def test_get_supported_image_extensions_matches_enum(): + result = get_supported_image_extensions() + enum_values = [f".{fmt.value}" for fmt in ImageFormat] + assert set(result) == set(enum_values) diff --git a/packages/data-designer-engine/src/data_designer/engine/models/facade.py b/packages/data-designer-engine/src/data_designer/engine/models/facade.py index b78d2e1e..d13273f4 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/facade.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/facade.py @@ -9,6 +9,7 @@ from typing import TYPE_CHECKING, Any from data_designer.config.models import GenerationType, ModelConfig, ModelProvider +from data_designer.config.utils.image_helpers import extract_base64_from_data_uri from data_designer.engine.mcp.errors import MCPConfigurationError from data_designer.engine.model_provider import ModelProviderRegistry from data_designer.engine.models.errors import ( @@ -38,6 +39,16 @@ def _identity(x: Any) -> Any: logger = logging.getLogger(__name__) +# Patterns for detecting diffusion-based image generation models +DIFFUSION_MODEL_PATTERNS = [ + "dall-e", + "dalle", + "stable-diffusion", + "sd-", + "sd_", + "imagen", +] + class ModelFacade: def __init__( @@ -205,15 +216,7 @@ def _is_diffusion_model(self) -> bool: True if model is detected as diffusion-based, False otherwise """ model_lower = self.model_name.lower() - diffusion_patterns = [ - "dall-e", - "dalle", - "stable-diffusion", - "sd-", - "sd_", - "imagen", - ] - return any(pattern in model_lower for pattern in diffusion_patterns) + return any(pattern in model_lower for pattern in DIFFUSION_MODEL_PATTERNS) def _generate_image_chat_completion(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> list[str]: """Generate image(s) using autoregressive model via chat completions API. @@ -253,18 +256,18 @@ def _generate_image_chat_completion(self, prompt: str, skip_usage_tracking: bool if isinstance(image_url, dict) and "url" in image_url: url = image_url["url"] - images.append(self._extract_base64_from_data_uri(url)) + images.append(extract_base64_from_data_uri(url)) elif isinstance(image_url, str): - images.append(self._extract_base64_from_data_uri(image_url)) + images.append(extract_base64_from_data_uri(image_url)) # Fallback: treat as base64 string elif isinstance(image, str): - images.append(self._extract_base64_from_data_uri(image)) + images.append(extract_base64_from_data_uri(image)) # Fallback: check content field if not images: content = message.content or "" if content: - images.append(self._extract_base64_from_data_uri(content)) + images.append(extract_base64_from_data_uri(content)) if not images: raise ModelAPIError("No image data found in response") @@ -535,28 +538,6 @@ def _track_usage_from_image_diffusion(self, response: litellm.types.utils.ImageR request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), ) - def _extract_base64_from_data_uri(self, data: str) -> str: - """Extract base64 data from data URI or return as-is. - - Args: - data: Data URI (e.g., "...") or plain base64 - - Returns: - Base64 string without data URI prefix - - Raises: - ModelAPIError: If data URI format is invalid - """ - if data.startswith("data:image/"): - # Extract base64 portion after comma - if "," in data: - return data.split(",", 1)[1] - else: - raise ModelAPIError("Invalid data URI format: missing comma separator") - - # Already plain base64 - return data - def _download_url_to_base64(self, url: str) -> str: """Download image from URL and convert to base64. diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py index d632bbc1..22d4bf84 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py @@ -3,19 +3,15 @@ from __future__ import annotations -import base64 import uuid -from enum import Enum from pathlib import Path +from typing import TYPE_CHECKING +from data_designer.config.utils.image_helpers import decode_base64_image, detect_image_format +from data_designer.lazy_heavy_imports import PIL -class ImageFormat(str, Enum): - """Supported image formats.""" - - PNG = "png" - JPEG = "jpeg" - JPG = "jpg" - WEBP = "webp" +if TYPE_CHECKING: + import PIL class ImageStorageManager: @@ -61,10 +57,10 @@ def save_base64_image(self, base64_data: str) -> str: OSError: If disk write fails """ # Decode base64 to bytes - image_bytes = self._decode_base64(base64_data) + image_bytes = decode_base64_image(base64_data) # Detect format - image_format = self._detect_format(image_bytes) + image_format = detect_image_format(image_bytes) # Generate unique filename image_id = uuid.uuid4() @@ -82,63 +78,6 @@ def save_base64_image(self, base64_data: str) -> str: return relative_path - def _decode_base64(self, base64_data: str) -> bytes: - """Decode base64 string to bytes. - - Args: - base64_data: Base64 string (with or without data URI prefix) - - Returns: - Decoded bytes - - Raises: - ValueError: If base64 data is invalid - """ - # Remove data URI prefix if present (e.g., "data:image/png;base64,") - if base64_data.startswith("data:"): - if "," in base64_data: - base64_data = base64_data.split(",", 1)[1] - else: - raise ValueError("Invalid data URI format: missing comma separator") - - try: - return base64.b64decode(base64_data, validate=True) - except Exception as e: - raise ValueError(f"Invalid base64 data: {e}") from e - - def _detect_format(self, image_bytes: bytes) -> ImageFormat: - """Detect image format from bytes. - - Args: - image_bytes: Image data as bytes - - Returns: - Detected format (defaults to PNG if unknown) - """ - # Check magic bytes first (fast) - if image_bytes.startswith(b"\x89PNG\r\n\x1a\n"): - return ImageFormat.PNG - elif image_bytes.startswith(b"\xff\xd8\xff"): - return ImageFormat.JPG - elif image_bytes.startswith(b"RIFF") and b"WEBP" in image_bytes[:12]: - return ImageFormat.WEBP - - # Fallback to PIL for robust detection - try: - import io - - from PIL import Image - - img = Image.open(io.BytesIO(image_bytes)) - format_str = img.format.lower() if img.format else None - if format_str in ["png", "jpeg", "jpg", "webp"]: - return ImageFormat(format_str if format_str != "jpeg" else "jpg") - except Exception: - pass - - # Default to PNG - return ImageFormat.PNG - def _validate_image(self, image_path: Path) -> None: """Validate that saved image is readable. @@ -149,9 +88,7 @@ def _validate_image(self, image_path: Path) -> None: ValueError: If image is corrupted or unreadable """ try: - from PIL import Image - - with Image.open(image_path) as img: + with PIL.Image.open(image_path) as img: img.verify() except Exception as e: # Clean up invalid file From 0f07f7b9501aaee80e68b994b301ccd464391f05 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 6 Feb 2026 13:11:54 -0700 Subject: [PATCH 25/69] Streamline integration for image generation --- .../config/utils/image_helpers.py | 20 +- .../tests/config/utils/test_image_helpers.py | 184 +++++++++++++++++- .../column_generators/generators/image.py | 20 +- .../dataset_builders/artifact_storage.py | 22 ++- .../dataset_builders/column_wise_builder.py | 45 +++-- .../data_designer/engine/storage/__init__.py | 4 +- ...image_storage.py => multimedia_storage.py} | 33 ++-- .../generators/test_image.py | 121 ++++++++++++ .../tests/engine/storage/__init__.py | 2 + .../engine/storage/test_multimedia_storage.py | 182 +++++++++++++++++ 10 files changed, 583 insertions(+), 50 deletions(-) rename packages/data-designer-engine/src/data_designer/engine/storage/{image_storage.py => multimedia_storage.py} (80%) create mode 100644 packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py create mode 100644 packages/data-designer-engine/tests/engine/storage/__init__.py create mode 100644 packages/data-designer-engine/tests/engine/storage/test_multimedia_storage.py diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index 48dacbae..1f5ec332 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -92,8 +92,8 @@ def detect_image_format(image_bytes: bytes) -> ImageFormat: try: img = PIL.Image.open(io.BytesIO(image_bytes)) format_str = img.format.lower() if img.format else None - if format_str in ["png", "jpeg", "jpg", "webp"]: - return ImageFormat(format_str if format_str != "jpeg" else "jpg") + if format_str in [ImageFormat.PNG, ImageFormat.JPG, ImageFormat.JPEG, ImageFormat.WEBP]: + return ImageFormat(format_str if format_str != ImageFormat.JPEG else ImageFormat.JPG) except Exception: pass @@ -191,6 +191,22 @@ def load_image_path_to_base64(image_path: str, base_path: str | None = None) -> return None +def validate_image(image_path: Path) -> None: + """Validate that an image file is readable and not corrupted. + + Args: + image_path: Path to image file + + Raises: + ValueError: If image is corrupted or unreadable + """ + try: + with PIL.Image.open(image_path) as img: + img.verify() + except Exception as e: + raise ValueError(f"Image validation failed: {e}") from e + + def get_supported_image_extensions() -> list[str]: """Get list of supported image extensions from ImageFormat enum. diff --git a/packages/data-designer-config/tests/config/utils/test_image_helpers.py b/packages/data-designer-config/tests/config/utils/test_image_helpers.py index 3d6683e4..9c7ccd7f 100644 --- a/packages/data-designer-config/tests/config/utils/test_image_helpers.py +++ b/packages/data-designer-config/tests/config/utils/test_image_helpers.py @@ -4,7 +4,14 @@ from __future__ import annotations import base64 - +import io +from typing import TYPE_CHECKING +from unittest.mock import Mock, patch + +# Explicitly import PIL.Image submodule to make it accessible as PIL.Image +# Python doesn't automatically import submodules when you import a package, +# so `import PIL` alone doesn't give you access to PIL.Image +import PIL.Image # noqa: E402 import pytest from data_designer.config.models import ImageFormat @@ -16,7 +23,13 @@ is_base64_image, is_image_path, is_image_url, + load_image_path_to_base64, + validate_image, ) +from data_designer.lazy_heavy_imports import PIL + +if TYPE_CHECKING: + import PIL # Tests for extract_base64_from_data_uri @@ -139,6 +152,39 @@ def test_is_image_url_non_http(): assert is_image_url("ftp://example.com/image.png") is False +# Tests for validate_image + + +def test_validate_image_valid_png(tmp_path): + # Create a valid 1x1 PNG using PIL + img = PIL.Image.new("RGB", (1, 1), color="red") + buf = io.BytesIO() + img.save(buf, format="PNG") + png_bytes = buf.getvalue() + + image_path = tmp_path / "test.png" + image_path.write_bytes(png_bytes) + + # Should not raise + validate_image(image_path) + + +def test_validate_image_corrupted_raises_error(tmp_path): + # Create an invalid image file + image_path = tmp_path / "corrupted.png" + image_path.write_bytes(b"not a valid image") + + with pytest.raises(ValueError, match="Image validation failed"): + validate_image(image_path) + + +def test_validate_image_nonexistent_raises_error(tmp_path): + image_path = tmp_path / "nonexistent.png" + + with pytest.raises(ValueError, match="Image validation failed"): + validate_image(image_path) + + # Tests for get_supported_image_extensions @@ -146,3 +192,139 @@ def test_get_supported_image_extensions_matches_enum(): result = get_supported_image_extensions() enum_values = [f".{fmt.value}" for fmt in ImageFormat] assert set(result) == set(enum_values) + + +# Additional tests for uncovered lines + + +def test_detect_image_format_with_pil_fallback_unsupported_format(tmp_path): + # Create a real GIF image that will trigger PIL fallback + # (GIF has different magic bytes not in our fast-path detection) + img = PIL.Image.new("RGB", (1, 1), color="red") + gif_path = tmp_path / "test.gif" + img.save(gif_path, format="GIF") + + gif_bytes = gif_path.read_bytes() + # Should use PIL fallback and default to PNG (GIF not in ImageFormat enum) + result = detect_image_format(gif_bytes) + assert result == ImageFormat.PNG + + +def test_detect_image_format_with_pil_fallback_jpeg(): + # Test PIL fallback path that converts "jpeg" format string to JPG enum + # Use mock since we can't easily create valid JPEG bytes without magic bytes + mock_img = Mock() + mock_img.format = "JPEG" + + # Use bytes that don't match our magic bytes to trigger PIL fallback + test_bytes = b"\x00\x00\x00\x00" + + with patch.object(PIL.Image, "open", return_value=mock_img): + result = detect_image_format(test_bytes) + # Should convert JPEG -> JPG via line 96 + assert result == ImageFormat.JPG + + +def test_is_image_path_non_string_input(): + assert is_image_path(123) is False + assert is_image_path(None) is False + assert is_image_path([]) is False + + +def test_is_base64_image_non_string_input(): + assert is_base64_image(123) is False + assert is_base64_image(None) is False + assert is_base64_image([]) is False + + +def test_is_base64_image_invalid_base64_decode(): + # String with valid base64 characters but incorrect padding that causes decode to fail + # Single '=' in middle of string is invalid base64 (padding only allowed at end) + invalid_base64 = "A" * 50 + "=" + "A" * 49 + "more text" + assert is_base64_image(invalid_base64) is False + + +def test_is_image_url_non_string_input(): + assert is_image_url(123) is False + assert is_image_url(None) is False + assert is_image_url([]) is False + + +# Tests for load_image_path_to_base64 + + +def test_load_image_path_to_base64_absolute_path(tmp_path): + # Create a test image file + img = PIL.Image.new("RGB", (1, 1), color="blue") + image_path = tmp_path / "test.png" + img.save(image_path) + + # Load with absolute path + result = load_image_path_to_base64(str(image_path)) + assert result is not None + assert len(result) > 0 + # Verify it's valid base64 + decoded = base64.b64decode(result) + assert len(decoded) > 0 + + +def test_load_image_path_to_base64_relative_with_base_path(tmp_path): + # Create a test image file + img = PIL.Image.new("RGB", (1, 1), color="green") + image_path = tmp_path / "subdir" / "test.png" + image_path.parent.mkdir(exist_ok=True) + img.save(image_path) + + # Load with relative path and base_path + result = load_image_path_to_base64("subdir/test.png", base_path=str(tmp_path)) + assert result is not None + assert len(result) > 0 + + +def test_load_image_path_to_base64_nonexistent_file(): + result = load_image_path_to_base64("/nonexistent/path/to/image.png") + assert result is None + + +def test_load_image_path_to_base64_relative_with_cwd_fallback(tmp_path, monkeypatch): + # Create test image in current working directory + + # Change to tmp_path as cwd + monkeypatch.chdir(tmp_path) + + img = PIL.Image.new("RGB", (1, 1), color="yellow") + image_path = tmp_path / "test_cwd.png" + img.save(image_path) + + # Use relative path without base_path - should fall back to cwd + result = load_image_path_to_base64("test_cwd.png") + assert result is not None + assert len(result) > 0 + + +def test_load_image_path_to_base64_base_path_fallback_to_cwd(tmp_path, monkeypatch): + # Test the case where base_path is provided but file isn't there, falls back to cwd + monkeypatch.chdir(tmp_path) + + # Create image in cwd + img = PIL.Image.new("RGB", (1, 1), color="red") + image_path = tmp_path / "test.png" + img.save(image_path) + + # Create a different base_path that doesn't have the image + wrong_base = tmp_path / "wrong" + wrong_base.mkdir() + + # Use relative path with wrong base_path - should fall back to cwd + result = load_image_path_to_base64("test.png", base_path=str(wrong_base)) + assert result is not None + assert len(result) > 0 + + +def test_load_image_path_to_base64_exception_handling(tmp_path): + # Create a directory (not a file) to trigger exception + dir_path = tmp_path / "directory" + dir_path.mkdir() + + result = load_image_path_to_base64(str(dir_path)) + assert result is None diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py index db3c9c9e..7ad7a18c 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py @@ -11,25 +11,27 @@ from data_designer.engine.processing.utils import deserialize_json_values if TYPE_CHECKING: - from data_designer.engine.storage.image_storage import ImageStorageManager + from data_designer.engine.storage.multimedia_storage import MultimediaStorage class ImageCellGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorWithModel[ImageGenerationColumnConfig]): """Generator for image columns with optional disk persistence. - Behavior depends on whether image_storage_manager is set: - - If set (create mode): Saves images to disk and stores relative paths in dataframe + Behavior depends on whether multimedia storage is available via ResourceProvider: + - If available (create mode): Saves images to disk and stores relative paths in dataframe - If None (preview mode): Stores base64 directly in dataframe API is automatically detected based on the model name: - Diffusion models (DALL-E, Stable Diffusion, Imagen, etc.) β†’ image_generation API - All other models β†’ chat/completions API (default) - Attributes: - image_storage_manager: Optional image storage manager instance (set by dataset builder) + Storage is accessed via ResourceProvider.artifact_storage.multimedia_storage """ - image_storage_manager: ImageStorageManager | None = None + @property + def multimedia_storage(self) -> MultimediaStorage | None: + """Get multimedia storage from resource provider if available.""" + return self._resource_provider.artifact_storage.multimedia_storage @staticmethod def get_generation_strategy() -> GenerationStrategy: @@ -67,11 +69,9 @@ def generate(self, data: dict) -> dict: base64_images = self.model.generate_image(prompt=prompt) # Store in dataframe based on mode - if self.image_storage_manager: + if self.multimedia_storage: # Create mode: save each image to disk and store list of relative paths - relative_paths = [ - self.image_storage_manager.save_base64_image(base64_image) for base64_image in base64_images - ] + relative_paths = [self.multimedia_storage.save_base64_image(base64_image) for base64_image in base64_images] data[self.config.name] = relative_paths else: # Preview mode: store list of base64 strings directly diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py index 35e7d4f8..b5ffaae7 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py @@ -11,11 +11,12 @@ from pathlib import Path from typing import TYPE_CHECKING -from pydantic import BaseModel, field_validator, model_validator +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from data_designer.config.utils.io_helpers import read_parquet_dataset from data_designer.config.utils.type_helpers import StrEnum, resolve_string_enum from data_designer.engine.dataset_builders.errors import ArtifactStorageError +from data_designer.engine.storage.multimedia_storage import MultimediaStorage from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: @@ -38,12 +39,15 @@ class BatchStage(StrEnum): class ArtifactStorage(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + artifact_path: Path | str dataset_name: str = "dataset" final_dataset_folder_name: str = FINAL_DATASET_FOLDER_NAME partial_results_folder_name: str = "tmp-partial-parquet-files" dropped_columns_folder_name: str = "dropped-columns-parquet-files" processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME + multimedia_storage: MultimediaStorage | None = Field(default=None, exclude=True) @property def artifact_path_exists(self) -> bool: @@ -116,6 +120,22 @@ def validate_folder_names(self): return self + def ensure_multimedia_storage(self) -> MultimediaStorage: + """Lazily create multimedia storage if not already present. + + Returns: + MultimediaStorage instance + + Note: + Creates storage with default settings (images_subdir="images", validate_images=True) + """ + if self.multimedia_storage is None: + self.multimedia_storage = MultimediaStorage( + base_path=self.base_dataset_path, + validate_images=True, + ) + return self.multimedia_storage + @staticmethod def mkdir_if_needed(path: Path | str) -> Path: """Create the directory if it does not exist.""" diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 7a2962eb..ac4469eb 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -26,7 +26,6 @@ ColumnGeneratorWithModel, GenerationStrategy, ) -from data_designer.engine.column_generators.generators.image import ImageCellGenerator from data_designer.engine.column_generators.utils.generator_classification import column_type_is_model_generated from data_designer.engine.compiler import compile_data_designer_config from data_designer.engine.dataset_builders.artifact_storage import SDG_CONFIG_FILENAME, ArtifactStorage @@ -41,7 +40,6 @@ from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry from data_designer.engine.resources.resource_provider import ResourceProvider -from data_designer.engine.storage.image_storage import ImageStorageManager from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: @@ -66,7 +64,6 @@ def __init__( self._resource_provider = resource_provider self._records_to_drop: set[int] = set() self._registry = registry or DataDesignerRegistry() - self._image_storage_manager: ImageStorageManager | None = None self._data_designer_config = compile_data_designer_config(data_designer_config, resource_provider) self._column_configs = compile_dataset_builder_column_configs(self._data_designer_config) @@ -98,11 +95,31 @@ def build( *, num_records: int, on_batch_complete: Callable[[Path], None] | None = None, + save_multimedia_to_disk: bool = True, ) -> Path: + """Build the dataset. + + Args: + num_records: Number of records to generate. + on_batch_complete: Optional callback function called when each batch completes. + save_multimedia_to_disk: Whether to save generated multimedia (images, audio, video) to disk. + If False, multimedia is stored directly in the DataFrame (e.g., images as base64). + Default is True. + + Returns: + Path to the generated dataset directory. + """ self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() self._write_builder_config() - self._initialize_image_storage_if_needed() + + # Ensure multimedia storage exists if needed + if save_multimedia_to_disk and self._has_image_columns(): + self.artifact_storage.ensure_multimedia_storage() + else: + # Disable storage for preview or when explicitly disabled + self.artifact_storage.multimedia_storage = None + generators = self._initialize_generators() start_time = time.perf_counter() group_id = uuid.uuid4().hex @@ -128,7 +145,7 @@ def build( def build_preview(self, *, num_records: int) -> pd.DataFrame: self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() - # Skip image storage initialization for preview - base64 will be stored directly in DataFrame + # Skip multimedia storage initialization for preview - base64 will be stored directly in DataFrame generators = self._initialize_generators() group_id = uuid.uuid4().hex @@ -155,26 +172,16 @@ def _has_image_columns(self) -> bool: return any(col.column_type == DataDesignerColumnType.IMAGE_GENERATION for col in self.single_column_configs) - def _initialize_image_storage_if_needed(self) -> None: - """Initialize image storage manager if dataset has image columns.""" - if self._has_image_columns(): - self._image_storage_manager = ImageStorageManager( - base_path=self.artifact_storage.base_dataset_path, images_subdir="images", validate_images=True - ) - def _initialize_generators(self) -> list[ColumnGenerator]: + """Initialize column generators. + + Generators access multimedia storage via ResourceProvider.artifact_storage.multimedia_storage + """ generators = [] for config in self._column_configs: generator_cls = self._registry.column_generators.get_for_config_type(type(config)) generator = generator_cls(config=config, resource_provider=self._resource_provider) - - # Inject image storage manager for image generators (if available) - # For preview mode, storage manager is None and base64 is stored directly - if isinstance(generator, ImageCellGenerator): - generator.image_storage_manager = self._image_storage_manager - generators.append(generator) - return generators def _write_builder_config(self) -> None: diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py b/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py index ad7ef0d5..820d512a 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from data_designer.engine.storage.image_storage import ImageFormat, ImageStorageManager +from data_designer.engine.storage.multimedia_storage import MultimediaStorage -__all__ = ["ImageFormat", "ImageStorageManager"] +__all__ = ["MultimediaStorage"] diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/multimedia_storage.py similarity index 80% rename from packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py rename to packages/data-designer-engine/src/data_designer/engine/storage/multimedia_storage.py index 22d4bf84..e40c0032 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/image_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/multimedia_storage.py @@ -5,28 +5,32 @@ import uuid from pathlib import Path -from typing import TYPE_CHECKING -from data_designer.config.utils.image_helpers import decode_base64_image, detect_image_format -from data_designer.lazy_heavy_imports import PIL +from data_designer.config.utils.image_helpers import decode_base64_image, detect_image_format, validate_image -if TYPE_CHECKING: - import PIL +IMAGES_SUBDIR = "images" -class ImageStorageManager: - """Manages disk storage of generated images. +class MultimediaStorage: + """Manages disk storage of generated multimedia content. + + Currently supports: + - Images (PNG, JPG, WEBP) + + Future support planned for: + - Audio + - Video Handles: - - Creating images directory + - Creating storage directories - Decoding base64 to bytes - - Detecting image format + - Detecting media format - Saving with UUID filenames - Returning relative paths """ - def __init__(self, base_path: Path, images_subdir: str = "images", validate_images: bool = True) -> None: - """Initialize image storage manager. + def __init__(self, base_path: Path, images_subdir: str = IMAGES_SUBDIR, validate_images: bool = True) -> None: + """Initialize multimedia storage manager. Args: base_path: Base directory for dataset @@ -88,12 +92,11 @@ def _validate_image(self, image_path: Path) -> None: ValueError: If image is corrupted or unreadable """ try: - with PIL.Image.open(image_path) as img: - img.verify() - except Exception as e: + validate_image(image_path) + except ValueError: # Clean up invalid file image_path.unlink(missing_ok=True) - raise ValueError(f"Saved image is invalid or corrupted: {e}") from e + raise def cleanup(self) -> None: """Clean up image directory (for preview mode).""" diff --git a/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py new file mode 100644 index 00000000..7173ed2d --- /dev/null +++ b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py @@ -0,0 +1,121 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import Mock, patch + +import pytest + +from data_designer.config.column_configs import ImageGenerationColumnConfig +from data_designer.engine.column_generators.generators.base import GenerationStrategy +from data_designer.engine.column_generators.generators.image import ImageCellGenerator +from data_designer.engine.processing.ginja.exceptions import UserTemplateError + + +@pytest.fixture +def stub_image_column_config(): + return ImageGenerationColumnConfig( + name="test_image", prompt="A {{ style }} image of {{ subject }}", model_alias="test_model" + ) + + +@pytest.fixture +def stub_base64_images() -> list[str]: + return ["base64_image_1", "base64_image_2"] + + +def test_image_cell_generator_generation_strategy( + stub_image_column_config: ImageGenerationColumnConfig, stub_resource_provider: None +) -> None: + generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) + assert generator.get_generation_strategy() == GenerationStrategy.CELL_BY_CELL + + +def test_image_cell_generator_multimedia_storage_property( + stub_image_column_config: ImageGenerationColumnConfig, stub_resource_provider: None +) -> None: + generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) + # Should return multimedia_storage from artifact_storage (None by default in stub) + assert generator.multimedia_storage is None + + +def test_image_cell_generator_generate_with_storage( + stub_image_column_config, stub_resource_provider, stub_base64_images +): + """Test generate with multimedia storage (create mode) - saves to disk.""" + # Setup mock multimedia storage + mock_storage = Mock() + mock_storage.save_base64_image.side_effect = ["images/uuid1.png", "images/uuid2.png"] + stub_resource_provider.artifact_storage.multimedia_storage = mock_storage + + with patch.object( + stub_resource_provider.model_registry.get_model.return_value, + "generate_image", + return_value=stub_base64_images, + ) as mock_generate: + generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) + data = generator.generate(data={"style": "photorealistic", "subject": "cat"}) + + # Check that column was added with relative paths + assert stub_image_column_config.name in data + assert data[stub_image_column_config.name] == ["images/uuid1.png", "images/uuid2.png"] + + # Verify model was called with rendered prompt + mock_generate.assert_called_once_with(prompt="A photorealistic image of cat") + + # Verify storage was called for each image + assert mock_storage.save_base64_image.call_count == 2 + mock_storage.save_base64_image.assert_any_call("base64_image_1") + mock_storage.save_base64_image.assert_any_call("base64_image_2") + + +def test_image_cell_generator_generate_without_storage( + stub_image_column_config, stub_resource_provider, stub_base64_images +): + """Test generate without multimedia storage (preview mode) - stores base64 directly.""" + # Ensure multimedia_storage is None (preview mode) + stub_resource_provider.artifact_storage.multimedia_storage = None + + with patch.object( + stub_resource_provider.model_registry.get_model.return_value, + "generate_image", + return_value=stub_base64_images, + ) as mock_generate: + generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) + data = generator.generate(data={"style": "watercolor", "subject": "dog"}) + + # Check that column was added with base64 data + assert stub_image_column_config.name in data + assert data[stub_image_column_config.name] == stub_base64_images + + # Verify model was called with rendered prompt + mock_generate.assert_called_once_with(prompt="A watercolor image of dog") + + +def test_image_cell_generator_missing_columns_error(stub_image_column_config, stub_resource_provider): + """Test that missing required columns raises ValueError.""" + generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) + + with pytest.raises(ValueError, match="columns.*missing"): + # Missing 'subject' column + generator.generate(data={"style": "photorealistic"}) + + +def test_image_cell_generator_empty_prompt_error(stub_resource_provider): + """Test that empty rendered prompt raises UserTemplateError.""" + # Create config with template that renders to empty string + config = ImageGenerationColumnConfig(name="test_image", prompt="{{ empty }}", model_alias="test_model") + + generator = ImageCellGenerator(config=config, resource_provider=stub_resource_provider) + + with pytest.raises(UserTemplateError): + generator.generate(data={"empty": ""}) + + +def test_image_cell_generator_whitespace_only_prompt_error(stub_resource_provider): + """Test that whitespace-only rendered prompt raises ValueError.""" + config = ImageGenerationColumnConfig(name="test_image", prompt="{{ spaces }}", model_alias="test_model") + + generator = ImageCellGenerator(config=config, resource_provider=stub_resource_provider) + + with pytest.raises(ValueError, match="empty"): + generator.generate(data={"spaces": " "}) diff --git a/packages/data-designer-engine/tests/engine/storage/__init__.py b/packages/data-designer-engine/tests/engine/storage/__init__.py new file mode 100644 index 00000000..e5725ea5 --- /dev/null +++ b/packages/data-designer-engine/tests/engine/storage/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/packages/data-designer-engine/tests/engine/storage/test_multimedia_storage.py b/packages/data-designer-engine/tests/engine/storage/test_multimedia_storage.py new file mode 100644 index 00000000..ade76b5a --- /dev/null +++ b/packages/data-designer-engine/tests/engine/storage/test_multimedia_storage.py @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import base64 +import io + +# Explicitly import PIL.Image submodule to make it accessible as PIL.Image +# Python doesn't automatically import submodules when you import a package, +# so `import PIL` alone doesn't give you access to PIL.Image +import PIL.Image # noqa: E402 +import pytest + +from data_designer.engine.storage.multimedia_storage import IMAGES_SUBDIR, MultimediaStorage +from data_designer.lazy_heavy_imports import PIL + + +@pytest.fixture +def multimedia_storage(tmp_path): + """Create a MultimediaStorage instance with a temporary directory.""" + return MultimediaStorage(base_path=tmp_path) + + +@pytest.fixture +def sample_base64_png() -> str: + """Create a valid 1x1 PNG as base64.""" + img = PIL.Image.new("RGB", (1, 1), color="red") + buf = io.BytesIO() + img.save(buf, format="PNG") + png_bytes = buf.getvalue() + return base64.b64encode(png_bytes).decode() + + +@pytest.fixture +def sample_base64_jpg() -> str: + """Create a valid 1x1 JPEG as base64.""" + img = PIL.Image.new("RGB", (1, 1), color="blue") + buf = io.BytesIO() + img.save(buf, format="JPEG") + jpg_bytes = buf.getvalue() + return base64.b64encode(jpg_bytes).decode() + + +def test_multimedia_storage_init(tmp_path): + """Test MultimediaStorage initialization.""" + storage = MultimediaStorage(base_path=tmp_path) + assert storage.base_path == tmp_path + assert storage.images_dir == tmp_path / IMAGES_SUBDIR + assert storage.images_subdir == IMAGES_SUBDIR + assert storage.validate_images is True + # Should create images directory on init + assert storage.images_dir.exists() + + +def test_multimedia_storage_init_custom_subdir(tmp_path): + """Test MultimediaStorage initialization with custom subdirectory.""" + custom_subdir = "custom_images" + storage = MultimediaStorage(base_path=tmp_path, images_subdir=custom_subdir, validate_images=False) + assert storage.images_subdir == custom_subdir + assert storage.images_dir == tmp_path / custom_subdir + assert storage.validate_images is False + assert storage.images_dir.exists() + + +def test_save_base64_image_png(multimedia_storage, sample_base64_png): + """Test saving a PNG image from base64.""" + relative_path = multimedia_storage.save_base64_image(sample_base64_png) + + # Check return value format + assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + assert relative_path.endswith(".png") + + # Check file exists on disk + full_path = multimedia_storage.base_path / relative_path + assert full_path.exists() + + # Verify file content + saved_bytes = full_path.read_bytes() + expected_bytes = base64.b64decode(sample_base64_png) + assert saved_bytes == expected_bytes + + +def test_save_base64_image_jpg(multimedia_storage, sample_base64_jpg): + """Test saving a JPEG image from base64.""" + relative_path = multimedia_storage.save_base64_image(sample_base64_jpg) + + # Check return value format + assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + assert relative_path.endswith(".jpg") + + # Check file exists on disk + full_path = multimedia_storage.base_path / relative_path + assert full_path.exists() + + +def test_save_base64_image_with_data_uri(multimedia_storage, sample_base64_png): + """Test saving image from data URI format.""" + data_uri = f"data:image/png;base64,{sample_base64_png}" + relative_path = multimedia_storage.save_base64_image(data_uri) + + # Should successfully extract base64 and save + assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + assert relative_path.endswith(".png") + + # Verify file exists and content is correct + full_path = multimedia_storage.base_path / relative_path + assert full_path.exists() + saved_bytes = full_path.read_bytes() + expected_bytes = base64.b64decode(sample_base64_png) + assert saved_bytes == expected_bytes + + +def test_save_base64_image_invalid_base64_raises_error(multimedia_storage): + """Test that invalid base64 data raises ValueError.""" + with pytest.raises(ValueError, match="Invalid base64"): + multimedia_storage.save_base64_image("not-valid-base64!!!") + + +def test_save_base64_image_multiple_images_unique_filenames(multimedia_storage, sample_base64_png): + """Test that multiple images get unique filenames.""" + path1 = multimedia_storage.save_base64_image(sample_base64_png) + path2 = multimedia_storage.save_base64_image(sample_base64_png) + + # Paths should be different (different UUIDs) + assert path1 != path2 + + # Both files should exist + assert (multimedia_storage.base_path / path1).exists() + assert (multimedia_storage.base_path / path2).exists() + + +def test_save_base64_image_validation_enabled(tmp_path, sample_base64_png): + """Test that validation is performed when enabled.""" + storage = MultimediaStorage(base_path=tmp_path, validate_images=True) + # Should succeed with valid image + relative_path = storage.save_base64_image(sample_base64_png) + assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + + +def test_save_base64_image_validation_corrupted_image_raises_error(tmp_path): + """Test that corrupted image fails validation and is cleaned up.""" + storage = MultimediaStorage(base_path=tmp_path, validate_images=True) + + # Create base64 of invalid image data + corrupted_bytes = b"not a valid image" + corrupted_base64 = base64.b64encode(corrupted_bytes).decode() + + with pytest.raises(ValueError, match="Image validation failed"): + storage.save_base64_image(corrupted_base64) + + # Check that no files were left behind + assert len(list(storage.images_dir.iterdir())) == 0 + + +def test_save_base64_image_validation_disabled(tmp_path): + """Test that validation can be disabled.""" + storage = MultimediaStorage(base_path=tmp_path, validate_images=False) + + # Create base64 of invalid image data + corrupted_bytes = b"not a valid image" + corrupted_base64 = base64.b64encode(corrupted_bytes).decode() + + # Should succeed without validation + relative_path = storage.save_base64_image(corrupted_base64) + assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + + # File should exist even though it's invalid + full_path = storage.base_path / relative_path + assert full_path.exists() + + +def test_cleanup(multimedia_storage, sample_base64_png): + """Test cleanup removes images directory.""" + # Save an image first + multimedia_storage.save_base64_image(sample_base64_png) + assert multimedia_storage.images_dir.exists() + assert len(list(multimedia_storage.images_dir.iterdir())) > 0 + + # Cleanup should remove directory + multimedia_storage.cleanup() + assert not multimedia_storage.images_dir.exists() From 2aae6ccd6f09064feddc6b14d1faa15dd5c5e417 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 6 Feb 2026 17:30:10 -0700 Subject: [PATCH 26/69] streamline generation --- .../config/utils/image_helpers.py | 8 +- .../config/utils/visualization.py | 28 +-- .../src/data_designer/lazy_heavy_imports.py | 1 + .../tests/config/utils/test_image_helpers.py | 24 +-- .../column_generators/generators/image.py | 31 ++- .../dataset_builders/artifact_storage.py | 28 ++- .../dataset_builders/column_wise_builder.py | 18 +- .../data_designer/engine/storage/__init__.py | 4 +- ...multimedia_storage.py => media_storage.py} | 63 ++++-- .../generators/test_image.py | 24 +-- .../dataset_builders/test_artifact_storage.py | 7 +- .../engine/storage/test_media_storage.py | 174 +++++++++++++++++ .../engine/storage/test_multimedia_storage.py | 182 ------------------ .../tests/engine/test_configurable_task.py | 33 +--- 14 files changed, 301 insertions(+), 324 deletions(-) rename packages/data-designer-engine/src/data_designer/engine/storage/{multimedia_storage.py => media_storage.py} (56%) create mode 100644 packages/data-designer-engine/tests/engine/storage/test_media_storage.py delete mode 100644 packages/data-designer-engine/tests/engine/storage/test_multimedia_storage.py diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index 1f5ec332..67803aff 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -11,10 +11,10 @@ from typing import TYPE_CHECKING from data_designer.config.models import ImageFormat -from data_designer.lazy_heavy_imports import PIL +from data_designer.lazy_heavy_imports import Image if TYPE_CHECKING: - import PIL + from PIL import Image # Magic bytes for image format detection IMAGE_FORMAT_MAGIC_BYTES = { @@ -90,7 +90,7 @@ def detect_image_format(image_bytes: bytes) -> ImageFormat: # Fallback to PIL for robust detection try: - img = PIL.Image.open(io.BytesIO(image_bytes)) + img = Image.open(io.BytesIO(image_bytes)) format_str = img.format.lower() if img.format else None if format_str in [ImageFormat.PNG, ImageFormat.JPG, ImageFormat.JPEG, ImageFormat.WEBP]: return ImageFormat(format_str if format_str != ImageFormat.JPEG else ImageFormat.JPG) @@ -201,7 +201,7 @@ def validate_image(image_path: Path) -> None: ValueError: If image is corrupted or unreadable """ try: - with PIL.Image.open(image_path) as img: + with Image.open(image_path) as img: img.verify() except Exception as e: raise ValueError(f"Image validation failed: {e}") from e diff --git a/packages/data-designer-config/src/data_designer/config/utils/visualization.py b/packages/data-designer-config/src/data_designer/config/utils/visualization.py index c349ec86..6a9e8ee5 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/visualization.py +++ b/packages/data-designer-config/src/data_designer/config/utils/visualization.py @@ -3,8 +3,6 @@ from __future__ import annotations -import base64 -import io import json import os from collections import OrderedDict @@ -35,12 +33,11 @@ is_image_url, load_image_path_to_base64, ) -from data_designer.lazy_heavy_imports import PIL, np, pd +from data_designer.lazy_heavy_imports import np, pd if TYPE_CHECKING: import numpy as np import pandas as pd - import PIL from data_designer.config.config_builder import DataDesignerConfigBuilder from data_designer.config.dataset_metadata import DatasetMetadata @@ -49,15 +46,12 @@ console = Console() -def _display_image_if_in_notebook( - image_data: str, col_name: str, max_width: int = 512, base_path: str | None = None -) -> bool: +def _display_image_if_in_notebook(image_data: str, col_name: str, base_path: str | None = None) -> bool: """Display image with caption in Jupyter notebook if available. Args: image_data: Base64-encoded image data, data URI, or file path. col_name: Name of the column (used for caption). - max_width: Maximum width for the displayed image in pixels. base_path: Optional base path to resolve relative image paths. Returns: @@ -83,27 +77,15 @@ def _display_image_if_in_notebook( # Extract base64 from data URI if present base64_data = extract_base64_from_data_uri(base64_data) - image_bytes = base64.b64decode(base64_data) - # Open image with PIL and resize if needed - img = PIL.Image.open(io.BytesIO(image_bytes)) - - # Resize if image is too large - if img.width > max_width: - ratio = max_width / img.width - new_height = int(img.height * ratio) - img = img.resize((max_width, new_height), PIL.Image.Resampling.LANCZOS) - - # Convert back to base64 for HTML display - buffered = io.BytesIO() - img.save(buffered, format=img.format or "PNG") - img_base64 = base64.b64encode(buffered.getvalue()).decode() + # Use the base64 data directly without resizing + img_base64 = base64_data # Create HTML with caption and image in left-aligned container html = f"""
πŸ–ΌοΈ {col_name}
- +
""" display(HTML(html)) diff --git a/packages/data-designer-config/src/data_designer/lazy_heavy_imports.py b/packages/data-designer-config/src/data_designer/lazy_heavy_imports.py index f7901a7c..0e95f248 100644 --- a/packages/data-designer-config/src/data_designer/lazy_heavy_imports.py +++ b/packages/data-designer-config/src/data_designer/lazy_heavy_imports.py @@ -36,6 +36,7 @@ "scipy": "scipy", "jsonschema": "jsonschema", "PIL": "PIL", + "Image": "PIL.Image", } diff --git a/packages/data-designer-config/tests/config/utils/test_image_helpers.py b/packages/data-designer-config/tests/config/utils/test_image_helpers.py index 9c7ccd7f..e0eb0370 100644 --- a/packages/data-designer-config/tests/config/utils/test_image_helpers.py +++ b/packages/data-designer-config/tests/config/utils/test_image_helpers.py @@ -5,13 +5,8 @@ import base64 import io -from typing import TYPE_CHECKING from unittest.mock import Mock, patch -# Explicitly import PIL.Image submodule to make it accessible as PIL.Image -# Python doesn't automatically import submodules when you import a package, -# so `import PIL` alone doesn't give you access to PIL.Image -import PIL.Image # noqa: E402 import pytest from data_designer.config.models import ImageFormat @@ -26,10 +21,7 @@ load_image_path_to_base64, validate_image, ) -from data_designer.lazy_heavy_imports import PIL - -if TYPE_CHECKING: - import PIL +from data_designer.lazy_heavy_imports import Image # Tests for extract_base64_from_data_uri @@ -157,7 +149,7 @@ def test_is_image_url_non_http(): def test_validate_image_valid_png(tmp_path): # Create a valid 1x1 PNG using PIL - img = PIL.Image.new("RGB", (1, 1), color="red") + img = Image.new("RGB", (1, 1), color="red") buf = io.BytesIO() img.save(buf, format="PNG") png_bytes = buf.getvalue() @@ -200,7 +192,7 @@ def test_get_supported_image_extensions_matches_enum(): def test_detect_image_format_with_pil_fallback_unsupported_format(tmp_path): # Create a real GIF image that will trigger PIL fallback # (GIF has different magic bytes not in our fast-path detection) - img = PIL.Image.new("RGB", (1, 1), color="red") + img = Image.new("RGB", (1, 1), color="red") gif_path = tmp_path / "test.gif" img.save(gif_path, format="GIF") @@ -219,7 +211,7 @@ def test_detect_image_format_with_pil_fallback_jpeg(): # Use bytes that don't match our magic bytes to trigger PIL fallback test_bytes = b"\x00\x00\x00\x00" - with patch.object(PIL.Image, "open", return_value=mock_img): + with patch.object(Image, "open", return_value=mock_img): result = detect_image_format(test_bytes) # Should convert JPEG -> JPG via line 96 assert result == ImageFormat.JPG @@ -255,7 +247,7 @@ def test_is_image_url_non_string_input(): def test_load_image_path_to_base64_absolute_path(tmp_path): # Create a test image file - img = PIL.Image.new("RGB", (1, 1), color="blue") + img = Image.new("RGB", (1, 1), color="blue") image_path = tmp_path / "test.png" img.save(image_path) @@ -270,7 +262,7 @@ def test_load_image_path_to_base64_absolute_path(tmp_path): def test_load_image_path_to_base64_relative_with_base_path(tmp_path): # Create a test image file - img = PIL.Image.new("RGB", (1, 1), color="green") + img = Image.new("RGB", (1, 1), color="green") image_path = tmp_path / "subdir" / "test.png" image_path.parent.mkdir(exist_ok=True) img.save(image_path) @@ -292,7 +284,7 @@ def test_load_image_path_to_base64_relative_with_cwd_fallback(tmp_path, monkeypa # Change to tmp_path as cwd monkeypatch.chdir(tmp_path) - img = PIL.Image.new("RGB", (1, 1), color="yellow") + img = Image.new("RGB", (1, 1), color="yellow") image_path = tmp_path / "test_cwd.png" img.save(image_path) @@ -307,7 +299,7 @@ def test_load_image_path_to_base64_base_path_fallback_to_cwd(tmp_path, monkeypat monkeypatch.chdir(tmp_path) # Create image in cwd - img = PIL.Image.new("RGB", (1, 1), color="red") + img = Image.new("RGB", (1, 1), color="red") image_path = tmp_path / "test.png" img.save(image_path) diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py index 7ad7a18c..41586e4b 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py @@ -11,27 +11,27 @@ from data_designer.engine.processing.utils import deserialize_json_values if TYPE_CHECKING: - from data_designer.engine.storage.multimedia_storage import MultimediaStorage + from data_designer.engine.storage.media_storage import MediaStorage class ImageCellGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorWithModel[ImageGenerationColumnConfig]): - """Generator for image columns with optional disk persistence. + """Generator for image columns with disk or dataframe persistence. - Behavior depends on whether multimedia storage is available via ResourceProvider: - - If available (create mode): Saves images to disk and stores relative paths in dataframe - - If None (preview mode): Stores base64 directly in dataframe + Media storage always exists and determines behavior via its mode: + - DISK mode (create): Saves images to disk and stores relative paths in dataframe + - DATAFRAME mode (preview): Stores base64 directly in dataframe API is automatically detected based on the model name: - Diffusion models (DALL-E, Stable Diffusion, Imagen, etc.) β†’ image_generation API - All other models β†’ chat/completions API (default) - Storage is accessed via ResourceProvider.artifact_storage.multimedia_storage + Storage is accessed via ResourceProvider.artifact_storage.media_storage """ @property - def multimedia_storage(self) -> MultimediaStorage | None: - """Get multimedia storage from resource provider if available.""" - return self._resource_provider.artifact_storage.multimedia_storage + def media_storage(self) -> MediaStorage: + """Get media storage from resource provider.""" + return self._resource_provider.artifact_storage.media_storage @staticmethod def get_generation_strategy() -> GenerationStrategy: @@ -68,13 +68,10 @@ def generate(self, data: dict) -> dict: # Generate images (returns list of base64 strings) base64_images = self.model.generate_image(prompt=prompt) - # Store in dataframe based on mode - if self.multimedia_storage: - # Create mode: save each image to disk and store list of relative paths - relative_paths = [self.multimedia_storage.save_base64_image(base64_image) for base64_image in base64_images] - data[self.config.name] = relative_paths - else: - # Preview mode: store list of base64 strings directly - data[self.config.name] = base64_images + # Store via media storage (mode determines disk vs dataframe storage) + # TODO: MediaStorage will check its mode (DISK/DATAFRAME) and act accordingly + # For now, always saves to disk - need to implement mode system + results = [self.media_storage.save_base64_image(base64_image) for base64_image in base64_images] + data[self.config.name] = results return data diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py index b5ffaae7..a7316be3 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py @@ -16,7 +16,7 @@ from data_designer.config.utils.io_helpers import read_parquet_dataset from data_designer.config.utils.type_helpers import StrEnum, resolve_string_enum from data_designer.engine.dataset_builders.errors import ArtifactStorageError -from data_designer.engine.storage.multimedia_storage import MultimediaStorage +from data_designer.engine.storage.media_storage import MediaStorage, StorageMode from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: @@ -47,7 +47,7 @@ class ArtifactStorage(BaseModel): partial_results_folder_name: str = "tmp-partial-parquet-files" dropped_columns_folder_name: str = "dropped-columns-parquet-files" processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME - multimedia_storage: MultimediaStorage | None = Field(default=None, exclude=True) + media_storage: MediaStorage = Field(default=None, exclude=True) @property def artifact_path_exists(self) -> bool: @@ -118,23 +118,21 @@ def validate_folder_names(self): if any(char in invalid_chars for char in name): raise ArtifactStorageError(f"πŸ›‘ Directory name '{name}' contains invalid characters.") - return self + # Initialize media storage with DISK mode by default + self.media_storage = MediaStorage( + base_path=self.base_dataset_path, + mode=StorageMode.DISK, + ) - def ensure_multimedia_storage(self) -> MultimediaStorage: - """Lazily create multimedia storage if not already present. + return self - Returns: - MultimediaStorage instance + def set_media_storage_mode(self, mode: StorageMode) -> None: + """Set media storage mode. - Note: - Creates storage with default settings (images_subdir="images", validate_images=True) + Args: + mode: StorageMode.DISK (save to disk) or StorageMode.DATAFRAME (store in memory) """ - if self.multimedia_storage is None: - self.multimedia_storage = MultimediaStorage( - base_path=self.base_dataset_path, - validate_images=True, - ) - return self.multimedia_storage + self.media_storage.mode = mode @staticmethod def mkdir_if_needed(path: Path | str) -> Path: diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index ac4469eb..6802f805 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -40,6 +40,7 @@ from data_designer.engine.processing.processors.drop_columns import DropColumnsProcessor from data_designer.engine.registry.data_designer_registry import DataDesignerRegistry from data_designer.engine.resources.resource_provider import ResourceProvider +from data_designer.engine.storage.media_storage import StorageMode from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: @@ -113,12 +114,10 @@ def build( self._run_mcp_tool_check_if_needed() self._write_builder_config() - # Ensure multimedia storage exists if needed - if save_multimedia_to_disk and self._has_image_columns(): - self.artifact_storage.ensure_multimedia_storage() - else: - # Disable storage for preview or when explicitly disabled - self.artifact_storage.multimedia_storage = None + # Set media storage mode based on parameters + if self._has_image_columns(): + mode = StorageMode.DISK if save_multimedia_to_disk else StorageMode.DATAFRAME + self.artifact_storage.set_media_storage_mode(mode) generators = self._initialize_generators() start_time = time.perf_counter() @@ -145,7 +144,10 @@ def build( def build_preview(self, *, num_records: int) -> pd.DataFrame: self._run_model_health_check_if_needed() self._run_mcp_tool_check_if_needed() - # Skip multimedia storage initialization for preview - base64 will be stored directly in DataFrame + + # Set media storage to DATAFRAME mode for preview - base64 stored directly in DataFrame + if self._has_image_columns(): + self.artifact_storage.set_media_storage_mode(StorageMode.DATAFRAME) generators = self._initialize_generators() group_id = uuid.uuid4().hex @@ -175,7 +177,7 @@ def _has_image_columns(self) -> bool: def _initialize_generators(self) -> list[ColumnGenerator]: """Initialize column generators. - Generators access multimedia storage via ResourceProvider.artifact_storage.multimedia_storage + Generators access media storage via ResourceProvider.artifact_storage.media_storage """ generators = [] for config in self._column_configs: diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py b/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py index 820d512a..34c776d5 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py @@ -1,6 +1,6 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from data_designer.engine.storage.multimedia_storage import MultimediaStorage +from data_designer.engine.storage.media_storage import MediaStorage, StorageMode -__all__ = ["MultimediaStorage"] +__all__ = ["MediaStorage", "StorageMode"] diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/multimedia_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py similarity index 56% rename from packages/data-designer-engine/src/data_designer/engine/storage/multimedia_storage.py rename to packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py index e40c0032..ddac3459 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/multimedia_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py @@ -3,16 +3,29 @@ from __future__ import annotations +import shutil import uuid from pathlib import Path from data_designer.config.utils.image_helpers import decode_base64_image, detect_image_format, validate_image +from data_designer.config.utils.type_helpers import StrEnum IMAGES_SUBDIR = "images" -class MultimediaStorage: - """Manages disk storage of generated multimedia content. +class StorageMode(StrEnum): + """Storage mode for generated media content. + + - DISK: Save media to disk and store relative paths in dataframe (for dataset creation) + - DATAFRAME: Store base64 data directly in dataframe (for preview mode) + """ + + DISK = "disk" + DATAFRAME = "dataframe" + + +class MediaStorage: + """Manages storage of generated media content. Currently supports: - Images (PNG, JPG, WEBP) @@ -21,45 +34,60 @@ class MultimediaStorage: - Audio - Video + Storage modes: + - DISK: Save media to disk and return relative paths (for dataset creation) + - DATAFRAME: Return base64 data directly (for preview mode) + Handles: - Creating storage directories - Decoding base64 to bytes - Detecting media format - - Saving with UUID filenames - - Returning relative paths + - Saving with UUID filenames (DISK mode) + - Returning relative paths or base64 data based on mode + - Always validates images to ensure data quality """ - def __init__(self, base_path: Path, images_subdir: str = IMAGES_SUBDIR, validate_images: bool = True) -> None: - """Initialize multimedia storage manager. + def __init__( + self, base_path: Path, images_subdir: str = IMAGES_SUBDIR, mode: StorageMode = StorageMode.DISK + ) -> None: + """Initialize media storage manager. Args: base_path: Base directory for dataset images_subdir: Subdirectory name for images (default: "images") - validate_images: Whether to validate images after saving (default: True) + mode: Storage mode - DISK (save to disk) or DATAFRAME (return base64) """ self.base_path = Path(base_path) self.images_dir = self.base_path / images_subdir self.images_subdir = images_subdir - self.validate_images = validate_images - self._ensure_images_directory() + self.mode = mode def _ensure_images_directory(self) -> None: - """Create images directory if it doesn't exist.""" + """Create images directory if it doesn't exist (lazy initialization).""" self.images_dir.mkdir(parents=True, exist_ok=True) def save_base64_image(self, base64_data: str) -> str: - """Save base64 image to disk and return relative path. + """Save or return base64 image based on storage mode. Args: base64_data: Base64 encoded image string (with or without data URI prefix) Returns: - Relative path to saved image (e.g., "images/f47ac10b-58cc.png") + DISK mode: Relative path to saved image (e.g., "images/f47ac10b-58cc.png") + DATAFRAME mode: Original base64 data string Raises: - ValueError: If base64 data is invalid - OSError: If disk write fails + ValueError: If base64 data is invalid (DISK mode only) + OSError: If disk write fails (DISK mode only) """ + # DATAFRAME mode: return base64 directly without disk operations + if self.mode == StorageMode.DATAFRAME: + return base64_data + + # DISK mode: save to disk, validate, and return relative path + # Ensure images directory exists (lazy initialization) + self._ensure_images_directory() + # Decode base64 to bytes image_bytes = decode_base64_image(base64_data) @@ -76,9 +104,8 @@ def save_base64_image(self, base64_data: str) -> str: with open(full_path, "wb") as f: f.write(image_bytes) - # Optional validation - if self.validate_images: - self._validate_image(full_path) + # Always validate in DISK mode to ensure data quality + self._validate_image(full_path) return relative_path @@ -100,7 +127,5 @@ def _validate_image(self, image_path: Path) -> None: def cleanup(self) -> None: """Clean up image directory (for preview mode).""" - import shutil - if self.images_dir.exists(): shutil.rmtree(self.images_dir) diff --git a/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py index 7173ed2d..e7055d67 100644 --- a/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py +++ b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py @@ -30,22 +30,22 @@ def test_image_cell_generator_generation_strategy( assert generator.get_generation_strategy() == GenerationStrategy.CELL_BY_CELL -def test_image_cell_generator_multimedia_storage_property( +def test_image_cell_generator_media_storage_property( stub_image_column_config: ImageGenerationColumnConfig, stub_resource_provider: None ) -> None: generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) - # Should return multimedia_storage from artifact_storage (None by default in stub) - assert generator.multimedia_storage is None + # Should return media_storage from artifact_storage (always exists) + assert generator.media_storage is not None def test_image_cell_generator_generate_with_storage( stub_image_column_config, stub_resource_provider, stub_base64_images ): - """Test generate with multimedia storage (create mode) - saves to disk.""" - # Setup mock multimedia storage + """Test generate with media storage (create mode) - saves to disk.""" + # Setup mock media storage mock_storage = Mock() mock_storage.save_base64_image.side_effect = ["images/uuid1.png", "images/uuid2.png"] - stub_resource_provider.artifact_storage.multimedia_storage = mock_storage + stub_resource_provider.artifact_storage.media_storage = mock_storage with patch.object( stub_resource_provider.model_registry.get_model.return_value, @@ -68,12 +68,14 @@ def test_image_cell_generator_generate_with_storage( mock_storage.save_base64_image.assert_any_call("base64_image_2") -def test_image_cell_generator_generate_without_storage( +def test_image_cell_generator_generate_in_dataframe_mode( stub_image_column_config, stub_resource_provider, stub_base64_images ): - """Test generate without multimedia storage (preview mode) - stores base64 directly.""" - # Ensure multimedia_storage is None (preview mode) - stub_resource_provider.artifact_storage.multimedia_storage = None + """Test generate with media storage in DATAFRAME mode - stores base64 directly.""" + # Mock save_base64_image to return base64 directly (simulating DATAFRAME mode) + mock_storage = Mock() + mock_storage.save_base64_image.side_effect = stub_base64_images + stub_resource_provider.artifact_storage.media_storage = mock_storage with patch.object( stub_resource_provider.model_registry.get_model.return_value, @@ -83,7 +85,7 @@ def test_image_cell_generator_generate_without_storage( generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) data = generator.generate(data={"style": "watercolor", "subject": "dog"}) - # Check that column was added with base64 data + # Check that column was added with base64 data (simulating DATAFRAME mode) assert stub_image_column_config.name in data assert data[stub_image_column_config.name] == stub_base64_images diff --git a/packages/data-designer-engine/tests/engine/dataset_builders/test_artifact_storage.py b/packages/data-designer-engine/tests/engine/dataset_builders/test_artifact_storage.py index df15b4f7..35edf892 100644 --- a/packages/data-designer-engine/tests/engine/dataset_builders/test_artifact_storage.py +++ b/packages/data-designer-engine/tests/engine/dataset_builders/test_artifact_storage.py @@ -213,10 +213,11 @@ def test_artifact_storage_resolved_dataset_name(mock_datetime, tmp_path): (af_storage.artifact_path / af_storage.dataset_name).mkdir() assert af_storage.resolved_dataset_name == "dataset" - # dataset path exists and is not empty + # dataset path exists and is not empty (create file BEFORE constructing ArtifactStorage) + dataset_dir = tmp_path / "dataset" + dataset_dir.mkdir(exist_ok=True) + (dataset_dir / "stub_file.txt").touch() af_storage = ArtifactStorage(artifact_path=tmp_path) - (af_storage.artifact_path / af_storage.dataset_name / "stub_file.txt").touch() - print(af_storage.resolved_dataset_name) assert af_storage.resolved_dataset_name == "dataset_01-01-2025_120304" diff --git a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py new file mode 100644 index 00000000..abd17afe --- /dev/null +++ b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py @@ -0,0 +1,174 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import base64 +import io + +import pytest + +from data_designer.engine.storage.media_storage import IMAGES_SUBDIR, MediaStorage, StorageMode +from data_designer.lazy_heavy_imports import Image + + +@pytest.fixture +def media_storage(tmp_path): + """Create a MediaStorage instance with a temporary directory.""" + return MediaStorage(base_path=tmp_path) + + +@pytest.fixture +def sample_base64_png() -> str: + """Create a valid 1x1 PNG as base64.""" + img = Image.new("RGB", (1, 1), color="red") + buf = io.BytesIO() + img.save(buf, format="PNG") + png_bytes = buf.getvalue() + return base64.b64encode(png_bytes).decode() + + +@pytest.fixture +def sample_base64_jpg() -> str: + """Create a valid 1x1 JPEG as base64.""" + img = Image.new("RGB", (1, 1), color="blue") + buf = io.BytesIO() + img.save(buf, format="JPEG") + jpg_bytes = buf.getvalue() + return base64.b64encode(jpg_bytes).decode() + + +def test_media_storage_init(tmp_path): + """Test MediaStorage initialization.""" + storage = MediaStorage(base_path=tmp_path) + assert storage.base_path == tmp_path + assert storage.images_dir == tmp_path / IMAGES_SUBDIR + assert storage.images_subdir == IMAGES_SUBDIR + assert storage.mode == StorageMode.DISK + # Directory should NOT exist until first save (lazy initialization) + assert not storage.images_dir.exists() + + +def test_media_storage_init_custom_subdir(tmp_path): + """Test MediaStorage initialization with custom subdirectory and mode.""" + custom_subdir = "custom_images" + storage = MediaStorage(base_path=tmp_path, images_subdir=custom_subdir, mode=StorageMode.DATAFRAME) + assert storage.images_subdir == custom_subdir + assert storage.images_dir == tmp_path / custom_subdir + assert storage.mode == StorageMode.DATAFRAME + # Directory should NOT exist until first save (lazy initialization) + assert not storage.images_dir.exists() + + +def test_save_base64_image_png(media_storage, sample_base64_png): + """Test saving a PNG image from base64.""" + relative_path = media_storage.save_base64_image(sample_base64_png) + + # Check return value format + assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + assert relative_path.endswith(".png") + + # Check file exists on disk + full_path = media_storage.base_path / relative_path + assert full_path.exists() + + # Verify file content + saved_bytes = full_path.read_bytes() + expected_bytes = base64.b64decode(sample_base64_png) + assert saved_bytes == expected_bytes + + +def test_save_base64_image_jpg(media_storage, sample_base64_jpg): + """Test saving a JPEG image from base64.""" + relative_path = media_storage.save_base64_image(sample_base64_jpg) + + # Check return value format + assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + assert relative_path.endswith(".jpg") + + # Check file exists on disk + full_path = media_storage.base_path / relative_path + assert full_path.exists() + + +def test_save_base64_image_with_data_uri(media_storage, sample_base64_png): + """Test saving image from data URI format.""" + data_uri = f"data:image/png;base64,{sample_base64_png}" + relative_path = media_storage.save_base64_image(data_uri) + + # Should successfully extract base64 and save + assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + assert relative_path.endswith(".png") + + # Verify file exists and content is correct + full_path = media_storage.base_path / relative_path + assert full_path.exists() + saved_bytes = full_path.read_bytes() + expected_bytes = base64.b64decode(sample_base64_png) + assert saved_bytes == expected_bytes + + +def test_save_base64_image_invalid_base64_raises_error(media_storage): + """Test that invalid base64 data raises ValueError.""" + with pytest.raises(ValueError, match="Invalid base64"): + media_storage.save_base64_image("not-valid-base64!!!") + + +def test_save_base64_image_multiple_images_unique_filenames(media_storage, sample_base64_png): + """Test that multiple images get unique filenames.""" + path1 = media_storage.save_base64_image(sample_base64_png) + path2 = media_storage.save_base64_image(sample_base64_png) + + # Paths should be different (different UUIDs) + assert path1 != path2 + + # Both files should exist + assert (media_storage.base_path / path1).exists() + assert (media_storage.base_path / path2).exists() + + +def test_save_base64_image_disk_mode_validates(tmp_path, sample_base64_png): + """Test that DISK mode validates images.""" + storage = MediaStorage(base_path=tmp_path, mode=StorageMode.DISK) + # Should succeed with valid image + relative_path = storage.save_base64_image(sample_base64_png) + assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + + +def test_save_base64_image_disk_mode_corrupted_image_raises_error(tmp_path): + """Test that DISK mode validates and rejects corrupted images.""" + storage = MediaStorage(base_path=tmp_path, mode=StorageMode.DISK) + + # Create base64 of invalid image data + corrupted_bytes = b"not a valid image" + corrupted_base64 = base64.b64encode(corrupted_bytes).decode() + + with pytest.raises(ValueError, match="Image validation failed"): + storage.save_base64_image(corrupted_base64) + + # Check that no files were left behind (cleanup on validation failure) + assert len(list(storage.images_dir.iterdir())) == 0 + + +def test_save_base64_image_dataframe_mode_returns_base64(tmp_path, sample_base64_png): + """Test that DATAFRAME mode returns base64 directly without disk operations.""" + storage = MediaStorage(base_path=tmp_path, mode=StorageMode.DATAFRAME) + + # Should return the same base64 data + result = storage.save_base64_image(sample_base64_png) + assert result == sample_base64_png + + # Directory should not be created in DATAFRAME mode (lazy initialization) + assert not storage.images_dir.exists() + + +def test_cleanup(media_storage, sample_base64_png): + """Test cleanup removes images directory.""" + # Save an image first + media_storage.save_base64_image(sample_base64_png) + assert media_storage.images_dir.exists() + assert len(list(media_storage.images_dir.iterdir())) > 0 + + # Cleanup should remove directory + media_storage.cleanup() + assert not media_storage.images_dir.exists() diff --git a/packages/data-designer-engine/tests/engine/storage/test_multimedia_storage.py b/packages/data-designer-engine/tests/engine/storage/test_multimedia_storage.py deleted file mode 100644 index ade76b5a..00000000 --- a/packages/data-designer-engine/tests/engine/storage/test_multimedia_storage.py +++ /dev/null @@ -1,182 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import base64 -import io - -# Explicitly import PIL.Image submodule to make it accessible as PIL.Image -# Python doesn't automatically import submodules when you import a package, -# so `import PIL` alone doesn't give you access to PIL.Image -import PIL.Image # noqa: E402 -import pytest - -from data_designer.engine.storage.multimedia_storage import IMAGES_SUBDIR, MultimediaStorage -from data_designer.lazy_heavy_imports import PIL - - -@pytest.fixture -def multimedia_storage(tmp_path): - """Create a MultimediaStorage instance with a temporary directory.""" - return MultimediaStorage(base_path=tmp_path) - - -@pytest.fixture -def sample_base64_png() -> str: - """Create a valid 1x1 PNG as base64.""" - img = PIL.Image.new("RGB", (1, 1), color="red") - buf = io.BytesIO() - img.save(buf, format="PNG") - png_bytes = buf.getvalue() - return base64.b64encode(png_bytes).decode() - - -@pytest.fixture -def sample_base64_jpg() -> str: - """Create a valid 1x1 JPEG as base64.""" - img = PIL.Image.new("RGB", (1, 1), color="blue") - buf = io.BytesIO() - img.save(buf, format="JPEG") - jpg_bytes = buf.getvalue() - return base64.b64encode(jpg_bytes).decode() - - -def test_multimedia_storage_init(tmp_path): - """Test MultimediaStorage initialization.""" - storage = MultimediaStorage(base_path=tmp_path) - assert storage.base_path == tmp_path - assert storage.images_dir == tmp_path / IMAGES_SUBDIR - assert storage.images_subdir == IMAGES_SUBDIR - assert storage.validate_images is True - # Should create images directory on init - assert storage.images_dir.exists() - - -def test_multimedia_storage_init_custom_subdir(tmp_path): - """Test MultimediaStorage initialization with custom subdirectory.""" - custom_subdir = "custom_images" - storage = MultimediaStorage(base_path=tmp_path, images_subdir=custom_subdir, validate_images=False) - assert storage.images_subdir == custom_subdir - assert storage.images_dir == tmp_path / custom_subdir - assert storage.validate_images is False - assert storage.images_dir.exists() - - -def test_save_base64_image_png(multimedia_storage, sample_base64_png): - """Test saving a PNG image from base64.""" - relative_path = multimedia_storage.save_base64_image(sample_base64_png) - - # Check return value format - assert relative_path.startswith(f"{IMAGES_SUBDIR}/") - assert relative_path.endswith(".png") - - # Check file exists on disk - full_path = multimedia_storage.base_path / relative_path - assert full_path.exists() - - # Verify file content - saved_bytes = full_path.read_bytes() - expected_bytes = base64.b64decode(sample_base64_png) - assert saved_bytes == expected_bytes - - -def test_save_base64_image_jpg(multimedia_storage, sample_base64_jpg): - """Test saving a JPEG image from base64.""" - relative_path = multimedia_storage.save_base64_image(sample_base64_jpg) - - # Check return value format - assert relative_path.startswith(f"{IMAGES_SUBDIR}/") - assert relative_path.endswith(".jpg") - - # Check file exists on disk - full_path = multimedia_storage.base_path / relative_path - assert full_path.exists() - - -def test_save_base64_image_with_data_uri(multimedia_storage, sample_base64_png): - """Test saving image from data URI format.""" - data_uri = f"data:image/png;base64,{sample_base64_png}" - relative_path = multimedia_storage.save_base64_image(data_uri) - - # Should successfully extract base64 and save - assert relative_path.startswith(f"{IMAGES_SUBDIR}/") - assert relative_path.endswith(".png") - - # Verify file exists and content is correct - full_path = multimedia_storage.base_path / relative_path - assert full_path.exists() - saved_bytes = full_path.read_bytes() - expected_bytes = base64.b64decode(sample_base64_png) - assert saved_bytes == expected_bytes - - -def test_save_base64_image_invalid_base64_raises_error(multimedia_storage): - """Test that invalid base64 data raises ValueError.""" - with pytest.raises(ValueError, match="Invalid base64"): - multimedia_storage.save_base64_image("not-valid-base64!!!") - - -def test_save_base64_image_multiple_images_unique_filenames(multimedia_storage, sample_base64_png): - """Test that multiple images get unique filenames.""" - path1 = multimedia_storage.save_base64_image(sample_base64_png) - path2 = multimedia_storage.save_base64_image(sample_base64_png) - - # Paths should be different (different UUIDs) - assert path1 != path2 - - # Both files should exist - assert (multimedia_storage.base_path / path1).exists() - assert (multimedia_storage.base_path / path2).exists() - - -def test_save_base64_image_validation_enabled(tmp_path, sample_base64_png): - """Test that validation is performed when enabled.""" - storage = MultimediaStorage(base_path=tmp_path, validate_images=True) - # Should succeed with valid image - relative_path = storage.save_base64_image(sample_base64_png) - assert relative_path.startswith(f"{IMAGES_SUBDIR}/") - - -def test_save_base64_image_validation_corrupted_image_raises_error(tmp_path): - """Test that corrupted image fails validation and is cleaned up.""" - storage = MultimediaStorage(base_path=tmp_path, validate_images=True) - - # Create base64 of invalid image data - corrupted_bytes = b"not a valid image" - corrupted_base64 = base64.b64encode(corrupted_bytes).decode() - - with pytest.raises(ValueError, match="Image validation failed"): - storage.save_base64_image(corrupted_base64) - - # Check that no files were left behind - assert len(list(storage.images_dir.iterdir())) == 0 - - -def test_save_base64_image_validation_disabled(tmp_path): - """Test that validation can be disabled.""" - storage = MultimediaStorage(base_path=tmp_path, validate_images=False) - - # Create base64 of invalid image data - corrupted_bytes = b"not a valid image" - corrupted_base64 = base64.b64encode(corrupted_bytes).decode() - - # Should succeed without validation - relative_path = storage.save_base64_image(corrupted_base64) - assert relative_path.startswith(f"{IMAGES_SUBDIR}/") - - # File should exist even though it's invalid - full_path = storage.base_path / relative_path - assert full_path.exists() - - -def test_cleanup(multimedia_storage, sample_base64_png): - """Test cleanup removes images directory.""" - # Save an image first - multimedia_storage.save_base64_image(sample_base64_png) - assert multimedia_storage.images_dir.exists() - assert len(list(multimedia_storage.images_dir.iterdir())) > 0 - - # Cleanup should remove directory - multimedia_storage.cleanup() - assert not multimedia_storage.images_dir.exists() diff --git a/packages/data-designer-engine/tests/engine/test_configurable_task.py b/packages/data-designer-engine/tests/engine/test_configurable_task.py index f20936a2..6e3673de 100644 --- a/packages/data-designer-engine/tests/engine/test_configurable_task.py +++ b/packages/data-designer-engine/tests/engine/test_configurable_task.py @@ -25,7 +25,7 @@ def test_configurable_task_generic_type_variables() -> None: assert TaskConfigT.__bound__ == ConfigBase -def test_configurable_task_concrete_implementation() -> None: +def test_configurable_task_concrete_implementation(tmp_path) -> None: class TestConfig(ConfigBase): value: str @@ -41,13 +41,8 @@ def _initialize(self) -> None: pass config = TestConfig(value="test") - mock_artifact_storage = Mock(spec=ArtifactStorage) - mock_artifact_storage.dataset_name = "test_dataset" - mock_artifact_storage.final_dataset_folder_name = "final_dataset" - mock_artifact_storage.partial_results_folder_name = "partial_results" - mock_artifact_storage.dropped_columns_folder_name = "dropped_columns" - mock_artifact_storage.processors_outputs_folder_name = "processors_outputs" - resource_provider = ResourceProvider(artifact_storage=mock_artifact_storage) + artifact_storage = ArtifactStorage(artifact_path=tmp_path) + resource_provider = ResourceProvider(artifact_storage=artifact_storage) task = TestTask(config=config, resource_provider=resource_provider) @@ -55,7 +50,7 @@ def _initialize(self) -> None: assert task._resource_provider == resource_provider -def test_configurable_task_config_validation() -> None: +def test_configurable_task_config_validation(tmp_path) -> None: class TestConfig(ConfigBase): value: str @@ -69,13 +64,8 @@ def _validate(self) -> None: raise ValueError("Invalid config") config = TestConfig(value="test") - mock_artifact_storage = Mock(spec=ArtifactStorage) - mock_artifact_storage.dataset_name = "test_dataset" - mock_artifact_storage.final_dataset_folder_name = "final_dataset" - mock_artifact_storage.partial_results_folder_name = "partial_results" - mock_artifact_storage.dropped_columns_folder_name = "dropped_columns" - mock_artifact_storage.processors_outputs_folder_name = "processors_outputs" - resource_provider = ResourceProvider(artifact_storage=mock_artifact_storage) + artifact_storage = ArtifactStorage(artifact_path=tmp_path) + resource_provider = ResourceProvider(artifact_storage=artifact_storage) task = TestTask(config=config, resource_provider=resource_provider) assert task._config.value == "test" @@ -85,7 +75,7 @@ def _validate(self) -> None: TestTask(config=invalid_config, resource_provider=resource_provider) -def test_configurable_task_resource_validation() -> None: +def test_configurable_task_resource_validation(tmp_path) -> None: class TestConfig(ConfigBase): value: str @@ -102,14 +92,9 @@ def _initialize(self) -> None: config = TestConfig(value="test") - mock_artifact_storage = Mock(spec=ArtifactStorage) - mock_artifact_storage.dataset_name = "test_dataset" - mock_artifact_storage.final_dataset_folder_name = "final_dataset" - mock_artifact_storage.partial_results_folder_name = "partial_results" - mock_artifact_storage.dropped_columns_folder_name = "dropped_columns" - mock_artifact_storage.processors_outputs_folder_name = "processors_outputs" + artifact_storage = ArtifactStorage(artifact_path=tmp_path) mock_model_registry = Mock(spec=ModelRegistry) - resource_provider = ResourceProvider(artifact_storage=mock_artifact_storage, model_registry=mock_model_registry) + resource_provider = ResourceProvider(artifact_storage=artifact_storage, model_registry=mock_model_registry) task = TestTask(config=config, resource_provider=resource_provider) assert task._resource_provider == resource_provider From 1677f066e5a228c418d558633ece69969cd7d122 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 10:17:32 -0700 Subject: [PATCH 27/69] track images generated in usage --- .../config/utils/image_helpers.py | 25 ++ .../tests/config/utils/test_image_helpers.py | 27 ++ .../src/data_designer/engine/models/facade.py | 403 ++++++++---------- .../data_designer/engine/models/registry.py | 4 + .../src/data_designer/engine/models/usage.py | 23 +- .../tests/engine/models/test_facade.py | 147 +++++++ .../tests/engine/models/test_usage.py | 60 ++- 7 files changed, 457 insertions(+), 232 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index 67803aff..2069d9bf 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -23,6 +23,31 @@ # WEBP uses RIFF header - handled separately } +# Patterns for detecting diffusion-based image generation models (DALL-E, Stable Diffusion, Imagen, etc.) +_IMAGE_DIFFUSION_MODEL_PATTERNS = ( + "dall-e", + "dalle", + "stable-diffusion", + "sd-", + "sd_", + "imagen", +) + + +def is_image_diffusion_model(model_name: str) -> bool: + """Return True if the model is a diffusion-based image generation model. + + Diffusion models use the image_generation API (e.g. DALL-E, Stable Diffusion, Imagen). + All other image models are assumed to use the chat/completions API. + + Args: + model_name: Model name or identifier (e.g. from provider). + + Returns: + True if the model is detected as diffusion-based, False otherwise. + """ + return any(pattern in model_name.lower() for pattern in _IMAGE_DIFFUSION_MODEL_PATTERNS) + def extract_base64_from_data_uri(data: str) -> str: """Extract base64 from data URI or return as-is. diff --git a/packages/data-designer-config/tests/config/utils/test_image_helpers.py b/packages/data-designer-config/tests/config/utils/test_image_helpers.py index e0eb0370..aa1ca451 100644 --- a/packages/data-designer-config/tests/config/utils/test_image_helpers.py +++ b/packages/data-designer-config/tests/config/utils/test_image_helpers.py @@ -16,6 +16,7 @@ extract_base64_from_data_uri, get_supported_image_extensions, is_base64_image, + is_image_diffusion_model, is_image_path, is_image_url, load_image_path_to_base64, @@ -144,6 +145,32 @@ def test_is_image_url_non_http(): assert is_image_url("ftp://example.com/image.png") is False +# Tests for is_image_diffusion_model + + +def test_is_image_diffusion_model_dall_e(): + assert is_image_diffusion_model("dall-e-3") is True + assert is_image_diffusion_model("DALL-E-2") is True + assert is_image_diffusion_model("openai/dalle-2") is True + + +def test_is_image_diffusion_model_stable_diffusion(): + assert is_image_diffusion_model("stable-diffusion-xl") is True + assert is_image_diffusion_model("sd-2.1") is True + assert is_image_diffusion_model("sd_1.5") is True + + +def test_is_image_diffusion_model_imagen(): + assert is_image_diffusion_model("imagen-3") is True + assert is_image_diffusion_model("google/imagen") is True + + +def test_is_image_diffusion_model_chat_completion_image_models(): + assert is_image_diffusion_model("gemini-3-pro-image-preview") is False + assert is_image_diffusion_model("gpt-5-image") is False + assert is_image_diffusion_model("flux.2-pro") is False + + # Tests for validate_image diff --git a/packages/data-designer-engine/src/data_designer/engine/models/facade.py b/packages/data-designer-engine/src/data_designer/engine/models/facade.py index d13273f4..11f6e9ec 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/facade.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/facade.py @@ -9,7 +9,10 @@ from typing import TYPE_CHECKING, Any from data_designer.config.models import GenerationType, ModelConfig, ModelProvider -from data_designer.config.utils.image_helpers import extract_base64_from_data_uri +from data_designer.config.utils.image_helpers import ( + extract_base64_from_data_uri, + is_image_diffusion_model, +) from data_designer.engine.mcp.errors import MCPConfigurationError from data_designer.engine.model_provider import ModelProviderRegistry from data_designer.engine.models.errors import ( @@ -20,7 +23,7 @@ ) from data_designer.engine.models.litellm_overrides import CustomRouter, LiteLLMRouterDefaultKwargs from data_designer.engine.models.parsers.errors import ParserException -from data_designer.engine.models.usage import ModelUsageStats, RequestUsageStats, TokenUsageStats +from data_designer.engine.models.usage import ImageUsageStats, ModelUsageStats, RequestUsageStats, TokenUsageStats from data_designer.engine.models.utils import ChatMessage, prompt_to_messages from data_designer.engine.secret_resolver import SecretResolver from data_designer.lazy_heavy_imports import litellm @@ -39,16 +42,6 @@ def _identity(x: Any) -> Any: logger = logging.getLogger(__name__) -# Patterns for detecting diffusion-based image generation models -DIFFUSION_MODEL_PATTERNS = [ - "dall-e", - "dalle", - "stable-diffusion", - "sd-", - "sd_", - "imagen", -] - class ModelFacade: def __init__( @@ -117,7 +110,7 @@ def completion( raise e finally: if not skip_usage_tracking and response is not None: - self._track_usage(response) + self._track_token_usage_from_completion(response) def consolidate_kwargs(self, **kwargs) -> dict[str, Any]: # Remove purpose from kwargs to avoid passing it to the model @@ -129,16 +122,153 @@ def consolidate_kwargs(self, **kwargs) -> dict[str, Any]: kwargs["extra_headers"] = self.model_provider.extra_headers return kwargs - def _get_mcp_facade(self, tool_alias: str | None) -> MCPFacade | None: - if tool_alias is None: - return None - if self._mcp_registry is None: - raise MCPConfigurationError(f"Tool alias {tool_alias!r} specified but no MCPRegistry configured.") + @catch_llm_exceptions + def generate( + self, + prompt: str, + *, + parser: Callable[[str], Any] = _identity, + system_prompt: str | None = None, + multi_modal_context: list[dict[str, Any]] | None = None, + tool_alias: str | None = None, + max_correction_steps: int = 0, + max_conversation_restarts: int = 0, + skip_usage_tracking: bool = False, + purpose: str | None = None, + **kwargs, + ) -> tuple[Any, list[ChatMessage]]: + """Generate a parsed output with correction steps. - try: - return self._mcp_registry.get_mcp(tool_alias=tool_alias) - except ValueError as exc: - raise MCPConfigurationError(f"Tool alias {tool_alias!r} is not registered.") from exc + This generation call will attempt to generate an output which is + valid according to the specified parser, where "valid" implies + that the parser can process the LLM response without raising + an exception. + + `ParserExceptions` are routed back + to the LLM as new rounds in the conversation, where the LLM is provided its + earlier response along with the "user" role responding with the exception string + (not traceback). This will continue for the number of rounds specified by + `max_correction_steps`. + + Args: + prompt (str): Task prompt. + system_prompt (str, optional): Optional system instructions. If not specified, + no system message is provided and the model should use its default system + prompt. + parser (func(str) -> Any): A function applied to the LLM response which processes + an LLM response into some output object. Default: identity function. + tool_alias (str | None): Optional tool configuration alias. When provided, + the model may call permitted tools from the configured MCP providers. + The alias must reference a ToolConfig registered in the MCPRegistry. + max_correction_steps (int): Maximum number of correction rounds permitted + within a single conversation. Note, many rounds can lead to increasing + context size without necessarily improving performance -- small language + models can enter repeated cycles which will not be solved with more steps. + Default: `0` (no correction). + max_conversation_restarts (int): Maximum number of full conversation restarts permitted + if generation fails. Default: `0` (no restarts). + skip_usage_tracking (bool): Whether to skip usage tracking. Default: `False`. + purpose (str): The purpose of the model usage to show as context in the error message. + It is expected to be used by the @catch_llm_exceptions decorator. + **kwargs: Additional arguments to pass to the model. + + Returns: + A tuple containing: + - The parsed output object from the parser. + - The full trace of ChatMessage entries in the conversation, including any tool calls, + corrections, and reasoning traces. Callers can decide whether to store this. + + Raises: + GenerationValidationFailureError: If the maximum number of retries or + correction steps are met and the last response failures on + generation validation. + MCPConfigurationError: If tool_alias is specified but no MCPRegistry is configured. + """ + output_obj = None + tool_schemas = None + tool_call_turns = 0 + total_tool_calls = 0 + curr_num_correction_steps = 0 + curr_num_restarts = 0 + + mcp_facade = self._get_mcp_facade(tool_alias) + + # Checkpoint for restarts - updated after tool calls so we don't repeat them + restart_checkpoint = prompt_to_messages( + user_prompt=prompt, system_prompt=system_prompt, multi_modal_context=multi_modal_context + ) + checkpoint_tool_call_turns = 0 + messages: list[ChatMessage] = deepcopy(restart_checkpoint) + + if mcp_facade is not None: + tool_schemas = mcp_facade.get_tool_schemas() + + while True: + completion_kwargs = dict(kwargs) + if tool_schemas is not None: + completion_kwargs["tools"] = tool_schemas + + completion_response = self.completion( + messages, + skip_usage_tracking=skip_usage_tracking, + **completion_kwargs, + ) + + # Process any tool calls in the response (handles parallel tool calling) + if mcp_facade is not None and mcp_facade.has_tool_calls(completion_response): + tool_call_turns += 1 + total_tool_calls += mcp_facade.tool_call_count(completion_response) + + if tool_call_turns > mcp_facade.max_tool_call_turns: + # Gracefully refuse tool calls when budget is exhausted + messages.extend(mcp_facade.refuse_completion_response(completion_response)) + else: + messages.extend(mcp_facade.process_completion_response(completion_response)) + + # Update checkpoint so restarts don't repeat tool calls + restart_checkpoint = deepcopy(messages) + checkpoint_tool_call_turns = tool_call_turns + + continue # Back to top + + # No tool calls remaining to process + response = completion_response.choices[0].message.content or "" + reasoning_trace = getattr(completion_response.choices[0].message, "reasoning_content", None) + messages.append(ChatMessage.as_assistant(content=response, reasoning_content=reasoning_trace or None)) + curr_num_correction_steps += 1 + + try: + output_obj = parser(response) # type: ignore - if not a string will cause a ParserException below + break + except ParserException as exc: + if max_correction_steps == 0 and max_conversation_restarts == 0: + raise GenerationValidationFailureError( + "Unsuccessful generation attempt. No retries were attempted." + ) from exc + + if curr_num_correction_steps <= max_correction_steps: + # Add user message with error for correction + messages.append(ChatMessage.as_user(content=str(get_exception_primary_cause(exc)))) + + elif curr_num_restarts < max_conversation_restarts: + curr_num_correction_steps = 0 + curr_num_restarts += 1 + messages = deepcopy(restart_checkpoint) + tool_call_turns = checkpoint_tool_call_turns + + else: + raise GenerationValidationFailureError( + f"Unsuccessful generation despite {max_correction_steps} correction steps " + f"and {max_conversation_restarts} conversation restarts." + ) from exc + + if not skip_usage_tracking and mcp_facade is not None: + self._usage_stats.tool_usage.extend( + tool_calls=total_tool_calls, + tool_call_turns=tool_call_turns, + ) + + return output_obj, messages @catch_llm_exceptions def generate_text_embeddings( @@ -171,7 +301,7 @@ def generate_text_embeddings( raise e finally: if not skip_usage_tracking and response is not None: - self._track_usage_from_embedding(response) + self._track_token_usage_from_embedding(response) @catch_llm_exceptions def generate_image(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> list[str]: @@ -201,22 +331,27 @@ def generate_image(self, prompt: str, skip_usage_tracking: bool = False, **kwarg ) # Auto-detect API type based on model name - if self._is_diffusion_model(): - return self._generate_image_diffusion(prompt, skip_usage_tracking, **kwargs) + if is_image_diffusion_model(self.model_name): + images = self._generate_image_diffusion(prompt, skip_usage_tracking, **kwargs) else: - return self._generate_image_chat_completion(prompt, skip_usage_tracking, **kwargs) + images = self._generate_image_chat_completion(prompt, skip_usage_tracking, **kwargs) - def _is_diffusion_model(self) -> bool: - """Detect if model uses diffusion API based on name patterns. + # Track image usage + if not skip_usage_tracking and len(images) > 0: + self._usage_stats.extend(image_usage=ImageUsageStats(total_images=len(images))) - Diffusion models include DALL-E, Stable Diffusion, and Imagen variants. - All other image models are assumed to use chat completions API. + return images - Returns: - True if model is detected as diffusion-based, False otherwise - """ - model_lower = self.model_name.lower() - return any(pattern in model_lower for pattern in DIFFUSION_MODEL_PATTERNS) + def _get_mcp_facade(self, tool_alias: str | None) -> MCPFacade | None: + if tool_alias is None: + return None + if self._mcp_registry is None: + raise MCPConfigurationError(f"Tool alias {tool_alias!r} specified but no MCPRegistry configured.") + + try: + return self._mcp_registry.get_mcp(tool_alias=tool_alias) + except ValueError as exc: + raise MCPConfigurationError(f"Tool alias {tool_alias!r} is not registered.") from exc def _generate_image_chat_completion(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> list[str]: """Generate image(s) using autoregressive model via chat completions API. @@ -311,155 +446,7 @@ def _generate_image_diffusion(self, prompt: str, skip_usage_tracking: bool = Fal raise finally: if not skip_usage_tracking and response is not None: - self._track_usage_from_image_diffusion(response) - - @catch_llm_exceptions - def generate( - self, - prompt: str, - *, - parser: Callable[[str], Any] = _identity, - system_prompt: str | None = None, - multi_modal_context: list[dict[str, Any]] | None = None, - tool_alias: str | None = None, - max_correction_steps: int = 0, - max_conversation_restarts: int = 0, - skip_usage_tracking: bool = False, - purpose: str | None = None, - **kwargs, - ) -> tuple[Any, list[ChatMessage]]: - """Generate a parsed output with correction steps. - - This generation call will attempt to generate an output which is - valid according to the specified parser, where "valid" implies - that the parser can process the LLM response without raising - an exception. - - `ParserExceptions` are routed back - to the LLM as new rounds in the conversation, where the LLM is provided its - earlier response along with the "user" role responding with the exception string - (not traceback). This will continue for the number of rounds specified by - `max_correction_steps`. - - Args: - prompt (str): Task prompt. - system_prompt (str, optional): Optional system instructions. If not specified, - no system message is provided and the model should use its default system - prompt. - parser (func(str) -> Any): A function applied to the LLM response which processes - an LLM response into some output object. Default: identity function. - tool_alias (str | None): Optional tool configuration alias. When provided, - the model may call permitted tools from the configured MCP providers. - The alias must reference a ToolConfig registered in the MCPRegistry. - max_correction_steps (int): Maximum number of correction rounds permitted - within a single conversation. Note, many rounds can lead to increasing - context size without necessarily improving performance -- small language - models can enter repeated cycles which will not be solved with more steps. - Default: `0` (no correction). - max_conversation_restarts (int): Maximum number of full conversation restarts permitted - if generation fails. Default: `0` (no restarts). - skip_usage_tracking (bool): Whether to skip usage tracking. Default: `False`. - purpose (str): The purpose of the model usage to show as context in the error message. - It is expected to be used by the @catch_llm_exceptions decorator. - **kwargs: Additional arguments to pass to the model. - - Returns: - A tuple containing: - - The parsed output object from the parser. - - The full trace of ChatMessage entries in the conversation, including any tool calls, - corrections, and reasoning traces. Callers can decide whether to store this. - - Raises: - GenerationValidationFailureError: If the maximum number of retries or - correction steps are met and the last response failures on - generation validation. - MCPConfigurationError: If tool_alias is specified but no MCPRegistry is configured. - """ - output_obj = None - tool_schemas = None - tool_call_turns = 0 - total_tool_calls = 0 - curr_num_correction_steps = 0 - curr_num_restarts = 0 - - mcp_facade = self._get_mcp_facade(tool_alias) - - # Checkpoint for restarts - updated after tool calls so we don't repeat them - restart_checkpoint = prompt_to_messages( - user_prompt=prompt, system_prompt=system_prompt, multi_modal_context=multi_modal_context - ) - checkpoint_tool_call_turns = 0 - messages: list[ChatMessage] = deepcopy(restart_checkpoint) - - if mcp_facade is not None: - tool_schemas = mcp_facade.get_tool_schemas() - - while True: - completion_kwargs = dict(kwargs) - if tool_schemas is not None: - completion_kwargs["tools"] = tool_schemas - - completion_response = self.completion( - messages, - skip_usage_tracking=skip_usage_tracking, - **completion_kwargs, - ) - - # Process any tool calls in the response (handles parallel tool calling) - if mcp_facade is not None and mcp_facade.has_tool_calls(completion_response): - tool_call_turns += 1 - total_tool_calls += mcp_facade.tool_call_count(completion_response) - - if tool_call_turns > mcp_facade.max_tool_call_turns: - # Gracefully refuse tool calls when budget is exhausted - messages.extend(mcp_facade.refuse_completion_response(completion_response)) - else: - messages.extend(mcp_facade.process_completion_response(completion_response)) - - # Update checkpoint so restarts don't repeat tool calls - restart_checkpoint = deepcopy(messages) - checkpoint_tool_call_turns = tool_call_turns - - continue # Back to top - - # No tool calls remaining to process - response = completion_response.choices[0].message.content or "" - reasoning_trace = getattr(completion_response.choices[0].message, "reasoning_content", None) - messages.append(ChatMessage.as_assistant(content=response, reasoning_content=reasoning_trace or None)) - curr_num_correction_steps += 1 - - try: - output_obj = parser(response) # type: ignore - if not a string will cause a ParserException below - break - except ParserException as exc: - if max_correction_steps == 0 and max_conversation_restarts == 0: - raise GenerationValidationFailureError( - "Unsuccessful generation attempt. No retries were attempted." - ) from exc - - if curr_num_correction_steps <= max_correction_steps: - # Add user message with error for correction - messages.append(ChatMessage.as_user(content=str(get_exception_primary_cause(exc)))) - - elif curr_num_restarts < max_conversation_restarts: - curr_num_correction_steps = 0 - curr_num_restarts += 1 - messages = deepcopy(restart_checkpoint) - tool_call_turns = checkpoint_tool_call_turns - - else: - raise GenerationValidationFailureError( - f"Unsuccessful generation despite {max_correction_steps} correction steps " - f"and {max_conversation_restarts} conversation restarts." - ) from exc - - if not skip_usage_tracking and mcp_facade is not None: - self._usage_stats.tool_usage.extend( - tool_calls=total_tool_calls, - tool_call_turns=tool_call_turns, - ) - - return output_obj, messages + self._track_token_usage_from_image_diffusion(response) def _get_litellm_deployment(self, model_config: ModelConfig) -> litellm.DeploymentTypedDict: provider = self._model_provider_registry.get_provider(model_config.provider) @@ -478,7 +465,7 @@ def _get_litellm_deployment(self, model_config: ModelConfig) -> litellm.Deployme "litellm_params": litellm_params.model_dump(), } - def _track_usage(self, response: litellm.types.utils.ModelResponse | None) -> None: + def _track_token_usage_from_completion(self, response: litellm.types.utils.ModelResponse | None) -> None: if response is None: self._usage_stats.extend(request_usage=RequestUsageStats(successful_requests=0, failed_requests=1)) return @@ -495,7 +482,7 @@ def _track_usage(self, response: litellm.types.utils.ModelResponse | None) -> No request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), ) - def _track_usage_from_embedding(self, response: litellm.types.utils.EmbeddingResponse | None) -> None: + def _track_token_usage_from_embedding(self, response: litellm.types.utils.EmbeddingResponse | None) -> None: if response is None: self._usage_stats.extend(request_usage=RequestUsageStats(successful_requests=0, failed_requests=1)) return @@ -508,27 +495,12 @@ def _track_usage_from_embedding(self, response: litellm.types.utils.EmbeddingRes request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), ) - def _track_usage_from_response(self, response: litellm.types.utils.ResponseResponse | None) -> None: - """Track usage from Responses API response.""" + def _track_token_usage_from_image_diffusion(self, response: litellm.types.utils.ImageResponse | None) -> None: + """Track token usage from image_generation API response.""" if response is None: self._usage_stats.extend(request_usage=RequestUsageStats(successful_requests=0, failed_requests=1)) return - if response.usage is not None: - input_tokens = getattr(response.usage, "input_tokens", 0) or 0 - output_tokens = getattr(response.usage, "output_tokens", 0) or 0 - self._usage_stats.extend( - token_usage=TokenUsageStats( - input_tokens=input_tokens, - output_tokens=output_tokens, - ), - request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), - ) - def _track_usage_from_image_diffusion(self, response: litellm.types.utils.ImageResponse | None) -> None: - """Track usage from image_generation API response.""" - if response is None: - self._usage_stats.extend(request_usage=RequestUsageStats(successful_requests=0, failed_requests=1)) - return if response.usage is not None and isinstance(response.usage, litellm.types.utils.ImageUsage): self._usage_stats.extend( token_usage=TokenUsageStats( @@ -537,28 +509,3 @@ def _track_usage_from_image_diffusion(self, response: litellm.types.utils.ImageR ), request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), ) - - def _download_url_to_base64(self, url: str) -> str: - """Download image from URL and convert to base64. - - Args: - url: Image URL - - Returns: - Base64-encoded image string - - Raises: - ModelAPIError: If download fails - """ - import base64 - - from data_designer.lazy_heavy_imports import httpx - - try: - with httpx.Client(timeout=30.0) as client: - response = client.get(url) - response.raise_for_status() - image_bytes = response.content - return base64.b64encode(image_bytes).decode("utf-8") - except Exception as e: - raise ModelAPIError(f"Failed to download image from URL {url}: {e}") from e diff --git a/packages/data-designer-engine/src/data_designer/engine/models/registry.py b/packages/data-designer-engine/src/data_designer/engine/models/registry.py index 56945941..2878f64e 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/registry.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/registry.py @@ -120,6 +120,10 @@ def log_model_usage(self, total_time_elapsed: float) -> None: f"turns={tool_usage['total_tool_call_turns']}" ) + if image_usage := stats.get("image_usage"): + total_images = image_usage["total_images"] + logger.info(f"{LOG_INDENT}images: total={total_images}") + if model_index < len(sorted_model_names) - 1: logger.info(LOG_INDENT.rstrip()) diff --git a/packages/data-designer-engine/src/data_designer/engine/models/usage.py b/packages/data-designer-engine/src/data_designer/engine/models/usage.py index f44a31ae..169ef1bb 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/usage.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/usage.py @@ -71,10 +71,23 @@ def merge(self, other: ToolUsageStats) -> ToolUsageStats: return self +class ImageUsageStats(BaseModel): + total_images: int = 0 + + @property + def has_usage(self) -> bool: + return self.total_images > 0 + + def extend(self, *, images: int) -> None: + """Extend stats with generated images count.""" + self.total_images += images + + class ModelUsageStats(BaseModel): token_usage: TokenUsageStats = TokenUsageStats() request_usage: RequestUsageStats = RequestUsageStats() tool_usage: ToolUsageStats = ToolUsageStats() + image_usage: ImageUsageStats = ImageUsageStats() @property def has_usage(self) -> bool: @@ -86,6 +99,7 @@ def extend( token_usage: TokenUsageStats | None = None, request_usage: RequestUsageStats | None = None, tool_usage: ToolUsageStats | None = None, + image_usage: ImageUsageStats | None = None, ) -> None: if token_usage is not None: self.token_usage.extend(input_tokens=token_usage.input_tokens, output_tokens=token_usage.output_tokens) @@ -95,9 +109,16 @@ def extend( ) if tool_usage is not None: self.tool_usage.merge(tool_usage) + if image_usage is not None: + self.image_usage.extend(images=image_usage.total_images) def get_usage_stats(self, *, total_time_elapsed: float) -> dict: - exclude = {"tool_usage"} if not self.tool_usage.has_usage else None + exclude = set() + if not self.tool_usage.has_usage: + exclude.add("tool_usage") + if not self.image_usage.has_usage: + exclude.add("image_usage") + exclude = exclude if exclude else None return self.model_dump(exclude=exclude) | { "tokens_per_second": int(self.token_usage.total_tokens / total_time_elapsed) if total_time_elapsed > 0 diff --git a/packages/data-designer-engine/tests/engine/models/test_facade.py b/packages/data-designer-engine/tests/engine/models/test_facade.py index c0ab9cd3..78473d63 100644 --- a/packages/data-designer-engine/tests/engine/models/test_facade.py +++ b/packages/data-designer-engine/tests/engine/models/test_facade.py @@ -989,3 +989,150 @@ def _completion(self: Any, messages: list[ChatMessage], **kwargs: Any) -> StubRe with patch.object(ModelFacade, "completion", new=_completion): with pytest.raises(MCPToolError, match="Invalid tool arguments"): model.generate(prompt="question", parser=lambda x: x, tool_alias="tools") + + +# ============================================================================= +# Image generation tests +# ============================================================================= + + +@patch("data_designer.engine.models.facade.CustomRouter.image_generation", autospec=True) +def test_generate_image_diffusion_tracks_image_usage( + mock_image_generation: Any, + stub_model_facade: ModelFacade, +) -> None: + """Test that generate_image tracks image usage for diffusion models.""" + from litellm.types.utils import ImageObject, ImageResponse + + # Mock response with 3 images + mock_response = ImageResponse( + data=[ + ImageObject(b64_json="image1_base64"), + ImageObject(b64_json="image2_base64"), + ImageObject(b64_json="image3_base64"), + ] + ) + mock_image_generation.return_value = mock_response + + # Verify initial state + assert stub_model_facade.usage_stats.image_usage.total_images == 0 + + # Generate images + with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=True): + images = stub_model_facade.generate_image(prompt="test prompt", n=3) + + # Verify results + assert len(images) == 3 + assert images == ["image1_base64", "image2_base64", "image3_base64"] + + # Verify image usage was tracked + assert stub_model_facade.usage_stats.image_usage.total_images == 3 + assert stub_model_facade.usage_stats.image_usage.has_usage is True + + +@patch("data_designer.engine.models.facade.ModelFacade.completion", autospec=True) +def test_generate_image_chat_completion_tracks_image_usage( + mock_completion: Any, + stub_model_facade: ModelFacade, +) -> None: + """Test that generate_image tracks image usage for chat completion models.""" + from litellm.types.utils import Choices, ImageURLListItem, Message, ModelResponse + + # Mock response with images attribute (Message requires type and index per ImageURLListItem) + mock_message = Message( + role="assistant", + content="", + images=[ + ImageURLListItem(type="image_url", image_url={"url": ""}, index=0), + ImageURLListItem(type="image_url", image_url={"url": ""}, index=1), + ], + ) + mock_response = ModelResponse(choices=[Choices(message=mock_message)]) + mock_completion.return_value = mock_response + + # Verify initial state + assert stub_model_facade.usage_stats.image_usage.total_images == 0 + + # Generate images + with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=False): + images = stub_model_facade.generate_image(prompt="test prompt") + + # Verify results + assert len(images) == 2 + assert images == ["image1", "image2"] + + # Verify image usage was tracked + assert stub_model_facade.usage_stats.image_usage.total_images == 2 + assert stub_model_facade.usage_stats.image_usage.has_usage is True + + +@patch("data_designer.engine.models.facade.CustomRouter.image_generation", autospec=True) +def test_generate_image_skip_usage_tracking( + mock_image_generation: Any, + stub_model_facade: ModelFacade, +) -> None: + """Test that generate_image respects skip_usage_tracking flag.""" + from litellm.types.utils import ImageObject, ImageResponse + + mock_response = ImageResponse( + data=[ + ImageObject(b64_json="image1_base64"), + ImageObject(b64_json="image2_base64"), + ] + ) + mock_image_generation.return_value = mock_response + + # Verify initial state + assert stub_model_facade.usage_stats.image_usage.total_images == 0 + + # Generate images with skip_usage_tracking=True + with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=True): + images = stub_model_facade.generate_image(prompt="test prompt", skip_usage_tracking=True) + + # Verify results + assert len(images) == 2 + + # Verify image usage was NOT tracked + assert stub_model_facade.usage_stats.image_usage.total_images == 0 + assert stub_model_facade.usage_stats.image_usage.has_usage is False + + +@patch("data_designer.engine.models.facade.CustomRouter.image_generation", autospec=True) +def test_generate_image_accumulates_usage( + mock_image_generation: Any, + stub_model_facade: ModelFacade, +) -> None: + """Test that generate_image accumulates image usage across multiple calls.""" + from litellm.types.utils import ImageObject, ImageResponse + + # First call - 2 images + mock_response1 = ImageResponse( + data=[ + ImageObject(b64_json="image1"), + ImageObject(b64_json="image2"), + ] + ) + # Second call - 3 images + mock_response2 = ImageResponse( + data=[ + ImageObject(b64_json="image3"), + ImageObject(b64_json="image4"), + ImageObject(b64_json="image5"), + ] + ) + mock_image_generation.side_effect = [mock_response1, mock_response2] + + # Verify initial state + assert stub_model_facade.usage_stats.image_usage.total_images == 0 + + # First generation + with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=True): + images1 = stub_model_facade.generate_image(prompt="test1") + assert len(images1) == 2 + assert stub_model_facade.usage_stats.image_usage.total_images == 2 + + # Second generation + images2 = stub_model_facade.generate_image(prompt="test2") + assert len(images2) == 3 + # Usage should accumulate + assert stub_model_facade.usage_stats.image_usage.total_images == 5 diff --git a/packages/data-designer-engine/tests/engine/models/test_usage.py b/packages/data-designer-engine/tests/engine/models/test_usage.py index 8e7adb04..2c4f783f 100644 --- a/packages/data-designer-engine/tests/engine/models/test_usage.py +++ b/packages/data-designer-engine/tests/engine/models/test_usage.py @@ -1,7 +1,13 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from data_designer.engine.models.usage import ModelUsageStats, RequestUsageStats, TokenUsageStats, ToolUsageStats +from data_designer.engine.models.usage import ( + ImageUsageStats, + ModelUsageStats, + RequestUsageStats, + TokenUsageStats, + ToolUsageStats, +) def test_token_usage_stats() -> None: @@ -32,6 +38,20 @@ def test_request_usage_stats() -> None: assert request_usage_stats.has_usage is True +def test_image_usage_stats() -> None: + image_usage_stats = ImageUsageStats() + assert image_usage_stats.total_images == 0 + assert image_usage_stats.has_usage is False + + image_usage_stats.extend(images=5) + assert image_usage_stats.total_images == 5 + assert image_usage_stats.has_usage is True + + image_usage_stats.extend(images=3) + assert image_usage_stats.total_images == 8 + assert image_usage_stats.has_usage is True + + def test_tool_usage_stats_empty_state() -> None: """Test ToolUsageStats initialization with empty state.""" tool_usage = ToolUsageStats() @@ -132,9 +152,10 @@ def test_model_usage_stats() -> None: assert model_usage_stats.token_usage.output_tokens == 0 assert model_usage_stats.request_usage.successful_requests == 0 assert model_usage_stats.request_usage.failed_requests == 0 + assert model_usage_stats.image_usage.total_images == 0 assert model_usage_stats.has_usage is False - # tool_usage is excluded when has_usage is False + # tool_usage and image_usage are excluded when has_usage is False assert model_usage_stats.get_usage_stats(total_time_elapsed=10) == { "token_usage": {"input_tokens": 0, "output_tokens": 0, "total_tokens": 0}, "request_usage": {"successful_requests": 0, "failed_requests": 0, "total_requests": 0}, @@ -152,7 +173,7 @@ def test_model_usage_stats() -> None: assert model_usage_stats.request_usage.failed_requests == 1 assert model_usage_stats.has_usage is True - # tool_usage is excluded when has_usage is False + # tool_usage and image_usage are excluded when has_usage is False assert model_usage_stats.get_usage_stats(total_time_elapsed=2) == { "token_usage": {"input_tokens": 10, "output_tokens": 20, "total_tokens": 30}, "request_usage": {"successful_requests": 2, "failed_requests": 1, "total_requests": 3}, @@ -177,3 +198,36 @@ def test_model_usage_stats_extend_with_tool_usage() -> None: assert stats1.tool_usage.total_tool_call_turns == 6 assert stats1.tool_usage.total_generations == 4 assert stats1.tool_usage.generations_with_tools == 3 + + +def test_model_usage_stats_with_image_usage() -> None: + """Test that ModelUsageStats includes image_usage when it has usage.""" + model_usage_stats = ModelUsageStats() + model_usage_stats.extend( + token_usage=TokenUsageStats(input_tokens=10, output_tokens=20), + request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), + image_usage=ImageUsageStats(total_images=5), + ) + + assert model_usage_stats.image_usage.total_images == 5 + assert model_usage_stats.image_usage.has_usage is True + + # image_usage should be included in output + usage_stats = model_usage_stats.get_usage_stats(total_time_elapsed=2) + assert "image_usage" in usage_stats + assert usage_stats["image_usage"] == {"total_images": 5} + + +def test_model_usage_stats_exclude_unused_stats() -> None: + """Test that ModelUsageStats excludes tool_usage and image_usage when they have no usage.""" + model_usage_stats = ModelUsageStats() + model_usage_stats.extend( + token_usage=TokenUsageStats(input_tokens=10, output_tokens=20), + request_usage=RequestUsageStats(successful_requests=1, failed_requests=0), + ) + + usage_stats = model_usage_stats.get_usage_stats(total_time_elapsed=2) + assert "tool_usage" not in usage_stats + assert "image_usage" not in usage_stats + assert "token_usage" in usage_stats + assert "request_usage" in usage_stats From 3b4acf19202db778f6905f0c1bd27ca984cdaffb Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 10:55:49 -0700 Subject: [PATCH 28/69] fix image usage tracking --- .../config/utils/image_helpers.py | 9 ++++---- .../src/data_designer/engine/models/usage.py | 2 +- .../tests/engine/models/test_usage.py | 22 +++++++++++++++++++ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index 2069d9bf..9fc4e2b0 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -23,8 +23,8 @@ # WEBP uses RIFF header - handled separately } -# Patterns for detecting diffusion-based image generation models (DALL-E, Stable Diffusion, Imagen, etc.) -_IMAGE_DIFFUSION_MODEL_PATTERNS = ( +# Patterns for diffusion-based image models only (use image_generation API). +IMAGE_DIFFUSION_MODEL_PATTERNS = ( "dall-e", "dalle", "stable-diffusion", @@ -37,8 +37,7 @@ def is_image_diffusion_model(model_name: str) -> bool: """Return True if the model is a diffusion-based image generation model. - Diffusion models use the image_generation API (e.g. DALL-E, Stable Diffusion, Imagen). - All other image models are assumed to use the chat/completions API. + Args: model_name: Model name or identifier (e.g. from provider). @@ -46,7 +45,7 @@ def is_image_diffusion_model(model_name: str) -> bool: Returns: True if the model is detected as diffusion-based, False otherwise. """ - return any(pattern in model_name.lower() for pattern in _IMAGE_DIFFUSION_MODEL_PATTERNS) + return any(pattern in model_name.lower() for pattern in IMAGE_DIFFUSION_MODEL_PATTERNS) def extract_base64_from_data_uri(data: str) -> str: diff --git a/packages/data-designer-engine/src/data_designer/engine/models/usage.py b/packages/data-designer-engine/src/data_designer/engine/models/usage.py index 169ef1bb..64e82b47 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/usage.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/usage.py @@ -91,7 +91,7 @@ class ModelUsageStats(BaseModel): @property def has_usage(self) -> bool: - return self.token_usage.has_usage and self.request_usage.has_usage + return self.token_usage.has_usage or self.request_usage.has_usage or self.image_usage.has_usage def extend( self, diff --git a/packages/data-designer-engine/tests/engine/models/test_usage.py b/packages/data-designer-engine/tests/engine/models/test_usage.py index 2c4f783f..2bfea4b4 100644 --- a/packages/data-designer-engine/tests/engine/models/test_usage.py +++ b/packages/data-designer-engine/tests/engine/models/test_usage.py @@ -218,6 +218,28 @@ def test_model_usage_stats_with_image_usage() -> None: assert usage_stats["image_usage"] == {"total_images": 5} +def test_model_usage_stats_has_usage_any_of() -> None: + """Test that has_usage is True when any of token, request, or image usage is present.""" + # Only token usage + stats = ModelUsageStats() + stats.extend(token_usage=TokenUsageStats(input_tokens=1, output_tokens=0)) + assert stats.has_usage is True + + # Only request usage (e.g. diffusion API without token counts) + stats = ModelUsageStats() + stats.extend(request_usage=RequestUsageStats(successful_requests=1, failed_requests=0)) + assert stats.has_usage is True + + # Only image usage + stats = ModelUsageStats() + stats.extend(image_usage=ImageUsageStats(total_images=2)) + assert stats.has_usage is True + + # None of the three + stats = ModelUsageStats() + assert stats.has_usage is False + + def test_model_usage_stats_exclude_unused_stats() -> None: """Test that ModelUsageStats excludes tool_usage and image_usage when they have no usage.""" model_usage_stats = ModelUsageStats() From 33b4211490519ac6f2e3bd38cd5273d945f58718 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 11:52:15 -0700 Subject: [PATCH 29/69] test clean up --- .../src/data_designer/config/models.py | 28 +------ .../config/utils/image_helpers.py | 2 - .../tests/config/test_models.py | 30 ++++++++ .../data_designer/engine/models/registry.py | 2 +- .../tests/engine/models/test_facade.py | 77 ++++++++++--------- 5 files changed, 76 insertions(+), 63 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py index 3dab2d8d..8b16b4bc 100644 --- a/packages/data-designer-config/src/data_designer/config/models.py +++ b/packages/data-designer-config/src/data_designer/config/models.py @@ -425,21 +425,7 @@ def generate_kwargs(self) -> dict[str, float | int]: class ImageInferenceParams(BaseInferenceParams): """Configuration for image generation models. - Works for all image generation models. The API type is automatically detected - based on the model name: - - Diffusion models (DALL-E, Stable Diffusion, Imagen, etc.) use image_generation API - - All other models use chat/completions API (default) - - Image storage behavior: - - Create mode: Images saved to disk with UUID filenames, paths stored in dataframe - - Preview mode: Images stored as base64 directly in dataframe - - Common parameters like quality and size are provided as optional fields. - For model-specific parameters (including n for number of images), use the `extra_body` - field inherited from BaseInferenceParams. - - If the API returns multiple images (either from prompt or API parameters), all images - will be stored as a list in the dataframe. + Works for both diffusion and autoregressive image generation models. Use extra_body for model-specific parameters. Attributes: generation_type: Type of generation, always "image" for this class. @@ -454,22 +440,14 @@ class ImageInferenceParams(BaseInferenceParams): size="1024x1024" ) - # Generate multiple images using extra_body - dd.ImageInferenceParams( - quality="hd", - size="1024x1024", - extra_body={"n": 3} # Request 3 images from API - ) - # With model-specific params via extra_body dd.ImageInferenceParams( - quality="hd", - size="1024x1024", + quality="auto", extra_body={ "generationConfig": { "imageConfig": { "aspectRatio": "1:1", - "negativePrompt": "blurry, low quality" + "imageSize": "1024" } } } diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index 9fc4e2b0..678d3b80 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -37,8 +37,6 @@ def is_image_diffusion_model(model_name: str) -> bool: """Return True if the model is a diffusion-based image generation model. - - Args: model_name: Model name or identifier (e.g. from provider). diff --git a/packages/data-designer-config/tests/config/test_models.py b/packages/data-designer-config/tests/config/test_models.py index 38b8079e..4891c78d 100644 --- a/packages/data-designer-config/tests/config/test_models.py +++ b/packages/data-designer-config/tests/config/test_models.py @@ -17,6 +17,7 @@ GenerationType, ImageContext, ImageFormat, + ImageInferenceParams, ManualDistribution, ManualDistributionParams, ModalityDataType, @@ -412,6 +413,12 @@ def test_model_config_construction(): assert model_config.inference_parameters == embedding_params assert model_config.generation_type == GenerationType.EMBEDDING + # test construction with image inference parameters + image_params = ImageInferenceParams(quality="hd", size="1024x1024") + model_config = ModelConfig(alias="test", model="test", inference_parameters=image_params) + assert model_config.inference_parameters == image_params + assert model_config.generation_type == GenerationType.IMAGE + def test_model_config_generation_type_from_dict(): # Test that generation_type in dict is used to create the right inference params type @@ -435,6 +442,29 @@ def test_model_config_generation_type_from_dict(): assert isinstance(model_config.inference_parameters, ChatCompletionInferenceParams) assert model_config.generation_type == GenerationType.CHAT_COMPLETION + model_config = ModelConfig.model_validate( + { + "alias": "test", + "model": "image-model", + "inference_parameters": {"generation_type": "image", "quality": "hd", "size": "1024x1024"}, + } + ) + assert isinstance(model_config.inference_parameters, ImageInferenceParams) + assert model_config.inference_parameters.quality == "hd" + assert model_config.inference_parameters.size == "1024x1024" + assert model_config.generation_type == GenerationType.IMAGE + + +def test_image_inference_params_generate_kwargs() -> None: + """ImageInferenceParams.generate_kwargs includes quality and size when set.""" + params = ImageInferenceParams() + assert params.generate_kwargs.get("quality") is None + assert params.generate_kwargs.get("size") is None + + params = ImageInferenceParams(quality="hd", size="1024x1024") + assert params.generate_kwargs["quality"] == "hd" + assert params.generate_kwargs["size"] == "1024x1024" + def test_chat_completion_params_format_for_display_all_params(): """Test formatting chat completion model with all parameters.""" diff --git a/packages/data-designer-engine/src/data_designer/engine/models/registry.py b/packages/data-designer-engine/src/data_designer/engine/models/registry.py index 2878f64e..c6f2b7c7 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/registry.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/registry.py @@ -187,7 +187,7 @@ def run_health_check(self, model_aliases: list[str]) -> None: skip_usage_tracking=True, purpose="running health checks", ) - elif model.model_generation_type == GenerationType.IMAGE_GENERATION: + elif model.model_generation_type == GenerationType.IMAGE: model.generate_image( prompt="Generate a simple pixel", skip_usage_tracking=True, diff --git a/packages/data-designer-engine/tests/engine/models/test_facade.py b/packages/data-designer-engine/tests/engine/models/test_facade.py index 78473d63..0323ce98 100644 --- a/packages/data-designer-engine/tests/engine/models/test_facade.py +++ b/packages/data-designer-engine/tests/engine/models/test_facade.py @@ -1,11 +1,12 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from typing import Any +from __future__ import annotations + +from typing import TYPE_CHECKING, Any from unittest.mock import patch import pytest -from litellm.types.utils import Choices, EmbeddingResponse, Message, ModelResponse from data_designer.engine.mcp.errors import MCPConfigurationError, MCPToolError from data_designer.engine.models.errors import ModelGenerationValidationFailureError @@ -13,6 +14,10 @@ from data_designer.engine.models.parsers.errors import ParserException from data_designer.engine.models.utils import ChatMessage from data_designer.engine.testing import StubMCPFacade, StubMCPRegistry, StubMessage, StubResponse +from data_designer.lazy_heavy_imports import litellm + +if TYPE_CHECKING: + import litellm def mock_oai_response_object(response_text: str) -> StubResponse: @@ -35,12 +40,14 @@ def stub_completion_messages() -> list[ChatMessage]: @pytest.fixture def stub_expected_completion_response(): - return ModelResponse(choices=Choices(message=Message(content="Test response"))) + return litellm.types.utils.ModelResponse( + choices=litellm.types.utils.Choices(message=litellm.types.utils.Message(content="Test response")) + ) @pytest.fixture def stub_expected_embedding_response(): - return EmbeddingResponse(data=[{"embedding": [0.1, 0.2, 0.3]}] * 2) + return litellm.types.utils.EmbeddingResponse(data=[{"embedding": [0.1, 0.2, 0.3]}] * 2) @pytest.mark.parametrize( @@ -106,9 +113,11 @@ def test_generate_with_system_prompt( # Capture messages at call time since they get mutated after the call captured_messages = [] - def capture_and_return(*args: Any, **kwargs: Any) -> ModelResponse: + def capture_and_return(*args: Any, **kwargs: Any) -> litellm.types.utils.ModelResponse: captured_messages.append(list(args[1])) # Copy the messages list - return ModelResponse(choices=Choices(message=Message(content="Hello!"))) + return litellm.types.utils.ModelResponse( + choices=litellm.types.utils.Choices(message=litellm.types.utils.Message(content="Hello!")) + ) mock_completion.side_effect = capture_and_return @@ -166,7 +175,7 @@ def test_completion_success( stub_completion_messages: list[ChatMessage], stub_model_configs: Any, stub_model_facade: ModelFacade, - stub_expected_completion_response: ModelResponse, + stub_expected_completion_response: litellm.types.utils.ModelResponse, skip_usage_tracking: bool, ) -> None: mock_router_completion.side_effect = lambda self, model, messages, **kwargs: stub_expected_completion_response @@ -199,11 +208,13 @@ def test_completion_with_kwargs( stub_completion_messages: list[ChatMessage], stub_model_configs: Any, stub_model_facade: ModelFacade, - stub_expected_completion_response: ModelResponse, + stub_expected_completion_response: litellm.types.utils.ModelResponse, ) -> None: captured_kwargs = {} - def mock_completion(self: Any, model: str, messages: list[dict[str, Any]], **kwargs: Any) -> ModelResponse: + def mock_completion( + self: Any, model: str, messages: list[dict[str, Any]], **kwargs: Any + ) -> litellm.types.utils.ModelResponse: captured_kwargs.update(kwargs) return stub_expected_completion_response @@ -1002,14 +1013,12 @@ def test_generate_image_diffusion_tracks_image_usage( stub_model_facade: ModelFacade, ) -> None: """Test that generate_image tracks image usage for diffusion models.""" - from litellm.types.utils import ImageObject, ImageResponse - # Mock response with 3 images - mock_response = ImageResponse( + mock_response = litellm.types.utils.ImageResponse( data=[ - ImageObject(b64_json="image1_base64"), - ImageObject(b64_json="image2_base64"), - ImageObject(b64_json="image3_base64"), + litellm.types.utils.ImageObject(b64_json="image1_base64"), + litellm.types.utils.ImageObject(b64_json="image2_base64"), + litellm.types.utils.ImageObject(b64_json="image3_base64"), ] ) mock_image_generation.return_value = mock_response @@ -1036,18 +1045,20 @@ def test_generate_image_chat_completion_tracks_image_usage( stub_model_facade: ModelFacade, ) -> None: """Test that generate_image tracks image usage for chat completion models.""" - from litellm.types.utils import Choices, ImageURLListItem, Message, ModelResponse - # Mock response with images attribute (Message requires type and index per ImageURLListItem) - mock_message = Message( + mock_message = litellm.types.utils.Message( role="assistant", content="", images=[ - ImageURLListItem(type="image_url", image_url={"url": ""}, index=0), - ImageURLListItem(type="image_url", image_url={"url": ""}, index=1), + litellm.types.utils.ImageURLListItem( + type="image_url", image_url={"url": ""}, index=0 + ), + litellm.types.utils.ImageURLListItem( + type="image_url", image_url={"url": ""}, index=1 + ), ], ) - mock_response = ModelResponse(choices=[Choices(message=mock_message)]) + mock_response = litellm.types.utils.ModelResponse(choices=[litellm.types.utils.Choices(message=mock_message)]) mock_completion.return_value = mock_response # Verify initial state @@ -1072,12 +1083,10 @@ def test_generate_image_skip_usage_tracking( stub_model_facade: ModelFacade, ) -> None: """Test that generate_image respects skip_usage_tracking flag.""" - from litellm.types.utils import ImageObject, ImageResponse - - mock_response = ImageResponse( + mock_response = litellm.types.utils.ImageResponse( data=[ - ImageObject(b64_json="image1_base64"), - ImageObject(b64_json="image2_base64"), + litellm.types.utils.ImageObject(b64_json="image1_base64"), + litellm.types.utils.ImageObject(b64_json="image2_base64"), ] ) mock_image_generation.return_value = mock_response @@ -1103,21 +1112,19 @@ def test_generate_image_accumulates_usage( stub_model_facade: ModelFacade, ) -> None: """Test that generate_image accumulates image usage across multiple calls.""" - from litellm.types.utils import ImageObject, ImageResponse - # First call - 2 images - mock_response1 = ImageResponse( + mock_response1 = litellm.types.utils.ImageResponse( data=[ - ImageObject(b64_json="image1"), - ImageObject(b64_json="image2"), + litellm.types.utils.ImageObject(b64_json="image1"), + litellm.types.utils.ImageObject(b64_json="image2"), ] ) # Second call - 3 images - mock_response2 = ImageResponse( + mock_response2 = litellm.types.utils.ImageResponse( data=[ - ImageObject(b64_json="image3"), - ImageObject(b64_json="image4"), - ImageObject(b64_json="image5"), + litellm.types.utils.ImageObject(b64_json="image3"), + litellm.types.utils.ImageObject(b64_json="image4"), + litellm.types.utils.ImageObject(b64_json="image5"), ] ) mock_image_generation.side_effect = [mock_response1, mock_response2] From fad791ee9c9073e590dd74b4febb7f8cc72b2064 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 12:16:31 -0700 Subject: [PATCH 30/69] Small refactor for simplicity --- .../src/data_designer/config/__init__.py | 4 ++-- .../src/data_designer/config/column_configs.py | 14 +++----------- .../src/data_designer/config/column_types.py | 8 ++++---- .../data_designer/config/utils/visualization.py | 2 +- .../tests/config/test_columns.py | 2 +- .../engine/column_generators/generators/image.py | 16 ++++------------ .../engine/column_generators/registry.py | 4 ++-- .../utils/generator_classification.py | 2 ++ .../dataset_builders/column_wise_builder.py | 2 +- .../column_generators/generators/test_image.py | 14 ++++++-------- .../utils/test_generator_classification.py | 2 ++ 11 files changed, 28 insertions(+), 42 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/__init__.py b/packages/data-designer-config/src/data_designer/config/__init__.py index 5686b506..42afae81 100644 --- a/packages/data-designer-config/src/data_designer/config/__init__.py +++ b/packages/data-designer-config/src/data_designer/config/__init__.py @@ -17,7 +17,7 @@ EmbeddingColumnConfig, ExpressionColumnConfig, GenerationStrategy, - ImageGenerationColumnConfig, + ImageColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, LLMStructuredColumnConfig, @@ -123,7 +123,7 @@ "CustomColumnConfig": (_MOD_COLUMN_CONFIGS, "CustomColumnConfig"), "EmbeddingColumnConfig": (_MOD_COLUMN_CONFIGS, "EmbeddingColumnConfig"), "ExpressionColumnConfig": (_MOD_COLUMN_CONFIGS, "ExpressionColumnConfig"), - "ImageGenerationColumnConfig": (_MOD_COLUMN_CONFIGS, "ImageGenerationColumnConfig"), + "ImageColumnConfig": (_MOD_COLUMN_CONFIGS, "ImageColumnConfig"), "GenerationStrategy": (_MOD_COLUMN_CONFIGS, "GenerationStrategy"), "LLMCodeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMCodeColumnConfig"), "LLMJudgeColumnConfig": (_MOD_COLUMN_CONFIGS, "LLMJudgeColumnConfig"), diff --git a/packages/data-designer-config/src/data_designer/config/column_configs.py b/packages/data-designer-config/src/data_designer/config/column_configs.py index facbc4cf..e9d89f4e 100644 --- a/packages/data-designer-config/src/data_designer/config/column_configs.py +++ b/packages/data-designer-config/src/data_designer/config/column_configs.py @@ -485,22 +485,14 @@ def side_effect_columns(self) -> list[str]: return [] -class ImageGenerationColumnConfig(SingleColumnConfig): +class ImageColumnConfig(SingleColumnConfig): """Configuration for image generation columns. Image columns generate images using either autoregressive or diffusion models. The API used is automatically determined based on the model name: - - **Diffusion models** (DALL-E, Stable Diffusion, Imagen, etc.) β†’ image_generation API - - **All other models** β†’ chat/completions API (default) - - Image storage behavior: - - **Create mode**: Images saved to disk with UUID filenames in `images/` folder, - dataframe stores relative paths (e.g., "images/abc123.png") - - **Preview mode**: Images stored as base64 directly in dataframe - Attributes: - column_type: Discriminator field, always "image-generation" for this configuration type. + column_type: Discriminator field, always "image" for this configuration type. prompt: Prompt template for image generation. Supports Jinja2 templating to reference other columns (e.g., "Generate an image of a {{ character_name }}"). Must be a valid Jinja2 template. @@ -509,7 +501,7 @@ class ImageGenerationColumnConfig(SingleColumnConfig): prompt: str model_alias: str - column_type: Literal["image-generation"] = "image-generation" + column_type: Literal["image"] = "image" @staticmethod def get_column_emoji() -> str: diff --git a/packages/data-designer-config/src/data_designer/config/column_types.py b/packages/data-designer-config/src/data_designer/config/column_types.py index 9b01e7d7..baba25dd 100644 --- a/packages/data-designer-config/src/data_designer/config/column_types.py +++ b/packages/data-designer-config/src/data_designer/config/column_types.py @@ -9,7 +9,7 @@ CustomColumnConfig, EmbeddingColumnConfig, ExpressionColumnConfig, - ImageGenerationColumnConfig, + ImageColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, LLMStructuredColumnConfig, @@ -40,7 +40,7 @@ | SeedDatasetColumnConfig | ValidationColumnConfig | EmbeddingColumnConfig - | ImageGenerationColumnConfig + | ImageColumnConfig ) ColumnConfigT = plugin_manager.inject_into_column_config_type_union(ColumnConfigT) @@ -89,7 +89,7 @@ def get_column_display_order() -> list[DataDesignerColumnType]: DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_JUDGE, DataDesignerColumnType.EMBEDDING, - DataDesignerColumnType.IMAGE_GENERATION, + DataDesignerColumnType.IMAGE, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EXPRESSION, DataDesignerColumnType.CUSTOM, @@ -145,5 +145,5 @@ def _resolve_sampler_kwargs(name: str, kwargs: dict) -> dict: DataDesignerColumnType.SAMPLER: SamplerColumnConfig, DataDesignerColumnType.SEED_DATASET: SeedDatasetColumnConfig, DataDesignerColumnType.EMBEDDING: EmbeddingColumnConfig, - DataDesignerColumnType.IMAGE_GENERATION: ImageGenerationColumnConfig, + DataDesignerColumnType.IMAGE: ImageColumnConfig, } diff --git a/packages/data-designer-config/src/data_designer/config/utils/visualization.py b/packages/data-designer-config/src/data_designer/config/utils/visualization.py index 6a9e8ee5..910bc467 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/visualization.py +++ b/packages/data-designer-config/src/data_designer/config/utils/visualization.py @@ -290,7 +290,7 @@ def display_sample_record( render_list.append(pad_console_element(table)) # Collect image generation columns (will be displayed at the end) - image_columns = config_builder.get_columns_of_type(DataDesignerColumnType.IMAGE_GENERATION) + image_columns = config_builder.get_columns_of_type(DataDesignerColumnType.IMAGE) images_to_display_later = [] if len(image_columns) > 0: # Check if we're in a notebook to decide display style diff --git a/packages/data-designer-config/tests/config/test_columns.py b/packages/data-designer-config/tests/config/test_columns.py index 56bb912d..e633518d 100644 --- a/packages/data-designer-config/tests/config/test_columns.py +++ b/packages/data-designer-config/tests/config/test_columns.py @@ -53,7 +53,7 @@ def test_data_designer_column_type_get_display_order(): DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_JUDGE, DataDesignerColumnType.EMBEDDING, - DataDesignerColumnType.IMAGE_GENERATION, + DataDesignerColumnType.IMAGE, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EXPRESSION, DataDesignerColumnType.CUSTOM, diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py index 41586e4b..c8396e24 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py @@ -5,7 +5,7 @@ from typing import TYPE_CHECKING -from data_designer.config.column_configs import ImageGenerationColumnConfig +from data_designer.config.column_configs import ImageColumnConfig from data_designer.engine.column_generators.generators.base import ColumnGeneratorWithModel, GenerationStrategy from data_designer.engine.processing.ginja.environment import WithJinja2UserTemplateRendering from data_designer.engine.processing.utils import deserialize_json_values @@ -14,18 +14,12 @@ from data_designer.engine.storage.media_storage import MediaStorage -class ImageCellGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorWithModel[ImageGenerationColumnConfig]): +class ImageCellGenerator(WithJinja2UserTemplateRendering, ColumnGeneratorWithModel[ImageColumnConfig]): """Generator for image columns with disk or dataframe persistence. Media storage always exists and determines behavior via its mode: - - DISK mode (create): Saves images to disk and stores relative paths in dataframe - - DATAFRAME mode (preview): Stores base64 directly in dataframe - - API is automatically detected based on the model name: - - Diffusion models (DALL-E, Stable Diffusion, Imagen, etc.) β†’ image_generation API - - All other models β†’ chat/completions API (default) - - Storage is accessed via ResourceProvider.artifact_storage.media_storage + - DISK mode: Saves images to disk and stores relative paths in dataframe + - DATAFRAME mode: Stores base64 directly in dataframe """ @property @@ -69,8 +63,6 @@ def generate(self, data: dict) -> dict: base64_images = self.model.generate_image(prompt=prompt) # Store via media storage (mode determines disk vs dataframe storage) - # TODO: MediaStorage will check its mode (DISK/DATAFRAME) and act accordingly - # For now, always saves to disk - need to implement mode system results = [self.media_storage.save_base64_image(base64_image) for base64_image in base64_images] data[self.config.name] = results diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/registry.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/registry.py index a4538ad6..f4fc27b9 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/registry.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/registry.py @@ -8,7 +8,7 @@ CustomColumnConfig, EmbeddingColumnConfig, ExpressionColumnConfig, - ImageGenerationColumnConfig, + ImageColumnConfig, LLMCodeColumnConfig, LLMJudgeColumnConfig, LLMStructuredColumnConfig, @@ -54,7 +54,7 @@ def create_default_column_generator_registry(with_plugins: bool = True) -> Colum registry.register(DataDesignerColumnType.SEED_DATASET, SeedDatasetColumnGenerator, SeedDatasetMultiColumnConfig) registry.register(DataDesignerColumnType.VALIDATION, ValidationColumnGenerator, ValidationColumnConfig) registry.register(DataDesignerColumnType.LLM_STRUCTURED, LLMStructuredCellGenerator, LLMStructuredColumnConfig) - registry.register(DataDesignerColumnType.IMAGE_GENERATION, ImageCellGenerator, ImageGenerationColumnConfig) + registry.register(DataDesignerColumnType.IMAGE, ImageCellGenerator, ImageColumnConfig) if with_plugins: for plugin in PluginRegistry().get_plugins(PluginType.COLUMN_GENERATOR): registry.register( diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/utils/generator_classification.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/utils/generator_classification.py index 2e082779..7a45fc71 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/utils/generator_classification.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/utils/generator_classification.py @@ -22,6 +22,7 @@ def column_type_used_in_execution_dag(column_type: str | DataDesignerColumnType) DataDesignerColumnType.LLM_TEXT, DataDesignerColumnType.VALIDATION, DataDesignerColumnType.EMBEDDING, + DataDesignerColumnType.IMAGE, } dag_column_types.update(plugin_manager.get_plugin_column_types(DataDesignerColumnType)) return column_type in dag_column_types @@ -36,6 +37,7 @@ def column_type_is_model_generated(column_type: str | DataDesignerColumnType) -> DataDesignerColumnType.LLM_STRUCTURED, DataDesignerColumnType.LLM_JUDGE, DataDesignerColumnType.EMBEDDING, + DataDesignerColumnType.IMAGE, } for plugin in plugin_manager.get_column_generator_plugins(): if issubclass(plugin.impl_cls, ColumnGeneratorWithModelRegistry): diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 6802f805..ad9e265b 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -172,7 +172,7 @@ def _has_image_columns(self) -> bool: """Check if config has any image generation columns.""" from data_designer.config.column_types import DataDesignerColumnType - return any(col.column_type == DataDesignerColumnType.IMAGE_GENERATION for col in self.single_column_configs) + return any(col.column_type == DataDesignerColumnType.IMAGE for col in self.single_column_configs) def _initialize_generators(self) -> list[ColumnGenerator]: """Initialize column generators. diff --git a/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py index e7055d67..80523ff5 100644 --- a/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py +++ b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py @@ -5,7 +5,7 @@ import pytest -from data_designer.config.column_configs import ImageGenerationColumnConfig +from data_designer.config.column_configs import ImageColumnConfig from data_designer.engine.column_generators.generators.base import GenerationStrategy from data_designer.engine.column_generators.generators.image import ImageCellGenerator from data_designer.engine.processing.ginja.exceptions import UserTemplateError @@ -13,9 +13,7 @@ @pytest.fixture def stub_image_column_config(): - return ImageGenerationColumnConfig( - name="test_image", prompt="A {{ style }} image of {{ subject }}", model_alias="test_model" - ) + return ImageColumnConfig(name="test_image", prompt="A {{ style }} image of {{ subject }}", model_alias="test_model") @pytest.fixture @@ -24,14 +22,14 @@ def stub_base64_images() -> list[str]: def test_image_cell_generator_generation_strategy( - stub_image_column_config: ImageGenerationColumnConfig, stub_resource_provider: None + stub_image_column_config: ImageColumnConfig, stub_resource_provider: None ) -> None: generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) assert generator.get_generation_strategy() == GenerationStrategy.CELL_BY_CELL def test_image_cell_generator_media_storage_property( - stub_image_column_config: ImageGenerationColumnConfig, stub_resource_provider: None + stub_image_column_config: ImageColumnConfig, stub_resource_provider: None ) -> None: generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) # Should return media_storage from artifact_storage (always exists) @@ -105,7 +103,7 @@ def test_image_cell_generator_missing_columns_error(stub_image_column_config, st def test_image_cell_generator_empty_prompt_error(stub_resource_provider): """Test that empty rendered prompt raises UserTemplateError.""" # Create config with template that renders to empty string - config = ImageGenerationColumnConfig(name="test_image", prompt="{{ empty }}", model_alias="test_model") + config = ImageColumnConfig(name="test_image", prompt="{{ empty }}", model_alias="test_model") generator = ImageCellGenerator(config=config, resource_provider=stub_resource_provider) @@ -115,7 +113,7 @@ def test_image_cell_generator_empty_prompt_error(stub_resource_provider): def test_image_cell_generator_whitespace_only_prompt_error(stub_resource_provider): """Test that whitespace-only rendered prompt raises ValueError.""" - config = ImageGenerationColumnConfig(name="test_image", prompt="{{ spaces }}", model_alias="test_model") + config = ImageColumnConfig(name="test_image", prompt="{{ spaces }}", model_alias="test_model") generator = ImageCellGenerator(config=config, resource_provider=stub_resource_provider) diff --git a/packages/data-designer-engine/tests/engine/column_generators/utils/test_generator_classification.py b/packages/data-designer-engine/tests/engine/column_generators/utils/test_generator_classification.py index bdf15e5d..0be26b11 100644 --- a/packages/data-designer-engine/tests/engine/column_generators/utils/test_generator_classification.py +++ b/packages/data-designer-engine/tests/engine/column_generators/utils/test_generator_classification.py @@ -14,6 +14,7 @@ def test_column_type_is_model_generated() -> None: assert column_type_is_model_generated(DataDesignerColumnType.LLM_STRUCTURED) assert column_type_is_model_generated(DataDesignerColumnType.LLM_JUDGE) assert column_type_is_model_generated(DataDesignerColumnType.EMBEDDING) + assert column_type_is_model_generated(DataDesignerColumnType.IMAGE) assert not column_type_is_model_generated(DataDesignerColumnType.SAMPLER) assert not column_type_is_model_generated(DataDesignerColumnType.VALIDATION) assert not column_type_is_model_generated(DataDesignerColumnType.EXPRESSION) @@ -28,5 +29,6 @@ def test_column_type_used_in_execution_dag() -> None: assert column_type_used_in_execution_dag(DataDesignerColumnType.LLM_TEXT) assert column_type_used_in_execution_dag(DataDesignerColumnType.VALIDATION) assert column_type_used_in_execution_dag(DataDesignerColumnType.EMBEDDING) + assert column_type_used_in_execution_dag(DataDesignerColumnType.IMAGE) assert not column_type_used_in_execution_dag(DataDesignerColumnType.SAMPLER) assert not column_type_used_in_execution_dag(DataDesignerColumnType.SEED_DATASET) From 54ebcc80cb2a9b62fdc98804631193205faa2414 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 12:59:43 -0700 Subject: [PATCH 31/69] update ImageInferenceParams --- .../src/data_designer/config/models.py | 21 +++++-------------- .../tests/config/test_models.py | 21 ++++++++++--------- 2 files changed, 16 insertions(+), 26 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py index 8b16b4bc..0542a8b8 100644 --- a/packages/data-designer-config/src/data_designer/config/models.py +++ b/packages/data-designer-config/src/data_designer/config/models.py @@ -425,24 +425,20 @@ def generate_kwargs(self) -> dict[str, float | int]: class ImageInferenceParams(BaseInferenceParams): """Configuration for image generation models. - Works for both diffusion and autoregressive image generation models. Use extra_body for model-specific parameters. + Works for both diffusion and autoregressive image generation models. Pass all model-specific image options via `extra_body`. Attributes: generation_type: Type of generation, always "image" for this class. - quality: Image quality setting (e.g., "standard", "hd"). Optional and model-specific. - size: Image size specification (e.g., "1024x1024", "1792x1024"). Optional and model-specific. Example: ```python - # Standard usage with common params + # OpenAI-style (DALLΒ·E): quality and size in extra_body or as top-level kwargs dd.ImageInferenceParams( - quality="hd", - size="1024x1024" + extra_body={"size": "1024x1024", "quality": "hd"} ) - # With model-specific params via extra_body + # Gemini-style: generationConfig.imageConfig dd.ImageInferenceParams( - quality="auto", extra_body={ "generationConfig": { "imageConfig": { @@ -456,17 +452,10 @@ class ImageInferenceParams(BaseInferenceParams): """ generation_type: Literal[GenerationType.IMAGE] = GenerationType.IMAGE - quality: str | None = None - size: str | None = None @property def generate_kwargs(self) -> dict[str, Any]: - result = super().generate_kwargs - if self.quality is not None: - result["quality"] = self.quality - if self.size is not None: - result["size"] = self.size - return result + return super().generate_kwargs InferenceParamsT: TypeAlias = Annotated[ diff --git a/packages/data-designer-config/tests/config/test_models.py b/packages/data-designer-config/tests/config/test_models.py index 4891c78d..564b235c 100644 --- a/packages/data-designer-config/tests/config/test_models.py +++ b/packages/data-designer-config/tests/config/test_models.py @@ -414,7 +414,7 @@ def test_model_config_construction(): assert model_config.generation_type == GenerationType.EMBEDDING # test construction with image inference parameters - image_params = ImageInferenceParams(quality="hd", size="1024x1024") + image_params = ImageInferenceParams(extra_body={"size": "1024x1024", "quality": "hd"}) model_config = ModelConfig(alias="test", model="test", inference_parameters=image_params) assert model_config.inference_parameters == image_params assert model_config.generation_type == GenerationType.IMAGE @@ -446,24 +446,25 @@ def test_model_config_generation_type_from_dict(): { "alias": "test", "model": "image-model", - "inference_parameters": {"generation_type": "image", "quality": "hd", "size": "1024x1024"}, + "inference_parameters": { + "generation_type": "image", + "extra_body": {"size": "1024x1024", "quality": "hd"}, + }, } ) assert isinstance(model_config.inference_parameters, ImageInferenceParams) - assert model_config.inference_parameters.quality == "hd" - assert model_config.inference_parameters.size == "1024x1024" + assert model_config.inference_parameters.extra_body == {"size": "1024x1024", "quality": "hd"} assert model_config.generation_type == GenerationType.IMAGE def test_image_inference_params_generate_kwargs() -> None: - """ImageInferenceParams.generate_kwargs includes quality and size when set.""" + """ImageInferenceParams.generate_kwargs delegates to base; image params go via extra_body.""" params = ImageInferenceParams() - assert params.generate_kwargs.get("quality") is None - assert params.generate_kwargs.get("size") is None + assert "quality" not in params.generate_kwargs + assert "size" not in params.generate_kwargs - params = ImageInferenceParams(quality="hd", size="1024x1024") - assert params.generate_kwargs["quality"] == "hd" - assert params.generate_kwargs["size"] == "1024x1024" + params = ImageInferenceParams(extra_body={"size": "1024x1024", "quality": "hd"}) + assert params.generate_kwargs.get("extra_body") == {"size": "1024x1024", "quality": "hd"} def test_chat_completion_params_format_for_display_all_params(): From 3aad6081dca8582d88f06b5e464c4b0f0ac79bf2 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 13:01:38 -0700 Subject: [PATCH 32/69] add example tutorial for image generation --- docs/notebook_source/1-the-basics.py | 2 + ...tructured-outputs-and-jinja-expressions.py | 2 + .../3-seeding-with-a-dataset.py | 2 + .../4-providing-images-as-context.py | 2 + docs/notebook_source/5-generating-images.py | 212 ++++++++++++++++++ docs/notebook_source/_README.md | 9 + 6 files changed, 229 insertions(+) create mode 100644 docs/notebook_source/5-generating-images.py diff --git a/docs/notebook_source/1-the-basics.py b/docs/notebook_source/1-the-basics.py index 392efb34..8735d582 100644 --- a/docs/notebook_source/1-the-basics.py +++ b/docs/notebook_source/1-the-basics.py @@ -330,3 +330,5 @@ # # - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/) # +# - [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/) +# diff --git a/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py b/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py index 66b3773f..df581612 100644 --- a/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py +++ b/docs/notebook_source/2-structured-outputs-and-jinja-expressions.py @@ -372,3 +372,5 @@ class ProductReview(BaseModel): # # - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/) # +# - [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/) +# diff --git a/docs/notebook_source/3-seeding-with-a-dataset.py b/docs/notebook_source/3-seeding-with-a-dataset.py index c9d694a8..e4f9218e 100644 --- a/docs/notebook_source/3-seeding-with-a-dataset.py +++ b/docs/notebook_source/3-seeding-with-a-dataset.py @@ -274,3 +274,5 @@ # # - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/) # +# - [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/) +# diff --git a/docs/notebook_source/4-providing-images-as-context.py b/docs/notebook_source/4-providing-images-as-context.py index a11880ba..1fd68dac 100644 --- a/docs/notebook_source/4-providing-images-as-context.py +++ b/docs/notebook_source/4-providing-images-as-context.py @@ -299,3 +299,5 @@ def convert_image_to_chat_format(record, height: int) -> dict: # - Combine vision-based summaries with other column types for multi-modal workflows # - Apply this pattern to other vision tasks like image captioning, OCR validation, or visual question answering # +# - [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/) with Data Designer +# diff --git a/docs/notebook_source/5-generating-images.py b/docs/notebook_source/5-generating-images.py new file mode 100644 index 00000000..aee5a0c1 --- /dev/null +++ b/docs/notebook_source/5-generating-images.py @@ -0,0 +1,212 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.18.1 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% [markdown] +# # 🎨 Data Designer Tutorial: Generating Images +# +# #### πŸ“š What you'll learn +# +# This notebook shows how to generate synthetic image data with Data Designer using image-generation models. +# +# - πŸ–ΌοΈ **Image generation columns**: Add columns that produce images from text prompts +# - πŸ“ **Jinja2 prompts**: Drive diversity by referencing other columns in your prompt template +# - πŸ’Ύ **Preview vs create**: Preview stores base64 in the dataframe; create saves images to disk and stores paths +# +# Data Designer supports both **diffusion** (e.g. DALLΒ·E, Stable Diffusion, Imagen) and **autoregressive** (e.g. Gemini image, GPT image) models; the API is chosen automatically from the model name. +# +# If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series. +# + +# %% [markdown] +# ### πŸ“¦ Import Data Designer +# +# - `data_designer.config` provides the configuration API. +# - `DataDesigner` is the main interface for generation. +# + +# %% +from IPython.display import Image as IPImage +from IPython.display import display + +import data_designer.config as dd +from data_designer.interface import DataDesigner + +# %% [markdown] +# ### βš™οΈ Initialize the Data Designer interface +# +# When initialized without arguments, [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used. This tutorial uses [OpenRouter](https://openrouter.ai) with the Flux 2 Pro image model; set `OPENROUTER_API_KEY` in your environment. +# + +# %% +data_designer = DataDesigner() + +# %% [markdown] +# ### πŸŽ›οΈ Define an image-generation model +# +# - Use `ImageInferenceParams` so Data Designer treats this model as an image generator. +# - Image options (size, quality, aspect ratio, etc.) are model-specific; pass them via `extra_body`. +# + +# %% +MODEL_PROVIDER = "openrouter" +MODEL_ID = "black-forest-labs/flux.2-pro" +MODEL_ALIAS = "image-model" + +model_configs = [ + dd.ModelConfig( + alias=MODEL_ALIAS, + model=MODEL_ID, + provider=MODEL_PROVIDER, + inference_parameters=dd.ImageInferenceParams( + extra_body={"size": "1024x1024"}, + ), + ) +] + +# %% [markdown] +# ### πŸ—οΈ Build the config: samplers + image column +# +# We'll generate diverse **dog portrait** images: sampler columns drive subject (breed), age, style, look direction, and emotion. The image-generation column uses a Jinja2 prompt that references all of them. +# + +# %% +config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="subject", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=[ + "a Golden Retriever", + "a German Shepherd", + "a Labrador Retriever", + "a Bulldog", + "a Beagle", + "a Poodle", + "a Corgi", + "a Siberian Husky", + "a Dalmatian", + ], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="age", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["1-3", "3-6", "6-9", "9-12", "12-15"], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="style", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=[ + "photorealistic", + "oil painting", + "watercolor", + "digital art", + "sketch", + "anime", + ], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="look_direction", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["left", "right", "front", "up", "down"], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="emotion", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["happy", "curious", "serious", "sleepy", "excited"], + ), + ) +) + +config_builder.add_column( + dd.ImageColumnConfig( + name="generated_image", + prompt=( + "A {{ style }} portrait of {{ subject }} {{ age }} years old looking {{ look_direction }} " + "towards a crowd of the same kind with an {{ emotion }} expression." + ), + model_alias=MODEL_ALIAS, + ) +) + +data_designer.validate(config_builder) + +# %% [markdown] +# ### πŸ” Preview: images as base64 +# +# In **preview** mode, generated images are stored as base64 strings in the dataframe. Run the next cell to step through each record (images are shown in the sample record display, but only in a notebook environment). +# + +# %% +preview = data_designer.preview(config_builder, num_records=2) + +# %% +for i in range(len(preview.dataset)): + preview.display_sample_record() + +# %% +preview.dataset + +# %% [markdown] +# ### πŸ†™ Create: images saved to disk +# +# In **create** mode, images are written to an `images/` folder with UUID filenames; the dataframe stores relative paths (e.g. `images/1d16b6e2-562f-4f51-91e5-baaa999ea916.png`). +# + +# %% +results = data_designer.create(config_builder, num_records=5, dataset_name="tutorial-5-images") + +# %% +dataset = results.load_dataset() +dataset.head() + +# %% +# Display all image from the created dataset. Paths are relative to the artifact output directory. +for index, row in dataset.iterrows(): + path_or_list = row.get("generated_image") + if path_or_list is not None: + for path in path_or_list: + base = results.artifact_storage.base_dataset_path + full_path = base / path + display(IPImage(data=full_path)) + +# %% [markdown] +# ## ⏭️ Next steps +# +# - [The basics](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/): samplers and LLM text columns +# - [Structured outputs and Jinja](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/2-structured-outputs-and-jinja-expressions/) +# - [Seeding with a dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/) +# - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/) +# diff --git a/docs/notebook_source/_README.md b/docs/notebook_source/_README.md index 09053c22..7bcd77d1 100644 --- a/docs/notebook_source/_README.md +++ b/docs/notebook_source/_README.md @@ -97,6 +97,15 @@ Learn how to use vision-language models to generate text descriptions from image - Generating detailed summaries from document images - Inspecting and validating vision-based generation results +### [5. Generating Images](5-generating-images.ipynb) + +Generate synthetic image data with Data Designer: + +- Configuring image-generation models with `ImageInferenceParams` +- Adding image columns with Jinja2 prompts and sampler-driven diversity +- Preview (base64 in dataframe) vs create (images saved to disk, paths in dataframe) +- Displaying generated images in the notebook + ## πŸ“– Important Documentation Sections Before diving into the tutorials, familiarize yourself with these key documentation sections: From f252c376917bcb791113318a84ee4e84d005d793 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 18:15:14 -0700 Subject: [PATCH 33/69] support multi-modal context in ImageColumnConfig --- .../data_designer/config/column_configs.py | 4 + .../column_generators/generators/image.py | 9 +- .../src/data_designer/engine/models/facade.py | 28 +++++- .../data_designer/engine/models/registry.py | 2 +- .../generators/test_image.py | 90 ++++++++++++++++++- 5 files changed, 125 insertions(+), 8 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/column_configs.py b/packages/data-designer-config/src/data_designer/config/column_configs.py index e9d89f4e..e3ea013d 100644 --- a/packages/data-designer-config/src/data_designer/config/column_configs.py +++ b/packages/data-designer-config/src/data_designer/config/column_configs.py @@ -497,10 +497,14 @@ class ImageColumnConfig(SingleColumnConfig): reference other columns (e.g., "Generate an image of a {{ character_name }}"). Must be a valid Jinja2 template. model_alias: The model to use for image generation. + multi_modal_context: Optional list of image contexts for multi-modal generation. + Enables autoregressive multi-modal models to generate images based on image inputs. + Only works with autoregressive models that support image-to-image generation. """ prompt: str model_alias: str + multi_modal_context: list[ImageContext] | None = None column_type: Literal["image"] = "image" @staticmethod diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py index c8396e24..11bc732c 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py @@ -59,8 +59,15 @@ def generate(self, data: dict) -> dict: if not prompt or not prompt.strip(): raise ValueError(f"Rendered prompt for column {self.config.name!r} is empty") + # Process multi-modal context if provided + multi_modal_context = None + if self.config.multi_modal_context is not None and len(self.config.multi_modal_context) > 0: + multi_modal_context = [] + for context in self.config.multi_modal_context: + multi_modal_context.extend(context.get_contexts(deserialized_record)) + # Generate images (returns list of base64 strings) - base64_images = self.model.generate_image(prompt=prompt) + base64_images = self.model.generate_image(prompt=prompt, multi_modal_context=multi_modal_context) # Store via media storage (mode determines disk vs dataframe storage) results = [self.media_storage.save_base64_image(base64_image) for base64_image in base64_images] diff --git a/packages/data-designer-engine/src/data_designer/engine/models/facade.py b/packages/data-designer-engine/src/data_designer/engine/models/facade.py index 11f6e9ec..51940e99 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/facade.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/facade.py @@ -304,7 +304,13 @@ def generate_text_embeddings( self._track_token_usage_from_embedding(response) @catch_llm_exceptions - def generate_image(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> list[str]: + def generate_image( + self, + prompt: str, + multi_modal_context: list[dict[str, Any]] | None = None, + skip_usage_tracking: bool = False, + **kwargs, + ) -> list[str]: """Generate image(s) and return base64-encoded data. Automatically detects the appropriate API based on model name: @@ -316,6 +322,8 @@ def generate_image(self, prompt: str, skip_usage_tracking: bool = False, **kwarg Args: prompt: The prompt for image generation + multi_modal_context: Optional list of image contexts for multi-modal generation. + Only used with autoregressive models via chat completions API. skip_usage_tracking: Whether to skip usage tracking **kwargs: Additional arguments to pass to the model (including n=number of images) @@ -334,7 +342,7 @@ def generate_image(self, prompt: str, skip_usage_tracking: bool = False, **kwarg if is_image_diffusion_model(self.model_name): images = self._generate_image_diffusion(prompt, skip_usage_tracking, **kwargs) else: - images = self._generate_image_chat_completion(prompt, skip_usage_tracking, **kwargs) + images = self._generate_image_chat_completion(prompt, multi_modal_context, skip_usage_tracking, **kwargs) # Track image usage if not skip_usage_tracking and len(images) > 0: @@ -353,14 +361,26 @@ def _get_mcp_facade(self, tool_alias: str | None) -> MCPFacade | None: except ValueError as exc: raise MCPConfigurationError(f"Tool alias {tool_alias!r} is not registered.") from exc - def _generate_image_chat_completion(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> list[str]: + def _generate_image_chat_completion( + self, + prompt: str, + multi_modal_context: list[dict[str, Any]] | None = None, + skip_usage_tracking: bool = False, + **kwargs, + ) -> list[str]: """Generate image(s) using autoregressive model via chat completions API. + Args: + prompt: The prompt for image generation + multi_modal_context: Optional list of image contexts for multi-modal generation + skip_usage_tracking: Whether to skip usage tracking + **kwargs: Additional arguments to pass to the model + Returns: List of base64-encoded image strings """ kwargs = self.consolidate_kwargs(**kwargs) - messages = [ChatMessage.as_user(content=prompt)] + messages = prompt_to_messages(user_prompt=prompt, multi_modal_context=multi_modal_context) response = None try: diff --git a/packages/data-designer-engine/src/data_designer/engine/models/registry.py b/packages/data-designer-engine/src/data_designer/engine/models/registry.py index c6f2b7c7..0b103e76 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/registry.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/registry.py @@ -189,7 +189,7 @@ def run_health_check(self, model_aliases: list[str]) -> None: ) elif model.model_generation_type == GenerationType.IMAGE: model.generate_image( - prompt="Generate a simple pixel", + prompt="Generate a simple illustration of a thumbs up sign.", skip_usage_tracking=True, purpose="running health checks", ) diff --git a/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py index 80523ff5..b433bc55 100644 --- a/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py +++ b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py @@ -6,6 +6,7 @@ import pytest from data_designer.config.column_configs import ImageColumnConfig +from data_designer.config.models import ImageContext, ImageFormat, ModalityDataType from data_designer.engine.column_generators.generators.base import GenerationStrategy from data_designer.engine.column_generators.generators.image import ImageCellGenerator from data_designer.engine.processing.ginja.exceptions import UserTemplateError @@ -58,7 +59,7 @@ def test_image_cell_generator_generate_with_storage( assert data[stub_image_column_config.name] == ["images/uuid1.png", "images/uuid2.png"] # Verify model was called with rendered prompt - mock_generate.assert_called_once_with(prompt="A photorealistic image of cat") + mock_generate.assert_called_once_with(prompt="A photorealistic image of cat", multi_modal_context=None) # Verify storage was called for each image assert mock_storage.save_base64_image.call_count == 2 @@ -88,7 +89,7 @@ def test_image_cell_generator_generate_in_dataframe_mode( assert data[stub_image_column_config.name] == stub_base64_images # Verify model was called with rendered prompt - mock_generate.assert_called_once_with(prompt="A watercolor image of dog") + mock_generate.assert_called_once_with(prompt="A watercolor image of dog", multi_modal_context=None) def test_image_cell_generator_missing_columns_error(stub_image_column_config, stub_resource_provider): @@ -119,3 +120,88 @@ def test_image_cell_generator_whitespace_only_prompt_error(stub_resource_provide with pytest.raises(ValueError, match="empty"): generator.generate(data={"spaces": " "}) + + +def test_image_cell_generator_with_multi_modal_context(stub_resource_provider): + """Test generate with multi-modal context for autoregressive models.""" + # Create image context that references a column with URL + image_context = ImageContext(column_name="reference_image", data_type=ModalityDataType.URL) + + config = ImageColumnConfig( + name="test_image", + prompt="Generate a similar image to the reference", + model_alias="test_model", + multi_modal_context=[image_context], + ) + + # Setup mock media storage + mock_storage = Mock() + mock_storage.save_base64_image.return_value = "images/generated.png" + stub_resource_provider.artifact_storage.media_storage = mock_storage + + stub_base64_images = ["base64_generated_image"] + + with patch.object( + stub_resource_provider.model_registry.get_model.return_value, + "generate_image", + return_value=stub_base64_images, + ) as mock_generate: + generator = ImageCellGenerator(config=config, resource_provider=stub_resource_provider) + data = generator.generate(data={"reference_image": "https://example.com/image.png"}) + + # Check that column was added + assert config.name in data + assert data[config.name] == ["images/generated.png"] + + # Verify model was called with prompt and multi_modal_context + mock_generate.assert_called_once() + call_args = mock_generate.call_args + assert call_args.kwargs["prompt"] == "Generate a similar image to the reference" + assert call_args.kwargs["multi_modal_context"] is not None + assert len(call_args.kwargs["multi_modal_context"]) == 1 + assert call_args.kwargs["multi_modal_context"][0]["type"] == "image_url" + assert call_args.kwargs["multi_modal_context"][0]["image_url"] == "https://example.com/image.png" + + +def test_image_cell_generator_with_base64_multi_modal_context(stub_resource_provider): + """Test generate with base64 multi-modal context.""" + # Create image context that references a column with base64 data + image_context = ImageContext( + column_name="reference_image", data_type=ModalityDataType.BASE64, image_format=ImageFormat.PNG + ) + + config = ImageColumnConfig( + name="test_image", + prompt="Generate a variation of this image", + model_alias="test_model", + multi_modal_context=[image_context], + ) + + # Setup mock media storage + mock_storage = Mock() + mock_storage.save_base64_image.return_value = "images/generated.png" + stub_resource_provider.artifact_storage.media_storage = mock_storage + + stub_base64_images = ["base64_generated_image"] + + with patch.object( + stub_resource_provider.model_registry.get_model.return_value, + "generate_image", + return_value=stub_base64_images, + ) as mock_generate: + generator = ImageCellGenerator(config=config, resource_provider=stub_resource_provider) + data = generator.generate(data={"reference_image": "iVBORw0KGgoAAAANS"}) + + # Check that column was added + assert config.name in data + assert data[config.name] == ["images/generated.png"] + + # Verify model was called with prompt and multi_modal_context + mock_generate.assert_called_once() + call_args = mock_generate.call_args + assert call_args.kwargs["prompt"] == "Generate a variation of this image" + assert call_args.kwargs["multi_modal_context"] is not None + assert len(call_args.kwargs["multi_modal_context"]) == 1 + assert call_args.kwargs["multi_modal_context"][0]["type"] == "image_url" + # Should be formatted as data URI + assert "data:image/png;base64," in call_args.kwargs["multi_modal_context"][0]["image_url"]["url"] From d6a0f2fcb8b0acb664e6c101a14ae1910095fb62 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 18:39:53 -0700 Subject: [PATCH 34/69] updated tutorial notebook --- docs/notebook_source/5-generating-images.py | 116 +++++++++++++++++--- 1 file changed, 100 insertions(+), 16 deletions(-) diff --git a/docs/notebook_source/5-generating-images.py b/docs/notebook_source/5-generating-images.py index aee5a0c1..28638ff9 100644 --- a/docs/notebook_source/5-generating-images.py +++ b/docs/notebook_source/5-generating-images.py @@ -69,7 +69,7 @@ model=MODEL_ID, provider=MODEL_PROVIDER, inference_parameters=dd.ImageInferenceParams( - extra_body={"size": "1024x1024"}, + extra_body={"height": 512, "width": 512}, ), ) ] @@ -85,7 +85,24 @@ config_builder.add_column( dd.SamplerColumnConfig( - name="subject", + name="style", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=[ + "photorealistic", + "oil painting", + "watercolor", + "digital art", + "sketch", + "anime", + ], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="dog_breed", sampler_type=dd.SamplerType.CATEGORY, params=dd.CategorySamplerParams( values=[ @@ -98,6 +115,58 @@ "a Corgi", "a Siberian Husky", "a Dalmatian", + "a Yorkshire Terrier", + "a Boxer", + "a Dachshund", + "a Doberman Pinscher", + "a Shih Tzu", + "a Chihuahua", + "a Border Collie", + "an Australian Shepherd", + "a Cocker Spaniel", + "a Maltese", + "a Pomeranian", + "a Saint Bernard", + "a Great Dane", + "an Akita", + "a Samoyed", + "a Boston Terrier", + ], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="cat_breed", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=[ + "a Persian", + "a Maine Coon", + "a Siamese", + "a Ragdoll", + "a Bengal", + "an Abyssinian", + "a British Shorthair", + "a Sphynx", + "a Scottish Fold", + "a Russian Blue", + "a Birman", + "an Oriental Shorthair", + "a Norwegian Forest Cat", + "a Devon Rex", + "a Burmese", + "an Egyptian Mau", + "a Tonkinese", + "a Himalayan", + "a Savannah", + "a Chartreux", + "a Somali", + "a Manx", + "a Turkish Angora", + "a Balinese", + "an American Shorthair", ], ), ) @@ -105,7 +174,7 @@ config_builder.add_column( dd.SamplerColumnConfig( - name="age", + name="dog_age", sampler_type=dd.SamplerType.CATEGORY, params=dd.CategorySamplerParams( values=["1-3", "3-6", "6-9", "9-12", "12-15"], @@ -115,24 +184,27 @@ config_builder.add_column( dd.SamplerColumnConfig( - name="style", + name="cat_age", sampler_type=dd.SamplerType.CATEGORY, params=dd.CategorySamplerParams( - values=[ - "photorealistic", - "oil painting", - "watercolor", - "digital art", - "sketch", - "anime", - ], + values=["1-3", "3-6", "6-9", "9-12", "12-18"], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="dog_look_direction", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["left", "right", "front", "up", "down"], ), ) ) config_builder.add_column( dd.SamplerColumnConfig( - name="look_direction", + name="cat_look_direction", sampler_type=dd.SamplerType.CATEGORY, params=dd.CategorySamplerParams( values=["left", "right", "front", "up", "down"], @@ -142,7 +214,7 @@ config_builder.add_column( dd.SamplerColumnConfig( - name="emotion", + name="dog_emotion", sampler_type=dd.SamplerType.CATEGORY, params=dd.CategorySamplerParams( values=["happy", "curious", "serious", "sleepy", "excited"], @@ -150,12 +222,24 @@ ) ) +config_builder.add_column( + dd.SamplerColumnConfig( + name="cat_emotion", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=["aloof", "curious", "content", "sleepy", "playful"], + ), + ) +) + config_builder.add_column( dd.ImageColumnConfig( name="generated_image", prompt=( - "A {{ style }} portrait of {{ subject }} {{ age }} years old looking {{ look_direction }} " - "towards a crowd of the same kind with an {{ emotion }} expression." + """ +A {{ style }} family pet portrait of a {{ dog_breed }} dog of {{ dog_age }} years old looking {{dog_look_direction}} with an {{ dog_emotion }} expression and +{{ cat_breed }} cat of {{ cat_age }} years old looking {{ cat_look_direction }} with an {{ cat_emotion }} expression in the background. Both subjects should be in focus. + """ ), model_alias=MODEL_ALIAS, ) From f5c6cf9418bd432d4b121d6ce82e7f30586e43df Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 18:57:09 -0700 Subject: [PATCH 35/69] organize image artifacts by column name --- .../column_generators/generators/image.py | 6 +- .../engine/storage/media_storage.py | 18 ++-- .../generators/test_image.py | 23 +++-- .../engine/storage/test_media_storage.py | 92 +++++++++++++++---- 4 files changed, 107 insertions(+), 32 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py index 11bc732c..55721916 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py @@ -70,7 +70,11 @@ def generate(self, data: dict) -> dict: base64_images = self.model.generate_image(prompt=prompt, multi_modal_context=multi_modal_context) # Store via media storage (mode determines disk vs dataframe storage) - results = [self.media_storage.save_base64_image(base64_image) for base64_image in base64_images] + # Use column name as subfolder to organize images + results = [ + self.media_storage.save_base64_image(base64_image, subfolder_name=self.config.name) + for base64_image in base64_images + ] data[self.config.name] = results return data diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py index ddac3459..df83e331 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py @@ -66,14 +66,15 @@ def _ensure_images_directory(self) -> None: """Create images directory if it doesn't exist (lazy initialization).""" self.images_dir.mkdir(parents=True, exist_ok=True) - def save_base64_image(self, base64_data: str) -> str: + def save_base64_image(self, base64_data: str, subfolder_name: str) -> str: """Save or return base64 image based on storage mode. Args: base64_data: Base64 encoded image string (with or without data URI prefix) + subfolder_name: Subfolder name to organize images (e.g., "images//") Returns: - DISK mode: Relative path to saved image (e.g., "images/f47ac10b-58cc.png") + DISK mode: Relative path to saved image (e.g., "images/subfolder_name/f47ac10b-58cc.png") DATAFRAME mode: Original base64 data string Raises: @@ -85,8 +86,11 @@ def save_base64_image(self, base64_data: str) -> str: return base64_data # DISK mode: save to disk, validate, and return relative path - # Ensure images directory exists (lazy initialization) - self._ensure_images_directory() + # Determine the target directory (organized by subfolder) + target_dir = self.images_dir / subfolder_name + + # Ensure target directory exists (lazy initialization) + target_dir.mkdir(parents=True, exist_ok=True) # Decode base64 to bytes image_bytes = decode_base64_image(base64_data) @@ -97,8 +101,10 @@ def save_base64_image(self, base64_data: str) -> str: # Generate unique filename image_id = uuid.uuid4() filename = f"{image_id}.{image_format.value}" - full_path = self.images_dir / filename - relative_path = f"{self.images_subdir}/{filename}" + full_path = target_dir / filename + + # Build relative path + relative_path = f"{self.images_subdir}/{subfolder_name}/{filename}" # Write to disk with open(full_path, "wb") as f: diff --git a/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py index b433bc55..ca5cbfae 100644 --- a/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py +++ b/packages/data-designer-engine/tests/engine/column_generators/generators/test_image.py @@ -43,7 +43,10 @@ def test_image_cell_generator_generate_with_storage( """Test generate with media storage (create mode) - saves to disk.""" # Setup mock media storage mock_storage = Mock() - mock_storage.save_base64_image.side_effect = ["images/uuid1.png", "images/uuid2.png"] + mock_storage.save_base64_image.side_effect = [ + "images/test_image/uuid1.png", + "images/test_image/uuid2.png", + ] stub_resource_provider.artifact_storage.media_storage = mock_storage with patch.object( @@ -54,17 +57,20 @@ def test_image_cell_generator_generate_with_storage( generator = ImageCellGenerator(config=stub_image_column_config, resource_provider=stub_resource_provider) data = generator.generate(data={"style": "photorealistic", "subject": "cat"}) - # Check that column was added with relative paths + # Check that column was added with relative paths (organized in subfolder) assert stub_image_column_config.name in data - assert data[stub_image_column_config.name] == ["images/uuid1.png", "images/uuid2.png"] + assert data[stub_image_column_config.name] == [ + "images/test_image/uuid1.png", + "images/test_image/uuid2.png", + ] # Verify model was called with rendered prompt mock_generate.assert_called_once_with(prompt="A photorealistic image of cat", multi_modal_context=None) - # Verify storage was called for each image + # Verify storage was called for each image with subfolder name assert mock_storage.save_base64_image.call_count == 2 - mock_storage.save_base64_image.assert_any_call("base64_image_1") - mock_storage.save_base64_image.assert_any_call("base64_image_2") + mock_storage.save_base64_image.assert_any_call("base64_image_1", subfolder_name="test_image") + mock_storage.save_base64_image.assert_any_call("base64_image_2", subfolder_name="test_image") def test_image_cell_generator_generate_in_dataframe_mode( @@ -91,6 +97,11 @@ def test_image_cell_generator_generate_in_dataframe_mode( # Verify model was called with rendered prompt mock_generate.assert_called_once_with(prompt="A watercolor image of dog", multi_modal_context=None) + # Verify storage was called for each image with subfolder name (even in DATAFRAME mode) + assert mock_storage.save_base64_image.call_count == 2 + mock_storage.save_base64_image.assert_any_call("base64_image_1", subfolder_name="test_image") + mock_storage.save_base64_image.assert_any_call("base64_image_2", subfolder_name="test_image") + def test_image_cell_generator_missing_columns_error(stub_image_column_config, stub_resource_provider): """Test that missing required columns raises ValueError.""" diff --git a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py index abd17afe..105348d2 100644 --- a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py +++ b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py @@ -62,10 +62,10 @@ def test_media_storage_init_custom_subdir(tmp_path): def test_save_base64_image_png(media_storage, sample_base64_png): """Test saving a PNG image from base64.""" - relative_path = media_storage.save_base64_image(sample_base64_png) + relative_path = media_storage.save_base64_image(sample_base64_png, subfolder_name="test_column") - # Check return value format - assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + # Check return value format (organized by column name) + assert relative_path.startswith(f"{IMAGES_SUBDIR}/test_column/") assert relative_path.endswith(".png") # Check file exists on disk @@ -80,10 +80,10 @@ def test_save_base64_image_png(media_storage, sample_base64_png): def test_save_base64_image_jpg(media_storage, sample_base64_jpg): """Test saving a JPEG image from base64.""" - relative_path = media_storage.save_base64_image(sample_base64_jpg) + relative_path = media_storage.save_base64_image(sample_base64_jpg, subfolder_name="test_column") - # Check return value format - assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + # Check return value format (organized by column name) + assert relative_path.startswith(f"{IMAGES_SUBDIR}/test_column/") assert relative_path.endswith(".jpg") # Check file exists on disk @@ -94,10 +94,10 @@ def test_save_base64_image_jpg(media_storage, sample_base64_jpg): def test_save_base64_image_with_data_uri(media_storage, sample_base64_png): """Test saving image from data URI format.""" data_uri = f"data:image/png;base64,{sample_base64_png}" - relative_path = media_storage.save_base64_image(data_uri) + relative_path = media_storage.save_base64_image(data_uri, subfolder_name="test_column") - # Should successfully extract base64 and save - assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + # Should successfully extract base64 and save (organized by column name) + assert relative_path.startswith(f"{IMAGES_SUBDIR}/test_column/") assert relative_path.endswith(".png") # Verify file exists and content is correct @@ -111,13 +111,13 @@ def test_save_base64_image_with_data_uri(media_storage, sample_base64_png): def test_save_base64_image_invalid_base64_raises_error(media_storage): """Test that invalid base64 data raises ValueError.""" with pytest.raises(ValueError, match="Invalid base64"): - media_storage.save_base64_image("not-valid-base64!!!") + media_storage.save_base64_image("not-valid-base64!!!", subfolder_name="test_column") def test_save_base64_image_multiple_images_unique_filenames(media_storage, sample_base64_png): """Test that multiple images get unique filenames.""" - path1 = media_storage.save_base64_image(sample_base64_png) - path2 = media_storage.save_base64_image(sample_base64_png) + path1 = media_storage.save_base64_image(sample_base64_png, subfolder_name="test_column") + path2 = media_storage.save_base64_image(sample_base64_png, subfolder_name="test_column") # Paths should be different (different UUIDs) assert path1 != path2 @@ -131,8 +131,8 @@ def test_save_base64_image_disk_mode_validates(tmp_path, sample_base64_png): """Test that DISK mode validates images.""" storage = MediaStorage(base_path=tmp_path, mode=StorageMode.DISK) # Should succeed with valid image - relative_path = storage.save_base64_image(sample_base64_png) - assert relative_path.startswith(f"{IMAGES_SUBDIR}/") + relative_path = storage.save_base64_image(sample_base64_png, subfolder_name="test_column") + assert relative_path.startswith(f"{IMAGES_SUBDIR}/test_column/") def test_save_base64_image_disk_mode_corrupted_image_raises_error(tmp_path): @@ -144,18 +144,20 @@ def test_save_base64_image_disk_mode_corrupted_image_raises_error(tmp_path): corrupted_base64 = base64.b64encode(corrupted_bytes).decode() with pytest.raises(ValueError, match="Image validation failed"): - storage.save_base64_image(corrupted_base64) + storage.save_base64_image(corrupted_base64, subfolder_name="test_column") # Check that no files were left behind (cleanup on validation failure) - assert len(list(storage.images_dir.iterdir())) == 0 + column_dir = storage.images_dir / "test_column" + if column_dir.exists(): + assert len(list(column_dir.iterdir())) == 0 def test_save_base64_image_dataframe_mode_returns_base64(tmp_path, sample_base64_png): """Test that DATAFRAME mode returns base64 directly without disk operations.""" storage = MediaStorage(base_path=tmp_path, mode=StorageMode.DATAFRAME) - # Should return the same base64 data - result = storage.save_base64_image(sample_base64_png) + # Should return the same base64 data (column_name is ignored in DATAFRAME mode) + result = storage.save_base64_image(sample_base64_png, subfolder_name="test_column") assert result == sample_base64_png # Directory should not be created in DATAFRAME mode (lazy initialization) @@ -165,10 +167,62 @@ def test_save_base64_image_dataframe_mode_returns_base64(tmp_path, sample_base64 def test_cleanup(media_storage, sample_base64_png): """Test cleanup removes images directory.""" # Save an image first - media_storage.save_base64_image(sample_base64_png) + media_storage.save_base64_image(sample_base64_png, subfolder_name="test_column") assert media_storage.images_dir.exists() assert len(list(media_storage.images_dir.iterdir())) > 0 # Cleanup should remove directory media_storage.cleanup() assert not media_storage.images_dir.exists() + + +def test_save_base64_image_with_subfolder_name(media_storage, sample_base64_png): + """Test saving image with subfolder name organizes into subdirectory.""" + subfolder = "test_subfolder" + relative_path = media_storage.save_base64_image(sample_base64_png, subfolder_name=subfolder) + + # Check return value format includes subfolder + assert relative_path.startswith(f"{IMAGES_SUBDIR}/{subfolder}/") + assert relative_path.endswith(".png") + + # Check file exists in correct subdirectory + full_path = media_storage.base_path / relative_path + assert full_path.exists() + assert full_path.parent.name == subfolder + + # Verify file content + saved_bytes = full_path.read_bytes() + expected_bytes = base64.b64decode(sample_base64_png) + assert saved_bytes == expected_bytes + + +def test_save_base64_image_with_different_subfolder_names(media_storage, sample_base64_png, sample_base64_jpg): + """Test that images with different subfolder names are stored in separate subdirectories.""" + path1 = media_storage.save_base64_image(sample_base64_png, subfolder_name="subfolder_a") + path2 = media_storage.save_base64_image(sample_base64_jpg, subfolder_name="subfolder_b") + + # Check paths are in different subdirectories + assert "subfolder_a" in path1 + assert "subfolder_b" in path2 + + # Check both directories exist + subfolder_a_dir = media_storage.images_dir / "subfolder_a" + subfolder_b_dir = media_storage.images_dir / "subfolder_b" + assert subfolder_a_dir.exists() + assert subfolder_b_dir.exists() + + # Check files exist in their respective directories + assert (media_storage.base_path / path1).exists() + assert (media_storage.base_path / path2).exists() + + +def test_save_base64_image_dataframe_mode_with_subfolder_name(tmp_path, sample_base64_png): + """Test that DATAFRAME mode returns base64 directly even with subfolder name.""" + storage = MediaStorage(base_path=tmp_path, mode=StorageMode.DATAFRAME) + + # Should return the same base64 data regardless of subfolder name + result = storage.save_base64_image(sample_base64_png, subfolder_name="test_subfolder") + assert result == sample_base64_png + + # Directory should not be created in DATAFRAME mode + assert not storage.images_dir.exists() From 71e2bac46a4e952980498da69845d515cc024635 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 19:33:56 -0700 Subject: [PATCH 36/69] address pr comments --- .../config/utils/visualization.py | 10 ++++--- .../src/data_designer/engine/models/facade.py | 26 ++++++++++++++----- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/visualization.py b/packages/data-designer-config/src/data_designer/config/utils/visualization.py index 910bc467..9d65cca5 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/visualization.py +++ b/packages/data-designer-config/src/data_designer/config/utils/visualization.py @@ -3,6 +3,7 @@ from __future__ import annotations +import html import json import os from collections import OrderedDict @@ -81,14 +82,17 @@ def _display_image_if_in_notebook(image_data: str, col_name: str, base_path: str # Use the base64 data directly without resizing img_base64 = base64_data + # Escape column name to prevent HTML injection + escaped_col_name = html.escape(col_name) + # Create HTML with caption and image in left-aligned container - html = f""" + html_content = f"""
-
πŸ–ΌοΈ {col_name}
+
πŸ–ΌοΈ {escaped_col_name}
""" - display(HTML(html)) + display(HTML(html_content)) return True except (ImportError, NameError): # Not in a notebook environment diff --git a/packages/data-designer-engine/src/data_designer/engine/models/facade.py b/packages/data-designer-engine/src/data_designer/engine/models/facade.py index 51940e99..a14231ab 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/facade.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/facade.py @@ -11,6 +11,7 @@ from data_designer.config.models import GenerationType, ModelConfig, ModelProvider from data_designer.config.utils.image_helpers import ( extract_base64_from_data_uri, + is_base64_image, is_image_diffusion_model, ) from data_designer.engine.mcp.errors import MCPConfigurationError @@ -40,6 +41,14 @@ def _identity(x: Any) -> Any: return x +def _try_extract_base64(data: str) -> str | None: + """Try to extract base64 image data from a data URI, returning None on failure.""" + try: + return extract_base64_from_data_uri(data) + except ValueError: + return None + + logger = logging.getLogger(__name__) @@ -410,19 +419,22 @@ def _generate_image_chat_completion( image_url = image["image_url"] if isinstance(image_url, dict) and "url" in image_url: - url = image_url["url"] - images.append(extract_base64_from_data_uri(url)) + if (b64 := _try_extract_base64(image_url["url"])) is not None: + images.append(b64) elif isinstance(image_url, str): - images.append(extract_base64_from_data_uri(image_url)) + if (b64 := _try_extract_base64(image_url)) is not None: + images.append(b64) # Fallback: treat as base64 string elif isinstance(image, str): - images.append(extract_base64_from_data_uri(image)) + if (b64 := _try_extract_base64(image)) is not None: + images.append(b64) - # Fallback: check content field + # Fallback: check content field if it looks like image data if not images: content = message.content or "" - if content: - images.append(extract_base64_from_data_uri(content)) + if content and (content.startswith("data:image/") or is_base64_image(content)): + if (b64 := _try_extract_base64(content)) is not None: + images.append(b64) if not images: raise ModelAPIError("No image data found in response") From 46138d81c3917a6abb967ae321b2216770fdc6fa Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 19:34:30 -0700 Subject: [PATCH 37/69] fix license headers --- .../data_designer/engine/column_generators/generators/image.py | 2 +- .../src/data_designer/engine/storage/__init__.py | 2 +- .../src/data_designer/engine/storage/media_storage.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py index 55721916..730e73bb 100644 --- a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py +++ b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/image.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py b/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py index 34c776d5..9d416c65 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from data_designer.engine.storage.media_storage import MediaStorage, StorageMode diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py index df83e331..9adefc89 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations From deb5fc2bdd7f05373ca5559a558441644581c3b3 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 19:37:19 -0700 Subject: [PATCH 38/69] generate collab notebooks --- docs/colab_notebooks/1-the-basics.ipynb | 66 +-- ...ctured-outputs-and-jinja-expressions.ipynb | 62 +-- .../3-seeding-with-a-dataset.ipynb | 58 +-- .../4-providing-images-as-context.ipynb | 70 +-- .../colab_notebooks/5-generating-images.ipynb | 437 ++++++++++++++++++ 5 files changed, 569 insertions(+), 124 deletions(-) create mode 100644 docs/colab_notebooks/5-generating-images.ipynb diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index ec2c5a99..ed8942df 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c79eea7a", + "id": "945eebf8", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "2476f160", + "id": "8e8f2e22", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -26,7 +26,7 @@ }, { "cell_type": "markdown", - "id": "3646f62e", + "id": "92d91bf1", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3348e5c8", + "id": "0b9b4427", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19cd9249", + "id": "8878d172", "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a6d13a9", + "id": "4c92bfb3", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "d445af5b", + "id": "4e39eed1", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4df0031d", + "id": "70c96cfb", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "markdown", - "id": "0f69b576", + "id": "99d975c9", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65d9be99", + "id": "851228c8", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "72582d09", + "id": "fefb639d", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8d7992b4", + "id": "0ba52672", "metadata": {}, "outputs": [], "source": [ @@ -169,7 +169,7 @@ }, { "cell_type": "markdown", - "id": "741a15a0", + "id": "7cc2aefc", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c3879c70", + "id": "a5a34b1a", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "1575ef81", + "id": "ee4d1b6a", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87a88d7b", + "id": "7782d790", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "8c74b738", + "id": "f88e8b18", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4eb1da1f", + "id": "19174a73", "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "4324d869", + "id": "01438115", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1302a503", + "id": "9c8f1275", "metadata": {}, "outputs": [], "source": [ @@ -382,7 +382,7 @@ }, { "cell_type": "markdown", - "id": "7cf8241b", + "id": "f61e3771", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -399,7 +399,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6fc6cf39", + "id": "7f8dc56e", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c929e068", + "id": "5b66172a", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dfb04e2a", + "id": "b0eaa931", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "adb879da", + "id": "122d099d", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ff58dd9f", + "id": "f40f7ba0", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "57c7355d", + "id": "597c41ec", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -466,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "df49db99", + "id": "acf8caa3", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2bbc48dd", + "id": "697e9090", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dc0673fa", + "id": "18f34e66", "metadata": {}, "outputs": [], "source": [ @@ -501,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "7688217b", + "id": "4c498f62", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", @@ -512,7 +512,9 @@ "\n", "- [Seeding synthetic data generation with an external dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/)\n", "\n", - "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n" + "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n", + "\n", + "- [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/)\n" ] } ], diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index c813ea50..49be6edb 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "258752cd", + "id": "bd333de9", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "fc4217c3", + "id": "28fb2ee3", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "2b831130", + "id": "fbeb3b2d", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fa1eda43", + "id": "6ef3d2ae", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5f014571", + "id": "07546806", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f7409282", + "id": "81b00725", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "8234dd4b", + "id": "a5cf694f", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21633aed", + "id": "8320e2b0", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "9b215265", + "id": "348e2c5a", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "76260638", + "id": "21019fc5", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "e6bfd93d", + "id": "7bf9d9af", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a0fbd497", + "id": "88abb685", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "7faae40e", + "id": "d8e790c6", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -198,7 +198,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f2f94909", + "id": "64465ab1", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "696f19f4", + "id": "cfbad124", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "312b50cd", + "id": "aa93a4c9", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "ecd971ca", + "id": "74aa72fc", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -361,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bda01ffc", + "id": "9ae978cc", "metadata": {}, "outputs": [], "source": [ @@ -414,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "059613e1", + "id": "ec850f14", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "23c9b839", + "id": "cb18575e", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e5adcdbd", + "id": "eee46dc6", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1cc39cae", + "id": "082d0fc4", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "bcca3f06", + "id": "e8d80b94", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -475,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6e1957ca", + "id": "4b0a7299", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "9db283d3", + "id": "d7e0c925", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -498,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "30826883", + "id": "b599d759", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +508,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88d4d3bd", + "id": "07a7c0da", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8762a2bb", + "id": "7760dffa", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +533,7 @@ }, { "cell_type": "markdown", - "id": "0375fcd2", + "id": "6d19000a", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", @@ -542,7 +542,9 @@ "\n", "- [Seeding synthetic data generation with an external dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/)\n", "\n", - "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n" + "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n", + "\n", + "- [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/)\n" ] } ], diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index c5d427d0..468aa795 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "b2a3e544", + "id": "573c3e7b", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "d57c4f0a", + "id": "63f6c36d", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "f7da8723", + "id": "02cc81c7", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "90a12556", + "id": "18d51631", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8fcdfde5", + "id": "67c55f6b", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5899e85c", + "id": "cfe2ff62", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "6c093c90", + "id": "bdbc5b03", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6a2066fe", + "id": "55d9caf1", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "f5e81142", + "id": "aa1623bc", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "880012ea", + "id": "9d1310cf", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "4b77a92c", + "id": "e64ce3b7", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f4ab6628", + "id": "dafd6155", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "26fb0a63", + "id": "7c01f11c", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "84908e88", + "id": "7941073f", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "1947e70a", + "id": "a68c7d55", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "be2fbad1", + "id": "f1b3d4d4", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "8fcce5dc", + "id": "eff1bf9f", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -325,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "82dc02f8", + "id": "b5955230", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f2d1583", + "id": "062a7294", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "62a9173b", + "id": "6378e1be", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "5263e705", + "id": "51e5175e", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -369,7 +369,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5295320f", + "id": "891b6860", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "3ecc195f", + "id": "0f52668f", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3865fb59", + "id": "ed083bd8", "metadata": {}, "outputs": [], "source": [ @@ -402,7 +402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a7acf2b0", + "id": "039c42e4", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81a6e999", + "id": "623ca205", "metadata": {}, "outputs": [], "source": [ @@ -427,14 +427,16 @@ }, { "cell_type": "markdown", - "id": "4503b1cf", + "id": "0a7e7d42", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", "\n", "Check out the following notebook to learn more about:\n", "\n", - "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n" + "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n", + "\n", + "- [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/)\n" ] } ], diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index cd175537..62ac63e8 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "90dda708", + "id": "731384ed", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "52ccb1e5", + "id": "bc66dd23", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "9627c4eb", + "id": "4539a931", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -37,7 +37,7 @@ }, { "cell_type": "markdown", - "id": "1817171a", + "id": "f88809bf", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f15a669", + "id": "3628d4c4", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1201c93b", + "id": "7fcf0f75", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f814b76c", + "id": "6654714a", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "ac423d57", + "id": "22488cb7", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c655c2d", + "id": "39913ca0", "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "markdown", - "id": "7d41e922", + "id": "fba112ab", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a8b5f4bf", + "id": "70fd86dd", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "6455fc58", + "id": "810c7457", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "462c2e01", + "id": "9b2204d0", "metadata": {}, "outputs": [], "source": [ @@ -186,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "31369d10", + "id": "29e3dae5", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55d9432a", + "id": "e2cc3506", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8614c4e9", + "id": "7a821067", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80550e46", + "id": "359d144b", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65ced9bb", + "id": "985cd308", "metadata": {}, "outputs": [], "source": [ @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34b210e8", + "id": "6a8cb414", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d506903d", + "id": "a57e1b73", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ }, { "cell_type": "markdown", - "id": "b91032a2", + "id": "7518100a", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4bd947de", + "id": "4c1fe540", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d0ff4c07", + "id": "bceafe91", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e97e4dfe", + "id": "20f4ace5", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "0a284c12", + "id": "16a86d56", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -396,7 +396,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2570e7fd", + "id": "c1bbae97", "metadata": {}, "outputs": [], "source": [ @@ -406,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "28b8eb5a", + "id": "d8d7604f", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5d0d9336", + "id": "27c0636c", "metadata": { "lines_to_next_cell": 2 }, @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "1c257a81", + "id": "f6b99539", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e6d840e9", + "id": "e5d53787", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "909e6f3f", + "id": "1f859e49", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "adbb4cae", + "id": "6688e3c5", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ }, { "cell_type": "markdown", - "id": "d085584c", + "id": "28635b09", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", @@ -499,7 +499,9 @@ "- Experiment with different vision models for specific document types\n", "- Try different prompt variations to generate specialized descriptions (e.g., technical details, key findings)\n", "- Combine vision-based summaries with other column types for multi-modal workflows\n", - "- Apply this pattern to other vision tasks like image captioning, OCR validation, or visual question answering\n" + "- Apply this pattern to other vision tasks like image captioning, OCR validation, or visual question answering\n", + "\n", + "- [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/) with Data Designer\n" ] } ], diff --git a/docs/colab_notebooks/5-generating-images.ipynb b/docs/colab_notebooks/5-generating-images.ipynb new file mode 100644 index 00000000..485fe258 --- /dev/null +++ b/docs/colab_notebooks/5-generating-images.ipynb @@ -0,0 +1,437 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0ee289e6", + "metadata": {}, + "source": [ + "# 🎨 Data Designer Tutorial: Generating Images\n", + "\n", + "#### πŸ“š What you'll learn\n", + "\n", + "This notebook shows how to generate synthetic image data with Data Designer using image-generation models.\n", + "\n", + "- πŸ–ΌοΈ **Image generation columns**: Add columns that produce images from text prompts\n", + "- πŸ“ **Jinja2 prompts**: Drive diversity by referencing other columns in your prompt template\n", + "- πŸ’Ύ **Preview vs create**: Preview stores base64 in the dataframe; create saves images to disk and stores paths\n", + "\n", + "Data Designer supports both **diffusion** (e.g. DALLΒ·E, Stable Diffusion, Imagen) and **autoregressive** (e.g. Gemini image, GPT image) models; the API is chosen automatically from the model name.\n", + "\n", + "If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series.\n" + ] + }, + { + "cell_type": "markdown", + "id": "86f748c1", + "metadata": {}, + "source": [ + "### πŸ“¦ Import Data Designer\n", + "\n", + "- `data_designer.config` provides the configuration API.\n", + "- `DataDesigner` is the main interface for generation.\n" + ] + }, + { + "cell_type": "markdown", + "id": "c610ee22", + "metadata": {}, + "source": [ + "### ⚑ Colab Setup\n", + "\n", + "Run the cells below to install the dependencies and set up the API key. If you don't have an API key, you can generate one from [build.nvidia.com](https://build.nvidia.com).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "818ca495", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -U data-designer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f165bb15", + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "from google.colab import userdata\n", + "\n", + "try:\n", + " os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n", + "except userdata.SecretNotFoundError:\n", + " os.environ[\"NVIDIA_API_KEY\"] = getpass.getpass(\"Enter your NVIDIA API key: \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5decfc83", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Image as IPImage\n", + "from IPython.display import display\n", + "\n", + "import data_designer.config as dd\n", + "from data_designer.interface import DataDesigner" + ] + }, + { + "cell_type": "markdown", + "id": "929f35d6", + "metadata": {}, + "source": [ + "### βš™οΈ Initialize the Data Designer interface\n", + "\n", + "When initialized without arguments, [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used. This tutorial uses [OpenRouter](https://openrouter.ai) with the Flux 2 Pro image model; set `OPENROUTER_API_KEY` in your environment.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4c8b7d7", + "metadata": {}, + "outputs": [], + "source": [ + "data_designer = DataDesigner()" + ] + }, + { + "cell_type": "markdown", + "id": "8ed7b0b6", + "metadata": {}, + "source": [ + "### πŸŽ›οΈ Define an image-generation model\n", + "\n", + "- Use `ImageInferenceParams` so Data Designer treats this model as an image generator.\n", + "- Image options (size, quality, aspect ratio, etc.) are model-specific; pass them via `extra_body`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6b1ca66", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_PROVIDER = \"openrouter\"\n", + "MODEL_ID = \"black-forest-labs/flux.2-pro\"\n", + "MODEL_ALIAS = \"image-model\"\n", + "\n", + "model_configs = [\n", + " dd.ModelConfig(\n", + " alias=MODEL_ALIAS,\n", + " model=MODEL_ID,\n", + " provider=MODEL_PROVIDER,\n", + " inference_parameters=dd.ImageInferenceParams(\n", + " extra_body={\"height\": 512, \"width\": 512},\n", + " ),\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "498cfecf", + "metadata": {}, + "source": [ + "### πŸ—οΈ Build the config: samplers + image column\n", + "\n", + "We'll generate diverse **dog portrait** images: sampler columns drive subject (breed), age, style, look direction, and emotion. The image-generation column uses a Jinja2 prompt that references all of them.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e74fc7ab", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"style\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\n", + " \"photorealistic\",\n", + " \"oil painting\",\n", + " \"watercolor\",\n", + " \"digital art\",\n", + " \"sketch\",\n", + " \"anime\",\n", + " ],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"dog_breed\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\n", + " \"a Golden Retriever\",\n", + " \"a German Shepherd\",\n", + " \"a Labrador Retriever\",\n", + " \"a Bulldog\",\n", + " \"a Beagle\",\n", + " \"a Poodle\",\n", + " \"a Corgi\",\n", + " \"a Siberian Husky\",\n", + " \"a Dalmatian\",\n", + " \"a Yorkshire Terrier\",\n", + " \"a Boxer\",\n", + " \"a Dachshund\",\n", + " \"a Doberman Pinscher\",\n", + " \"a Shih Tzu\",\n", + " \"a Chihuahua\",\n", + " \"a Border Collie\",\n", + " \"an Australian Shepherd\",\n", + " \"a Cocker Spaniel\",\n", + " \"a Maltese\",\n", + " \"a Pomeranian\",\n", + " \"a Saint Bernard\",\n", + " \"a Great Dane\",\n", + " \"an Akita\",\n", + " \"a Samoyed\",\n", + " \"a Boston Terrier\",\n", + " ],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"cat_breed\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\n", + " \"a Persian\",\n", + " \"a Maine Coon\",\n", + " \"a Siamese\",\n", + " \"a Ragdoll\",\n", + " \"a Bengal\",\n", + " \"an Abyssinian\",\n", + " \"a British Shorthair\",\n", + " \"a Sphynx\",\n", + " \"a Scottish Fold\",\n", + " \"a Russian Blue\",\n", + " \"a Birman\",\n", + " \"an Oriental Shorthair\",\n", + " \"a Norwegian Forest Cat\",\n", + " \"a Devon Rex\",\n", + " \"a Burmese\",\n", + " \"an Egyptian Mau\",\n", + " \"a Tonkinese\",\n", + " \"a Himalayan\",\n", + " \"a Savannah\",\n", + " \"a Chartreux\",\n", + " \"a Somali\",\n", + " \"a Manx\",\n", + " \"a Turkish Angora\",\n", + " \"a Balinese\",\n", + " \"an American Shorthair\",\n", + " ],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"dog_age\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\"1-3\", \"3-6\", \"6-9\", \"9-12\", \"12-15\"],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"cat_age\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\"1-3\", \"3-6\", \"6-9\", \"9-12\", \"12-18\"],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"dog_look_direction\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\"left\", \"right\", \"front\", \"up\", \"down\"],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"cat_look_direction\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\"left\", \"right\", \"front\", \"up\", \"down\"],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"dog_emotion\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\"happy\", \"curious\", \"serious\", \"sleepy\", \"excited\"],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"cat_emotion\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\"aloof\", \"curious\", \"content\", \"sleepy\", \"playful\"],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.ImageColumnConfig(\n", + " name=\"generated_image\",\n", + " prompt=(\n", + " \"\"\"\n", + "A {{ style }} family pet portrait of a {{ dog_breed }} dog of {{ dog_age }} years old looking {{dog_look_direction}} with an {{ dog_emotion }} expression and\n", + "{{ cat_breed }} cat of {{ cat_age }} years old looking {{ cat_look_direction }} with an {{ cat_emotion }} expression in the background. Both subjects should be in focus.\n", + " \"\"\"\n", + " ),\n", + " model_alias=MODEL_ALIAS,\n", + " )\n", + ")\n", + "\n", + "data_designer.validate(config_builder)" + ] + }, + { + "cell_type": "markdown", + "id": "c592c820", + "metadata": {}, + "source": [ + "### πŸ” Preview: images as base64\n", + "\n", + "In **preview** mode, generated images are stored as base64 strings in the dataframe. Run the next cell to step through each record (images are shown in the sample record display, but only in a notebook environment).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eee17bb1", + "metadata": {}, + "outputs": [], + "source": [ + "preview = data_designer.preview(config_builder, num_records=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3cd320cc", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(preview.dataset)):\n", + " preview.display_sample_record()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffb5e188", + "metadata": {}, + "outputs": [], + "source": [ + "preview.dataset" + ] + }, + { + "cell_type": "markdown", + "id": "87b83328", + "metadata": {}, + "source": [ + "### πŸ†™ Create: images saved to disk\n", + "\n", + "In **create** mode, images are written to an `images/` folder with UUID filenames; the dataframe stores relative paths (e.g. `images/1d16b6e2-562f-4f51-91e5-baaa999ea916.png`).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8f9cc41", + "metadata": {}, + "outputs": [], + "source": [ + "results = data_designer.create(config_builder, num_records=5, dataset_name=\"tutorial-5-images\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d4453e5", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = results.load_dataset()\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "198301ab", + "metadata": {}, + "outputs": [], + "source": [ + "# Display all image from the created dataset. Paths are relative to the artifact output directory.\n", + "for index, row in dataset.iterrows():\n", + " path_or_list = row.get(\"generated_image\")\n", + " if path_or_list is not None:\n", + " for path in path_or_list:\n", + " base = results.artifact_storage.base_dataset_path\n", + " full_path = base / path\n", + " display(IPImage(data=full_path))" + ] + }, + { + "cell_type": "markdown", + "id": "2bdcef2b", + "metadata": {}, + "source": [ + "## ⏭️ Next steps\n", + "\n", + "- [The basics](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/): samplers and LLM text columns\n", + "- [Structured outputs and Jinja](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/2-structured-outputs-and-jinja-expressions/)\n", + "- [Seeding with a dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/)\n", + "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From d11d049aad3474476a03a099c701f596c24c8ca2 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 19:39:10 -0700 Subject: [PATCH 39/69] move pillow to lib dep from notebook --- packages/data-designer-config/pyproject.toml | 1 + uv.lock | 2 ++ 2 files changed, 3 insertions(+) diff --git a/packages/data-designer-config/pyproject.toml b/packages/data-designer-config/pyproject.toml index 04af4adc..569c8fe0 100644 --- a/packages/data-designer-config/pyproject.toml +++ b/packages/data-designer-config/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "jinja2>=3.1.6,<4", "numpy>=1.23.5,<3", "pandas>=2.3.3,<3", + "pillow>=12.0.0,<13", "pyarrow>=19.0.1,<20", # Required for parquet I/O operations "pydantic[email]>=2.9.2,<3", "pygments>=2.19.2,<3", diff --git a/uv.lock b/uv.lock index b26a9385..6a4a8432 100644 --- a/uv.lock +++ b/uv.lock @@ -965,6 +965,7 @@ dependencies = [ { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "pandas" }, + { name = "pillow" }, { name = "pyarrow" }, { name = "pydantic", extra = ["email"] }, { name = "pygments" }, @@ -978,6 +979,7 @@ requires-dist = [ { name = "jinja2", specifier = ">=3.1.6,<4" }, { name = "numpy", specifier = ">=1.23.5,<3" }, { name = "pandas", specifier = ">=2.3.3,<3" }, + { name = "pillow", specifier = ">=12.0.0,<13" }, { name = "pyarrow", specifier = ">=19.0.1,<20" }, { name = "pydantic", extras = ["email"], specifier = ">=2.9.2,<3" }, { name = "pygments", specifier = ">=2.19.2,<3" }, From 511e1f26a3180579227d01598bdf7e90b2e07b26 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 19:41:49 -0700 Subject: [PATCH 40/69] update uv lock" --- pyproject.toml | 7 +++---- uv.lock | 37 ++++++++++++------------------------- 2 files changed, 15 insertions(+), 29 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f7b71536..988e6f04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,11 +42,11 @@ dev-dependencies = [ [dependency-groups] dev = [ "jsonpath-ng>=1.5.3,<2", - "pytest>=9.0.2,<10", - "pytest-asyncio>=1.3.0,<2", + "pytest>=8.3.3,<9", + "pytest-asyncio>=0.24.0,<1", "pytest-cov>=7.0.0,<8", "pytest-env>=1.2.0,<2", - "pytest-httpx>=0.36.0,<1", + "pytest-httpx>=0.35.0,<1", "pre-commit>=4.0.0,<5", ] docs = [ @@ -63,7 +63,6 @@ notebooks = [ "datasets>=4.0.0,<5", "ipykernel>=6.29.0,<7", "jupyter>=1.0.0,<2", - "pillow>=12.0.0,<13", ] recipes = [ "bm25s>=0.2.0,<1", diff --git a/uv.lock b/uv.lock index 6a4a8432..9a111de5 100644 --- a/uv.lock +++ b/uv.lock @@ -308,15 +308,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" }, ] -[[package]] -name = "backports-asyncio-runner" -version = "1.2.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/8e/ff/70dca7d7cb1cbc0edb2c6cc0c38b65cba36cccc491eca64cabd5fe7f8670/backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162", size = 69893, upload-time = "2025-07-02T02:27:15.685Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/59/76ab57e3fe74484f48a53f8e337171b4a2349e506eabe136d7e01d059086/backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5", size = 12313, upload-time = "2025-07-02T02:27:14.263Z" }, -] - [[package]] name = "backrefs" version = "6.1" @@ -1071,7 +1062,6 @@ notebooks = [ { name = "datasets" }, { name = "ipykernel" }, { name = "jupyter" }, - { name = "pillow" }, ] recipes = [ { name = "bm25s" }, @@ -1085,11 +1075,11 @@ requires-dist = [{ name = "matplotlib", specifier = ">=3.10.8" }] dev = [ { name = "jsonpath-ng", specifier = ">=1.5.3,<2" }, { name = "pre-commit", specifier = ">=4.0.0,<5" }, - { name = "pytest", specifier = ">=9.0.2,<10" }, - { name = "pytest-asyncio", specifier = ">=1.3.0,<2" }, + { name = "pytest", specifier = ">=8.3.3,<9" }, + { name = "pytest-asyncio", specifier = ">=0.24.0,<1" }, { name = "pytest-cov", specifier = ">=7.0.0,<8" }, { name = "pytest-env", specifier = ">=1.2.0,<2" }, - { name = "pytest-httpx", specifier = ">=0.36.0,<1" }, + { name = "pytest-httpx", specifier = ">=0.35.0,<1" }, { name = "ruff", specifier = ">=0.14.10,<1" }, ] docs = [ @@ -1106,7 +1096,6 @@ notebooks = [ { name = "datasets", specifier = ">=4.0.0,<5" }, { name = "ipykernel", specifier = ">=6.29.0,<7" }, { name = "jupyter", specifier = ">=1.0.0,<2" }, - { name = "pillow", specifier = ">=12.0.0,<13" }, ] recipes = [ { name = "bm25s", specifier = ">=0.2.0,<1" }, @@ -4407,7 +4396,7 @@ wheels = [ [[package]] name = "pytest" -version = "9.0.2" +version = "8.4.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, @@ -4418,23 +4407,21 @@ dependencies = [ { name = "pygments" }, { name = "tomli", marker = "python_full_version < '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, ] [[package]] name = "pytest-asyncio" -version = "1.3.0" +version = "0.26.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "backports-asyncio-runner", marker = "python_full_version < '3.11'" }, { name = "pytest" }, - { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/c4/453c52c659521066969523e87d85d54139bbd17b78f09532fb8eb8cdb58e/pytest_asyncio-0.26.0.tar.gz", hash = "sha256:c4df2a697648241ff39e7f0e4a73050b03f123f760673956cf0d72a4990e312f", size = 54156, upload-time = "2025-03-25T06:22:28.883Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" }, + { url = "https://files.pythonhosted.org/packages/20/7f/338843f449ace853647ace35870874f69a764d251872ed1b4de9f234822c/pytest_asyncio-0.26.0-py3-none-any.whl", hash = "sha256:7b51ed894f4fbea1340262bdae5135797ebbe21d8638978e35d31c6d19f72fb0", size = 19694, upload-time = "2025-03-25T06:22:27.807Z" }, ] [[package]] @@ -4466,15 +4453,15 @@ wheels = [ [[package]] name = "pytest-httpx" -version = "0.36.0" +version = "0.35.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "httpx" }, { name = "pytest" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/5574834da9499066fa1a5ea9c336f94dba2eae02298d36dab192fcf95c86/pytest_httpx-0.36.0.tar.gz", hash = "sha256:9edb66a5fd4388ce3c343189bc67e7e1cb50b07c2e3fc83b97d511975e8a831b", size = 56793, upload-time = "2025-12-02T16:34:57.414Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1f/89/5b12b7b29e3d0af3a4b9c071ee92fa25a9017453731a38f08ba01c280f4c/pytest_httpx-0.35.0.tar.gz", hash = "sha256:d619ad5d2e67734abfbb224c3d9025d64795d4b8711116b1a13f72a251ae511f", size = 54146, upload-time = "2024-11-28T19:16:54.237Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/d2/1eb1ea9c84f0d2033eb0b49675afdc71aa4ea801b74615f00f3c33b725e3/pytest_httpx-0.36.0-py3-none-any.whl", hash = "sha256:bd4c120bb80e142df856e825ec9f17981effb84d159f9fa29ed97e2357c3a9c8", size = 20229, upload-time = "2025-12-02T16:34:56.45Z" }, + { url = "https://files.pythonhosted.org/packages/b0/ed/026d467c1853dd83102411a78126b4842618e86c895f93528b0528c7a620/pytest_httpx-0.35.0-py3-none-any.whl", hash = "sha256:ee11a00ffcea94a5cbff47af2114d34c5b231c326902458deed73f9c459fd744", size = 19442, upload-time = "2024-11-28T19:16:52.787Z" }, ] [[package]] From 2b22df8517fd225a462608b19b568a7561968e92 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 20:00:24 -0700 Subject: [PATCH 41/69] remove legacy flag from display_sample_record --- .../data_designer/config/utils/visualization.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/visualization.py b/packages/data-designer-config/src/data_designer/config/utils/visualization.py index 9d65cca5..2132b83b 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/visualization.py +++ b/packages/data-designer-config/src/data_designer/config/utils/visualization.py @@ -47,13 +47,12 @@ console = Console() -def _display_image_if_in_notebook(image_data: str, col_name: str, base_path: str | None = None) -> bool: +def _display_image_if_in_notebook(image_data: str, col_name: str) -> bool: """Display image with caption in Jupyter notebook if available. Args: image_data: Base64-encoded image data, data URI, or file path. col_name: Name of the column (used for caption). - base_path: Optional base path to resolve relative image paths. Returns: True if image was displayed, False otherwise. @@ -66,7 +65,7 @@ def _display_image_if_in_notebook(image_data: str, col_name: str, base_path: str # Check if it's a file path and load it if is_image_path(image_data) and not image_data.startswith("data:image/"): - loaded_base64 = load_image_path_to_base64(image_data, base_path) + loaded_base64 = load_image_path_to_base64(image_data) if loaded_base64 is None: console.print( f"[yellow]⚠️ Could not load image from path '{image_data}' for column '{col_name}'[/yellow]" @@ -191,11 +190,6 @@ def display_sample_record( None if hide_seed_columns or self.dataset_metadata is None else self.dataset_metadata.seed_column_names ) - # Try to get base path from artifact storage if available - base_path = None - if hasattr(self, "artifact_storage") and self.artifact_storage is not None: - base_path = str(self.artifact_storage.base_dataset_path) - display_sample_record( record=record, processor_data_to_display=processor_data_to_display, @@ -204,7 +198,6 @@ def display_sample_record( syntax_highlighting_theme=syntax_highlighting_theme, record_index=i, seed_column_names=seed_column_names, - base_path=base_path, ) if index is None: self._display_cycle_index = (self._display_cycle_index + 1) % num_records @@ -238,7 +231,6 @@ def display_sample_record( syntax_highlighting_theme: str = "dracula", record_index: int | None = None, seed_column_names: list[str] | None = None, - base_path: str | None = None, ): if isinstance(record, (dict, pd.Series)): record = pd.DataFrame([record]).iloc[0] @@ -420,7 +412,7 @@ def display_sample_record( # Display images at the bottom with captions (only in notebook) if len(images_to_display_later) > 0: for col_name, image_data in images_to_display_later: - _display_image_if_in_notebook(image_data, col_name, base_path=base_path) + _display_image_if_in_notebook(image_data, col_name) def get_truncated_list_as_string(long_list: list[Any], max_items: int = 2) -> str: From 92395447905b869370caa23c9c7732ac2823c6f2 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 20:02:40 -0700 Subject: [PATCH 42/69] remove unnecessary override of generate kwargs --- .../data-designer-config/src/data_designer/config/models.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py index 0542a8b8..51954a74 100644 --- a/packages/data-designer-config/src/data_designer/config/models.py +++ b/packages/data-designer-config/src/data_designer/config/models.py @@ -453,10 +453,6 @@ class ImageInferenceParams(BaseInferenceParams): generation_type: Literal[GenerationType.IMAGE] = GenerationType.IMAGE - @property - def generate_kwargs(self) -> dict[str, Any]: - return super().generate_kwargs - InferenceParamsT: TypeAlias = Annotated[ ChatCompletionInferenceParams | EmbeddingInferenceParams | ImageInferenceParams, From 3a779aaabc8ae4cc75fba1c4db310c6ce8c60042 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 20:10:23 -0700 Subject: [PATCH 43/69] Restore some changes not needed --- .../dataset_builders/column_wise_builder.py | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py index 7bd578d0..9077e807 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/column_wise_builder.py @@ -11,7 +11,7 @@ from typing import TYPE_CHECKING, Callable from data_designer.config.column_configs import CustomColumnConfig -from data_designer.config.column_types import ColumnConfigT +from data_designer.config.column_types import ColumnConfigT, DataDesignerColumnType from data_designer.config.config_builder import BuilderConfig from data_designer.config.data_designer_config import DataDesignerConfig from data_designer.config.dataset_builders import BuildStage @@ -170,21 +170,15 @@ def process_preview(self, dataset: pd.DataFrame) -> pd.DataFrame: def _has_image_columns(self) -> bool: """Check if config has any image generation columns.""" - from data_designer.config.column_types import DataDesignerColumnType - return any(col.column_type == DataDesignerColumnType.IMAGE for col in self.single_column_configs) def _initialize_generators(self) -> list[ColumnGenerator]: - """Initialize column generators. - - Generators access media storage via ResourceProvider.artifact_storage.media_storage - """ - generators = [] - for config in self._column_configs: - generator_cls = self._registry.column_generators.get_for_config_type(type(config)) - generator = generator_cls(config=config, resource_provider=self._resource_provider) - generators.append(generator) - return generators + return [ + self._registry.column_generators.get_for_config_type(type(config))( + config=config, resource_provider=self._resource_provider + ) + for config in self._column_configs + ] def _write_builder_config(self) -> None: self.artifact_storage.mkdir_if_needed(self.artifact_storage.base_dataset_path) From 33b6cd9bf942ec3b6b9fcf365a4297b3f5ea1138 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 20:16:02 -0700 Subject: [PATCH 44/69] use a specific image generation exception instead of generic ModelAPIError --- .../src/data_designer/engine/models/errors.py | 3 ++ .../src/data_designer/engine/models/facade.py | 10 ++--- .../tests/engine/models/test_facade.py | 45 ++++++++++++++++++- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/models/errors.py b/packages/data-designer-engine/src/data_designer/engine/models/errors.py index 3e1ddf01..8ca1ebfd 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/errors.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/errors.py @@ -83,6 +83,9 @@ class ModelStructuredOutputError(DataDesignerError): ... class ModelGenerationValidationFailureError(DataDesignerError): ... +class ImageGenerationError(DataDesignerError): ... + + class FormattedLLMErrorMessage(BaseModel): cause: str solution: str diff --git a/packages/data-designer-engine/src/data_designer/engine/models/facade.py b/packages/data-designer-engine/src/data_designer/engine/models/facade.py index a14231ab..447ad87b 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/facade.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/facade.py @@ -18,7 +18,7 @@ from data_designer.engine.model_provider import ModelProviderRegistry from data_designer.engine.models.errors import ( GenerationValidationFailureError, - ModelAPIError, + ImageGenerationError, catch_llm_exceptions, get_exception_primary_cause, ) @@ -340,7 +340,7 @@ def generate_image( List of base64-encoded image strings (without data URI prefix) Raises: - ModelAPIError: If image generation fails or returns invalid data + ImageGenerationError: If image generation fails or returns invalid data """ logger.debug( f"Generating image with model {self.model_name!r}...", @@ -406,7 +406,7 @@ def _generate_image_chat_completion( # Validate response structure if not response.choices or len(response.choices) == 0: - raise ModelAPIError("Response missing choices") + raise ImageGenerationError("Image generation response missing choices") message = response.choices[0].message images = [] @@ -437,7 +437,7 @@ def _generate_image_chat_completion( images.append(b64) if not images: - raise ModelAPIError("No image data found in response") + raise ImageGenerationError("No image data found in response") return images @@ -469,7 +469,7 @@ def _generate_image_diffusion(self, prompt: str, skip_usage_tracking: bool = Fal # Validate response if not response.data or len(response.data) == 0: - raise ModelAPIError("Image generation returned no data") + raise ImageGenerationError("Image generation returned no data") # Return all images as list return [img.b64_json for img in response.data] diff --git a/packages/data-designer-engine/tests/engine/models/test_facade.py b/packages/data-designer-engine/tests/engine/models/test_facade.py index 0323ce98..b80a22e7 100644 --- a/packages/data-designer-engine/tests/engine/models/test_facade.py +++ b/packages/data-designer-engine/tests/engine/models/test_facade.py @@ -9,7 +9,7 @@ import pytest from data_designer.engine.mcp.errors import MCPConfigurationError, MCPToolError -from data_designer.engine.models.errors import ModelGenerationValidationFailureError +from data_designer.engine.models.errors import ImageGenerationError, ModelGenerationValidationFailureError from data_designer.engine.models.facade import ModelFacade from data_designer.engine.models.parsers.errors import ParserException from data_designer.engine.models.utils import ChatMessage @@ -1106,6 +1106,49 @@ def test_generate_image_skip_usage_tracking( assert stub_model_facade.usage_stats.image_usage.has_usage is False +@patch("data_designer.engine.models.facade.ModelFacade.completion", autospec=True) +def test_generate_image_chat_completion_no_choices( + mock_completion: Any, + stub_model_facade: ModelFacade, +) -> None: + """Test that generate_image raises ImageGenerationError when response has no choices.""" + mock_response = litellm.types.utils.ModelResponse(choices=[]) + mock_completion.return_value = mock_response + + with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=False): + with pytest.raises(ImageGenerationError, match="Image generation response missing choices"): + stub_model_facade.generate_image(prompt="test prompt") + + +@patch("data_designer.engine.models.facade.ModelFacade.completion", autospec=True) +def test_generate_image_chat_completion_no_image_data( + mock_completion: Any, + stub_model_facade: ModelFacade, +) -> None: + """Test that generate_image raises ImageGenerationError when no image data in response.""" + mock_message = litellm.types.utils.Message(role="assistant", content="just text, no image") + mock_response = litellm.types.utils.ModelResponse(choices=[litellm.types.utils.Choices(message=mock_message)]) + mock_completion.return_value = mock_response + + with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=False): + with pytest.raises(ImageGenerationError, match="No image data found in response"): + stub_model_facade.generate_image(prompt="test prompt") + + +@patch("data_designer.engine.models.facade.CustomRouter.image_generation", autospec=True) +def test_generate_image_diffusion_no_data( + mock_image_generation: Any, + stub_model_facade: ModelFacade, +) -> None: + """Test that generate_image raises ImageGenerationError when diffusion API returns no data.""" + mock_response = litellm.types.utils.ImageResponse(data=[]) + mock_image_generation.return_value = mock_response + + with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=True): + with pytest.raises(ImageGenerationError, match="Image generation returned no data"): + stub_model_facade.generate_image(prompt="test prompt") + + @patch("data_designer.engine.models.facade.CustomRouter.image_generation", autospec=True) def test_generate_image_accumulates_usage( mock_image_generation: Any, From 3a98cafc0e9e08b1dd310d167054b062f20a7802 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 20:28:17 -0700 Subject: [PATCH 45/69] more cleanup --- .../src/data_designer/engine/models/facade.py | 2 +- .../engine/storage/media_storage.py | 10 - .../tests/engine/models/test_facade.py | 2 +- .../engine/storage/test_media_storage.py | 12 - .../integrations/huggingface/client.py | 4 +- pyproject.toml | 3 - uv.lock | 410 ------------------ 7 files changed, 4 insertions(+), 439 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/models/facade.py b/packages/data-designer-engine/src/data_designer/engine/models/facade.py index 447ad87b..e637d9f4 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/facade.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/facade.py @@ -437,7 +437,7 @@ def _generate_image_chat_completion( images.append(b64) if not images: - raise ImageGenerationError("No image data found in response") + raise ImageGenerationError("No image data found in image generation response") return images diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py index 9adefc89..3726b7f7 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py @@ -3,7 +3,6 @@ from __future__ import annotations -import shutil import uuid from pathlib import Path @@ -30,10 +29,6 @@ class MediaStorage: Currently supports: - Images (PNG, JPG, WEBP) - Future support planned for: - - Audio - - Video - Storage modes: - DISK: Save media to disk and return relative paths (for dataset creation) - DATAFRAME: Return base64 data directly (for preview mode) @@ -130,8 +125,3 @@ def _validate_image(self, image_path: Path) -> None: # Clean up invalid file image_path.unlink(missing_ok=True) raise - - def cleanup(self) -> None: - """Clean up image directory (for preview mode).""" - if self.images_dir.exists(): - shutil.rmtree(self.images_dir) diff --git a/packages/data-designer-engine/tests/engine/models/test_facade.py b/packages/data-designer-engine/tests/engine/models/test_facade.py index b80a22e7..65c66896 100644 --- a/packages/data-designer-engine/tests/engine/models/test_facade.py +++ b/packages/data-designer-engine/tests/engine/models/test_facade.py @@ -1131,7 +1131,7 @@ def test_generate_image_chat_completion_no_image_data( mock_completion.return_value = mock_response with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=False): - with pytest.raises(ImageGenerationError, match="No image data found in response"): + with pytest.raises(ImageGenerationError, match="No image data found in image generation response"): stub_model_facade.generate_image(prompt="test prompt") diff --git a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py index 105348d2..3648486d 100644 --- a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py +++ b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py @@ -164,18 +164,6 @@ def test_save_base64_image_dataframe_mode_returns_base64(tmp_path, sample_base64 assert not storage.images_dir.exists() -def test_cleanup(media_storage, sample_base64_png): - """Test cleanup removes images directory.""" - # Save an image first - media_storage.save_base64_image(sample_base64_png, subfolder_name="test_column") - assert media_storage.images_dir.exists() - assert len(list(media_storage.images_dir.iterdir())) > 0 - - # Cleanup should remove directory - media_storage.cleanup() - assert not media_storage.images_dir.exists() - - def test_save_base64_image_with_subfolder_name(media_storage, sample_base64_png): """Test saving image with subfolder name organizes into subdirectory.""" subfolder = "test_subfolder" diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 2e84ee3c..1d0a0f0e 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -193,11 +193,11 @@ def _upload_images_folder(self, repo_id: str, images_folder: Path) -> None: if not images_folder.exists(): return - image_files = list(images_folder.glob("*")) + image_files = list(images_folder.rglob("*.*")) if not image_files: return - logger.info(f" |-- {RandomEmoji.loading()} Uploading {len(image_files)} images...") + logger.info(f" |-- {RandomEmoji.loading()} Uploading {len(image_files)} image files...") try: self._api.upload_folder( diff --git a/pyproject.toml b/pyproject.toml index 988e6f04..35566648 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,9 +14,6 @@ name = "data-designer-workspace" version = "0.0.0" # Placeholder, never used since package = false description = "DataDesigner monorepo workspace" requires-python = ">=3.10" -dependencies = [ - "matplotlib>=3.10.8", -] [build-system] requires = ["hatchling"] diff --git a/uv.lock b/uv.lock index 9a111de5..17306f0e 100644 --- a/uv.lock +++ b/uv.lock @@ -595,163 +595,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, ] -[[package]] -name = "contourpy" -version = "1.3.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.11'", -] -dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/66/54/eb9bfc647b19f2009dd5c7f5ec51c4e6ca831725f1aea7a993034f483147/contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54", size = 13466130, upload-time = "2025-04-15T17:47:53.79Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/12/a3/da4153ec8fe25d263aa48c1a4cbde7f49b59af86f0b6f7862788c60da737/contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934", size = 268551, upload-time = "2025-04-15T17:34:46.581Z" }, - { url = "https://files.pythonhosted.org/packages/2f/6c/330de89ae1087eb622bfca0177d32a7ece50c3ef07b28002de4757d9d875/contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989", size = 253399, upload-time = "2025-04-15T17:34:51.427Z" }, - { url = "https://files.pythonhosted.org/packages/c1/bd/20c6726b1b7f81a8bee5271bed5c165f0a8e1f572578a9d27e2ccb763cb2/contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d", size = 312061, upload-time = "2025-04-15T17:34:55.961Z" }, - { url = "https://files.pythonhosted.org/packages/22/fc/a9665c88f8a2473f823cf1ec601de9e5375050f1958cbb356cdf06ef1ab6/contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9", size = 351956, upload-time = "2025-04-15T17:35:00.992Z" }, - { url = "https://files.pythonhosted.org/packages/25/eb/9f0a0238f305ad8fb7ef42481020d6e20cf15e46be99a1fcf939546a177e/contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512", size = 320872, upload-time = "2025-04-15T17:35:06.177Z" }, - { url = "https://files.pythonhosted.org/packages/32/5c/1ee32d1c7956923202f00cf8d2a14a62ed7517bdc0ee1e55301227fc273c/contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631", size = 325027, upload-time = "2025-04-15T17:35:11.244Z" }, - { url = "https://files.pythonhosted.org/packages/83/bf/9baed89785ba743ef329c2b07fd0611d12bfecbedbdd3eeecf929d8d3b52/contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f", size = 1306641, upload-time = "2025-04-15T17:35:26.701Z" }, - { url = "https://files.pythonhosted.org/packages/d4/cc/74e5e83d1e35de2d28bd97033426b450bc4fd96e092a1f7a63dc7369b55d/contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2", size = 1374075, upload-time = "2025-04-15T17:35:43.204Z" }, - { url = "https://files.pythonhosted.org/packages/0c/42/17f3b798fd5e033b46a16f8d9fcb39f1aba051307f5ebf441bad1ecf78f8/contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0", size = 177534, upload-time = "2025-04-15T17:35:46.554Z" }, - { url = "https://files.pythonhosted.org/packages/54/ec/5162b8582f2c994721018d0c9ece9dc6ff769d298a8ac6b6a652c307e7df/contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a", size = 221188, upload-time = "2025-04-15T17:35:50.064Z" }, - { url = "https://files.pythonhosted.org/packages/b3/b9/ede788a0b56fc5b071639d06c33cb893f68b1178938f3425debebe2dab78/contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445", size = 269636, upload-time = "2025-04-15T17:35:54.473Z" }, - { url = "https://files.pythonhosted.org/packages/e6/75/3469f011d64b8bbfa04f709bfc23e1dd71be54d05b1b083be9f5b22750d1/contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773", size = 254636, upload-time = "2025-04-15T17:35:58.283Z" }, - { url = "https://files.pythonhosted.org/packages/8d/2f/95adb8dae08ce0ebca4fd8e7ad653159565d9739128b2d5977806656fcd2/contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1", size = 313053, upload-time = "2025-04-15T17:36:03.235Z" }, - { url = "https://files.pythonhosted.org/packages/c3/a6/8ccf97a50f31adfa36917707fe39c9a0cbc24b3bbb58185577f119736cc9/contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43", size = 352985, upload-time = "2025-04-15T17:36:08.275Z" }, - { url = "https://files.pythonhosted.org/packages/1d/b6/7925ab9b77386143f39d9c3243fdd101621b4532eb126743201160ffa7e6/contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab", size = 323750, upload-time = "2025-04-15T17:36:13.29Z" }, - { url = "https://files.pythonhosted.org/packages/c2/f3/20c5d1ef4f4748e52d60771b8560cf00b69d5c6368b5c2e9311bcfa2a08b/contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7", size = 326246, upload-time = "2025-04-15T17:36:18.329Z" }, - { url = "https://files.pythonhosted.org/packages/8c/e5/9dae809e7e0b2d9d70c52b3d24cba134dd3dad979eb3e5e71f5df22ed1f5/contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83", size = 1308728, upload-time = "2025-04-15T17:36:33.878Z" }, - { url = "https://files.pythonhosted.org/packages/e2/4a/0058ba34aeea35c0b442ae61a4f4d4ca84d6df8f91309bc2d43bb8dd248f/contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd", size = 1375762, upload-time = "2025-04-15T17:36:51.295Z" }, - { url = "https://files.pythonhosted.org/packages/09/33/7174bdfc8b7767ef2c08ed81244762d93d5c579336fc0b51ca57b33d1b80/contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f", size = 178196, upload-time = "2025-04-15T17:36:55.002Z" }, - { url = "https://files.pythonhosted.org/packages/5e/fe/4029038b4e1c4485cef18e480b0e2cd2d755448bb071eb9977caac80b77b/contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878", size = 222017, upload-time = "2025-04-15T17:36:58.576Z" }, - { url = "https://files.pythonhosted.org/packages/34/f7/44785876384eff370c251d58fd65f6ad7f39adce4a093c934d4a67a7c6b6/contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2", size = 271580, upload-time = "2025-04-15T17:37:03.105Z" }, - { url = "https://files.pythonhosted.org/packages/93/3b/0004767622a9826ea3d95f0e9d98cd8729015768075d61f9fea8eeca42a8/contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15", size = 255530, upload-time = "2025-04-15T17:37:07.026Z" }, - { url = "https://files.pythonhosted.org/packages/e7/bb/7bd49e1f4fa805772d9fd130e0d375554ebc771ed7172f48dfcd4ca61549/contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92", size = 307688, upload-time = "2025-04-15T17:37:11.481Z" }, - { url = "https://files.pythonhosted.org/packages/fc/97/e1d5dbbfa170725ef78357a9a0edc996b09ae4af170927ba8ce977e60a5f/contourpy-1.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d14f12932a8d620e307f715857107b1d1845cc44fdb5da2bc8e850f5ceba9f87", size = 347331, upload-time = "2025-04-15T17:37:18.212Z" }, - { url = "https://files.pythonhosted.org/packages/6f/66/e69e6e904f5ecf6901be3dd16e7e54d41b6ec6ae3405a535286d4418ffb4/contourpy-1.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:532fd26e715560721bb0d5fc7610fce279b3699b018600ab999d1be895b09415", size = 318963, upload-time = "2025-04-15T17:37:22.76Z" }, - { url = "https://files.pythonhosted.org/packages/a8/32/b8a1c8965e4f72482ff2d1ac2cd670ce0b542f203c8e1d34e7c3e6925da7/contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b383144cf2d2c29f01a1e8170f50dacf0eac02d64139dcd709a8ac4eb3cfe", size = 323681, upload-time = "2025-04-15T17:37:33.001Z" }, - { url = "https://files.pythonhosted.org/packages/30/c6/12a7e6811d08757c7162a541ca4c5c6a34c0f4e98ef2b338791093518e40/contourpy-1.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c49f73e61f1f774650a55d221803b101d966ca0c5a2d6d5e4320ec3997489441", size = 1308674, upload-time = "2025-04-15T17:37:48.64Z" }, - { url = "https://files.pythonhosted.org/packages/2a/8a/bebe5a3f68b484d3a2b8ffaf84704b3e343ef1addea528132ef148e22b3b/contourpy-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3d80b2c0300583228ac98d0a927a1ba6a2ba6b8a742463c564f1d419ee5b211e", size = 1380480, upload-time = "2025-04-15T17:38:06.7Z" }, - { url = "https://files.pythonhosted.org/packages/34/db/fcd325f19b5978fb509a7d55e06d99f5f856294c1991097534360b307cf1/contourpy-1.3.2-cp312-cp312-win32.whl", hash = "sha256:90df94c89a91b7362e1142cbee7568f86514412ab8a2c0d0fca72d7e91b62912", size = 178489, upload-time = "2025-04-15T17:38:10.338Z" }, - { url = "https://files.pythonhosted.org/packages/01/c8/fadd0b92ffa7b5eb5949bf340a63a4a496a6930a6c37a7ba0f12acb076d6/contourpy-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:8c942a01d9163e2e5cfb05cb66110121b8d07ad438a17f9e766317bcb62abf73", size = 223042, upload-time = "2025-04-15T17:38:14.239Z" }, - { url = "https://files.pythonhosted.org/packages/2e/61/5673f7e364b31e4e7ef6f61a4b5121c5f170f941895912f773d95270f3a2/contourpy-1.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de39db2604ae755316cb5967728f4bea92685884b1e767b7c24e983ef5f771cb", size = 271630, upload-time = "2025-04-15T17:38:19.142Z" }, - { url = "https://files.pythonhosted.org/packages/ff/66/a40badddd1223822c95798c55292844b7e871e50f6bfd9f158cb25e0bd39/contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f9e896f447c5c8618f1edb2bafa9a4030f22a575ec418ad70611450720b5b08", size = 255670, upload-time = "2025-04-15T17:38:23.688Z" }, - { url = "https://files.pythonhosted.org/packages/1e/c7/cf9fdee8200805c9bc3b148f49cb9482a4e3ea2719e772602a425c9b09f8/contourpy-1.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71e2bd4a1c4188f5c2b8d274da78faab884b59df20df63c34f74aa1813c4427c", size = 306694, upload-time = "2025-04-15T17:38:28.238Z" }, - { url = "https://files.pythonhosted.org/packages/dd/e7/ccb9bec80e1ba121efbffad7f38021021cda5be87532ec16fd96533bb2e0/contourpy-1.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de425af81b6cea33101ae95ece1f696af39446db9682a0b56daaa48cfc29f38f", size = 345986, upload-time = "2025-04-15T17:38:33.502Z" }, - { url = "https://files.pythonhosted.org/packages/dc/49/ca13bb2da90391fa4219fdb23b078d6065ada886658ac7818e5441448b78/contourpy-1.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:977e98a0e0480d3fe292246417239d2d45435904afd6d7332d8455981c408b85", size = 318060, upload-time = "2025-04-15T17:38:38.672Z" }, - { url = "https://files.pythonhosted.org/packages/c8/65/5245ce8c548a8422236c13ffcdcdada6a2a812c361e9e0c70548bb40b661/contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841", size = 322747, upload-time = "2025-04-15T17:38:43.712Z" }, - { url = "https://files.pythonhosted.org/packages/72/30/669b8eb48e0a01c660ead3752a25b44fdb2e5ebc13a55782f639170772f9/contourpy-1.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c66c4906cdbc50e9cba65978823e6e00b45682eb09adbb78c9775b74eb222422", size = 1308895, upload-time = "2025-04-15T17:39:00.224Z" }, - { url = "https://files.pythonhosted.org/packages/05/5a/b569f4250decee6e8d54498be7bdf29021a4c256e77fe8138c8319ef8eb3/contourpy-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b7fc0cd78ba2f4695fd0a6ad81a19e7e3ab825c31b577f384aa9d7817dc3bef", size = 1379098, upload-time = "2025-04-15T17:43:29.649Z" }, - { url = "https://files.pythonhosted.org/packages/19/ba/b227c3886d120e60e41b28740ac3617b2f2b971b9f601c835661194579f1/contourpy-1.3.2-cp313-cp313-win32.whl", hash = "sha256:15ce6ab60957ca74cff444fe66d9045c1fd3e92c8936894ebd1f3eef2fff075f", size = 178535, upload-time = "2025-04-15T17:44:44.532Z" }, - { url = "https://files.pythonhosted.org/packages/12/6e/2fed56cd47ca739b43e892707ae9a13790a486a3173be063681ca67d2262/contourpy-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1578f7eafce927b168752ed7e22646dad6cd9bca673c60bff55889fa236ebf9", size = 223096, upload-time = "2025-04-15T17:44:48.194Z" }, - { url = "https://files.pythonhosted.org/packages/54/4c/e76fe2a03014a7c767d79ea35c86a747e9325537a8b7627e0e5b3ba266b4/contourpy-1.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0475b1f6604896bc7c53bb070e355e9321e1bc0d381735421a2d2068ec56531f", size = 285090, upload-time = "2025-04-15T17:43:34.084Z" }, - { url = "https://files.pythonhosted.org/packages/7b/e2/5aba47debd55d668e00baf9651b721e7733975dc9fc27264a62b0dd26eb8/contourpy-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c85bb486e9be652314bb5b9e2e3b0d1b2e643d5eec4992c0fbe8ac71775da739", size = 268643, upload-time = "2025-04-15T17:43:38.626Z" }, - { url = "https://files.pythonhosted.org/packages/a1/37/cd45f1f051fe6230f751cc5cdd2728bb3a203f5619510ef11e732109593c/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:745b57db7758f3ffc05a10254edd3182a2a83402a89c00957a8e8a22f5582823", size = 310443, upload-time = "2025-04-15T17:43:44.522Z" }, - { url = "https://files.pythonhosted.org/packages/8b/a2/36ea6140c306c9ff6dd38e3bcec80b3b018474ef4d17eb68ceecd26675f4/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:970e9173dbd7eba9b4e01aab19215a48ee5dd3f43cef736eebde064a171f89a5", size = 349865, upload-time = "2025-04-15T17:43:49.545Z" }, - { url = "https://files.pythonhosted.org/packages/95/b7/2fc76bc539693180488f7b6cc518da7acbbb9e3b931fd9280504128bf956/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6c4639a9c22230276b7bffb6a850dfc8258a2521305e1faefe804d006b2e532", size = 321162, upload-time = "2025-04-15T17:43:54.203Z" }, - { url = "https://files.pythonhosted.org/packages/f4/10/76d4f778458b0aa83f96e59d65ece72a060bacb20cfbee46cf6cd5ceba41/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc829960f34ba36aad4302e78eabf3ef16a3a100863f0d4eeddf30e8a485a03b", size = 327355, upload-time = "2025-04-15T17:44:01.025Z" }, - { url = "https://files.pythonhosted.org/packages/43/a3/10cf483ea683f9f8ab096c24bad3cce20e0d1dd9a4baa0e2093c1c962d9d/contourpy-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d32530b534e986374fc19eaa77fcb87e8a99e5431499949b828312bdcd20ac52", size = 1307935, upload-time = "2025-04-15T17:44:17.322Z" }, - { url = "https://files.pythonhosted.org/packages/78/73/69dd9a024444489e22d86108e7b913f3528f56cfc312b5c5727a44188471/contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd", size = 1372168, upload-time = "2025-04-15T17:44:33.43Z" }, - { url = "https://files.pythonhosted.org/packages/0f/1b/96d586ccf1b1a9d2004dd519b25fbf104a11589abfd05484ff12199cca21/contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1", size = 189550, upload-time = "2025-04-15T17:44:37.092Z" }, - { url = "https://files.pythonhosted.org/packages/b0/e6/6000d0094e8a5e32ad62591c8609e269febb6e4db83a1c75ff8868b42731/contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69", size = 238214, upload-time = "2025-04-15T17:44:40.827Z" }, - { url = "https://files.pythonhosted.org/packages/33/05/b26e3c6ecc05f349ee0013f0bb850a761016d89cec528a98193a48c34033/contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c", size = 265681, upload-time = "2025-04-15T17:44:59.314Z" }, - { url = "https://files.pythonhosted.org/packages/2b/25/ac07d6ad12affa7d1ffed11b77417d0a6308170f44ff20fa1d5aa6333f03/contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16", size = 315101, upload-time = "2025-04-15T17:45:04.165Z" }, - { url = "https://files.pythonhosted.org/packages/8f/4d/5bb3192bbe9d3f27e3061a6a8e7733c9120e203cb8515767d30973f71030/contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad", size = 220599, upload-time = "2025-04-15T17:45:08.456Z" }, - { url = "https://files.pythonhosted.org/packages/ff/c0/91f1215d0d9f9f343e4773ba6c9b89e8c0cc7a64a6263f21139da639d848/contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0", size = 266807, upload-time = "2025-04-15T17:45:15.535Z" }, - { url = "https://files.pythonhosted.org/packages/d4/79/6be7e90c955c0487e7712660d6cead01fa17bff98e0ea275737cc2bc8e71/contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5", size = 318729, upload-time = "2025-04-15T17:45:20.166Z" }, - { url = "https://files.pythonhosted.org/packages/87/68/7f46fb537958e87427d98a4074bcde4b67a70b04900cfc5ce29bc2f556c1/contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5", size = 221791, upload-time = "2025-04-15T17:45:24.794Z" }, -] - -[[package]] -name = "contourpy" -version = "1.3.3" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.14'", - "python_full_version >= '3.12' and python_full_version < '3.14'", - "python_full_version == '3.11.*'", -] -dependencies = [ - { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/91/2e/c4390a31919d8a78b90e8ecf87cd4b4c4f05a5b48d05ec17db8e5404c6f4/contourpy-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:709a48ef9a690e1343202916450bc48b9e51c049b089c7f79a267b46cffcdaa1", size = 288773, upload-time = "2025-07-26T12:01:02.277Z" }, - { url = "https://files.pythonhosted.org/packages/0d/44/c4b0b6095fef4dc9c420e041799591e3b63e9619e3044f7f4f6c21c0ab24/contourpy-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:23416f38bfd74d5d28ab8429cc4d63fa67d5068bd711a85edb1c3fb0c3e2f381", size = 270149, upload-time = "2025-07-26T12:01:04.072Z" }, - { url = "https://files.pythonhosted.org/packages/30/2e/dd4ced42fefac8470661d7cb7e264808425e6c5d56d175291e93890cce09/contourpy-1.3.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:929ddf8c4c7f348e4c0a5a3a714b5c8542ffaa8c22954862a46ca1813b667ee7", size = 329222, upload-time = "2025-07-26T12:01:05.688Z" }, - { url = "https://files.pythonhosted.org/packages/f2/74/cc6ec2548e3d276c71389ea4802a774b7aa3558223b7bade3f25787fafc2/contourpy-1.3.3-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9e999574eddae35f1312c2b4b717b7885d4edd6cb46700e04f7f02db454e67c1", size = 377234, upload-time = "2025-07-26T12:01:07.054Z" }, - { url = "https://files.pythonhosted.org/packages/03/b3/64ef723029f917410f75c09da54254c5f9ea90ef89b143ccadb09df14c15/contourpy-1.3.3-cp311-cp311-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf67e0e3f482cb69779dd3061b534eb35ac9b17f163d851e2a547d56dba0a3a", size = 380555, upload-time = "2025-07-26T12:01:08.801Z" }, - { url = "https://files.pythonhosted.org/packages/5f/4b/6157f24ca425b89fe2eb7e7be642375711ab671135be21e6faa100f7448c/contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51e79c1f7470158e838808d4a996fa9bac72c498e93d8ebe5119bc1e6becb0db", size = 355238, upload-time = "2025-07-26T12:01:10.319Z" }, - { url = "https://files.pythonhosted.org/packages/98/56/f914f0dd678480708a04cfd2206e7c382533249bc5001eb9f58aa693e200/contourpy-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:598c3aaece21c503615fd59c92a3598b428b2f01bfb4b8ca9c4edeecc2438620", size = 1326218, upload-time = "2025-07-26T12:01:12.659Z" }, - { url = "https://files.pythonhosted.org/packages/fb/d7/4a972334a0c971acd5172389671113ae82aa7527073980c38d5868ff1161/contourpy-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:322ab1c99b008dad206d406bb61d014cf0174df491ae9d9d0fac6a6fda4f977f", size = 1392867, upload-time = "2025-07-26T12:01:15.533Z" }, - { url = "https://files.pythonhosted.org/packages/75/3e/f2cc6cd56dc8cff46b1a56232eabc6feea52720083ea71ab15523daab796/contourpy-1.3.3-cp311-cp311-win32.whl", hash = "sha256:fd907ae12cd483cd83e414b12941c632a969171bf90fc937d0c9f268a31cafff", size = 183677, upload-time = "2025-07-26T12:01:17.088Z" }, - { url = "https://files.pythonhosted.org/packages/98/4b/9bd370b004b5c9d8045c6c33cf65bae018b27aca550a3f657cdc99acdbd8/contourpy-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:3519428f6be58431c56581f1694ba8e50626f2dd550af225f82fb5f5814d2a42", size = 225234, upload-time = "2025-07-26T12:01:18.256Z" }, - { url = "https://files.pythonhosted.org/packages/d9/b6/71771e02c2e004450c12b1120a5f488cad2e4d5b590b1af8bad060360fe4/contourpy-1.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:15ff10bfada4bf92ec8b31c62bf7c1834c244019b4a33095a68000d7075df470", size = 193123, upload-time = "2025-07-26T12:01:19.848Z" }, - { url = "https://files.pythonhosted.org/packages/be/45/adfee365d9ea3d853550b2e735f9d66366701c65db7855cd07621732ccfc/contourpy-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b08a32ea2f8e42cf1d4be3169a98dd4be32bafe4f22b6c4cb4ba810fa9e5d2cb", size = 293419, upload-time = "2025-07-26T12:01:21.16Z" }, - { url = "https://files.pythonhosted.org/packages/53/3e/405b59cfa13021a56bba395a6b3aca8cec012b45bf177b0eaf7a202cde2c/contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:556dba8fb6f5d8742f2923fe9457dbdd51e1049c4a43fd3986a0b14a1d815fc6", size = 273979, upload-time = "2025-07-26T12:01:22.448Z" }, - { url = "https://files.pythonhosted.org/packages/d4/1c/a12359b9b2ca3a845e8f7f9ac08bdf776114eb931392fcad91743e2ea17b/contourpy-1.3.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92d9abc807cf7d0e047b95ca5d957cf4792fcd04e920ca70d48add15c1a90ea7", size = 332653, upload-time = "2025-07-26T12:01:24.155Z" }, - { url = "https://files.pythonhosted.org/packages/63/12/897aeebfb475b7748ea67b61e045accdfcf0d971f8a588b67108ed7f5512/contourpy-1.3.3-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2e8faa0ed68cb29af51edd8e24798bb661eac3bd9f65420c1887b6ca89987c8", size = 379536, upload-time = "2025-07-26T12:01:25.91Z" }, - { url = "https://files.pythonhosted.org/packages/43/8a/a8c584b82deb248930ce069e71576fc09bd7174bbd35183b7943fb1064fd/contourpy-1.3.3-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:626d60935cf668e70a5ce6ff184fd713e9683fb458898e4249b63be9e28286ea", size = 384397, upload-time = "2025-07-26T12:01:27.152Z" }, - { url = "https://files.pythonhosted.org/packages/cc/8f/ec6289987824b29529d0dfda0d74a07cec60e54b9c92f3c9da4c0ac732de/contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d00e655fcef08aba35ec9610536bfe90267d7ab5ba944f7032549c55a146da1", size = 362601, upload-time = "2025-07-26T12:01:28.808Z" }, - { url = "https://files.pythonhosted.org/packages/05/0a/a3fe3be3ee2dceb3e615ebb4df97ae6f3828aa915d3e10549ce016302bd1/contourpy-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:451e71b5a7d597379ef572de31eeb909a87246974d960049a9848c3bc6c41bf7", size = 1331288, upload-time = "2025-07-26T12:01:31.198Z" }, - { url = "https://files.pythonhosted.org/packages/33/1d/acad9bd4e97f13f3e2b18a3977fe1b4a37ecf3d38d815333980c6c72e963/contourpy-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:459c1f020cd59fcfe6650180678a9993932d80d44ccde1fa1868977438f0b411", size = 1403386, upload-time = "2025-07-26T12:01:33.947Z" }, - { url = "https://files.pythonhosted.org/packages/cf/8f/5847f44a7fddf859704217a99a23a4f6417b10e5ab1256a179264561540e/contourpy-1.3.3-cp312-cp312-win32.whl", hash = "sha256:023b44101dfe49d7d53932be418477dba359649246075c996866106da069af69", size = 185018, upload-time = "2025-07-26T12:01:35.64Z" }, - { url = "https://files.pythonhosted.org/packages/19/e8/6026ed58a64563186a9ee3f29f41261fd1828f527dd93d33b60feca63352/contourpy-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:8153b8bfc11e1e4d75bcb0bff1db232f9e10b274e0929de9d608027e0d34ff8b", size = 226567, upload-time = "2025-07-26T12:01:36.804Z" }, - { url = "https://files.pythonhosted.org/packages/d1/e2/f05240d2c39a1ed228d8328a78b6f44cd695f7ef47beb3e684cf93604f86/contourpy-1.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:07ce5ed73ecdc4a03ffe3e1b3e3c1166db35ae7584be76f65dbbe28a7791b0cc", size = 193655, upload-time = "2025-07-26T12:01:37.999Z" }, - { url = "https://files.pythonhosted.org/packages/68/35/0167aad910bbdb9599272bd96d01a9ec6852f36b9455cf2ca67bd4cc2d23/contourpy-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:177fb367556747a686509d6fef71d221a4b198a3905fe824430e5ea0fda54eb5", size = 293257, upload-time = "2025-07-26T12:01:39.367Z" }, - { url = "https://files.pythonhosted.org/packages/96/e4/7adcd9c8362745b2210728f209bfbcf7d91ba868a2c5f40d8b58f54c509b/contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d002b6f00d73d69333dac9d0b8d5e84d9724ff9ef044fd63c5986e62b7c9e1b1", size = 274034, upload-time = "2025-07-26T12:01:40.645Z" }, - { url = "https://files.pythonhosted.org/packages/73/23/90e31ceeed1de63058a02cb04b12f2de4b40e3bef5e082a7c18d9c8ae281/contourpy-1.3.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:348ac1f5d4f1d66d3322420f01d42e43122f43616e0f194fc1c9f5d830c5b286", size = 334672, upload-time = "2025-07-26T12:01:41.942Z" }, - { url = "https://files.pythonhosted.org/packages/ed/93/b43d8acbe67392e659e1d984700e79eb67e2acb2bd7f62012b583a7f1b55/contourpy-1.3.3-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:655456777ff65c2c548b7c454af9c6f33f16c8884f11083244b5819cc214f1b5", size = 381234, upload-time = "2025-07-26T12:01:43.499Z" }, - { url = "https://files.pythonhosted.org/packages/46/3b/bec82a3ea06f66711520f75a40c8fc0b113b2a75edb36aa633eb11c4f50f/contourpy-1.3.3-cp313-cp313-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:644a6853d15b2512d67881586bd03f462c7ab755db95f16f14d7e238f2852c67", size = 385169, upload-time = "2025-07-26T12:01:45.219Z" }, - { url = "https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9", size = 362859, upload-time = "2025-07-26T12:01:46.519Z" }, - { url = "https://files.pythonhosted.org/packages/33/71/e2a7945b7de4e58af42d708a219f3b2f4cff7386e6b6ab0a0fa0033c49a9/contourpy-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a15459b0f4615b00bbd1e91f1b9e19b7e63aea7483d03d804186f278c0af2659", size = 1332062, upload-time = "2025-07-26T12:01:48.964Z" }, - { url = "https://files.pythonhosted.org/packages/12/fc/4e87ac754220ccc0e807284f88e943d6d43b43843614f0a8afa469801db0/contourpy-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca0fdcd73925568ca027e0b17ab07aad764be4706d0a925b89227e447d9737b7", size = 1403932, upload-time = "2025-07-26T12:01:51.979Z" }, - { url = "https://files.pythonhosted.org/packages/a6/2e/adc197a37443f934594112222ac1aa7dc9a98faf9c3842884df9a9d8751d/contourpy-1.3.3-cp313-cp313-win32.whl", hash = "sha256:b20c7c9a3bf701366556e1b1984ed2d0cedf999903c51311417cf5f591d8c78d", size = 185024, upload-time = "2025-07-26T12:01:53.245Z" }, - { url = "https://files.pythonhosted.org/packages/18/0b/0098c214843213759692cc638fce7de5c289200a830e5035d1791d7a2338/contourpy-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:1cadd8b8969f060ba45ed7c1b714fe69185812ab43bd6b86a9123fe8f99c3263", size = 226578, upload-time = "2025-07-26T12:01:54.422Z" }, - { url = "https://files.pythonhosted.org/packages/8a/9a/2f6024a0c5995243cd63afdeb3651c984f0d2bc727fd98066d40e141ad73/contourpy-1.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:fd914713266421b7536de2bfa8181aa8c699432b6763a0ea64195ebe28bff6a9", size = 193524, upload-time = "2025-07-26T12:01:55.73Z" }, - { url = "https://files.pythonhosted.org/packages/c0/b3/f8a1a86bd3298513f500e5b1f5fd92b69896449f6cab6a146a5d52715479/contourpy-1.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:88df9880d507169449d434c293467418b9f6cbe82edd19284aa0409e7fdb933d", size = 306730, upload-time = "2025-07-26T12:01:57.051Z" }, - { url = "https://files.pythonhosted.org/packages/3f/11/4780db94ae62fc0c2053909b65dc3246bd7cecfc4f8a20d957ad43aa4ad8/contourpy-1.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d06bb1f751ba5d417047db62bca3c8fde202b8c11fb50742ab3ab962c81e8216", size = 287897, upload-time = "2025-07-26T12:01:58.663Z" }, - { url = "https://files.pythonhosted.org/packages/ae/15/e59f5f3ffdd6f3d4daa3e47114c53daabcb18574a26c21f03dc9e4e42ff0/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e4e6b05a45525357e382909a4c1600444e2a45b4795163d3b22669285591c1ae", size = 326751, upload-time = "2025-07-26T12:02:00.343Z" }, - { url = "https://files.pythonhosted.org/packages/0f/81/03b45cfad088e4770b1dcf72ea78d3802d04200009fb364d18a493857210/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ab3074b48c4e2cf1a960e6bbeb7f04566bf36b1861d5c9d4d8ac04b82e38ba20", size = 375486, upload-time = "2025-07-26T12:02:02.128Z" }, - { url = "https://files.pythonhosted.org/packages/0c/ba/49923366492ffbdd4486e970d421b289a670ae8cf539c1ea9a09822b371a/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c3d53c796f8647d6deb1abe867daeb66dcc8a97e8455efa729516b997b8ed99", size = 388106, upload-time = "2025-07-26T12:02:03.615Z" }, - { url = "https://files.pythonhosted.org/packages/9f/52/5b00ea89525f8f143651f9f03a0df371d3cbd2fccd21ca9b768c7a6500c2/contourpy-1.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50ed930df7289ff2a8d7afeb9603f8289e5704755c7e5c3bbd929c90c817164b", size = 352548, upload-time = "2025-07-26T12:02:05.165Z" }, - { url = "https://files.pythonhosted.org/packages/32/1d/a209ec1a3a3452d490f6b14dd92e72280c99ae3d1e73da74f8277d4ee08f/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4feffb6537d64b84877da813a5c30f1422ea5739566abf0bd18065ac040e120a", size = 1322297, upload-time = "2025-07-26T12:02:07.379Z" }, - { url = "https://files.pythonhosted.org/packages/bc/9e/46f0e8ebdd884ca0e8877e46a3f4e633f6c9c8c4f3f6e72be3fe075994aa/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2b7e9480ffe2b0cd2e787e4df64270e3a0440d9db8dc823312e2c940c167df7e", size = 1391023, upload-time = "2025-07-26T12:02:10.171Z" }, - { url = "https://files.pythonhosted.org/packages/b9/70/f308384a3ae9cd2209e0849f33c913f658d3326900d0ff5d378d6a1422d2/contourpy-1.3.3-cp313-cp313t-win32.whl", hash = "sha256:283edd842a01e3dcd435b1c5116798d661378d83d36d337b8dde1d16a5fc9ba3", size = 196157, upload-time = "2025-07-26T12:02:11.488Z" }, - { url = "https://files.pythonhosted.org/packages/b2/dd/880f890a6663b84d9e34a6f88cded89d78f0091e0045a284427cb6b18521/contourpy-1.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:87acf5963fc2b34825e5b6b048f40e3635dd547f590b04d2ab317c2619ef7ae8", size = 240570, upload-time = "2025-07-26T12:02:12.754Z" }, - { url = "https://files.pythonhosted.org/packages/80/99/2adc7d8ffead633234817ef8e9a87115c8a11927a94478f6bb3d3f4d4f7d/contourpy-1.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:3c30273eb2a55024ff31ba7d052dde990d7d8e5450f4bbb6e913558b3d6c2301", size = 199713, upload-time = "2025-07-26T12:02:14.4Z" }, - { url = "https://files.pythonhosted.org/packages/72/8b/4546f3ab60f78c514ffb7d01a0bd743f90de36f0019d1be84d0a708a580a/contourpy-1.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fde6c716d51c04b1c25d0b90364d0be954624a0ee9d60e23e850e8d48353d07a", size = 292189, upload-time = "2025-07-26T12:02:16.095Z" }, - { url = "https://files.pythonhosted.org/packages/fd/e1/3542a9cb596cadd76fcef413f19c79216e002623158befe6daa03dbfa88c/contourpy-1.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cbedb772ed74ff5be440fa8eee9bd49f64f6e3fc09436d9c7d8f1c287b121d77", size = 273251, upload-time = "2025-07-26T12:02:17.524Z" }, - { url = "https://files.pythonhosted.org/packages/b1/71/f93e1e9471d189f79d0ce2497007731c1e6bf9ef6d1d61b911430c3db4e5/contourpy-1.3.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22e9b1bd7a9b1d652cd77388465dc358dafcd2e217d35552424aa4f996f524f5", size = 335810, upload-time = "2025-07-26T12:02:18.9Z" }, - { url = "https://files.pythonhosted.org/packages/91/f9/e35f4c1c93f9275d4e38681a80506b5510e9327350c51f8d4a5a724d178c/contourpy-1.3.3-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a22738912262aa3e254e4f3cb079a95a67132fc5a063890e224393596902f5a4", size = 382871, upload-time = "2025-07-26T12:02:20.418Z" }, - { url = "https://files.pythonhosted.org/packages/b5/71/47b512f936f66a0a900d81c396a7e60d73419868fba959c61efed7a8ab46/contourpy-1.3.3-cp314-cp314-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:afe5a512f31ee6bd7d0dda52ec9864c984ca3d66664444f2d72e0dc4eb832e36", size = 386264, upload-time = "2025-07-26T12:02:21.916Z" }, - { url = "https://files.pythonhosted.org/packages/04/5f/9ff93450ba96b09c7c2b3f81c94de31c89f92292f1380261bd7195bea4ea/contourpy-1.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f64836de09927cba6f79dcd00fdd7d5329f3fccc633468507079c829ca4db4e3", size = 363819, upload-time = "2025-07-26T12:02:23.759Z" }, - { url = "https://files.pythonhosted.org/packages/3e/a6/0b185d4cc480ee494945cde102cb0149ae830b5fa17bf855b95f2e70ad13/contourpy-1.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1fd43c3be4c8e5fd6e4f2baeae35ae18176cf2e5cced681cca908addf1cdd53b", size = 1333650, upload-time = "2025-07-26T12:02:26.181Z" }, - { url = "https://files.pythonhosted.org/packages/43/d7/afdc95580ca56f30fbcd3060250f66cedbde69b4547028863abd8aa3b47e/contourpy-1.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6afc576f7b33cf00996e5c1102dc2a8f7cc89e39c0b55df93a0b78c1bd992b36", size = 1404833, upload-time = "2025-07-26T12:02:28.782Z" }, - { url = "https://files.pythonhosted.org/packages/e2/e2/366af18a6d386f41132a48f033cbd2102e9b0cf6345d35ff0826cd984566/contourpy-1.3.3-cp314-cp314-win32.whl", hash = "sha256:66c8a43a4f7b8df8b71ee1840e4211a3c8d93b214b213f590e18a1beca458f7d", size = 189692, upload-time = "2025-07-26T12:02:30.128Z" }, - { url = "https://files.pythonhosted.org/packages/7d/c2/57f54b03d0f22d4044b8afb9ca0e184f8b1afd57b4f735c2fa70883dc601/contourpy-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:cf9022ef053f2694e31d630feaacb21ea24224be1c3ad0520b13d844274614fd", size = 232424, upload-time = "2025-07-26T12:02:31.395Z" }, - { url = "https://files.pythonhosted.org/packages/18/79/a9416650df9b525737ab521aa181ccc42d56016d2123ddcb7b58e926a42c/contourpy-1.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:95b181891b4c71de4bb404c6621e7e2390745f887f2a026b2d99e92c17892339", size = 198300, upload-time = "2025-07-26T12:02:32.956Z" }, - { url = "https://files.pythonhosted.org/packages/1f/42/38c159a7d0f2b7b9c04c64ab317042bb6952b713ba875c1681529a2932fe/contourpy-1.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:33c82d0138c0a062380332c861387650c82e4cf1747aaa6938b9b6516762e772", size = 306769, upload-time = "2025-07-26T12:02:34.2Z" }, - { url = "https://files.pythonhosted.org/packages/c3/6c/26a8205f24bca10974e77460de68d3d7c63e282e23782f1239f226fcae6f/contourpy-1.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ea37e7b45949df430fe649e5de8351c423430046a2af20b1c1961cae3afcda77", size = 287892, upload-time = "2025-07-26T12:02:35.807Z" }, - { url = "https://files.pythonhosted.org/packages/66/06/8a475c8ab718ebfd7925661747dbb3c3ee9c82ac834ccb3570be49d129f4/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d304906ecc71672e9c89e87c4675dc5c2645e1f4269a5063b99b0bb29f232d13", size = 326748, upload-time = "2025-07-26T12:02:37.193Z" }, - { url = "https://files.pythonhosted.org/packages/b4/a3/c5ca9f010a44c223f098fccd8b158bb1cb287378a31ac141f04730dc49be/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca658cd1a680a5c9ea96dc61cdbae1e85c8f25849843aa799dfd3cb370ad4fbe", size = 375554, upload-time = "2025-07-26T12:02:38.894Z" }, - { url = "https://files.pythonhosted.org/packages/80/5b/68bd33ae63fac658a4145088c1e894405e07584a316738710b636c6d0333/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ab2fd90904c503739a75b7c8c5c01160130ba67944a7b77bbf36ef8054576e7f", size = 388118, upload-time = "2025-07-26T12:02:40.642Z" }, - { url = "https://files.pythonhosted.org/packages/40/52/4c285a6435940ae25d7410a6c36bda5145839bc3f0beb20c707cda18b9d2/contourpy-1.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7301b89040075c30e5768810bc96a8e8d78085b47d8be6e4c3f5a0b4ed478a0", size = 352555, upload-time = "2025-07-26T12:02:42.25Z" }, - { url = "https://files.pythonhosted.org/packages/24/ee/3e81e1dd174f5c7fefe50e85d0892de05ca4e26ef1c9a59c2a57e43b865a/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2a2a8b627d5cc6b7c41a4beff6c5ad5eb848c88255fda4a8745f7e901b32d8e4", size = 1322295, upload-time = "2025-07-26T12:02:44.668Z" }, - { url = "https://files.pythonhosted.org/packages/3c/b2/6d913d4d04e14379de429057cd169e5e00f6c2af3bb13e1710bcbdb5da12/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fd6ec6be509c787f1caf6b247f0b1ca598bef13f4ddeaa126b7658215529ba0f", size = 1391027, upload-time = "2025-07-26T12:02:47.09Z" }, - { url = "https://files.pythonhosted.org/packages/93/8a/68a4ec5c55a2971213d29a9374913f7e9f18581945a7a31d1a39b5d2dfe5/contourpy-1.3.3-cp314-cp314t-win32.whl", hash = "sha256:e74a9a0f5e3fff48fb5a7f2fd2b9b70a3fe014a67522f79b7cca4c0c7e43c9ae", size = 202428, upload-time = "2025-07-26T12:02:48.691Z" }, - { url = "https://files.pythonhosted.org/packages/fa/96/fd9f641ffedc4fa3ace923af73b9d07e869496c9cc7a459103e6e978992f/contourpy-1.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:13b68d6a62db8eafaebb8039218921399baf6e47bf85006fd8529f2a08ef33fc", size = 250331, upload-time = "2025-07-26T12:02:50.137Z" }, - { url = "https://files.pythonhosted.org/packages/ae/8c/469afb6465b853afff216f9528ffda78a915ff880ed58813ba4faf4ba0b6/contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b", size = 203831, upload-time = "2025-07-26T12:02:51.449Z" }, - { url = "https://files.pythonhosted.org/packages/a5/29/8dcfe16f0107943fa92388c23f6e05cff0ba58058c4c95b00280d4c75a14/contourpy-1.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cd5dfcaeb10f7b7f9dc8941717c6c2ade08f587be2226222c12b25f0483ed497", size = 278809, upload-time = "2025-07-26T12:02:52.74Z" }, - { url = "https://files.pythonhosted.org/packages/85/a9/8b37ef4f7dafeb335daee3c8254645ef5725be4d9c6aa70b50ec46ef2f7e/contourpy-1.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c1fc238306b35f246d61a1d416a627348b5cf0648648a031e14bb8705fcdfe8", size = 261593, upload-time = "2025-07-26T12:02:54.037Z" }, - { url = "https://files.pythonhosted.org/packages/0a/59/ebfb8c677c75605cc27f7122c90313fd2f375ff3c8d19a1694bda74aaa63/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70f9aad7de812d6541d29d2bbf8feb22ff7e1c299523db288004e3157ff4674e", size = 302202, upload-time = "2025-07-26T12:02:55.947Z" }, - { url = "https://files.pythonhosted.org/packages/3c/37/21972a15834d90bfbfb009b9d004779bd5a07a0ec0234e5ba8f64d5736f4/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed3657edf08512fc3fe81b510e35c2012fbd3081d2e26160f27ca28affec989", size = 329207, upload-time = "2025-07-26T12:02:57.468Z" }, - { url = "https://files.pythonhosted.org/packages/0c/58/bd257695f39d05594ca4ad60df5bcb7e32247f9951fd09a9b8edb82d1daa/contourpy-1.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:3d1a3799d62d45c18bafd41c5fa05120b96a28079f2393af559b843d1a966a77", size = 225315, upload-time = "2025-07-26T12:02:58.801Z" }, -] - [[package]] name = "coverage" version = "7.13.2" @@ -921,15 +764,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0d/c3/e90f4a4feae6410f914f8ebac129b9ae7a8c92eb60a638012dde42030a9d/cryptography-46.0.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:6b5063083824e5509fdba180721d55909ffacccc8adbec85268b48439423d78c", size = 3438528, upload-time = "2025-10-15T23:18:26.227Z" }, ] -[[package]] -name = "cycler" -version = "0.12.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615, upload-time = "2023-10-07T05:32:18.335Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, -] - [[package]] name = "data-designer" source = { editable = "packages/data-designer" } @@ -1033,9 +867,6 @@ requires-dist = [ name = "data-designer-workspace" version = "0.0.0" source = { virtual = "." } -dependencies = [ - { name = "matplotlib" }, -] [package.dev-dependencies] dev = [ @@ -1069,7 +900,6 @@ recipes = [ ] [package.metadata] -requires-dist = [{ name = "matplotlib", specifier = ">=3.10.8" }] [package.metadata.requires-dev] dev = [ @@ -1394,63 +1224,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1", size = 16701, upload-time = "2026-01-09T17:55:04.334Z" }, ] -[[package]] -name = "fonttools" -version = "4.61.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5b/94/8a28707adb00bed1bf22dac16ccafe60faf2ade353dcb32c3617ee917307/fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24", size = 2854799, upload-time = "2025-12-12T17:29:27.5Z" }, - { url = "https://files.pythonhosted.org/packages/94/93/c2e682faaa5ee92034818d8f8a8145ae73eb83619600495dcf8503fa7771/fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958", size = 2403032, upload-time = "2025-12-12T17:29:30.115Z" }, - { url = "https://files.pythonhosted.org/packages/f1/62/1748f7e7e1ee41aa52279fd2e3a6d0733dc42a673b16932bad8e5d0c8b28/fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da", size = 4897863, upload-time = "2025-12-12T17:29:32.535Z" }, - { url = "https://files.pythonhosted.org/packages/69/69/4ca02ee367d2c98edcaeb83fc278d20972502ee071214ad9d8ca85e06080/fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6", size = 4859076, upload-time = "2025-12-12T17:29:34.907Z" }, - { url = "https://files.pythonhosted.org/packages/8c/f5/660f9e3cefa078861a7f099107c6d203b568a6227eef163dd173bfc56bdc/fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1", size = 4875623, upload-time = "2025-12-12T17:29:37.33Z" }, - { url = "https://files.pythonhosted.org/packages/63/d1/9d7c5091d2276ed47795c131c1bf9316c3c1ab2789c22e2f59e0572ccd38/fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881", size = 4993327, upload-time = "2025-12-12T17:29:39.781Z" }, - { url = "https://files.pythonhosted.org/packages/6f/2d/28def73837885ae32260d07660a052b99f0aa00454867d33745dfe49dbf0/fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47", size = 1502180, upload-time = "2025-12-12T17:29:42.217Z" }, - { url = "https://files.pythonhosted.org/packages/63/fa/bfdc98abb4dd2bd491033e85e3ba69a2313c850e759a6daa014bc9433b0f/fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6", size = 1550654, upload-time = "2025-12-12T17:29:44.564Z" }, - { url = "https://files.pythonhosted.org/packages/69/12/bf9f4eaa2fad039356cc627587e30ed008c03f1cebd3034376b5ee8d1d44/fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09", size = 2852213, upload-time = "2025-12-12T17:29:46.675Z" }, - { url = "https://files.pythonhosted.org/packages/ac/49/4138d1acb6261499bedde1c07f8c2605d1d8f9d77a151e5507fd3ef084b6/fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37", size = 2401689, upload-time = "2025-12-12T17:29:48.769Z" }, - { url = "https://files.pythonhosted.org/packages/e5/fe/e6ce0fe20a40e03aef906af60aa87668696f9e4802fa283627d0b5ed777f/fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb", size = 5058809, upload-time = "2025-12-12T17:29:51.701Z" }, - { url = "https://files.pythonhosted.org/packages/79/61/1ca198af22f7dd22c17ab86e9024ed3c06299cfdb08170640e9996d501a0/fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9", size = 5036039, upload-time = "2025-12-12T17:29:53.659Z" }, - { url = "https://files.pythonhosted.org/packages/99/cc/fa1801e408586b5fce4da9f5455af8d770f4fc57391cd5da7256bb364d38/fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87", size = 5034714, upload-time = "2025-12-12T17:29:55.592Z" }, - { url = "https://files.pythonhosted.org/packages/bf/aa/b7aeafe65adb1b0a925f8f25725e09f078c635bc22754f3fecb7456955b0/fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56", size = 5158648, upload-time = "2025-12-12T17:29:57.861Z" }, - { url = "https://files.pythonhosted.org/packages/99/f9/08ea7a38663328881384c6e7777bbefc46fd7d282adfd87a7d2b84ec9d50/fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a", size = 2280681, upload-time = "2025-12-12T17:29:59.943Z" }, - { url = "https://files.pythonhosted.org/packages/07/ad/37dd1ae5fa6e01612a1fbb954f0927681f282925a86e86198ccd7b15d515/fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7", size = 2331951, upload-time = "2025-12-12T17:30:02.254Z" }, - { url = "https://files.pythonhosted.org/packages/6f/16/7decaa24a1bd3a70c607b2e29f0adc6159f36a7e40eaba59846414765fd4/fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e", size = 2851593, upload-time = "2025-12-12T17:30:04.225Z" }, - { url = "https://files.pythonhosted.org/packages/94/98/3c4cb97c64713a8cf499b3245c3bf9a2b8fd16a3e375feff2aed78f96259/fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2", size = 2400231, upload-time = "2025-12-12T17:30:06.47Z" }, - { url = "https://files.pythonhosted.org/packages/b7/37/82dbef0f6342eb01f54bca073ac1498433d6ce71e50c3c3282b655733b31/fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796", size = 4954103, upload-time = "2025-12-12T17:30:08.432Z" }, - { url = "https://files.pythonhosted.org/packages/6c/44/f3aeac0fa98e7ad527f479e161aca6c3a1e47bb6996b053d45226fe37bf2/fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d", size = 5004295, upload-time = "2025-12-12T17:30:10.56Z" }, - { url = "https://files.pythonhosted.org/packages/14/e8/7424ced75473983b964d09f6747fa09f054a6d656f60e9ac9324cf40c743/fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8", size = 4944109, upload-time = "2025-12-12T17:30:12.874Z" }, - { url = "https://files.pythonhosted.org/packages/c8/8b/6391b257fa3d0b553d73e778f953a2f0154292a7a7a085e2374b111e5410/fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0", size = 5093598, upload-time = "2025-12-12T17:30:15.79Z" }, - { url = "https://files.pythonhosted.org/packages/d9/71/fd2ea96cdc512d92da5678a1c98c267ddd4d8c5130b76d0f7a80f9a9fde8/fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261", size = 2269060, upload-time = "2025-12-12T17:30:18.058Z" }, - { url = "https://files.pythonhosted.org/packages/80/3b/a3e81b71aed5a688e89dfe0e2694b26b78c7d7f39a5ffd8a7d75f54a12a8/fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9", size = 2319078, upload-time = "2025-12-12T17:30:22.862Z" }, - { url = "https://files.pythonhosted.org/packages/4b/cf/00ba28b0990982530addb8dc3e9e6f2fa9cb5c20df2abdda7baa755e8fe1/fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c", size = 2846454, upload-time = "2025-12-12T17:30:24.938Z" }, - { url = "https://files.pythonhosted.org/packages/5a/ca/468c9a8446a2103ae645d14fee3f610567b7042aba85031c1c65e3ef7471/fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e", size = 2398191, upload-time = "2025-12-12T17:30:27.343Z" }, - { url = "https://files.pythonhosted.org/packages/a3/4b/d67eedaed19def5967fade3297fed8161b25ba94699efc124b14fb68cdbc/fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5", size = 4928410, upload-time = "2025-12-12T17:30:29.771Z" }, - { url = "https://files.pythonhosted.org/packages/b0/8d/6fb3494dfe61a46258cd93d979cf4725ded4eb46c2a4ca35e4490d84daea/fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd", size = 4984460, upload-time = "2025-12-12T17:30:32.073Z" }, - { url = "https://files.pythonhosted.org/packages/f7/f1/a47f1d30b3dc00d75e7af762652d4cbc3dff5c2697a0dbd5203c81afd9c3/fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3", size = 4925800, upload-time = "2025-12-12T17:30:34.339Z" }, - { url = "https://files.pythonhosted.org/packages/a7/01/e6ae64a0981076e8a66906fab01539799546181e32a37a0257b77e4aa88b/fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d", size = 5067859, upload-time = "2025-12-12T17:30:36.593Z" }, - { url = "https://files.pythonhosted.org/packages/73/aa/28e40b8d6809a9b5075350a86779163f074d2b617c15d22343fce81918db/fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c", size = 2267821, upload-time = "2025-12-12T17:30:38.478Z" }, - { url = "https://files.pythonhosted.org/packages/1a/59/453c06d1d83dc0951b69ef692d6b9f1846680342927df54e9a1ca91c6f90/fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b", size = 2318169, upload-time = "2025-12-12T17:30:40.951Z" }, - { url = "https://files.pythonhosted.org/packages/32/8f/4e7bf82c0cbb738d3c2206c920ca34ca74ef9dabde779030145d28665104/fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd", size = 2846094, upload-time = "2025-12-12T17:30:43.511Z" }, - { url = "https://files.pythonhosted.org/packages/71/09/d44e45d0a4f3a651f23a1e9d42de43bc643cce2971b19e784cc67d823676/fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e", size = 2396589, upload-time = "2025-12-12T17:30:45.681Z" }, - { url = "https://files.pythonhosted.org/packages/89/18/58c64cafcf8eb677a99ef593121f719e6dcbdb7d1c594ae5a10d4997ca8a/fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c", size = 4877892, upload-time = "2025-12-12T17:30:47.709Z" }, - { url = "https://files.pythonhosted.org/packages/8a/ec/9e6b38c7ba1e09eb51db849d5450f4c05b7e78481f662c3b79dbde6f3d04/fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75", size = 4972884, upload-time = "2025-12-12T17:30:49.656Z" }, - { url = "https://files.pythonhosted.org/packages/5e/87/b5339da8e0256734ba0dbbf5b6cdebb1dd79b01dc8c270989b7bcd465541/fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063", size = 4924405, upload-time = "2025-12-12T17:30:51.735Z" }, - { url = "https://files.pythonhosted.org/packages/0b/47/e3409f1e1e69c073a3a6fd8cb886eb18c0bae0ee13db2c8d5e7f8495e8b7/fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2", size = 5035553, upload-time = "2025-12-12T17:30:54.823Z" }, - { url = "https://files.pythonhosted.org/packages/bf/b6/1f6600161b1073a984294c6c031e1a56ebf95b6164249eecf30012bb2e38/fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c", size = 2271915, upload-time = "2025-12-12T17:30:57.913Z" }, - { url = "https://files.pythonhosted.org/packages/52/7b/91e7b01e37cc8eb0e1f770d08305b3655e4f002fc160fb82b3390eabacf5/fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c", size = 2323487, upload-time = "2025-12-12T17:30:59.804Z" }, - { url = "https://files.pythonhosted.org/packages/39/5c/908ad78e46c61c3e3ed70c3b58ff82ab48437faf84ec84f109592cabbd9f/fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa", size = 2929571, upload-time = "2025-12-12T17:31:02.574Z" }, - { url = "https://files.pythonhosted.org/packages/bd/41/975804132c6dea64cdbfbaa59f3518a21c137a10cccf962805b301ac6ab2/fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91", size = 2435317, upload-time = "2025-12-12T17:31:04.974Z" }, - { url = "https://files.pythonhosted.org/packages/b0/5a/aef2a0a8daf1ebaae4cfd83f84186d4a72ee08fd6a8451289fcd03ffa8a4/fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19", size = 4882124, upload-time = "2025-12-12T17:31:07.456Z" }, - { url = "https://files.pythonhosted.org/packages/80/33/d6db3485b645b81cea538c9d1c9219d5805f0877fda18777add4671c5240/fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba", size = 5100391, upload-time = "2025-12-12T17:31:09.732Z" }, - { url = "https://files.pythonhosted.org/packages/6c/d6/675ba631454043c75fcf76f0ca5463eac8eb0666ea1d7badae5fea001155/fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7", size = 4978800, upload-time = "2025-12-12T17:31:11.681Z" }, - { url = "https://files.pythonhosted.org/packages/7f/33/d3ec753d547a8d2bdaedd390d4a814e8d5b45a093d558f025c6b990b554c/fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118", size = 5006426, upload-time = "2025-12-12T17:31:13.764Z" }, - { url = "https://files.pythonhosted.org/packages/b4/40/cc11f378b561a67bea850ab50063366a0d1dd3f6d0a30ce0f874b0ad5664/fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5", size = 2335377, upload-time = "2025-12-12T17:31:16.49Z" }, - { url = "https://files.pythonhosted.org/packages/e4/ff/c9a2b66b39f8628531ea58b320d66d951267c98c6a38684daa8f50fb02f8/fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b", size = 2400613, upload-time = "2025-12-12T17:31:18.769Z" }, - { url = "https://files.pythonhosted.org/packages/c7/4e/ce75a57ff3aebf6fc1f4e9d508b8e5810618a33d900ad6c19eb30b290b97/fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371", size = 1148996, upload-time = "2025-12-12T17:31:21.03Z" }, -] - [[package]] name = "fqdn" version = "1.5.1" @@ -2532,114 +2305,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/16/5a/736dd2f4535dbf3bf26523f9158c011389ef88dd06ec2eef67fd744f1c7b/jupytext-1.19.1-py3-none-any.whl", hash = "sha256:d8975035155d034bdfde5c0c37891425314b7ea8d3a6c4b5d18c294348714cd9", size = 170478, upload-time = "2026-01-25T21:35:11.17Z" }, ] -[[package]] -name = "kiwisolver" -version = "1.4.9" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/5c/3c/85844f1b0feb11ee581ac23fe5fce65cd049a200c1446708cc1b7f922875/kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d", size = 97564, upload-time = "2025-08-10T21:27:49.279Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/5d/8ce64e36d4e3aac5ca96996457dcf33e34e6051492399a3f1fec5657f30b/kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b", size = 124159, upload-time = "2025-08-10T21:25:35.472Z" }, - { url = "https://files.pythonhosted.org/packages/96/1e/22f63ec454874378175a5f435d6ea1363dd33fb2af832c6643e4ccea0dc8/kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f", size = 66578, upload-time = "2025-08-10T21:25:36.73Z" }, - { url = "https://files.pythonhosted.org/packages/41/4c/1925dcfff47a02d465121967b95151c82d11027d5ec5242771e580e731bd/kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84fd60810829c27ae375114cd379da1fa65e6918e1da405f356a775d49a62bcf", size = 65312, upload-time = "2025-08-10T21:25:37.658Z" }, - { url = "https://files.pythonhosted.org/packages/d4/42/0f333164e6307a0687d1eb9ad256215aae2f4bd5d28f4653d6cd319a3ba3/kiwisolver-1.4.9-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b78efa4c6e804ecdf727e580dbb9cba85624d2e1c6b5cb059c66290063bd99a9", size = 1628458, upload-time = "2025-08-10T21:25:39.067Z" }, - { url = "https://files.pythonhosted.org/packages/86/b6/2dccb977d651943995a90bfe3495c2ab2ba5cd77093d9f2318a20c9a6f59/kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4efec7bcf21671db6a3294ff301d2fc861c31faa3c8740d1a94689234d1b415", size = 1225640, upload-time = "2025-08-10T21:25:40.489Z" }, - { url = "https://files.pythonhosted.org/packages/50/2b/362ebd3eec46c850ccf2bfe3e30f2fc4c008750011f38a850f088c56a1c6/kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90f47e70293fc3688b71271100a1a5453aa9944a81d27ff779c108372cf5567b", size = 1244074, upload-time = "2025-08-10T21:25:42.221Z" }, - { url = "https://files.pythonhosted.org/packages/6f/bb/f09a1e66dab8984773d13184a10a29fe67125337649d26bdef547024ed6b/kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fdca1def57a2e88ef339de1737a1449d6dbf5fab184c54a1fca01d541317154", size = 1293036, upload-time = "2025-08-10T21:25:43.801Z" }, - { url = "https://files.pythonhosted.org/packages/ea/01/11ecf892f201cafda0f68fa59212edaea93e96c37884b747c181303fccd1/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cf554f21be770f5111a1690d42313e140355e687e05cf82cb23d0a721a64a48", size = 2175310, upload-time = "2025-08-10T21:25:45.045Z" }, - { url = "https://files.pythonhosted.org/packages/7f/5f/bfe11d5b934f500cc004314819ea92427e6e5462706a498c1d4fc052e08f/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1795ac5cd0510207482c3d1d3ed781143383b8cfd36f5c645f3897ce066220", size = 2270943, upload-time = "2025-08-10T21:25:46.393Z" }, - { url = "https://files.pythonhosted.org/packages/3d/de/259f786bf71f1e03e73d87e2db1a9a3bcab64d7b4fd780167123161630ad/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ccd09f20ccdbbd341b21a67ab50a119b64a403b09288c27481575105283c1586", size = 2440488, upload-time = "2025-08-10T21:25:48.074Z" }, - { url = "https://files.pythonhosted.org/packages/1b/76/c989c278faf037c4d3421ec07a5c452cd3e09545d6dae7f87c15f54e4edf/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:540c7c72324d864406a009d72f5d6856f49693db95d1fbb46cf86febef873634", size = 2246787, upload-time = "2025-08-10T21:25:49.442Z" }, - { url = "https://files.pythonhosted.org/packages/a2/55/c2898d84ca440852e560ca9f2a0d28e6e931ac0849b896d77231929900e7/kiwisolver-1.4.9-cp310-cp310-win_amd64.whl", hash = "sha256:ede8c6d533bc6601a47ad4046080d36b8fc99f81e6f1c17b0ac3c2dc91ac7611", size = 73730, upload-time = "2025-08-10T21:25:51.102Z" }, - { url = "https://files.pythonhosted.org/packages/e8/09/486d6ac523dd33b80b368247f238125d027964cfacb45c654841e88fb2ae/kiwisolver-1.4.9-cp310-cp310-win_arm64.whl", hash = "sha256:7b4da0d01ac866a57dd61ac258c5607b4cd677f63abaec7b148354d2b2cdd536", size = 65036, upload-time = "2025-08-10T21:25:52.063Z" }, - { url = "https://files.pythonhosted.org/packages/6f/ab/c80b0d5a9d8a1a65f4f815f2afff9798b12c3b9f31f1d304dd233dd920e2/kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16", size = 124167, upload-time = "2025-08-10T21:25:53.403Z" }, - { url = "https://files.pythonhosted.org/packages/a0/c0/27fe1a68a39cf62472a300e2879ffc13c0538546c359b86f149cc19f6ac3/kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089", size = 66579, upload-time = "2025-08-10T21:25:54.79Z" }, - { url = "https://files.pythonhosted.org/packages/31/a2/a12a503ac1fd4943c50f9822678e8015a790a13b5490354c68afb8489814/kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543", size = 65309, upload-time = "2025-08-10T21:25:55.76Z" }, - { url = "https://files.pythonhosted.org/packages/66/e1/e533435c0be77c3f64040d68d7a657771194a63c279f55573188161e81ca/kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61", size = 1435596, upload-time = "2025-08-10T21:25:56.861Z" }, - { url = "https://files.pythonhosted.org/packages/67/1e/51b73c7347f9aabdc7215aa79e8b15299097dc2f8e67dee2b095faca9cb0/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1", size = 1246548, upload-time = "2025-08-10T21:25:58.246Z" }, - { url = "https://files.pythonhosted.org/packages/21/aa/72a1c5d1e430294f2d32adb9542719cfb441b5da368d09d268c7757af46c/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872", size = 1263618, upload-time = "2025-08-10T21:25:59.857Z" }, - { url = "https://files.pythonhosted.org/packages/a3/af/db1509a9e79dbf4c260ce0cfa3903ea8945f6240e9e59d1e4deb731b1a40/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26", size = 1317437, upload-time = "2025-08-10T21:26:01.105Z" }, - { url = "https://files.pythonhosted.org/packages/e0/f2/3ea5ee5d52abacdd12013a94130436e19969fa183faa1e7c7fbc89e9a42f/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028", size = 2195742, upload-time = "2025-08-10T21:26:02.675Z" }, - { url = "https://files.pythonhosted.org/packages/6f/9b/1efdd3013c2d9a2566aa6a337e9923a00590c516add9a1e89a768a3eb2fc/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771", size = 2290810, upload-time = "2025-08-10T21:26:04.009Z" }, - { url = "https://files.pythonhosted.org/packages/fb/e5/cfdc36109ae4e67361f9bc5b41323648cb24a01b9ade18784657e022e65f/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a", size = 2461579, upload-time = "2025-08-10T21:26:05.317Z" }, - { url = "https://files.pythonhosted.org/packages/62/86/b589e5e86c7610842213994cdea5add00960076bef4ae290c5fa68589cac/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464", size = 2268071, upload-time = "2025-08-10T21:26:06.686Z" }, - { url = "https://files.pythonhosted.org/packages/3b/c6/f8df8509fd1eee6c622febe54384a96cfaf4d43bf2ccec7a0cc17e4715c9/kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2", size = 73840, upload-time = "2025-08-10T21:26:07.94Z" }, - { url = "https://files.pythonhosted.org/packages/e2/2d/16e0581daafd147bc11ac53f032a2b45eabac897f42a338d0a13c1e5c436/kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7", size = 65159, upload-time = "2025-08-10T21:26:09.048Z" }, - { url = "https://files.pythonhosted.org/packages/86/c9/13573a747838aeb1c76e3267620daa054f4152444d1f3d1a2324b78255b5/kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999", size = 123686, upload-time = "2025-08-10T21:26:10.034Z" }, - { url = "https://files.pythonhosted.org/packages/51/ea/2ecf727927f103ffd1739271ca19c424d0e65ea473fbaeea1c014aea93f6/kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2", size = 66460, upload-time = "2025-08-10T21:26:11.083Z" }, - { url = "https://files.pythonhosted.org/packages/5b/5a/51f5464373ce2aeb5194508298a508b6f21d3867f499556263c64c621914/kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14", size = 64952, upload-time = "2025-08-10T21:26:12.058Z" }, - { url = "https://files.pythonhosted.org/packages/70/90/6d240beb0f24b74371762873e9b7f499f1e02166a2d9c5801f4dbf8fa12e/kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04", size = 1474756, upload-time = "2025-08-10T21:26:13.096Z" }, - { url = "https://files.pythonhosted.org/packages/12/42/f36816eaf465220f683fb711efdd1bbf7a7005a2473d0e4ed421389bd26c/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752", size = 1276404, upload-time = "2025-08-10T21:26:14.457Z" }, - { url = "https://files.pythonhosted.org/packages/2e/64/bc2de94800adc830c476dce44e9b40fd0809cddeef1fde9fcf0f73da301f/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77", size = 1294410, upload-time = "2025-08-10T21:26:15.73Z" }, - { url = "https://files.pythonhosted.org/packages/5f/42/2dc82330a70aa8e55b6d395b11018045e58d0bb00834502bf11509f79091/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198", size = 1343631, upload-time = "2025-08-10T21:26:17.045Z" }, - { url = "https://files.pythonhosted.org/packages/22/fd/f4c67a6ed1aab149ec5a8a401c323cee7a1cbe364381bb6c9c0d564e0e20/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d", size = 2224963, upload-time = "2025-08-10T21:26:18.737Z" }, - { url = "https://files.pythonhosted.org/packages/45/aa/76720bd4cb3713314677d9ec94dcc21ced3f1baf4830adde5bb9b2430a5f/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab", size = 2321295, upload-time = "2025-08-10T21:26:20.11Z" }, - { url = "https://files.pythonhosted.org/packages/80/19/d3ec0d9ab711242f56ae0dc2fc5d70e298bb4a1f9dfab44c027668c673a1/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2", size = 2487987, upload-time = "2025-08-10T21:26:21.49Z" }, - { url = "https://files.pythonhosted.org/packages/39/e9/61e4813b2c97e86b6fdbd4dd824bf72d28bcd8d4849b8084a357bc0dd64d/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145", size = 2291817, upload-time = "2025-08-10T21:26:22.812Z" }, - { url = "https://files.pythonhosted.org/packages/a0/41/85d82b0291db7504da3c2defe35c9a8a5c9803a730f297bd823d11d5fb77/kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54", size = 73895, upload-time = "2025-08-10T21:26:24.37Z" }, - { url = "https://files.pythonhosted.org/packages/e2/92/5f3068cf15ee5cb624a0c7596e67e2a0bb2adee33f71c379054a491d07da/kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60", size = 64992, upload-time = "2025-08-10T21:26:25.732Z" }, - { url = "https://files.pythonhosted.org/packages/31/c1/c2686cda909742ab66c7388e9a1a8521a59eb89f8bcfbee28fc980d07e24/kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8", size = 123681, upload-time = "2025-08-10T21:26:26.725Z" }, - { url = "https://files.pythonhosted.org/packages/ca/f0/f44f50c9f5b1a1860261092e3bc91ecdc9acda848a8b8c6abfda4a24dd5c/kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2", size = 66464, upload-time = "2025-08-10T21:26:27.733Z" }, - { url = "https://files.pythonhosted.org/packages/2d/7a/9d90a151f558e29c3936b8a47ac770235f436f2120aca41a6d5f3d62ae8d/kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f", size = 64961, upload-time = "2025-08-10T21:26:28.729Z" }, - { url = "https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098", size = 1474607, upload-time = "2025-08-10T21:26:29.798Z" }, - { url = "https://files.pythonhosted.org/packages/d9/28/aac26d4c882f14de59041636292bc838db8961373825df23b8eeb807e198/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed", size = 1276546, upload-time = "2025-08-10T21:26:31.401Z" }, - { url = "https://files.pythonhosted.org/packages/8b/ad/8bfc1c93d4cc565e5069162f610ba2f48ff39b7de4b5b8d93f69f30c4bed/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525", size = 1294482, upload-time = "2025-08-10T21:26:32.721Z" }, - { url = "https://files.pythonhosted.org/packages/da/f1/6aca55ff798901d8ce403206d00e033191f63d82dd708a186e0ed2067e9c/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78", size = 1343720, upload-time = "2025-08-10T21:26:34.032Z" }, - { url = "https://files.pythonhosted.org/packages/d1/91/eed031876c595c81d90d0f6fc681ece250e14bf6998c3d7c419466b523b7/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b", size = 2224907, upload-time = "2025-08-10T21:26:35.824Z" }, - { url = "https://files.pythonhosted.org/packages/e9/ec/4d1925f2e49617b9cca9c34bfa11adefad49d00db038e692a559454dfb2e/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799", size = 2321334, upload-time = "2025-08-10T21:26:37.534Z" }, - { url = "https://files.pythonhosted.org/packages/43/cb/450cd4499356f68802750c6ddc18647b8ea01ffa28f50d20598e0befe6e9/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3", size = 2488313, upload-time = "2025-08-10T21:26:39.191Z" }, - { url = "https://files.pythonhosted.org/packages/71/67/fc76242bd99f885651128a5d4fa6083e5524694b7c88b489b1b55fdc491d/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c", size = 2291970, upload-time = "2025-08-10T21:26:40.828Z" }, - { url = "https://files.pythonhosted.org/packages/75/bd/f1a5d894000941739f2ae1b65a32892349423ad49c2e6d0771d0bad3fae4/kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d", size = 73894, upload-time = "2025-08-10T21:26:42.33Z" }, - { url = "https://files.pythonhosted.org/packages/95/38/dce480814d25b99a391abbddadc78f7c117c6da34be68ca8b02d5848b424/kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2", size = 64995, upload-time = "2025-08-10T21:26:43.889Z" }, - { url = "https://files.pythonhosted.org/packages/e2/37/7d218ce5d92dadc5ebdd9070d903e0c7cf7edfe03f179433ac4d13ce659c/kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1", size = 126510, upload-time = "2025-08-10T21:26:44.915Z" }, - { url = "https://files.pythonhosted.org/packages/23/b0/e85a2b48233daef4b648fb657ebbb6f8367696a2d9548a00b4ee0eb67803/kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1", size = 67903, upload-time = "2025-08-10T21:26:45.934Z" }, - { url = "https://files.pythonhosted.org/packages/44/98/f2425bc0113ad7de24da6bb4dae1343476e95e1d738be7c04d31a5d037fd/kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11", size = 66402, upload-time = "2025-08-10T21:26:47.101Z" }, - { url = "https://files.pythonhosted.org/packages/98/d8/594657886df9f34c4177cc353cc28ca7e6e5eb562d37ccc233bff43bbe2a/kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c", size = 1582135, upload-time = "2025-08-10T21:26:48.665Z" }, - { url = "https://files.pythonhosted.org/packages/5c/c6/38a115b7170f8b306fc929e166340c24958347308ea3012c2b44e7e295db/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197", size = 1389409, upload-time = "2025-08-10T21:26:50.335Z" }, - { url = "https://files.pythonhosted.org/packages/bf/3b/e04883dace81f24a568bcee6eb3001da4ba05114afa622ec9b6fafdc1f5e/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c", size = 1401763, upload-time = "2025-08-10T21:26:51.867Z" }, - { url = "https://files.pythonhosted.org/packages/9f/80/20ace48e33408947af49d7d15c341eaee69e4e0304aab4b7660e234d6288/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185", size = 1453643, upload-time = "2025-08-10T21:26:53.592Z" }, - { url = "https://files.pythonhosted.org/packages/64/31/6ce4380a4cd1f515bdda976a1e90e547ccd47b67a1546d63884463c92ca9/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748", size = 2330818, upload-time = "2025-08-10T21:26:55.051Z" }, - { url = "https://files.pythonhosted.org/packages/fa/e9/3f3fcba3bcc7432c795b82646306e822f3fd74df0ee81f0fa067a1f95668/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64", size = 2419963, upload-time = "2025-08-10T21:26:56.421Z" }, - { url = "https://files.pythonhosted.org/packages/99/43/7320c50e4133575c66e9f7dadead35ab22d7c012a3b09bb35647792b2a6d/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff", size = 2594639, upload-time = "2025-08-10T21:26:57.882Z" }, - { url = "https://files.pythonhosted.org/packages/65/d6/17ae4a270d4a987ef8a385b906d2bdfc9fce502d6dc0d3aea865b47f548c/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07", size = 2391741, upload-time = "2025-08-10T21:26:59.237Z" }, - { url = "https://files.pythonhosted.org/packages/2a/8f/8f6f491d595a9e5912971f3f863d81baddccc8a4d0c3749d6a0dd9ffc9df/kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c", size = 68646, upload-time = "2025-08-10T21:27:00.52Z" }, - { url = "https://files.pythonhosted.org/packages/6b/32/6cc0fbc9c54d06c2969faa9c1d29f5751a2e51809dd55c69055e62d9b426/kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386", size = 123806, upload-time = "2025-08-10T21:27:01.537Z" }, - { url = "https://files.pythonhosted.org/packages/b2/dd/2bfb1d4a4823d92e8cbb420fe024b8d2167f72079b3bb941207c42570bdf/kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552", size = 66605, upload-time = "2025-08-10T21:27:03.335Z" }, - { url = "https://files.pythonhosted.org/packages/f7/69/00aafdb4e4509c2ca6064646cba9cd4b37933898f426756adb2cb92ebbed/kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3", size = 64925, upload-time = "2025-08-10T21:27:04.339Z" }, - { url = "https://files.pythonhosted.org/packages/43/dc/51acc6791aa14e5cb6d8a2e28cefb0dc2886d8862795449d021334c0df20/kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58", size = 1472414, upload-time = "2025-08-10T21:27:05.437Z" }, - { url = "https://files.pythonhosted.org/packages/3d/bb/93fa64a81db304ac8a246f834d5094fae4b13baf53c839d6bb6e81177129/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4", size = 1281272, upload-time = "2025-08-10T21:27:07.063Z" }, - { url = "https://files.pythonhosted.org/packages/70/e6/6df102916960fb8d05069d4bd92d6d9a8202d5a3e2444494e7cd50f65b7a/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df", size = 1298578, upload-time = "2025-08-10T21:27:08.452Z" }, - { url = "https://files.pythonhosted.org/packages/7c/47/e142aaa612f5343736b087864dbaebc53ea8831453fb47e7521fa8658f30/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6", size = 1345607, upload-time = "2025-08-10T21:27:10.125Z" }, - { url = "https://files.pythonhosted.org/packages/54/89/d641a746194a0f4d1a3670fb900d0dbaa786fb98341056814bc3f058fa52/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5", size = 2230150, upload-time = "2025-08-10T21:27:11.484Z" }, - { url = "https://files.pythonhosted.org/packages/aa/6b/5ee1207198febdf16ac11f78c5ae40861b809cbe0e6d2a8d5b0b3044b199/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf", size = 2325979, upload-time = "2025-08-10T21:27:12.917Z" }, - { url = "https://files.pythonhosted.org/packages/fc/ff/b269eefd90f4ae14dcc74973d5a0f6d28d3b9bb1afd8c0340513afe6b39a/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5", size = 2491456, upload-time = "2025-08-10T21:27:14.353Z" }, - { url = "https://files.pythonhosted.org/packages/fc/d4/10303190bd4d30de547534601e259a4fbf014eed94aae3e5521129215086/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce", size = 2294621, upload-time = "2025-08-10T21:27:15.808Z" }, - { url = "https://files.pythonhosted.org/packages/28/e0/a9a90416fce5c0be25742729c2ea52105d62eda6c4be4d803c2a7be1fa50/kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7", size = 75417, upload-time = "2025-08-10T21:27:17.436Z" }, - { url = "https://files.pythonhosted.org/packages/1f/10/6949958215b7a9a264299a7db195564e87900f709db9245e4ebdd3c70779/kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c", size = 66582, upload-time = "2025-08-10T21:27:18.436Z" }, - { url = "https://files.pythonhosted.org/packages/ec/79/60e53067903d3bc5469b369fe0dfc6b3482e2133e85dae9daa9527535991/kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548", size = 126514, upload-time = "2025-08-10T21:27:19.465Z" }, - { url = "https://files.pythonhosted.org/packages/25/d1/4843d3e8d46b072c12a38c97c57fab4608d36e13fe47d47ee96b4d61ba6f/kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d", size = 67905, upload-time = "2025-08-10T21:27:20.51Z" }, - { url = "https://files.pythonhosted.org/packages/8c/ae/29ffcbd239aea8b93108de1278271ae764dfc0d803a5693914975f200596/kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c", size = 66399, upload-time = "2025-08-10T21:27:21.496Z" }, - { url = "https://files.pythonhosted.org/packages/a1/ae/d7ba902aa604152c2ceba5d352d7b62106bedbccc8e95c3934d94472bfa3/kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122", size = 1582197, upload-time = "2025-08-10T21:27:22.604Z" }, - { url = "https://files.pythonhosted.org/packages/f2/41/27c70d427eddb8bc7e4f16420a20fefc6f480312122a59a959fdfe0445ad/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64", size = 1390125, upload-time = "2025-08-10T21:27:24.036Z" }, - { url = "https://files.pythonhosted.org/packages/41/42/b3799a12bafc76d962ad69083f8b43b12bf4fe78b097b12e105d75c9b8f1/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134", size = 1402612, upload-time = "2025-08-10T21:27:25.773Z" }, - { url = "https://files.pythonhosted.org/packages/d2/b5/a210ea073ea1cfaca1bb5c55a62307d8252f531beb364e18aa1e0888b5a0/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370", size = 1453990, upload-time = "2025-08-10T21:27:27.089Z" }, - { url = "https://files.pythonhosted.org/packages/5f/ce/a829eb8c033e977d7ea03ed32fb3c1781b4fa0433fbadfff29e39c676f32/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21", size = 2331601, upload-time = "2025-08-10T21:27:29.343Z" }, - { url = "https://files.pythonhosted.org/packages/e0/4b/b5e97eb142eb9cd0072dacfcdcd31b1c66dc7352b0f7c7255d339c0edf00/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a", size = 2422041, upload-time = "2025-08-10T21:27:30.754Z" }, - { url = "https://files.pythonhosted.org/packages/40/be/8eb4cd53e1b85ba4edc3a9321666f12b83113a178845593307a3e7891f44/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f", size = 2594897, upload-time = "2025-08-10T21:27:32.803Z" }, - { url = "https://files.pythonhosted.org/packages/99/dd/841e9a66c4715477ea0abc78da039832fbb09dac5c35c58dc4c41a407b8a/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369", size = 2391835, upload-time = "2025-08-10T21:27:34.23Z" }, - { url = "https://files.pythonhosted.org/packages/0c/28/4b2e5c47a0da96896fdfdb006340ade064afa1e63675d01ea5ac222b6d52/kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891", size = 79988, upload-time = "2025-08-10T21:27:35.587Z" }, - { url = "https://files.pythonhosted.org/packages/80/be/3578e8afd18c88cdf9cb4cffde75a96d2be38c5a903f1ed0ceec061bd09e/kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32", size = 70260, upload-time = "2025-08-10T21:27:36.606Z" }, - { url = "https://files.pythonhosted.org/packages/a2/63/fde392691690f55b38d5dd7b3710f5353bf7a8e52de93a22968801ab8978/kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4d1d9e582ad4d63062d34077a9a1e9f3c34088a2ec5135b1f7190c07cf366527", size = 60183, upload-time = "2025-08-10T21:27:37.669Z" }, - { url = "https://files.pythonhosted.org/packages/27/b1/6aad34edfdb7cced27f371866f211332bba215bfd918ad3322a58f480d8b/kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:deed0c7258ceb4c44ad5ec7d9918f9f14fd05b2be86378d86cf50e63d1e7b771", size = 58675, upload-time = "2025-08-10T21:27:39.031Z" }, - { url = "https://files.pythonhosted.org/packages/9d/1a/23d855a702bb35a76faed5ae2ba3de57d323f48b1f6b17ee2176c4849463/kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a590506f303f512dff6b7f75fd2fd18e16943efee932008fe7140e5fa91d80e", size = 80277, upload-time = "2025-08-10T21:27:40.129Z" }, - { url = "https://files.pythonhosted.org/packages/5a/5b/5239e3c2b8fb5afa1e8508f721bb77325f740ab6994d963e61b2b7abcc1e/kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e09c2279a4d01f099f52d5c4b3d9e208e91edcbd1a175c9662a8b16e000fece9", size = 77994, upload-time = "2025-08-10T21:27:41.181Z" }, - { url = "https://files.pythonhosted.org/packages/f9/1c/5d4d468fb16f8410e596ed0eac02d2c68752aa7dc92997fe9d60a7147665/kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb", size = 73744, upload-time = "2025-08-10T21:27:42.254Z" }, - { url = "https://files.pythonhosted.org/packages/a3/0f/36d89194b5a32c054ce93e586d4049b6c2c22887b0eb229c61c68afd3078/kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5", size = 60104, upload-time = "2025-08-10T21:27:43.287Z" }, - { url = "https://files.pythonhosted.org/packages/52/ba/4ed75f59e4658fd21fe7dde1fee0ac397c678ec3befba3fe6482d987af87/kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa", size = 58592, upload-time = "2025-08-10T21:27:44.314Z" }, - { url = "https://files.pythonhosted.org/packages/33/01/a8ea7c5ea32a9b45ceeaee051a04c8ed4320f5add3c51bfa20879b765b70/kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2", size = 80281, upload-time = "2025-08-10T21:27:45.369Z" }, - { url = "https://files.pythonhosted.org/packages/da/e3/dbd2ecdce306f1d07a1aaf324817ee993aab7aee9db47ceac757deabafbe/kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f", size = 78009, upload-time = "2025-08-10T21:27:46.376Z" }, - { url = "https://files.pythonhosted.org/packages/da/e9/0d4add7873a73e462aeb45c036a2dead2562b825aa46ba326727b3f31016/kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1", size = 73929, upload-time = "2025-08-10T21:27:48.236Z" }, -] - [[package]] name = "lark" version = "1.3.1" @@ -2913,81 +2578,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/70/bc/6f1c2f612465f5fa89b95bead1f44dcb607670fd42891d8fdcd5d039f4f4/markupsafe-3.0.3-cp314-cp314t-win_arm64.whl", hash = "sha256:32001d6a8fc98c8cb5c947787c5d08b0a50663d139f1305bac5885d98d9b40fa", size = 14146, upload-time = "2025-09-27T18:37:28.327Z" }, ] -[[package]] -name = "matplotlib" -version = "3.10.8" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "contourpy", version = "1.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "contourpy", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "cycler" }, - { name = "fonttools" }, - { name = "kiwisolver" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "packaging" }, - { name = "pillow" }, - { name = "pyparsing" }, - { name = "python-dateutil" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8a/76/d3c6e3a13fe484ebe7718d14e269c9569c4eb0020a968a327acb3b9a8fe6/matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3", size = 34806269, upload-time = "2025-12-10T22:56:51.155Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/58/be/a30bd917018ad220c400169fba298f2bb7003c8ccbc0c3e24ae2aacad1e8/matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7", size = 8239828, upload-time = "2025-12-10T22:55:02.313Z" }, - { url = "https://files.pythonhosted.org/packages/58/27/ca01e043c4841078e82cf6e80a6993dfecd315c3d79f5f3153afbb8e1ec6/matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656", size = 8128050, upload-time = "2025-12-10T22:55:04.997Z" }, - { url = "https://files.pythonhosted.org/packages/cb/aa/7ab67f2b729ae6a91bcf9dcac0affb95fb8c56f7fd2b2af894ae0b0cf6fa/matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df", size = 8700452, upload-time = "2025-12-10T22:55:07.47Z" }, - { url = "https://files.pythonhosted.org/packages/73/ae/2d5817b0acee3c49b7e7ccfbf5b273f284957cc8e270adf36375db353190/matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17", size = 9534928, upload-time = "2025-12-10T22:55:10.566Z" }, - { url = "https://files.pythonhosted.org/packages/c9/5b/8e66653e9f7c39cb2e5cab25fce4810daffa2bff02cbf5f3077cea9e942c/matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933", size = 9586377, upload-time = "2025-12-10T22:55:12.362Z" }, - { url = "https://files.pythonhosted.org/packages/e2/e2/fd0bbadf837f81edb0d208ba8f8cb552874c3b16e27cb91a31977d90875d/matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a", size = 8128127, upload-time = "2025-12-10T22:55:14.436Z" }, - { url = "https://files.pythonhosted.org/packages/f8/86/de7e3a1cdcfc941483af70609edc06b83e7c8a0e0dc9ac325200a3f4d220/matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160", size = 8251215, upload-time = "2025-12-10T22:55:16.175Z" }, - { url = "https://files.pythonhosted.org/packages/fd/14/baad3222f424b19ce6ad243c71de1ad9ec6b2e4eb1e458a48fdc6d120401/matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78", size = 8139625, upload-time = "2025-12-10T22:55:17.712Z" }, - { url = "https://files.pythonhosted.org/packages/8f/a0/7024215e95d456de5883e6732e708d8187d9753a21d32f8ddb3befc0c445/matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4", size = 8712614, upload-time = "2025-12-10T22:55:20.8Z" }, - { url = "https://files.pythonhosted.org/packages/5a/f4/b8347351da9a5b3f41e26cf547252d861f685c6867d179a7c9d60ad50189/matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2", size = 9540997, upload-time = "2025-12-10T22:55:23.258Z" }, - { url = "https://files.pythonhosted.org/packages/9e/c0/c7b914e297efe0bc36917bf216b2acb91044b91e930e878ae12981e461e5/matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6", size = 9596825, upload-time = "2025-12-10T22:55:25.217Z" }, - { url = "https://files.pythonhosted.org/packages/6f/d3/a4bbc01c237ab710a1f22b4da72f4ff6d77eb4c7735ea9811a94ae239067/matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9", size = 8135090, upload-time = "2025-12-10T22:55:27.162Z" }, - { url = "https://files.pythonhosted.org/packages/89/dd/a0b6588f102beab33ca6f5218b31725216577b2a24172f327eaf6417d5c9/matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2", size = 8012377, upload-time = "2025-12-10T22:55:29.185Z" }, - { url = "https://files.pythonhosted.org/packages/9e/67/f997cdcbb514012eb0d10cd2b4b332667997fb5ebe26b8d41d04962fa0e6/matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a", size = 8260453, upload-time = "2025-12-10T22:55:30.709Z" }, - { url = "https://files.pythonhosted.org/packages/7e/65/07d5f5c7f7c994f12c768708bd2e17a4f01a2b0f44a1c9eccad872433e2e/matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58", size = 8148321, upload-time = "2025-12-10T22:55:33.265Z" }, - { url = "https://files.pythonhosted.org/packages/3e/f3/c5195b1ae57ef85339fd7285dfb603b22c8b4e79114bae5f4f0fcf688677/matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04", size = 8716944, upload-time = "2025-12-10T22:55:34.922Z" }, - { url = "https://files.pythonhosted.org/packages/00/f9/7638f5cc82ec8a7aa005de48622eecc3ed7c9854b96ba15bd76b7fd27574/matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f", size = 9550099, upload-time = "2025-12-10T22:55:36.789Z" }, - { url = "https://files.pythonhosted.org/packages/57/61/78cd5920d35b29fd2a0fe894de8adf672ff52939d2e9b43cb83cd5ce1bc7/matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466", size = 9613040, upload-time = "2025-12-10T22:55:38.715Z" }, - { url = "https://files.pythonhosted.org/packages/30/4e/c10f171b6e2f44d9e3a2b96efa38b1677439d79c99357600a62cc1e9594e/matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf", size = 8142717, upload-time = "2025-12-10T22:55:41.103Z" }, - { url = "https://files.pythonhosted.org/packages/f1/76/934db220026b5fef85f45d51a738b91dea7d70207581063cd9bd8fafcf74/matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b", size = 8012751, upload-time = "2025-12-10T22:55:42.684Z" }, - { url = "https://files.pythonhosted.org/packages/3d/b9/15fd5541ef4f5b9a17eefd379356cf12175fe577424e7b1d80676516031a/matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6", size = 8261076, upload-time = "2025-12-10T22:55:44.648Z" }, - { url = "https://files.pythonhosted.org/packages/8d/a0/2ba3473c1b66b9c74dc7107c67e9008cb1782edbe896d4c899d39ae9cf78/matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1", size = 8148794, upload-time = "2025-12-10T22:55:46.252Z" }, - { url = "https://files.pythonhosted.org/packages/75/97/a471f1c3eb1fd6f6c24a31a5858f443891d5127e63a7788678d14e249aea/matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486", size = 8718474, upload-time = "2025-12-10T22:55:47.864Z" }, - { url = "https://files.pythonhosted.org/packages/01/be/cd478f4b66f48256f42927d0acbcd63a26a893136456cd079c0cc24fbabf/matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce", size = 9549637, upload-time = "2025-12-10T22:55:50.048Z" }, - { url = "https://files.pythonhosted.org/packages/5d/7c/8dc289776eae5109e268c4fb92baf870678dc048a25d4ac903683b86d5bf/matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6", size = 9613678, upload-time = "2025-12-10T22:55:52.21Z" }, - { url = "https://files.pythonhosted.org/packages/64/40/37612487cc8a437d4dd261b32ca21fe2d79510fe74af74e1f42becb1bdb8/matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149", size = 8142686, upload-time = "2025-12-10T22:55:54.253Z" }, - { url = "https://files.pythonhosted.org/packages/66/52/8d8a8730e968185514680c2a6625943f70269509c3dcfc0dcf7d75928cb8/matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645", size = 8012917, upload-time = "2025-12-10T22:55:56.268Z" }, - { url = "https://files.pythonhosted.org/packages/b5/27/51fe26e1062f298af5ef66343d8ef460e090a27fea73036c76c35821df04/matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077", size = 8305679, upload-time = "2025-12-10T22:55:57.856Z" }, - { url = "https://files.pythonhosted.org/packages/2c/1e/4de865bc591ac8e3062e835f42dd7fe7a93168d519557837f0e37513f629/matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22", size = 8198336, upload-time = "2025-12-10T22:55:59.371Z" }, - { url = "https://files.pythonhosted.org/packages/c6/cb/2f7b6e75fb4dce87ef91f60cac4f6e34f4c145ab036a22318ec837971300/matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39", size = 8731653, upload-time = "2025-12-10T22:56:01.032Z" }, - { url = "https://files.pythonhosted.org/packages/46/b3/bd9c57d6ba670a37ab31fb87ec3e8691b947134b201f881665b28cc039ff/matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565", size = 9561356, upload-time = "2025-12-10T22:56:02.95Z" }, - { url = "https://files.pythonhosted.org/packages/c0/3d/8b94a481456dfc9dfe6e39e93b5ab376e50998cddfd23f4ae3b431708f16/matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a", size = 9614000, upload-time = "2025-12-10T22:56:05.411Z" }, - { url = "https://files.pythonhosted.org/packages/bd/cd/bc06149fe5585ba800b189a6a654a75f1f127e8aab02fd2be10df7fa500c/matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958", size = 8220043, upload-time = "2025-12-10T22:56:07.551Z" }, - { url = "https://files.pythonhosted.org/packages/e3/de/b22cf255abec916562cc04eef457c13e58a1990048de0c0c3604d082355e/matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5", size = 8062075, upload-time = "2025-12-10T22:56:09.178Z" }, - { url = "https://files.pythonhosted.org/packages/3c/43/9c0ff7a2f11615e516c3b058e1e6e8f9614ddeca53faca06da267c48345d/matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f", size = 8262481, upload-time = "2025-12-10T22:56:10.885Z" }, - { url = "https://files.pythonhosted.org/packages/6f/ca/e8ae28649fcdf039fda5ef554b40a95f50592a3c47e6f7270c9561c12b07/matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b", size = 8151473, upload-time = "2025-12-10T22:56:12.377Z" }, - { url = "https://files.pythonhosted.org/packages/f1/6f/009d129ae70b75e88cbe7e503a12a4c0670e08ed748a902c2568909e9eb5/matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d", size = 9553896, upload-time = "2025-12-10T22:56:14.432Z" }, - { url = "https://files.pythonhosted.org/packages/f5/26/4221a741eb97967bc1fd5e4c52b9aa5a91b2f4ec05b59f6def4d820f9df9/matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008", size = 9824193, upload-time = "2025-12-10T22:56:16.29Z" }, - { url = "https://files.pythonhosted.org/packages/1f/f3/3abf75f38605772cf48a9daf5821cd4f563472f38b4b828c6fba6fa6d06e/matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c", size = 9615444, upload-time = "2025-12-10T22:56:18.155Z" }, - { url = "https://files.pythonhosted.org/packages/93/a5/de89ac80f10b8dc615807ee1133cd99ac74082581196d4d9590bea10690d/matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11", size = 8272719, upload-time = "2025-12-10T22:56:20.366Z" }, - { url = "https://files.pythonhosted.org/packages/69/ce/b006495c19ccc0a137b48083168a37bd056392dee02f87dba0472f2797fe/matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8", size = 8144205, upload-time = "2025-12-10T22:56:22.239Z" }, - { url = "https://files.pythonhosted.org/packages/68/d9/b31116a3a855bd313c6fcdb7226926d59b041f26061c6c5b1be66a08c826/matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50", size = 8305785, upload-time = "2025-12-10T22:56:24.218Z" }, - { url = "https://files.pythonhosted.org/packages/1e/90/6effe8103f0272685767ba5f094f453784057072f49b393e3ea178fe70a5/matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908", size = 8198361, upload-time = "2025-12-10T22:56:26.787Z" }, - { url = "https://files.pythonhosted.org/packages/d7/65/a73188711bea603615fc0baecca1061429ac16940e2385433cc778a9d8e7/matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a", size = 9561357, upload-time = "2025-12-10T22:56:28.953Z" }, - { url = "https://files.pythonhosted.org/packages/f4/3d/b5c5d5d5be8ce63292567f0e2c43dde9953d3ed86ac2de0a72e93c8f07a1/matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1", size = 9823610, upload-time = "2025-12-10T22:56:31.455Z" }, - { url = "https://files.pythonhosted.org/packages/4d/4b/e7beb6bbd49f6bae727a12b270a2654d13c397576d25bd6786e47033300f/matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c", size = 9614011, upload-time = "2025-12-10T22:56:33.85Z" }, - { url = "https://files.pythonhosted.org/packages/7c/e6/76f2813d31f032e65f6f797e3f2f6e4aab95b65015924b1c51370395c28a/matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b", size = 8362801, upload-time = "2025-12-10T22:56:36.107Z" }, - { url = "https://files.pythonhosted.org/packages/5d/49/d651878698a0b67f23aa28e17f45a6d6dd3d3f933fa29087fa4ce5947b5a/matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f", size = 8192560, upload-time = "2025-12-10T22:56:38.008Z" }, - { url = "https://files.pythonhosted.org/packages/f5/43/31d59500bb950b0d188e149a2e552040528c13d6e3d6e84d0cccac593dcd/matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8", size = 8237252, upload-time = "2025-12-10T22:56:39.529Z" }, - { url = "https://files.pythonhosted.org/packages/0c/2c/615c09984f3c5f907f51c886538ad785cf72e0e11a3225de2c0f9442aecc/matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7", size = 8124693, upload-time = "2025-12-10T22:56:41.758Z" }, - { url = "https://files.pythonhosted.org/packages/91/e1/2757277a1c56041e1fc104b51a0f7b9a4afc8eb737865d63cababe30bc61/matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3", size = 8702205, upload-time = "2025-12-10T22:56:43.415Z" }, - { url = "https://files.pythonhosted.org/packages/04/30/3afaa31c757f34b7725ab9d2ba8b48b5e89c2019c003e7d0ead143aabc5a/matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1", size = 8249198, upload-time = "2025-12-10T22:56:45.584Z" }, - { url = "https://files.pythonhosted.org/packages/48/2f/6334aec331f57485a642a7c8be03cb286f29111ae71c46c38b363230063c/matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a", size = 8136817, upload-time = "2025-12-10T22:56:47.339Z" }, - { url = "https://files.pythonhosted.org/packages/73/e4/6d6f14b2a759c622f191b2d67e9075a3f56aaccb3be4bb9bb6890030d0a0/matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2", size = 8713867, upload-time = "2025-12-10T22:56:48.954Z" }, -] - [[package]] name = "matplotlib-inline" version = "0.2.1" From cd39941dfc6ac4d75cd93bb68d10d7ba6da18b45 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 20:28:42 -0700 Subject: [PATCH 46/69] more tests for hf image folder upload --- .../integrations/huggingface/test_client.py | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index 735ea3bc..ba11a485 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -462,6 +462,79 @@ def test_validate_dataset_path_invalid_builder_config_json(tmp_path: Path) -> No client.upload_dataset("test/dataset", base_path, "Test") +def test_upload_dataset_uploads_images_folder( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that upload_dataset uploads images when images folder exists with subfolders.""" + # Create images directory with column subfolders (matches MediaStorage structure) + images_dir = sample_dataset_path / "images" + col_dir = images_dir / "my_image_column" + col_dir.mkdir(parents=True) + (col_dir / "uuid1.png").write_bytes(b"fake png data") + (col_dir / "uuid2.png").write_bytes(b"fake png data") + + client = HuggingFaceHubClient(token="test-token") + client.upload_dataset(repo_id="test/dataset", base_dataset_path=sample_dataset_path, description="Test dataset") + + # Check that upload_folder was called for images + image_calls = [call for call in mock_hf_api.upload_folder.call_args_list if call.kwargs["path_in_repo"] == "images"] + assert len(image_calls) == 1 + assert image_calls[0].kwargs["folder_path"] == str(images_dir) + assert image_calls[0].kwargs["repo_type"] == "dataset" + + +def test_upload_dataset_skips_images_when_folder_missing( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that upload_dataset skips images upload when images folder doesn't exist.""" + # sample_dataset_path has no images/ directory by default + client = HuggingFaceHubClient(token="test-token") + client.upload_dataset(repo_id="test/dataset", base_dataset_path=sample_dataset_path, description="Test dataset") + + # No upload_folder call should target "images" + image_calls = [call for call in mock_hf_api.upload_folder.call_args_list if call.kwargs["path_in_repo"] == "images"] + assert len(image_calls) == 0 + + +def test_upload_dataset_skips_images_when_folder_empty( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that upload_dataset skips images upload when images folder exists but is empty.""" + images_dir = sample_dataset_path / "images" + images_dir.mkdir() + + client = HuggingFaceHubClient(token="test-token") + client.upload_dataset(repo_id="test/dataset", base_dataset_path=sample_dataset_path, description="Test dataset") + + image_calls = [call for call in mock_hf_api.upload_folder.call_args_list if call.kwargs["path_in_repo"] == "images"] + assert len(image_calls) == 0 + + +def test_upload_dataset_images_upload_failure( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: + """Test that upload_dataset raises error when images upload fails.""" + # Create images directory with a file + images_dir = sample_dataset_path / "images" + col_dir = images_dir / "col" + col_dir.mkdir(parents=True) + (col_dir / "img.png").write_bytes(b"fake") + + # Make upload_folder fail only for images + original_upload_folder = mock_hf_api.upload_folder + + def failing_upload_folder(**kwargs): + if kwargs.get("path_in_repo") == "images": + raise Exception("Network error") + return original_upload_folder(**kwargs) + + mock_hf_api.upload_folder.side_effect = failing_upload_folder + + client = HuggingFaceHubClient(token="test-token") + with pytest.raises(HuggingFaceHubClientUploadError, match="Failed to upload images"): + client.upload_dataset(repo_id="test/dataset", base_dataset_path=sample_dataset_path, description="Test dataset") + + def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: """Test upload_dataset fails with invalid repo_id.""" client = HuggingFaceHubClient(token="test-token") From 52e023d8e40fcd2591410fa09c0d38a9fd41885c Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 20:40:00 -0700 Subject: [PATCH 47/69] Fix test --- .../tests/integrations/huggingface/test_client.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index ba11a485..924a6bfe 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -521,12 +521,9 @@ def test_upload_dataset_images_upload_failure( (col_dir / "img.png").write_bytes(b"fake") # Make upload_folder fail only for images - original_upload_folder = mock_hf_api.upload_folder - def failing_upload_folder(**kwargs): if kwargs.get("path_in_repo") == "images": raise Exception("Network error") - return original_upload_folder(**kwargs) mock_hf_api.upload_folder.side_effect = failing_upload_folder From c53a1dce4eeeb86f29d8d9f0017d3cf6ee7081aa Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 20:45:17 -0700 Subject: [PATCH 48/69] set init=False for media_storage --- .../data_designer/engine/dataset_builders/artifact_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py index a7316be3..7d2c6e72 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py @@ -47,7 +47,7 @@ class ArtifactStorage(BaseModel): partial_results_folder_name: str = "tmp-partial-parquet-files" dropped_columns_folder_name: str = "dropped-columns-parquet-files" processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME - media_storage: MediaStorage = Field(default=None, exclude=True) + media_storage: MediaStorage = Field(default=None, init=False, exclude=True) @property def artifact_path_exists(self) -> bool: From 8f813b1318f8431a70784b35bcc60b03f417da0f Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 9 Feb 2026 20:58:15 -0700 Subject: [PATCH 49/69] handle image url in _display_image_if_in_notebook --- .../config/utils/visualization.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/visualization.py b/packages/data-designer-config/src/data_designer/config/utils/visualization.py index 2132b83b..bd2876d4 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/visualization.py +++ b/packages/data-designer-config/src/data_designer/config/utils/visualization.py @@ -51,7 +51,7 @@ def _display_image_if_in_notebook(image_data: str, col_name: str) -> bool: """Display image with caption in Jupyter notebook if available. Args: - image_data: Base64-encoded image data, data URI, or file path. + image_data: Base64-encoded image data, data URI, file path, or URL. col_name: Name of the column (used for caption). Returns: @@ -63,7 +63,22 @@ def _display_image_if_in_notebook(image_data: str, col_name: str) -> bool: get_ipython() # This will raise NameError if not in IPython/Jupyter - # Check if it's a file path and load it + # Escape column name to prevent HTML injection + escaped_col_name = html.escape(col_name) + + # URLs: render directly as + if is_image_url(image_data): + escaped_url = html.escape(image_data) + html_content = f""" +
+
πŸ–ΌοΈ {escaped_col_name}
+ +
+ """ + display(HTML(html_content)) + return True + + # File paths: load from disk and convert to base64 if is_image_path(image_data) and not image_data.startswith("data:image/"): loaded_base64 = load_image_path_to_base64(image_data) if loaded_base64 is None: @@ -76,13 +91,7 @@ def _display_image_if_in_notebook(image_data: str, col_name: str) -> bool: base64_data = image_data # Extract base64 from data URI if present - base64_data = extract_base64_from_data_uri(base64_data) - - # Use the base64 data directly without resizing - img_base64 = base64_data - - # Escape column name to prevent HTML injection - escaped_col_name = html.escape(col_name) + img_base64 = extract_base64_from_data_uri(base64_data) # Create HTML with caption and image in left-aligned container html_content = f""" From 782a346a0c974050764584a4649f5d6b865c14b5 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 11:26:18 -0700 Subject: [PATCH 50/69] Fix path traversal vulnerability in MediaStorage subfolder handling Sanitize subfolder names before using them in filesystem paths to prevent path traversal attacks via column names containing special characters like '../', '/', or '\'. Co-Authored-By: Claude Sonnet 4.5 --- .../engine/storage/media_storage.py | 12 +++++++-- .../engine/storage/test_media_storage.py | 25 +++++++++++++++++++ 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py index 3726b7f7..81387525 100644 --- a/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/storage/media_storage.py @@ -61,6 +61,11 @@ def _ensure_images_directory(self) -> None: """Create images directory if it doesn't exist (lazy initialization).""" self.images_dir.mkdir(parents=True, exist_ok=True) + def _sanitize_subfolder_name(self, name: str) -> str: + """Sanitize subfolder name to prevent path traversal and filesystem issues.""" + # Replace path separators and parent directory references with underscores + return name.replace("/", "_").replace("\\", "_").replace("..", "_") + def save_base64_image(self, base64_data: str, subfolder_name: str) -> str: """Save or return base64 image based on storage mode. @@ -81,8 +86,11 @@ def save_base64_image(self, base64_data: str, subfolder_name: str) -> str: return base64_data # DISK mode: save to disk, validate, and return relative path + # Sanitize subfolder name to prevent path traversal + sanitized_subfolder = self._sanitize_subfolder_name(subfolder_name) + # Determine the target directory (organized by subfolder) - target_dir = self.images_dir / subfolder_name + target_dir = self.images_dir / sanitized_subfolder # Ensure target directory exists (lazy initialization) target_dir.mkdir(parents=True, exist_ok=True) @@ -99,7 +107,7 @@ def save_base64_image(self, base64_data: str, subfolder_name: str) -> str: full_path = target_dir / filename # Build relative path - relative_path = f"{self.images_subdir}/{subfolder_name}/{filename}" + relative_path = f"{self.images_subdir}/{sanitized_subfolder}/{filename}" # Write to disk with open(full_path, "wb") as f: diff --git a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py index 3648486d..f908a4c2 100644 --- a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py +++ b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py @@ -214,3 +214,28 @@ def test_save_base64_image_dataframe_mode_with_subfolder_name(tmp_path, sample_b # Directory should not be created in DATAFRAME mode assert not storage.images_dir.exists() + + +@pytest.mark.parametrize( + "unsafe_name,expected_sanitized", + [ + ("../evil", "__evil"), # Parent directory traversal: .. -> _, / -> _ + ("foo/bar", "foo_bar"), # Path separator (forward slash) + ("foo\\bar", "foo_bar"), # Path separator (backslash) + ("test..name", "test_name"), # Double dots in middle: .. -> _ + ], +) +def test_save_base64_image_sanitizes_subfolder_name(media_storage, sample_base64_png, unsafe_name, expected_sanitized): + """Test that subfolder names are sanitized to prevent path traversal.""" + relative_path = media_storage.save_base64_image(sample_base64_png, subfolder_name=unsafe_name) + + # Check that path contains sanitized subfolder name + assert expected_sanitized in relative_path + assert "/" not in expected_sanitized # No path separators + assert "\\" not in expected_sanitized # No backslashes + assert ".." not in expected_sanitized # No parent references + + # Verify file is inside images directory (not escaped via path traversal) + full_path = media_storage.base_path / relative_path + assert full_path.exists() + assert media_storage.images_dir in full_path.parents From 2d7a2023530216bdabda0966eda107b6be3a1d16 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 11:31:18 -0700 Subject: [PATCH 51/69] Fix PIL format detection in detect_image_format Compare against enum values instead of enum members in PIL fallback path. This fixes JPEG/WEBP/GIF detection when magic bytes don't match. --- .../src/data_designer/config/utils/image_helpers.py | 4 ++-- .../tests/config/utils/test_image_helpers.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index 678d3b80..c20c81ea 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -114,8 +114,8 @@ def detect_image_format(image_bytes: bytes) -> ImageFormat: try: img = Image.open(io.BytesIO(image_bytes)) format_str = img.format.lower() if img.format else None - if format_str in [ImageFormat.PNG, ImageFormat.JPG, ImageFormat.JPEG, ImageFormat.WEBP]: - return ImageFormat(format_str if format_str != ImageFormat.JPEG else ImageFormat.JPG) + if format_str in [fmt.value for fmt in ImageFormat]: + return ImageFormat(format_str if format_str != ImageFormat.JPEG.value else ImageFormat.JPG.value) except Exception: pass diff --git a/packages/data-designer-config/tests/config/utils/test_image_helpers.py b/packages/data-designer-config/tests/config/utils/test_image_helpers.py index aa1ca451..f24696e4 100644 --- a/packages/data-designer-config/tests/config/utils/test_image_helpers.py +++ b/packages/data-designer-config/tests/config/utils/test_image_helpers.py @@ -224,9 +224,9 @@ def test_detect_image_format_with_pil_fallback_unsupported_format(tmp_path): img.save(gif_path, format="GIF") gif_bytes = gif_path.read_bytes() - # Should use PIL fallback and default to PNG (GIF not in ImageFormat enum) + # Should use PIL fallback and correctly detect GIF format result = detect_image_format(gif_bytes) - assert result == ImageFormat.PNG + assert result == ImageFormat.GIF def test_detect_image_format_with_pil_fallback_jpeg(): From 5fca3a664d335a24546fc321d90068e2e449262c Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 12:21:53 -0700 Subject: [PATCH 52/69] Fix Pydantic v2 compatibility in ArtifactStorage Replace Field(init=False) with PrivateAttr for media_storage attribute. Pydantic v2 does not support init kwarg in Field(). Use PrivateAttr for non-serialized attributes with property accessors for backward compatibility. --- .../dataset_builders/artifact_storage.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py index 7d2c6e72..43b817b0 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py @@ -11,7 +11,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic import BaseModel, ConfigDict, PrivateAttr, field_validator, model_validator from data_designer.config.utils.io_helpers import read_parquet_dataset from data_designer.config.utils.type_helpers import StrEnum, resolve_string_enum @@ -47,7 +47,17 @@ class ArtifactStorage(BaseModel): partial_results_folder_name: str = "tmp-partial-parquet-files" dropped_columns_folder_name: str = "dropped-columns-parquet-files" processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME - media_storage: MediaStorage = Field(default=None, init=False, exclude=True) + _media_storage: MediaStorage = PrivateAttr(default=None) + + @property + def media_storage(self) -> MediaStorage: + """Access media storage instance.""" + return self._media_storage + + @media_storage.setter + def media_storage(self, value: MediaStorage) -> None: + """Set media storage instance.""" + self._media_storage = value @property def artifact_path_exists(self) -> bool: @@ -119,7 +129,7 @@ def validate_folder_names(self): raise ArtifactStorageError(f"πŸ›‘ Directory name '{name}' contains invalid characters.") # Initialize media storage with DISK mode by default - self.media_storage = MediaStorage( + self._media_storage = MediaStorage( base_path=self.base_dataset_path, mode=StorageMode.DISK, ) @@ -132,7 +142,7 @@ def set_media_storage_mode(self, mode: StorageMode) -> None: Args: mode: StorageMode.DISK (save to disk) or StorageMode.DATAFRAME (store in memory) """ - self.media_storage.mode = mode + self._media_storage.mode = mode @staticmethod def mkdir_if_needed(path: Path | str) -> Path: From 192d5402801fa9323887fe7360c53e51fbad6bdb Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 12:50:03 -0700 Subject: [PATCH 53/69] docs for image generation --- docs/code_reference/models.md | 5 +- docs/colab_notebooks/1-the-basics.ipynb | 62 ++++++++--------- ...ctured-outputs-and-jinja-expressions.ipynb | 58 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 54 +++++++-------- .../4-providing-images-as-context.ipynb | 66 +++++++++---------- .../colab_notebooks/5-generating-images.ipynb | 44 ++++++------- docs/concepts/columns.md | 20 +++++- docs/concepts/models/inference-parameters.md | 40 ++++++++++- docs/notebook_source/5-generating-images.py | 2 +- mkdocs.yml | 1 + 10 files changed, 205 insertions(+), 147 deletions(-) diff --git a/docs/code_reference/models.md b/docs/code_reference/models.md index 882e03f3..98023d51 100644 --- a/docs/code_reference/models.md +++ b/docs/code_reference/models.md @@ -1,11 +1,12 @@ # Models -The `models` module defines configuration objects for model-based generation. [ModelProvider](#data_designer.config.models.ModelProvider), specifies connection and authentication details for custom providers. [ModelConfig](#data_designer.config.models.ModelConfig) encapsulates model details including the model alias, identifier, and inference parameters. [Inference Parameters](../concepts/models/inference-parameters.md) controls model behavior through settings like `temperature`, `top_p`, and `max_tokens`, with support for both fixed values and distribution-based sampling. The module includes [ImageContext](#data_designer.config.models.ImageContext) for providing image inputs to multimodal models. +The `models` module defines configuration objects for model-based generation. [ModelProvider](#data_designer.config.models.ModelProvider) specifies connection and authentication details for custom providers. [ModelConfig](#data_designer.config.models.ModelConfig) encapsulates model details including the model alias, identifier, and inference parameters. [Inference Parameters](../concepts/models/inference-parameters.md) controls model behavior through settings like `temperature`, `top_p`, and `max_tokens`, with support for both fixed values and distribution-based sampling. The module includes [ImageContext](#data_designer.config.models.ImageContext) for providing image inputs to multimodal models, and [ImageInferenceParams](#data_designer.config.models.ImageInferenceParams) for configuring image generation models. For more information on how they are used, see below: - **[Model Providers](../concepts/models/model-providers.md)** - **[Model Configs](../concepts/models/model-configs.md)** -- **[Image Context](/notebooks/4-providing-images-as-context/)** +- **[Image Context](../notebooks/4-providing-images-as-context.ipynb)** +- **[Generating Images](../notebooks/5-generating-images.ipynb)** ::: data_designer.config.models diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index ed8942df..5983b899 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "945eebf8", + "id": "ac1377b2", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "8e8f2e22", + "id": "3dcba9e4", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -26,7 +26,7 @@ }, { "cell_type": "markdown", - "id": "92d91bf1", + "id": "a0a26660", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0b9b4427", + "id": "11ee3b8e", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8878d172", + "id": "47301447", "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4c92bfb3", + "id": "3bf4d093", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "4e39eed1", + "id": "84533566", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70c96cfb", + "id": "a1d87e57", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "markdown", - "id": "99d975c9", + "id": "dfdfe936", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "851228c8", + "id": "673de1e4", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "fefb639d", + "id": "785d5530", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0ba52672", + "id": "00bdd7bd", "metadata": {}, "outputs": [], "source": [ @@ -169,7 +169,7 @@ }, { "cell_type": "markdown", - "id": "7cc2aefc", + "id": "38b00929", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a5a34b1a", + "id": "1ab0eef8", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "ee4d1b6a", + "id": "b3d51275", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7782d790", + "id": "fb7f0947", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "f88e8b18", + "id": "a78eaaf1", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19174a73", + "id": "675dcca1", "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "01438115", + "id": "267e9717", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c8f1275", + "id": "007e0de6", "metadata": {}, "outputs": [], "source": [ @@ -382,7 +382,7 @@ }, { "cell_type": "markdown", - "id": "f61e3771", + "id": "e2bb0bd9", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -399,7 +399,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f8dc56e", + "id": "305dbc2a", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5b66172a", + "id": "6eda71a0", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b0eaa931", + "id": "52e4fe1a", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "122d099d", + "id": "bfd1a3e9", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f40f7ba0", + "id": "0151ca2b", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "597c41ec", + "id": "d212d818", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -466,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acf8caa3", + "id": "58604774", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "697e9090", + "id": "3c09b4fe", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18f34e66", + "id": "264e59e3", "metadata": {}, "outputs": [], "source": [ @@ -501,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "4c498f62", + "id": "96b7b832", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 49be6edb..77c72bae 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "bd333de9", + "id": "afd29f78", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "28fb2ee3", + "id": "5189a728", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "fbeb3b2d", + "id": "33c9d0c9", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ef3d2ae", + "id": "4f5bbd85", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "07546806", + "id": "90abc86a", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81b00725", + "id": "33da8c4e", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "a5cf694f", + "id": "ad827faf", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8320e2b0", + "id": "b072c2f4", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "348e2c5a", + "id": "786100eb", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21019fc5", + "id": "4adf8bdd", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "7bf9d9af", + "id": "3178d635", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88abb685", + "id": "c90eb35b", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "d8e790c6", + "id": "49b7c8b5", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -198,7 +198,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64465ab1", + "id": "57aa10a2", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "cfbad124", + "id": "e73bd82b", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aa93a4c9", + "id": "296cdf8d", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "74aa72fc", + "id": "e757d87d", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -361,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ae978cc", + "id": "dd52fb2d", "metadata": {}, "outputs": [], "source": [ @@ -414,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "ec850f14", + "id": "92b33564", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cb18575e", + "id": "48c44912", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eee46dc6", + "id": "04ebcc42", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "082d0fc4", + "id": "d1666a5e", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "e8d80b94", + "id": "321c6495", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -475,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4b0a7299", + "id": "09008c4d", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "d7e0c925", + "id": "6efb50b0", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -498,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b599d759", + "id": "5e2756f2", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +508,7 @@ { "cell_type": "code", "execution_count": null, - "id": "07a7c0da", + "id": "5c8c0e1c", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7760dffa", + "id": "05212932", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +533,7 @@ }, { "cell_type": "markdown", - "id": "6d19000a", + "id": "2ccbbc14", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 468aa795..0b4fc81e 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "573c3e7b", + "id": "7132a5b6", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "63f6c36d", + "id": "5e88dd3a", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "02cc81c7", + "id": "9546c919", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18d51631", + "id": "29aa565b", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67c55f6b", + "id": "6b3bd93b", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cfe2ff62", + "id": "5a6b5cb5", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "bdbc5b03", + "id": "857bc101", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55d9caf1", + "id": "3cd82cf8", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "aa1623bc", + "id": "cbec0c3c", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9d1310cf", + "id": "cb303163", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "e64ce3b7", + "id": "5904cc74", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dafd6155", + "id": "284f3f04", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "7c01f11c", + "id": "c705b2ae", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7941073f", + "id": "70f3ed2d", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "a68c7d55", + "id": "70eb52c0", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f1b3d4d4", + "id": "068c6580", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "eff1bf9f", + "id": "15abcf63", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -325,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b5955230", + "id": "a69af2e6", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "062a7294", + "id": "f5ac9d71", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6378e1be", + "id": "6bb47512", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "51e5175e", + "id": "2114b5bc", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -369,7 +369,7 @@ { "cell_type": "code", "execution_count": null, - "id": "891b6860", + "id": "c4c11da9", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "0f52668f", + "id": "d0512d67", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed083bd8", + "id": "92417411", "metadata": {}, "outputs": [], "source": [ @@ -402,7 +402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "039c42e4", + "id": "b6753723", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "623ca205", + "id": "5301d1b8", "metadata": {}, "outputs": [], "source": [ @@ -427,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "0a7e7d42", + "id": "b415a021", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 62ac63e8..08d0e46f 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "731384ed", + "id": "d28e0063", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "bc66dd23", + "id": "238d96ab", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "4539a931", + "id": "e4b2d125", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -37,7 +37,7 @@ }, { "cell_type": "markdown", - "id": "f88809bf", + "id": "82f4cd84", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3628d4c4", + "id": "a5ed449e", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7fcf0f75", + "id": "931c7e31", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6654714a", + "id": "04dfa3a8", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "22488cb7", + "id": "15475af5", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39913ca0", + "id": "c0962dde", "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "markdown", - "id": "fba112ab", + "id": "9b975535", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70fd86dd", + "id": "ed1bf1f1", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "810c7457", + "id": "53aadb2c", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b2204d0", + "id": "94955e35", "metadata": {}, "outputs": [], "source": [ @@ -186,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "29e3dae5", + "id": "62448131", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e2cc3506", + "id": "33087e69", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7a821067", + "id": "5f4e1bac", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ { "cell_type": "code", "execution_count": null, - "id": "359d144b", + "id": "252f67ed", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "985cd308", + "id": "7d4e8bb5", "metadata": {}, "outputs": [], "source": [ @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6a8cb414", + "id": "13bce5fc", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a57e1b73", + "id": "3e12a27e", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ }, { "cell_type": "markdown", - "id": "7518100a", + "id": "d47bd653", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4c1fe540", + "id": "f4926089", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bceafe91", + "id": "ed240050", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "20f4ace5", + "id": "5eda7277", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "16a86d56", + "id": "5caed579", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -396,7 +396,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c1bbae97", + "id": "b329d3a1", "metadata": {}, "outputs": [], "source": [ @@ -406,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "d8d7604f", + "id": "d69f7c67", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27c0636c", + "id": "b2caa9d1", "metadata": { "lines_to_next_cell": 2 }, @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "f6b99539", + "id": "99c0861f", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e5d53787", + "id": "de4e1a36", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f859e49", + "id": "1959a492", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6688e3c5", + "id": "9cf8ab99", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ }, { "cell_type": "markdown", - "id": "28635b09", + "id": "12a42f12", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/5-generating-images.ipynb b/docs/colab_notebooks/5-generating-images.ipynb index 485fe258..41530d7c 100644 --- a/docs/colab_notebooks/5-generating-images.ipynb +++ b/docs/colab_notebooks/5-generating-images.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "0ee289e6", + "id": "fb577af2", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Generating Images\n", @@ -15,14 +15,14 @@ "- πŸ“ **Jinja2 prompts**: Drive diversity by referencing other columns in your prompt template\n", "- πŸ’Ύ **Preview vs create**: Preview stores base64 in the dataframe; create saves images to disk and stores paths\n", "\n", - "Data Designer supports both **diffusion** (e.g. DALLΒ·E, Stable Diffusion, Imagen) and **autoregressive** (e.g. Gemini image, GPT image) models; the API is chosen automatically from the model name.\n", + "Data Designer supports both **diffusion** (e.g. DALLΒ·E, Stable Diffusion, Imagen) and **autoregressive** (e.g. Gemini image, GPT image) models.\n", "\n", "If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series.\n" ] }, { "cell_type": "markdown", - "id": "86f748c1", + "id": "6d0c0049", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -33,7 +33,7 @@ }, { "cell_type": "markdown", - "id": "c610ee22", + "id": "47e777d7", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "818ca495", + "id": "0889f8b0", "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f165bb15", + "id": "58dda77c", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5decfc83", + "id": "9ee635ed", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "929f35d6", + "id": "32f2a62b", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b4c8b7d7", + "id": "8f2fe56e", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "8ed7b0b6", + "id": "a526f294", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-generation model\n", @@ -118,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d6b1ca66", + "id": "95eb6aa3", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ }, { "cell_type": "markdown", - "id": "498cfecf", + "id": "0665df24", "metadata": {}, "source": [ "### πŸ—οΈ Build the config: samplers + image column\n", @@ -151,7 +151,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e74fc7ab", + "id": "98a5381c", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "c592c820", + "id": "75dc9209", "metadata": {}, "source": [ "### πŸ” Preview: images as base64\n", @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eee17bb1", + "id": "b70e22f6", "metadata": {}, "outputs": [], "source": [ @@ -345,7 +345,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3cd320cc", + "id": "e0fdc58d", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ffb5e188", + "id": "dbb3c655", "metadata": {}, "outputs": [], "source": [ @@ -365,7 +365,7 @@ }, { "cell_type": "markdown", - "id": "87b83328", + "id": "cd9dd316", "metadata": {}, "source": [ "### πŸ†™ Create: images saved to disk\n", @@ -376,7 +376,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a8f9cc41", + "id": "0074f4c3", "metadata": {}, "outputs": [], "source": [ @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d4453e5", + "id": "6bbe1861", "metadata": {}, "outputs": [], "source": [ @@ -397,7 +397,7 @@ { "cell_type": "code", "execution_count": null, - "id": "198301ab", + "id": "43d567dd", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +413,7 @@ }, { "cell_type": "markdown", - "id": "2bdcef2b", + "id": "d0eb52cb", "metadata": {}, "source": [ "## ⏭️ Next steps\n", diff --git a/docs/concepts/columns.md b/docs/concepts/columns.md index 48eed5e0..d1422277 100644 --- a/docs/concepts/columns.md +++ b/docs/concepts/columns.md @@ -7,7 +7,7 @@ Columns are the fundamental building blocks in Data Designer. Each column repres ## Column Types -Data Designer provides ten built-in column types, each optimized for different generation scenarios. +Data Designer provides eleven built-in column types, each optimized for different generation scenarios. ### 🎲 Sampler Columns @@ -96,6 +96,24 @@ Define scoring rubrics (relevance, accuracy, fluency, helpfulness) and the judge Use judge columns for data quality filtering (e.g., keep only 4+ rated responses), A/B testing generation strategies, and quality monitoring over time. +### πŸ–ΌοΈ Image Columns + +Image columns generate images from text prompts using either **diffusion** models (DALLΒ·E, Stable Diffusion, Imagen) or **autoregressive** models (Gemini image, GPT image). + +Use **Jinja2 templating** in the prompt to reference other columns, driving diversity across generated images. For example, reference sampled attributes like style, subject, and composition to produce varied images without manually writing different prompts. + +Image columns require a model configured with `ImageInferenceParams`. Model-specific options (size, quality, aspect ratio) are passed via `extra_body` in the inference parameters. + +**Output modes:** + +- **Preview** (`data_designer.preview()`): Images are stored as base64-encoded strings directly in the DataFrame for quick iteration +- **Create** (`data_designer.create()`): Images are saved to disk in an `images//` folder with UUID filenames; the DataFrame stores relative paths + +Image columns also support `multi_modal_context` for autoregressive models that accept image inputs, enabling image-to-image generation workflows. + +!!! tip "Tutorial" + See the [Generating Images](../notebooks/5-generating-images.ipynb) tutorial for a complete walkthrough, and [Providing Images as Context](../notebooks/4-providing-images-as-context.ipynb) for using images as input to other columns. + ### 🧬 Embedding Columns Embedding columns generate vector embeddings (numerical representations) for text content using embedding models. These embeddings capture semantic meaning, enabling similarity search, clustering, and semantic analysis. diff --git a/docs/concepts/models/inference-parameters.md b/docs/concepts/models/inference-parameters.md index 3470611c..a1ca2865 100644 --- a/docs/concepts/models/inference-parameters.md +++ b/docs/concepts/models/inference-parameters.md @@ -1,6 +1,6 @@ # Inference Parameters -Inference parameters control how models generate responses during synthetic data generation. Data Designer provides two types of inference parameters: `ChatCompletionInferenceParams` for text/code/structured generation and `EmbeddingInferenceParams` for embedding generation. +Inference parameters control how models generate responses during synthetic data generation. Data Designer provides three types of inference parameters: `ChatCompletionInferenceParams` for text/code/structured generation, `EmbeddingInferenceParams` for embedding generation, and `ImageInferenceParams` for image generation. ## Overview @@ -136,6 +136,44 @@ The `EmbeddingInferenceParams` class controls how models generate embeddings. Th | `extra_body` | `dict[str, Any]` | No | Additional parameters to include in the API request body | +## Image Inference Parameters + +The `ImageInferenceParams` class is used for image generation models, including both diffusion models (DALLΒ·E, Stable Diffusion, Imagen) and autoregressive models (Gemini image, GPT image). Unlike text models, image-specific options are passed entirely via `extra_body`, since they vary significantly between providers. + +### Fields + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `max_parallel_requests` | `int` | No | Maximum concurrent API requests (default: 4, β‰₯ 1) | +| `timeout` | `int` | No | API request timeout in seconds (β‰₯ 1) | +| `extra_body` | `dict[str, Any]` | No | Model-specific image options (size, quality, aspect ratio, etc.) | + +### Examples + +```python +import data_designer.config as dd + +# Diffusion model (e.g., DALLΒ·E, Stable Diffusion) +dd.ModelConfig( + alias="image-model", + model="black-forest-labs/flux.2-pro", + provider="openrouter", + inference_parameters=dd.ImageInferenceParams( + extra_body={"height": 512, "width": 512} + ), +) + +# OpenAI DALLΒ·E style +dd.ModelConfig( + alias="dalle", + model="dall-e-3", + inference_parameters=dd.ImageInferenceParams( + extra_body={"size": "1024x1024", "quality": "hd"} + ), +) +``` + + ## See Also - **[Default Model Settings](default-model-settings.md)**: Pre-configured model settings included with Data Designer diff --git a/docs/notebook_source/5-generating-images.py b/docs/notebook_source/5-generating-images.py index 28638ff9..73b98cd9 100644 --- a/docs/notebook_source/5-generating-images.py +++ b/docs/notebook_source/5-generating-images.py @@ -23,7 +23,7 @@ # - πŸ“ **Jinja2 prompts**: Drive diversity by referencing other columns in your prompt template # - πŸ’Ύ **Preview vs create**: Preview stores base64 in the dataframe; create saves images to disk and stores paths # -# Data Designer supports both **diffusion** (e.g. DALLΒ·E, Stable Diffusion, Imagen) and **autoregressive** (e.g. Gemini image, GPT image) models; the API is chosen automatically from the model name. +# Data Designer supports both **diffusion** (e.g. DALLΒ·E, Stable Diffusion, Imagen) and **autoregressive** (e.g. Gemini image, GPT image) models. # # If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series. # diff --git a/mkdocs.yml b/mkdocs.yml index ec88d886..c28f1038 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -36,6 +36,7 @@ nav: - Structured Outputs and Jinja Expressions: notebooks/2-structured-outputs-and-jinja-expressions.ipynb - Seeding with an External Dataset: notebooks/3-seeding-with-a-dataset.ipynb - Providing Images as Context: notebooks/4-providing-images-as-context.ipynb + - Generating Images: notebooks/5-generating-images.ipynb - Recipes: - Recipe Cards: recipes/cards.md - Code Generation: From 982b9d7d2f929d9400e1deb42e1c128cbd4eab40 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 14:25:54 -0700 Subject: [PATCH 54/69] image-to-image tutorial --- docs/colab_notebooks/1-the-basics.ipynb | 62 +-- ...ctured-outputs-and-jinja-expressions.ipynb | 58 +-- .../3-seeding-with-a-dataset.ipynb | 54 +- .../4-providing-images-as-context.ipynb | 66 +-- .../colab_notebooks/5-generating-images.ipynb | 55 +- .../6-editing-images-with-image-context.ipynb | 490 ++++++++++++++++++ docs/notebook_source/5-generating-images.py | 11 +- .../6-editing-images-with-image-context.py | 314 +++++++++++ docs/notebook_source/_README.md | 9 + mkdocs.yml | 1 + 10 files changed, 968 insertions(+), 152 deletions(-) create mode 100644 docs/colab_notebooks/6-editing-images-with-image-context.ipynb create mode 100644 docs/notebook_source/6-editing-images-with-image-context.py diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 5983b899..ec9f5f00 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ac1377b2", + "id": "b557c504", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "3dcba9e4", + "id": "84987de5", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -26,7 +26,7 @@ }, { "cell_type": "markdown", - "id": "a0a26660", + "id": "e79f93a9", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11ee3b8e", + "id": "80920352", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "47301447", + "id": "bbef8181", "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3bf4d093", + "id": "7bd1e8d6", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "84533566", + "id": "08d48c5f", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a1d87e57", + "id": "38bdc673", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "markdown", - "id": "dfdfe936", + "id": "27c75d97", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "673de1e4", + "id": "17bf8a4a", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "785d5530", + "id": "3d910ff1", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "00bdd7bd", + "id": "20e7ff7f", "metadata": {}, "outputs": [], "source": [ @@ -169,7 +169,7 @@ }, { "cell_type": "markdown", - "id": "38b00929", + "id": "81e40a7c", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1ab0eef8", + "id": "f6e9308f", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "b3d51275", + "id": "88a4aada", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fb7f0947", + "id": "1cbb33dd", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "a78eaaf1", + "id": "3e027eba", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "675dcca1", + "id": "28a9a449", "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "267e9717", + "id": "3cf9ad4a", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "007e0de6", + "id": "3ce0bd1b", "metadata": {}, "outputs": [], "source": [ @@ -382,7 +382,7 @@ }, { "cell_type": "markdown", - "id": "e2bb0bd9", + "id": "83c0d6c9", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -399,7 +399,7 @@ { "cell_type": "code", "execution_count": null, - "id": "305dbc2a", + "id": "10a8544a", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6eda71a0", + "id": "18955c09", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52e4fe1a", + "id": "ae0d842e", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "bfd1a3e9", + "id": "7a8aae69", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0151ca2b", + "id": "9cf63137", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "d212d818", + "id": "1b1db3b0", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -466,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58604774", + "id": "371910f0", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c09b4fe", + "id": "4df85fb5", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "264e59e3", + "id": "50034186", "metadata": {}, "outputs": [], "source": [ @@ -501,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "96b7b832", + "id": "ee1e3476", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 77c72bae..f98e9b92 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "afd29f78", + "id": "c8ab83b5", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "5189a728", + "id": "598d3d3a", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "33c9d0c9", + "id": "4eed9946", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4f5bbd85", + "id": "aefd74d2", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "90abc86a", + "id": "b4596021", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33da8c4e", + "id": "db722e12", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "ad827faf", + "id": "af84adc6", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b072c2f4", + "id": "b0a13f37", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "786100eb", + "id": "494fc106", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4adf8bdd", + "id": "52b901be", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "3178d635", + "id": "0239c09a", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c90eb35b", + "id": "a40d7f02", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "49b7c8b5", + "id": "bf75c464", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -198,7 +198,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57aa10a2", + "id": "561d3650", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "e73bd82b", + "id": "f758aedb", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "296cdf8d", + "id": "b7643909", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "e757d87d", + "id": "1033adc4", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -361,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd52fb2d", + "id": "6c877291", "metadata": {}, "outputs": [], "source": [ @@ -414,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "92b33564", + "id": "6ef6a1e0", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "48c44912", + "id": "87be3640", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "04ebcc42", + "id": "9238802e", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d1666a5e", + "id": "1184e9db", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "321c6495", + "id": "1c043980", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -475,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "09008c4d", + "id": "28195ad7", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "6efb50b0", + "id": "8885ad89", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -498,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5e2756f2", + "id": "0b76df7c", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +508,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5c8c0e1c", + "id": "b403cfe4", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "05212932", + "id": "11e68465", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +533,7 @@ }, { "cell_type": "markdown", - "id": "2ccbbc14", + "id": "fa36adf2", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 0b4fc81e..2ffb72e0 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "7132a5b6", + "id": "f60f6b7d", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "5e88dd3a", + "id": "612a5533", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "9546c919", + "id": "0b383424", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "29aa565b", + "id": "c5a3e218", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6b3bd93b", + "id": "34434dcd", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a6b5cb5", + "id": "11be7325", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "857bc101", + "id": "fb998ae8", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3cd82cf8", + "id": "f12828b1", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "cbec0c3c", + "id": "967da49d", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cb303163", + "id": "3c3e9dec", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "5904cc74", + "id": "e09a9729", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "284f3f04", + "id": "ad4163d5", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "c705b2ae", + "id": "879f0eb9", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70f3ed2d", + "id": "668936c1", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "70eb52c0", + "id": "5a4ca2ed", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "068c6580", + "id": "f78db181", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "15abcf63", + "id": "bc80ad01", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -325,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a69af2e6", + "id": "6c88bb87", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f5ac9d71", + "id": "35cf488e", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6bb47512", + "id": "0a8cb4d8", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "2114b5bc", + "id": "02fd45c5", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -369,7 +369,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c4c11da9", + "id": "49b8a512", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "d0512d67", + "id": "5835f7c1", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "92417411", + "id": "7145afcf", "metadata": {}, "outputs": [], "source": [ @@ -402,7 +402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b6753723", + "id": "82b3fc64", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5301d1b8", + "id": "075abc99", "metadata": {}, "outputs": [], "source": [ @@ -427,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "b415a021", + "id": "038fdbe5", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 08d0e46f..5a301606 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "d28e0063", + "id": "c1e3790b", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "238d96ab", + "id": "5f81e248", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "e4b2d125", + "id": "dd80189a", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -37,7 +37,7 @@ }, { "cell_type": "markdown", - "id": "82f4cd84", + "id": "169c6b63", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a5ed449e", + "id": "66f27a6b", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "931c7e31", + "id": "dd899aa2", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "04dfa3a8", + "id": "bce17cec", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "15475af5", + "id": "208789c2", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c0962dde", + "id": "b3eb31a9", "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "markdown", - "id": "9b975535", + "id": "118d5a0a", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed1bf1f1", + "id": "504f7cee", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "53aadb2c", + "id": "87524dd5", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "94955e35", + "id": "f7ebcc8d", "metadata": {}, "outputs": [], "source": [ @@ -186,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "62448131", + "id": "746bc5cf", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "33087e69", + "id": "6411fbf3", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5f4e1bac", + "id": "5ace4d22", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ { "cell_type": "code", "execution_count": null, - "id": "252f67ed", + "id": "aa4e3245", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7d4e8bb5", + "id": "9c1d68d5", "metadata": {}, "outputs": [], "source": [ @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13bce5fc", + "id": "1c1179b9", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3e12a27e", + "id": "421b09a5", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ }, { "cell_type": "markdown", - "id": "d47bd653", + "id": "9625f011", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f4926089", + "id": "b1d0d0cd", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed240050", + "id": "deeb8217", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5eda7277", + "id": "3ae5f96b", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "5caed579", + "id": "eb72585e", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -396,7 +396,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b329d3a1", + "id": "bbe9006b", "metadata": {}, "outputs": [], "source": [ @@ -406,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "d69f7c67", + "id": "c83021f1", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b2caa9d1", + "id": "5645e41c", "metadata": { "lines_to_next_cell": 2 }, @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "99c0861f", + "id": "1530b1ef", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "de4e1a36", + "id": "f912e00c", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1959a492", + "id": "236c17ae", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9cf8ab99", + "id": "da28b5af", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ }, { "cell_type": "markdown", - "id": "12a42f12", + "id": "2f056bc1", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/5-generating-images.ipynb b/docs/colab_notebooks/5-generating-images.ipynb index 41530d7c..f8955a17 100644 --- a/docs/colab_notebooks/5-generating-images.ipynb +++ b/docs/colab_notebooks/5-generating-images.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "fb577af2", + "id": "03c4dcde", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Generating Images\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "6d0c0049", + "id": "4c87bc8f", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -33,7 +33,7 @@ }, { "cell_type": "markdown", - "id": "47e777d7", + "id": "e36ad3a6", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0889f8b0", + "id": "a96a5056", "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ { "cell_type": "code", "execution_count": null, - "id": "58dda77c", + "id": "7b0dd67e", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ee635ed", + "id": "611b8e3b", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "32f2a62b", + "id": "68604cc1", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8f2fe56e", + "id": "cf826b56", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "a526f294", + "id": "19c44404", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-generation model\n", @@ -118,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "95eb6aa3", + "id": "5a3177b2", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ }, { "cell_type": "markdown", - "id": "0665df24", + "id": "8c3b2830", "metadata": {}, "source": [ "### πŸ—οΈ Build the config: samplers + image column\n", @@ -151,7 +151,7 @@ { "cell_type": "code", "execution_count": null, - "id": "98a5381c", + "id": "dbcbe101", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "75dc9209", + "id": "31c88669", "metadata": {}, "source": [ "### πŸ” Preview: images as base64\n", @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b70e22f6", + "id": "e1c9cdba", "metadata": {}, "outputs": [], "source": [ @@ -345,7 +345,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e0fdc58d", + "id": "a268f456", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dbb3c655", + "id": "41542106", "metadata": {}, "outputs": [], "source": [ @@ -365,7 +365,7 @@ }, { "cell_type": "markdown", - "id": "cd9dd316", + "id": "a96def17", "metadata": {}, "source": [ "### πŸ†™ Create: images saved to disk\n", @@ -376,7 +376,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0074f4c3", + "id": "d3667abc", "metadata": {}, "outputs": [], "source": [ @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6bbe1861", + "id": "614ced29", "metadata": {}, "outputs": [], "source": [ @@ -397,23 +397,23 @@ { "cell_type": "code", "execution_count": null, - "id": "43d567dd", + "id": "8fe38ca1", "metadata": {}, "outputs": [], "source": [ - "# Display all image from the created dataset. Paths are relative to the artifact output directory.\n", + "# Display all images from the created dataset. Paths are relative to the artifact output directory.\n", "for index, row in dataset.iterrows():\n", " path_or_list = row.get(\"generated_image\")\n", " if path_or_list is not None:\n", - " for path in path_or_list:\n", - " base = results.artifact_storage.base_dataset_path\n", - " full_path = base / path\n", - " display(IPImage(data=full_path))" + " paths = path_or_list if not isinstance(path_or_list, str) else [path_or_list]\n", + " for path in paths:\n", + " full_path = results.artifact_storage.base_dataset_path / path\n", + " display(IPImage(filename=str(full_path)))" ] }, { "cell_type": "markdown", - "id": "d0eb52cb", + "id": "c483fd33", "metadata": {}, "source": [ "## ⏭️ Next steps\n", @@ -421,7 +421,8 @@ "- [The basics](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/): samplers and LLM text columns\n", "- [Structured outputs and Jinja](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/2-structured-outputs-and-jinja-expressions/)\n", "- [Seeding with a dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/)\n", - "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n" + "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/)\n", + "- [Image-to-image editing](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/6-editing-images-with-image-context/): edit existing images with seed datasets\n" ] } ], diff --git a/docs/colab_notebooks/6-editing-images-with-image-context.ipynb b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb new file mode 100644 index 00000000..9607309e --- /dev/null +++ b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb @@ -0,0 +1,490 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c6d11542", + "metadata": {}, + "source": [ + "# 🎨 Data Designer Tutorial: Image-to-Image Editing\n", + "\n", + "#### πŸ“š What you'll learn\n", + "\n", + "This notebook shows how to edit existing images by combining a seed dataset with image generation. You'll load animal portrait photographs from HuggingFace, feed them as context to an autoregressive model, and generate fun edited versions with accessories like sunglasses, top hats, and bow ties.\n", + "\n", + "- 🌱 **Seed datasets with images**: Load a HuggingFace image dataset and use it as a seed\n", + "- πŸ–ΌοΈ **Image context for editing**: Pass existing images to an image-generation model via `multi_modal_context`\n", + "- 🎲 **Sampler-driven diversity**: Combine sampled accessories and settings with seed images for varied results\n", + "- πŸ’Ύ **Preview vs create**: Preview stores base64 in the dataframe; create saves images to disk\n", + "\n", + "This tutorial uses an **autoregressive** model (one that supports both image input *and* image output via the chat completions API). Diffusion models (DALLΒ·E, Stable Diffusion, etc.) do not support image contextβ€”see [Tutorial 5](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/) for text-to-image generation with diffusion models.\n", + "\n", + "If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series.\n" + ] + }, + { + "cell_type": "markdown", + "id": "27e08f5f", + "metadata": {}, + "source": [ + "### πŸ“¦ Import Data Designer\n", + "\n", + "- `data_designer.config` provides the configuration API.\n", + "- `DataDesigner` is the main interface for generation.\n" + ] + }, + { + "cell_type": "markdown", + "id": "b7fe946f", + "metadata": {}, + "source": [ + "### ⚑ Colab Setup\n", + "\n", + "Run the cells below to install the dependencies and set up the API key. If you don't have an API key, you can generate one from [build.nvidia.com](https://build.nvidia.com).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfe4f30f", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "!pip install -U data-designer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebd5c0de", + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "from google.colab import userdata\n", + "\n", + "try:\n", + " os.environ[\"NVIDIA_API_KEY\"] = userdata.get(\"NVIDIA_API_KEY\")\n", + "except userdata.SecretNotFoundError:\n", + " os.environ[\"NVIDIA_API_KEY\"] = getpass.getpass(\"Enter your NVIDIA API key: \")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78fa3003", + "metadata": {}, + "outputs": [], + "source": [ + "import base64\n", + "import io\n", + "import uuid\n", + "\n", + "import pandas as pd\n", + "from datasets import load_dataset\n", + "from IPython.display import Image as IPImage\n", + "from IPython.display import display\n", + "\n", + "import data_designer.config as dd\n", + "from data_designer.interface import DataDesigner" + ] + }, + { + "cell_type": "markdown", + "id": "473146c5", + "metadata": {}, + "source": [ + "### βš™οΈ Initialize the Data Designer interface\n", + "\n", + "When initialized without arguments, [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d677d021", + "metadata": {}, + "outputs": [], + "source": [ + "data_designer = DataDesigner()" + ] + }, + { + "cell_type": "markdown", + "id": "f24d1cdf", + "metadata": {}, + "source": [ + "### πŸŽ›οΈ Define an image-editing model\n", + "\n", + "We need an **autoregressive** model that supports both image input and image output via the chat completions API. This lets us pass existing images as context and receive edited images back.\n", + "\n", + "- Use `ImageInferenceParams` so Data Designer treats this model as an image generator.\n", + "- Image-specific options are model-dependent; pass them via `extra_body`.\n", + "\n", + "> **Note**: This tutorial uses the Flux 2 Pro model via [OpenRouter](https://openrouter.ai). Set `OPENROUTER_API_KEY` in your environment.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fe7f474", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_PROVIDER = \"openrouter\"\n", + "MODEL_ID = \"black-forest-labs/flux.2-pro\"\n", + "MODEL_ALIAS = \"image-editor\"\n", + "\n", + "model_configs = [\n", + " dd.ModelConfig(\n", + " alias=MODEL_ALIAS,\n", + " model=MODEL_ID,\n", + " provider=MODEL_PROVIDER,\n", + " inference_parameters=dd.ImageInferenceParams(\n", + " extra_body={\"height\": 512, \"width\": 512},\n", + " ),\n", + " )\n", + "]" + ] + }, + { + "cell_type": "markdown", + "id": "09428614", + "metadata": {}, + "source": [ + "### 🌱 Load animal portraits from HuggingFace\n", + "\n", + "We'll load animal face photographs from the [AFHQ](https://huggingface.co/datasets/huggan/AFHQv2) (Animal Faces-HQ) dataset, convert them to base64, and use them as a seed dataset.\n", + "\n", + "AFHQ contains high-quality 512Γ—512 close-up portraits of cats, dogs, and wildlifeβ€”perfect subjects for adding fun accessories.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc5260d1", + "metadata": {}, + "outputs": [], + "source": [ + "SEED_COUNT = 10\n", + "BASE64_IMAGE_HEIGHT = 512\n", + "\n", + "ANIMAL_LABELS = {0: \"cat\", 1: \"dog\", 2: \"wild\"}\n", + "\n", + "\n", + "def resize_image(image, height: int):\n", + " \"\"\"Resize image maintaining aspect ratio.\"\"\"\n", + " original_width, original_height = image.size\n", + " width = int(original_width * (height / original_height))\n", + " return image.resize((width, height))\n", + "\n", + "\n", + "def prepare_record(record: dict, height: int) -> dict:\n", + " \"\"\"Convert a HuggingFace record to base64 with metadata.\"\"\"\n", + " image = resize_image(record[\"image\"], height)\n", + " img_buffer = io.BytesIO()\n", + " image.save(img_buffer, format=\"PNG\")\n", + " base64_string = base64.b64encode(img_buffer.getvalue()).decode(\"utf-8\")\n", + " return {\n", + " \"uuid\": str(uuid.uuid4()),\n", + " \"base64_image\": base64_string,\n", + " \"animal\": ANIMAL_LABELS[record[\"label\"]],\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3382262", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"πŸ“₯ Streaming animal portraits from HuggingFace...\")\n", + "hf_dataset = load_dataset(\"huggan/AFHQv2\", split=\"train\", streaming=True)\n", + "\n", + "hf_iter = iter(hf_dataset)\n", + "records = [prepare_record(next(hf_iter), BASE64_IMAGE_HEIGHT) for _ in range(SEED_COUNT)]\n", + "df_seed = pd.DataFrame(records)\n", + "\n", + "print(f\"βœ… Prepared {len(df_seed)} animal portraits with columns: {list(df_seed.columns)}\")\n", + "df_seed.head()" + ] + }, + { + "cell_type": "markdown", + "id": "41788b3c", + "metadata": {}, + "source": [ + "### πŸ—οΈ Build the configuration\n", + "\n", + "We combine three ingredients:\n", + "\n", + "1. **Seed dataset** β€” original animal portraits as base64 and their species labels\n", + "2. **Sampler columns** β€” randomly sample accessories and settings for each image\n", + "3. **Image column with context** β€” generate an edited image using the original as reference\n", + "\n", + "The `multi_modal_context` parameter on `ImageColumnConfig` tells Data Designer to pass the seed image to the model alongside the text prompt. The model receives both the image and the editing instructions, and generates a new image.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b27cebd", + "metadata": {}, + "outputs": [], + "source": [ + "config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs)\n", + "\n", + "# 1. Seed the original animal portraits\n", + "config_builder.with_seed_dataset(dd.DataFrameSeedSource(df=df_seed))\n", + "\n", + "# 2. Add sampler columns for accessory diversity\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"accessory\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\n", + " \"a tiny top hat\",\n", + " \"oversized sunglasses\",\n", + " \"a red bow tie\",\n", + " \"a knitted beanie\",\n", + " \"a flower crown\",\n", + " \"a monocle and mustache\",\n", + " \"a pirate hat and eye patch\",\n", + " \"a chef hat\",\n", + " ],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"setting\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\n", + " \"a cozy living room\",\n", + " \"a sunny park\",\n", + " \"a photo studio with soft lighting\",\n", + " \"a red carpet event\",\n", + " \"a holiday card backdrop with snowflakes\",\n", + " \"a tropical beach at sunset\",\n", + " ],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "config_builder.add_column(\n", + " dd.SamplerColumnConfig(\n", + " name=\"art_style\",\n", + " sampler_type=dd.SamplerType.CATEGORY,\n", + " params=dd.CategorySamplerParams(\n", + " values=[\n", + " \"a photorealistic style\",\n", + " \"a Disney Pixar 3D render\",\n", + " \"a watercolor painting\",\n", + " \"a pop art poster\",\n", + " ],\n", + " ),\n", + " )\n", + ")\n", + "\n", + "# 3. Image column that reads the seed image as context and generates an edited version\n", + "config_builder.add_column(\n", + " dd.ImageColumnConfig(\n", + " name=\"edited_image\",\n", + " prompt=(\n", + " \"Edit this {{ animal }} portrait photo. \"\n", + " \"Add {{ accessory }} on the animal. \"\n", + " \"Place the {{ animal }} in {{ setting }}. \"\n", + " \"Render the result in {{ art_style }}. \"\n", + " \"Keep the animal's face, expression, and features faithful to the original photo.\"\n", + " ),\n", + " model_alias=MODEL_ALIAS,\n", + " multi_modal_context=[\n", + " dd.ImageContext(\n", + " column_name=\"base64_image\",\n", + " data_type=dd.ModalityDataType.BASE64,\n", + " image_format=dd.ImageFormat.PNG,\n", + " )\n", + " ],\n", + " )\n", + ")\n", + "\n", + "data_designer.validate(config_builder)" + ] + }, + { + "cell_type": "markdown", + "id": "032c2dfa", + "metadata": {}, + "source": [ + "### πŸ” Preview: quick iteration\n", + "\n", + "In **preview** mode, generated images are stored as base64 strings in the dataframe. Use this to iterate on your prompts, accessories, and sampler values before scaling up.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f1d2deb", + "metadata": {}, + "outputs": [], + "source": [ + "preview = data_designer.preview(config_builder, num_records=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5555cb05", + "metadata": {}, + "outputs": [], + "source": [ + "for i in range(len(preview.dataset)):\n", + " preview.display_sample_record()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ae5b459", + "metadata": { + "lines_to_next_cell": 1 + }, + "outputs": [], + "source": [ + "preview.dataset" + ] + }, + { + "cell_type": "markdown", + "id": "feb63fca", + "metadata": {}, + "source": [ + "### πŸ”Ž Compare original vs edited\n", + "\n", + "Let's display the original animal portraits next to their accessorized versions.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f22b2d2", + "metadata": {}, + "outputs": [], + "source": [ + "def display_before_after(row: pd.Series, index: int, base_path=None) -> None:\n", + " \"\"\"Display original vs edited image for a single record.\n", + "\n", + " When base_path is None (preview mode), edited_image is decoded from base64.\n", + " When base_path is provided (create mode), edited_image is loaded from disk.\n", + " \"\"\"\n", + " print(f\"\\n{'=' * 60}\")\n", + " print(f\"Record {index}: {row['animal']} wearing {row['accessory']}\")\n", + " print(f\"Setting: {row['setting']}\")\n", + " print(f\"Style: {row['art_style']}\")\n", + " print(f\"{'=' * 60}\")\n", + "\n", + " print(\"\\nπŸ“· Original portrait:\")\n", + " display(IPImage(data=base64.b64decode(row[\"base64_image\"])))\n", + "\n", + " print(\"\\n🎨 Edited version:\")\n", + " edited = row.get(\"edited_image\")\n", + " if edited is None:\n", + " return\n", + " if base_path is None:\n", + " images = edited if isinstance(edited, list) else [edited]\n", + " for img_b64 in images:\n", + " display(IPImage(data=base64.b64decode(img_b64)))\n", + " else:\n", + " paths = edited if not isinstance(edited, str) else [edited]\n", + " for path in paths:\n", + " display(IPImage(filename=str(base_path / path)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1873c353", + "metadata": {}, + "outputs": [], + "source": [ + "for index, row in preview.dataset.iterrows():\n", + " display_before_after(row, index)" + ] + }, + { + "cell_type": "markdown", + "id": "da198732", + "metadata": {}, + "source": [ + "### πŸ†™ Create at scale\n", + "\n", + "In **create** mode, images are saved to disk in an `images//` folder with UUID filenames. The dataframe stores relative paths.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7618774", + "metadata": {}, + "outputs": [], + "source": [ + "results = data_designer.create(config_builder, num_records=10, dataset_name=\"tutorial-6-edited-images\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95b8004b", + "metadata": {}, + "outputs": [], + "source": [ + "dataset = results.load_dataset()\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "881650a3", + "metadata": {}, + "outputs": [], + "source": [ + "for index, row in dataset.head(10).iterrows():\n", + " display_before_after(row, index, base_path=results.artifact_storage.base_dataset_path)" + ] + }, + { + "cell_type": "markdown", + "id": "24a0a4c4", + "metadata": {}, + "source": [ + "## ⏭️ Next steps\n", + "\n", + "- Experiment with different autoregressive models for image editing\n", + "- Try more creative editing prompts (style transfer, background replacement, artistic filters)\n", + "- Combine image editing with text generation (e.g., generate captions for edited images using an LLM-Text column)\n", + "\n", + "Related tutorials:\n", + "\n", + "- [The basics](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/): samplers and LLM text columns\n", + "- [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/): image-to-text with VLMs\n", + "- [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/): text-to-image generation with diffusion models\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/notebook_source/5-generating-images.py b/docs/notebook_source/5-generating-images.py index 73b98cd9..6232a131 100644 --- a/docs/notebook_source/5-generating-images.py +++ b/docs/notebook_source/5-generating-images.py @@ -277,14 +277,14 @@ dataset.head() # %% -# Display all image from the created dataset. Paths are relative to the artifact output directory. +# Display all images from the created dataset. Paths are relative to the artifact output directory. for index, row in dataset.iterrows(): path_or_list = row.get("generated_image") if path_or_list is not None: - for path in path_or_list: - base = results.artifact_storage.base_dataset_path - full_path = base / path - display(IPImage(data=full_path)) + paths = path_or_list if not isinstance(path_or_list, str) else [path_or_list] + for path in paths: + full_path = results.artifact_storage.base_dataset_path / path + display(IPImage(filename=str(full_path))) # %% [markdown] # ## ⏭️ Next steps @@ -293,4 +293,5 @@ # - [Structured outputs and Jinja](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/2-structured-outputs-and-jinja-expressions/) # - [Seeding with a dataset](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/3-seeding-with-a-dataset/) # - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/) +# - [Image-to-image editing](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/6-editing-images-with-image-context/): edit existing images with seed datasets # diff --git a/docs/notebook_source/6-editing-images-with-image-context.py b/docs/notebook_source/6-editing-images-with-image-context.py new file mode 100644 index 00000000..a1bdca83 --- /dev/null +++ b/docs/notebook_source/6-editing-images-with-image-context.py @@ -0,0 +1,314 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: percent +# format_version: '1.3' +# jupytext_version: 1.18.1 +# kernelspec: +# display_name: .venv +# language: python +# name: python3 +# --- + +# %% [markdown] +# # 🎨 Data Designer Tutorial: Image-to-Image Editing +# +# #### πŸ“š What you'll learn +# +# This notebook shows how to edit existing images by combining a seed dataset with image generation. You'll load animal portrait photographs from HuggingFace, feed them as context to an autoregressive model, and generate fun edited versions with accessories like sunglasses, top hats, and bow ties. +# +# - 🌱 **Seed datasets with images**: Load a HuggingFace image dataset and use it as a seed +# - πŸ–ΌοΈ **Image context for editing**: Pass existing images to an image-generation model via `multi_modal_context` +# - 🎲 **Sampler-driven diversity**: Combine sampled accessories and settings with seed images for varied results +# - πŸ’Ύ **Preview vs create**: Preview stores base64 in the dataframe; create saves images to disk +# +# This tutorial uses an **autoregressive** model (one that supports both image input *and* image output via the chat completions API). Diffusion models (DALLΒ·E, Stable Diffusion, etc.) do not support image contextβ€”see [Tutorial 5](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/) for text-to-image generation with diffusion models. +# +# If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series. +# + +# %% [markdown] +# ### πŸ“¦ Import Data Designer +# +# - `data_designer.config` provides the configuration API. +# - `DataDesigner` is the main interface for generation. +# + +# %% +import base64 +import io +import uuid + +import pandas as pd +from datasets import load_dataset +from IPython.display import Image as IPImage +from IPython.display import display + +import data_designer.config as dd +from data_designer.interface import DataDesigner + +# %% [markdown] +# ### βš™οΈ Initialize the Data Designer interface +# +# When initialized without arguments, [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used. +# + +# %% +data_designer = DataDesigner() + +# %% [markdown] +# ### πŸŽ›οΈ Define an image-editing model +# +# We need an **autoregressive** model that supports both image input and image output via the chat completions API. This lets us pass existing images as context and receive edited images back. +# +# - Use `ImageInferenceParams` so Data Designer treats this model as an image generator. +# - Image-specific options are model-dependent; pass them via `extra_body`. +# +# > **Note**: This tutorial uses the Flux 2 Pro model via [OpenRouter](https://openrouter.ai). Set `OPENROUTER_API_KEY` in your environment. +# + +# %% +MODEL_PROVIDER = "openrouter" +MODEL_ID = "black-forest-labs/flux.2-pro" +MODEL_ALIAS = "image-editor" + +model_configs = [ + dd.ModelConfig( + alias=MODEL_ALIAS, + model=MODEL_ID, + provider=MODEL_PROVIDER, + inference_parameters=dd.ImageInferenceParams( + extra_body={"height": 512, "width": 512}, + ), + ) +] + +# %% [markdown] +# ### 🌱 Load animal portraits from HuggingFace +# +# We'll load animal face photographs from the [AFHQ](https://huggingface.co/datasets/huggan/AFHQv2) (Animal Faces-HQ) dataset, convert them to base64, and use them as a seed dataset. +# +# AFHQ contains high-quality 512Γ—512 close-up portraits of cats, dogs, and wildlifeβ€”perfect subjects for adding fun accessories. +# + +# %% +SEED_COUNT = 10 +BASE64_IMAGE_HEIGHT = 512 + +ANIMAL_LABELS = {0: "cat", 1: "dog", 2: "wild"} + + +def resize_image(image, height: int): + """Resize image maintaining aspect ratio.""" + original_width, original_height = image.size + width = int(original_width * (height / original_height)) + return image.resize((width, height)) + + +def prepare_record(record: dict, height: int) -> dict: + """Convert a HuggingFace record to base64 with metadata.""" + image = resize_image(record["image"], height) + img_buffer = io.BytesIO() + image.save(img_buffer, format="PNG") + base64_string = base64.b64encode(img_buffer.getvalue()).decode("utf-8") + return { + "uuid": str(uuid.uuid4()), + "base64_image": base64_string, + "animal": ANIMAL_LABELS[record["label"]], + } + + +# %% +print("πŸ“₯ Streaming animal portraits from HuggingFace...") +hf_dataset = load_dataset("huggan/AFHQv2", split="train", streaming=True) + +hf_iter = iter(hf_dataset) +records = [prepare_record(next(hf_iter), BASE64_IMAGE_HEIGHT) for _ in range(SEED_COUNT)] +df_seed = pd.DataFrame(records) + +print(f"βœ… Prepared {len(df_seed)} animal portraits with columns: {list(df_seed.columns)}") +df_seed.head() + +# %% [markdown] +# ### πŸ—οΈ Build the configuration +# +# We combine three ingredients: +# +# 1. **Seed dataset** β€” original animal portraits as base64 and their species labels +# 2. **Sampler columns** β€” randomly sample accessories and settings for each image +# 3. **Image column with context** β€” generate an edited image using the original as reference +# +# The `multi_modal_context` parameter on `ImageColumnConfig` tells Data Designer to pass the seed image to the model alongside the text prompt. The model receives both the image and the editing instructions, and generates a new image. +# + +# %% +config_builder = dd.DataDesignerConfigBuilder(model_configs=model_configs) + +# 1. Seed the original animal portraits +config_builder.with_seed_dataset(dd.DataFrameSeedSource(df=df_seed)) + +# 2. Add sampler columns for accessory diversity +config_builder.add_column( + dd.SamplerColumnConfig( + name="accessory", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=[ + "a tiny top hat", + "oversized sunglasses", + "a red bow tie", + "a knitted beanie", + "a flower crown", + "a monocle and mustache", + "a pirate hat and eye patch", + "a chef hat", + ], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="setting", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=[ + "a cozy living room", + "a sunny park", + "a photo studio with soft lighting", + "a red carpet event", + "a holiday card backdrop with snowflakes", + "a tropical beach at sunset", + ], + ), + ) +) + +config_builder.add_column( + dd.SamplerColumnConfig( + name="art_style", + sampler_type=dd.SamplerType.CATEGORY, + params=dd.CategorySamplerParams( + values=[ + "a photorealistic style", + "a Disney Pixar 3D render", + "a watercolor painting", + "a pop art poster", + ], + ), + ) +) + +# 3. Image column that reads the seed image as context and generates an edited version +config_builder.add_column( + dd.ImageColumnConfig( + name="edited_image", + prompt=( + "Edit this {{ animal }} portrait photo. " + "Add {{ accessory }} on the animal. " + "Place the {{ animal }} in {{ setting }}. " + "Render the result in {{ art_style }}. " + "Keep the animal's face, expression, and features faithful to the original photo." + ), + model_alias=MODEL_ALIAS, + multi_modal_context=[ + dd.ImageContext( + column_name="base64_image", + data_type=dd.ModalityDataType.BASE64, + image_format=dd.ImageFormat.PNG, + ) + ], + ) +) + +data_designer.validate(config_builder) + +# %% [markdown] +# ### πŸ” Preview: quick iteration +# +# In **preview** mode, generated images are stored as base64 strings in the dataframe. Use this to iterate on your prompts, accessories, and sampler values before scaling up. +# + +# %% +preview = data_designer.preview(config_builder, num_records=2) + +# %% +for i in range(len(preview.dataset)): + preview.display_sample_record() + +# %% +preview.dataset + +# %% [markdown] +# ### πŸ”Ž Compare original vs edited +# +# Let's display the original animal portraits next to their accessorized versions. +# + + +# %% +def display_before_after(row: pd.Series, index: int, base_path=None) -> None: + """Display original vs edited image for a single record. + + When base_path is None (preview mode), edited_image is decoded from base64. + When base_path is provided (create mode), edited_image is loaded from disk. + """ + print(f"\n{'=' * 60}") + print(f"Record {index}: {row['animal']} wearing {row['accessory']}") + print(f"Setting: {row['setting']}") + print(f"Style: {row['art_style']}") + print(f"{'=' * 60}") + + print("\nπŸ“· Original portrait:") + display(IPImage(data=base64.b64decode(row["base64_image"]))) + + print("\n🎨 Edited version:") + edited = row.get("edited_image") + if edited is None: + return + if base_path is None: + images = edited if isinstance(edited, list) else [edited] + for img_b64 in images: + display(IPImage(data=base64.b64decode(img_b64))) + else: + paths = edited if not isinstance(edited, str) else [edited] + for path in paths: + display(IPImage(filename=str(base_path / path))) + + +# %% +for index, row in preview.dataset.iterrows(): + display_before_after(row, index) + +# %% [markdown] +# ### πŸ†™ Create at scale +# +# In **create** mode, images are saved to disk in an `images//` folder with UUID filenames. The dataframe stores relative paths. +# + +# %% +results = data_designer.create(config_builder, num_records=10, dataset_name="tutorial-6-edited-images") + +# %% +dataset = results.load_dataset() +dataset.head() + +# %% +for index, row in dataset.head(10).iterrows(): + display_before_after(row, index, base_path=results.artifact_storage.base_dataset_path) + +# %% [markdown] +# ## ⏭️ Next steps +# +# - Experiment with different autoregressive models for image editing +# - Try more creative editing prompts (style transfer, background replacement, artistic filters) +# - Combine image editing with text generation (e.g., generate captions for edited images using an LLM-Text column) +# +# Related tutorials: +# +# - [The basics](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/): samplers and LLM text columns +# - [Providing images as context](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/4-providing-images-as-context/): image-to-text with VLMs +# - [Generating images](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/): text-to-image generation with diffusion models +# diff --git a/docs/notebook_source/_README.md b/docs/notebook_source/_README.md index 7bcd77d1..bbd29f9e 100644 --- a/docs/notebook_source/_README.md +++ b/docs/notebook_source/_README.md @@ -106,6 +106,15 @@ Generate synthetic image data with Data Designer: - Preview (base64 in dataframe) vs create (images saved to disk, paths in dataframe) - Displaying generated images in the notebook +### [6. Image-to-Image Editing](6-editing-images-with-image-context.ipynb) + +Edit existing images by combining seed datasets with image generation: + +- Loading a HuggingFace image dataset and using it as a seed +- Passing existing images to an image-generation model via `multi_modal_context` +- Combining sampled accessories and settings with seed images for varied results +- Comparing original vs edited images in preview and create modes + ## πŸ“– Important Documentation Sections Before diving into the tutorials, familiarize yourself with these key documentation sections: diff --git a/mkdocs.yml b/mkdocs.yml index c28f1038..8f1a3868 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -37,6 +37,7 @@ nav: - Seeding with an External Dataset: notebooks/3-seeding-with-a-dataset.ipynb - Providing Images as Context: notebooks/4-providing-images-as-context.ipynb - Generating Images: notebooks/5-generating-images.ipynb + - Image-to-Image Editing: notebooks/6-editing-images-with-image-context.ipynb - Recipes: - Recipe Cards: recipes/cards.md - Code Generation: From cf2b3648f61d0367182c02a3cf355e08be1b8c9d Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:12:36 -0700 Subject: [PATCH 55/69] Address PR review comments - Reduce num_records to 2 for image generation in tutorial notebook - Add tests for different image response formats (dict and plain string) - Parametrize PNG/JPG media storage tests for better maintainability --- docs/notebook_source/5-generating-images.py | 2 +- .../tests/engine/models/test_facade.py | 66 ++++++++++++++++++- .../engine/storage/test_media_storage.py | 33 +++++----- 3 files changed, 81 insertions(+), 20 deletions(-) diff --git a/docs/notebook_source/5-generating-images.py b/docs/notebook_source/5-generating-images.py index 28638ff9..b445b950 100644 --- a/docs/notebook_source/5-generating-images.py +++ b/docs/notebook_source/5-generating-images.py @@ -270,7 +270,7 @@ # # %% -results = data_designer.create(config_builder, num_records=5, dataset_name="tutorial-5-images") +results = data_designer.create(config_builder, num_records=2, dataset_name="tutorial-5-images") # %% dataset = results.load_dataset() diff --git a/packages/data-designer-engine/tests/engine/models/test_facade.py b/packages/data-designer-engine/tests/engine/models/test_facade.py index 65c66896..1f220f4f 100644 --- a/packages/data-designer-engine/tests/engine/models/test_facade.py +++ b/packages/data-designer-engine/tests/engine/models/test_facade.py @@ -4,7 +4,7 @@ from __future__ import annotations from typing import TYPE_CHECKING, Any -from unittest.mock import patch +from unittest.mock import MagicMock, patch import pytest @@ -1077,6 +1077,70 @@ def test_generate_image_chat_completion_tracks_image_usage( assert stub_model_facade.usage_stats.image_usage.has_usage is True +@patch("data_designer.engine.models.facade.ModelFacade.completion", autospec=True) +def test_generate_image_chat_completion_with_dict_format( + mock_completion: Any, + stub_model_facade: ModelFacade, +) -> None: + """Test that generate_image handles images as dicts with image_url string.""" + # Create mock message with images as dict with string image_url + mock_message = MagicMock() + mock_message.role = "assistant" + mock_message.content = "" + mock_message.images = [ + {"image_url": ""}, + {"image_url": ""}, + ] + + mock_choice = MagicMock() + mock_choice.message = mock_message + + mock_response = MagicMock() + mock_response.choices = [mock_choice] + + mock_completion.return_value = mock_response + + # Generate images + with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=False): + images = stub_model_facade.generate_image(prompt="test prompt") + + # Verify results + assert len(images) == 2 + assert images == ["image1", "image2"] + + +@patch("data_designer.engine.models.facade.ModelFacade.completion", autospec=True) +def test_generate_image_chat_completion_with_plain_strings( + mock_completion: Any, + stub_model_facade: ModelFacade, +) -> None: + """Test that generate_image handles images as plain strings.""" + # Create mock message with images as plain strings + mock_message = MagicMock() + mock_message.role = "assistant" + mock_message.content = "" + mock_message.images = [ + "", + "image2", # Plain base64 without data URI prefix + ] + + mock_choice = MagicMock() + mock_choice.message = mock_message + + mock_response = MagicMock() + mock_response.choices = [mock_choice] + + mock_completion.return_value = mock_response + + # Generate images + with patch("data_designer.engine.models.facade.is_image_diffusion_model", return_value=False): + images = stub_model_facade.generate_image(prompt="test prompt") + + # Verify results + assert len(images) == 2 + assert images == ["image1", "image2"] + + @patch("data_designer.engine.models.facade.CustomRouter.image_generation", autospec=True) def test_generate_image_skip_usage_tracking( mock_image_generation: Any, diff --git a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py index f908a4c2..9d74734a 100644 --- a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py +++ b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py @@ -60,13 +60,23 @@ def test_media_storage_init_custom_subdir(tmp_path): assert not storage.images_dir.exists() -def test_save_base64_image_png(media_storage, sample_base64_png): - """Test saving a PNG image from base64.""" - relative_path = media_storage.save_base64_image(sample_base64_png, subfolder_name="test_column") +@pytest.mark.parametrize( + "image_fixture,expected_extension", + [ + ("sample_base64_png", ".png"), + ("sample_base64_jpg", ".jpg"), + ], +) +def test_save_base64_image_format(media_storage, image_fixture, expected_extension, request): + """Test saving images from base64 in different formats.""" + # Get the actual fixture value using request.getfixturevalue + sample_base64 = request.getfixturevalue(image_fixture) + + relative_path = media_storage.save_base64_image(sample_base64, subfolder_name="test_column") # Check return value format (organized by column name) assert relative_path.startswith(f"{IMAGES_SUBDIR}/test_column/") - assert relative_path.endswith(".png") + assert relative_path.endswith(expected_extension) # Check file exists on disk full_path = media_storage.base_path / relative_path @@ -74,23 +84,10 @@ def test_save_base64_image_png(media_storage, sample_base64_png): # Verify file content saved_bytes = full_path.read_bytes() - expected_bytes = base64.b64decode(sample_base64_png) + expected_bytes = base64.b64decode(sample_base64) assert saved_bytes == expected_bytes -def test_save_base64_image_jpg(media_storage, sample_base64_jpg): - """Test saving a JPEG image from base64.""" - relative_path = media_storage.save_base64_image(sample_base64_jpg, subfolder_name="test_column") - - # Check return value format (organized by column name) - assert relative_path.startswith(f"{IMAGES_SUBDIR}/test_column/") - assert relative_path.endswith(".jpg") - - # Check file exists on disk - full_path = media_storage.base_path / relative_path - assert full_path.exists() - - def test_save_base64_image_with_data_uri(media_storage, sample_base64_png): """Test saving image from data URI format.""" data_uri = f"data:image/png;base64,{sample_base64_png}" From 5cef2f76fe8a6d008da25a89d5150876486b98c6 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:19:02 -0700 Subject: [PATCH 56/69] Fix incorrect model classification in inference-parameters docs Relabel flux.2-pro/openrouter example from "Diffusion model" to "Autoregressive model" to match its usage elsewhere in the docs. Regenerate colab notebooks. --- docs/colab_notebooks/1-the-basics.ipynb | 62 ++++++++--------- ...ctured-outputs-and-jinja-expressions.ipynb | 58 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 54 +++++++-------- .../4-providing-images-as-context.ipynb | 66 +++++++++---------- .../colab_notebooks/5-generating-images.ipynb | 42 ++++++------ .../6-editing-images-with-image-context.ipynb | 64 +++++++++--------- docs/concepts/models/inference-parameters.md | 4 +- .../6-editing-images-with-image-context.py | 2 +- 8 files changed, 176 insertions(+), 176 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index ec9f5f00..dfdfd8bc 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "b557c504", + "id": "4564128b", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "84987de5", + "id": "344a8570", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -26,7 +26,7 @@ }, { "cell_type": "markdown", - "id": "e79f93a9", + "id": "6326beb4", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "80920352", + "id": "8c634dad", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bbef8181", + "id": "9d368dbc", "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7bd1e8d6", + "id": "973c8fc2", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "08d48c5f", + "id": "e57be919", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "38bdc673", + "id": "6a86e940", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "markdown", - "id": "27c75d97", + "id": "78030009", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "17bf8a4a", + "id": "cf52af7b", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "3d910ff1", + "id": "e7a74cd6", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "20e7ff7f", + "id": "6d70702f", "metadata": {}, "outputs": [], "source": [ @@ -169,7 +169,7 @@ }, { "cell_type": "markdown", - "id": "81e40a7c", + "id": "ffcd3ed6", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f6e9308f", + "id": "65aa69d0", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "88a4aada", + "id": "a801ba88", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1cbb33dd", + "id": "f929263c", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "3e027eba", + "id": "df08b8d9", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28a9a449", + "id": "7c851e80", "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "3cf9ad4a", + "id": "0b44e1c6", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3ce0bd1b", + "id": "46c6b4ae", "metadata": {}, "outputs": [], "source": [ @@ -382,7 +382,7 @@ }, { "cell_type": "markdown", - "id": "83c0d6c9", + "id": "250b79ee", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -399,7 +399,7 @@ { "cell_type": "code", "execution_count": null, - "id": "10a8544a", + "id": "b50c18ee", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18955c09", + "id": "a2aee2db", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ae0d842e", + "id": "6b3d0d41", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "7a8aae69", + "id": "6735018d", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9cf63137", + "id": "02c3c44d", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "1b1db3b0", + "id": "4982b32c", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -466,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "371910f0", + "id": "e92044ed", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4df85fb5", + "id": "e2d9dd24", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "50034186", + "id": "b7c5efb2", "metadata": {}, "outputs": [], "source": [ @@ -501,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "ee1e3476", + "id": "2abee6c6", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index f98e9b92..1f6e47d2 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c8ab83b5", + "id": "1b094815", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "598d3d3a", + "id": "3f25eef5", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "4eed9946", + "id": "96eaa4d4", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aefd74d2", + "id": "4ec697f2", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b4596021", + "id": "13fced47", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "db722e12", + "id": "68f3bb75", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "af84adc6", + "id": "6af5525d", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b0a13f37", + "id": "5c9d7d4a", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "494fc106", + "id": "609bbbeb", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52b901be", + "id": "d9141cce", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "0239c09a", + "id": "68060d38", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a40d7f02", + "id": "6a1a2a26", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "bf75c464", + "id": "f2e0c8ac", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -198,7 +198,7 @@ { "cell_type": "code", "execution_count": null, - "id": "561d3650", + "id": "fc9db3ef", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "f758aedb", + "id": "264fd36b", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b7643909", + "id": "1bb15cff", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "1033adc4", + "id": "b4938e8d", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -361,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6c877291", + "id": "7e7bccf7", "metadata": {}, "outputs": [], "source": [ @@ -414,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "6ef6a1e0", + "id": "0314f017", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87be3640", + "id": "87ea580f", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9238802e", + "id": "1b41df79", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1184e9db", + "id": "6036e1e7", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "1c043980", + "id": "cb565f07", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -475,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "28195ad7", + "id": "5a29f960", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "8885ad89", + "id": "6066ce7a", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -498,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0b76df7c", + "id": "d942e256", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +508,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b403cfe4", + "id": "c6c62786", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11e68465", + "id": "18798a2d", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +533,7 @@ }, { "cell_type": "markdown", - "id": "fa36adf2", + "id": "8ebe8b5a", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 2ffb72e0..3d971631 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "f60f6b7d", + "id": "bee396d8", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "612a5533", + "id": "936d3ab6", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "0b383424", + "id": "137717e6", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c5a3e218", + "id": "49699e45", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34434dcd", + "id": "094872a7", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11be7325", + "id": "8d0c4134", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "fb998ae8", + "id": "6dfb7395", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f12828b1", + "id": "ef28093c", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "967da49d", + "id": "107b97cc", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3c3e9dec", + "id": "b71f1585", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "e09a9729", + "id": "60291b08", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ad4163d5", + "id": "739bfed1", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "879f0eb9", + "id": "c71aacf7", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "668936c1", + "id": "1eddb6df", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "5a4ca2ed", + "id": "117a857a", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f78db181", + "id": "d78bab2f", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "bc80ad01", + "id": "83ddf48e", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -325,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6c88bb87", + "id": "5c1e63b7", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "35cf488e", + "id": "b10cfe15", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0a8cb4d8", + "id": "fb6a9c67", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "02fd45c5", + "id": "d1a37ddc", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -369,7 +369,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49b8a512", + "id": "78e03230", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "5835f7c1", + "id": "09a4f8f0", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7145afcf", + "id": "78b36c93", "metadata": {}, "outputs": [], "source": [ @@ -402,7 +402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "82b3fc64", + "id": "9c139434", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "075abc99", + "id": "0b6bb2b7", "metadata": {}, "outputs": [], "source": [ @@ -427,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "038fdbe5", + "id": "6928ebd2", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 5a301606..be601b73 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c1e3790b", + "id": "ab93e7b4", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "5f81e248", + "id": "8face940", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "dd80189a", + "id": "9bf9571a", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -37,7 +37,7 @@ }, { "cell_type": "markdown", - "id": "169c6b63", + "id": "fe337043", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "66f27a6b", + "id": "9ad14b95", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dd899aa2", + "id": "a4919960", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bce17cec", + "id": "6608f2d0", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "208789c2", + "id": "50c00ad0", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b3eb31a9", + "id": "11c57336", "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "markdown", - "id": "118d5a0a", + "id": "8c5aec49", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "504f7cee", + "id": "f6428362", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "87524dd5", + "id": "319876a1", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f7ebcc8d", + "id": "c2f63f57", "metadata": {}, "outputs": [], "source": [ @@ -186,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "746bc5cf", + "id": "a36d211d", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6411fbf3", + "id": "c5cb8bfc", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5ace4d22", + "id": "feb51ef5", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aa4e3245", + "id": "54198281", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c1d68d5", + "id": "c011fdc6", "metadata": {}, "outputs": [], "source": [ @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1c1179b9", + "id": "3b2a753f", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "421b09a5", + "id": "f24ccc25", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ }, { "cell_type": "markdown", - "id": "9625f011", + "id": "9a20155a", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b1d0d0cd", + "id": "cf286a17", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "deeb8217", + "id": "672869f4", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3ae5f96b", + "id": "f43fcbc8", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "eb72585e", + "id": "89aeb032", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -396,7 +396,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bbe9006b", + "id": "9a962027", "metadata": {}, "outputs": [], "source": [ @@ -406,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "c83021f1", + "id": "12b94d1b", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5645e41c", + "id": "db86a531", "metadata": { "lines_to_next_cell": 2 }, @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "1530b1ef", + "id": "d5e21f9a", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f912e00c", + "id": "c6703d4c", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "236c17ae", + "id": "3df12871", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "da28b5af", + "id": "5af31687", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ }, { "cell_type": "markdown", - "id": "2f056bc1", + "id": "96274bf7", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/5-generating-images.ipynb b/docs/colab_notebooks/5-generating-images.ipynb index f8955a17..f0aee607 100644 --- a/docs/colab_notebooks/5-generating-images.ipynb +++ b/docs/colab_notebooks/5-generating-images.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "03c4dcde", + "id": "aded0f63", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Generating Images\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "4c87bc8f", + "id": "3e3ad8e9", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -33,7 +33,7 @@ }, { "cell_type": "markdown", - "id": "e36ad3a6", + "id": "56453c53", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a96a5056", + "id": "fdb7feac", "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b0dd67e", + "id": "e480f7f3", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "611b8e3b", + "id": "db0349eb", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "68604cc1", + "id": "11c68030", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf826b56", + "id": "97569c84", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "19c44404", + "id": "496af921", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-generation model\n", @@ -118,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a3177b2", + "id": "422c1c49", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ }, { "cell_type": "markdown", - "id": "8c3b2830", + "id": "8dbeee61", "metadata": {}, "source": [ "### πŸ—οΈ Build the config: samplers + image column\n", @@ -151,7 +151,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dbcbe101", + "id": "a6549814", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "31c88669", + "id": "398efd02", "metadata": {}, "source": [ "### πŸ” Preview: images as base64\n", @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e1c9cdba", + "id": "6fb9aa82", "metadata": {}, "outputs": [], "source": [ @@ -345,7 +345,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a268f456", + "id": "6c2dde5a", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "41542106", + "id": "63825adc", "metadata": {}, "outputs": [], "source": [ @@ -365,7 +365,7 @@ }, { "cell_type": "markdown", - "id": "a96def17", + "id": "bff94616", "metadata": {}, "source": [ "### πŸ†™ Create: images saved to disk\n", @@ -376,7 +376,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d3667abc", + "id": "633ba7c9", "metadata": {}, "outputs": [], "source": [ @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "614ced29", + "id": "7315ad3c", "metadata": {}, "outputs": [], "source": [ @@ -397,7 +397,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8fe38ca1", + "id": "15485604", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +413,7 @@ }, { "cell_type": "markdown", - "id": "c483fd33", + "id": "465e15e5", "metadata": {}, "source": [ "## ⏭️ Next steps\n", diff --git a/docs/colab_notebooks/6-editing-images-with-image-context.ipynb b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb index 9607309e..2248e2a5 100644 --- a/docs/colab_notebooks/6-editing-images-with-image-context.ipynb +++ b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c6d11542", + "id": "cbe28a17", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Image-to-Image Editing\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "27e08f5f", + "id": "e81cf47f", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -34,7 +34,7 @@ }, { "cell_type": "markdown", - "id": "b7fe946f", + "id": "7f7642a8", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -45,7 +45,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bfe4f30f", + "id": "aee74ade", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ebd5c0de", + "id": "055dce96", "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78fa3003", + "id": "19bad01e", "metadata": {}, "outputs": [], "source": [ @@ -93,7 +93,7 @@ }, { "cell_type": "markdown", - "id": "473146c5", + "id": "cb2a7054", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -104,7 +104,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d677d021", + "id": "9ce997e0", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ }, { "cell_type": "markdown", - "id": "f24d1cdf", + "id": "1d4ba7fa", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-editing model\n", @@ -129,7 +129,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5fe7f474", + "id": "9dd9185e", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "09428614", + "id": "78d77137", "metadata": {}, "source": [ "### 🌱 Load animal portraits from HuggingFace\n", @@ -164,7 +164,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dc5260d1", + "id": "7f2bd5a9", "metadata": {}, "outputs": [], "source": [ @@ -197,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a3382262", + "id": "6ac94f71", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "41788b3c", + "id": "06290ac8", "metadata": {}, "source": [ "### πŸ—οΈ Build the configuration\n", @@ -231,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b27cebd", + "id": "f23ee931", "metadata": {}, "outputs": [], "source": [ @@ -319,7 +319,7 @@ }, { "cell_type": "markdown", - "id": "032c2dfa", + "id": "9d824059", "metadata": {}, "source": [ "### πŸ” Preview: quick iteration\n", @@ -330,7 +330,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2f1d2deb", + "id": "66e11057", "metadata": {}, "outputs": [], "source": [ @@ -340,7 +340,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5555cb05", + "id": "f0a0965c", "metadata": {}, "outputs": [], "source": [ @@ -351,10 +351,8 @@ { "cell_type": "code", "execution_count": null, - "id": "8ae5b459", - "metadata": { - "lines_to_next_cell": 1 - }, + "id": "6e0b73d6", + "metadata": {}, "outputs": [], "source": [ "preview.dataset" @@ -362,8 +360,10 @@ }, { "cell_type": "markdown", - "id": "feb63fca", - "metadata": {}, + "id": "e4fe0d30", + "metadata": { + "lines_to_next_cell": 2 + }, "source": [ "### πŸ”Ž Compare original vs edited\n", "\n", @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8f22b2d2", + "id": "8a611c24", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1873c353", + "id": "63a57b22", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +419,7 @@ }, { "cell_type": "markdown", - "id": "da198732", + "id": "f0a32128", "metadata": {}, "source": [ "### πŸ†™ Create at scale\n", @@ -430,17 +430,17 @@ { "cell_type": "code", "execution_count": null, - "id": "b7618774", + "id": "e4bba421", "metadata": {}, "outputs": [], "source": [ - "results = data_designer.create(config_builder, num_records=10, dataset_name=\"tutorial-6-edited-images\")" + "results = data_designer.create(config_builder, num_records=5, dataset_name=\"tutorial-6-edited-images\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "95b8004b", + "id": "3741c8cc", "metadata": {}, "outputs": [], "source": [ @@ -451,7 +451,7 @@ { "cell_type": "code", "execution_count": null, - "id": "881650a3", + "id": "2078f892", "metadata": {}, "outputs": [], "source": [ @@ -461,7 +461,7 @@ }, { "cell_type": "markdown", - "id": "24a0a4c4", + "id": "8ea73b15", "metadata": {}, "source": [ "## ⏭️ Next steps\n", diff --git a/docs/concepts/models/inference-parameters.md b/docs/concepts/models/inference-parameters.md index a1ca2865..e0c3b678 100644 --- a/docs/concepts/models/inference-parameters.md +++ b/docs/concepts/models/inference-parameters.md @@ -153,7 +153,7 @@ The `ImageInferenceParams` class is used for image generation models, including ```python import data_designer.config as dd -# Diffusion model (e.g., DALLΒ·E, Stable Diffusion) +# Autoregressive model (chat completions API, supports image context) dd.ModelConfig( alias="image-model", model="black-forest-labs/flux.2-pro", @@ -163,7 +163,7 @@ dd.ModelConfig( ), ) -# OpenAI DALLΒ·E style +# Diffusion model (e.g., DALLΒ·E, Stable Diffusion) dd.ModelConfig( alias="dalle", model="dall-e-3", diff --git a/docs/notebook_source/6-editing-images-with-image-context.py b/docs/notebook_source/6-editing-images-with-image-context.py index a1bdca83..74e5d3d0 100644 --- a/docs/notebook_source/6-editing-images-with-image-context.py +++ b/docs/notebook_source/6-editing-images-with-image-context.py @@ -289,7 +289,7 @@ def display_before_after(row: pd.Series, index: int, base_path=None) -> None: # # %% -results = data_designer.create(config_builder, num_records=10, dataset_name="tutorial-6-edited-images") +results = data_designer.create(config_builder, num_records=5, dataset_name="tutorial-6-edited-images") # %% dataset = results.load_dataset() From b4c101b9872e474530010014dac3dfc941ca7c2f Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:21:36 -0700 Subject: [PATCH 57/69] regen colab notebooks --- docs/colab_notebooks/1-the-basics.ipynb | 62 ++++++++--------- ...ctured-outputs-and-jinja-expressions.ipynb | 58 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 54 +++++++-------- .../4-providing-images-as-context.ipynb | 66 +++++++++---------- .../colab_notebooks/5-generating-images.ipynb | 42 ++++++------ .../6-editing-images-with-image-context.ipynb | 54 +++++++-------- 6 files changed, 168 insertions(+), 168 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index dfdfd8bc..5db874b4 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "4564128b", + "id": "2a780d2f", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "344a8570", + "id": "abad0edc", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -26,7 +26,7 @@ }, { "cell_type": "markdown", - "id": "6326beb4", + "id": "20c9ce2d", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8c634dad", + "id": "94242a74", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9d368dbc", + "id": "6c8ee9cd", "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "973c8fc2", + "id": "c0c489ab", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "e57be919", + "id": "e654a809", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6a86e940", + "id": "3eb3b635", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "markdown", - "id": "78030009", + "id": "e0efe23c", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf52af7b", + "id": "ae122c55", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "e7a74cd6", + "id": "c60f5053", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6d70702f", + "id": "15588980", "metadata": {}, "outputs": [], "source": [ @@ -169,7 +169,7 @@ }, { "cell_type": "markdown", - "id": "ffcd3ed6", + "id": "2cb13ebb", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "65aa69d0", + "id": "1b35befd", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "a801ba88", + "id": "5f0481d2", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f929263c", + "id": "0d48fdf2", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "df08b8d9", + "id": "4e31585b", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7c851e80", + "id": "d323959f", "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "0b44e1c6", + "id": "ebb70b7f", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46c6b4ae", + "id": "bbd8772c", "metadata": {}, "outputs": [], "source": [ @@ -382,7 +382,7 @@ }, { "cell_type": "markdown", - "id": "250b79ee", + "id": "e80705c4", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -399,7 +399,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b50c18ee", + "id": "f18218cb", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a2aee2db", + "id": "503e7dbd", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6b3d0d41", + "id": "81753910", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "6735018d", + "id": "a83b7fce", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "02c3c44d", + "id": "76c8f996", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "4982b32c", + "id": "f7d95599", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -466,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e92044ed", + "id": "7b5affee", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e2d9dd24", + "id": "91f0a46e", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b7c5efb2", + "id": "75f8df3a", "metadata": {}, "outputs": [], "source": [ @@ -501,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "2abee6c6", + "id": "30f497e7", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 1f6e47d2..ec77a097 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "1b094815", + "id": "d5ef2760", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "3f25eef5", + "id": "5759706a", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "96eaa4d4", + "id": "84222b10", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4ec697f2", + "id": "b7ec2ab3", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "13fced47", + "id": "39e9b141", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "68f3bb75", + "id": "5c4c58b8", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "6af5525d", + "id": "d530a175", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5c9d7d4a", + "id": "60dfabbf", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "609bbbeb", + "id": "36e10380", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d9141cce", + "id": "9a45638e", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "68060d38", + "id": "45664121", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6a1a2a26", + "id": "34d3e48c", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "f2e0c8ac", + "id": "646aef9e", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -198,7 +198,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fc9db3ef", + "id": "09925e01", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "264fd36b", + "id": "933c9a74", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1bb15cff", + "id": "08a2ae51", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "b4938e8d", + "id": "62c579e9", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -361,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7e7bccf7", + "id": "d06a4634", "metadata": {}, "outputs": [], "source": [ @@ -414,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "0314f017", + "id": "2f550b1f", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "87ea580f", + "id": "6f220a6d", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1b41df79", + "id": "ee6a7ed1", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6036e1e7", + "id": "b8795600", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "cb565f07", + "id": "1e7aa322", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -475,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5a29f960", + "id": "e0775906", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "6066ce7a", + "id": "0ecba803", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -498,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d942e256", + "id": "d83bff69", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +508,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c6c62786", + "id": "3a2b5aa2", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18798a2d", + "id": "111f91d1", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +533,7 @@ }, { "cell_type": "markdown", - "id": "8ebe8b5a", + "id": "192cc0ac", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 3d971631..301698c4 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "bee396d8", + "id": "2a4ce855", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "936d3ab6", + "id": "2621a33b", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "137717e6", + "id": "d95050e8", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "49699e45", + "id": "36899290", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "094872a7", + "id": "21c1eb59", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8d0c4134", + "id": "a867d606", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "6dfb7395", + "id": "c283906d", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ef28093c", + "id": "03e84c09", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "107b97cc", + "id": "1c0b8dca", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b71f1585", + "id": "4ada8b35", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "60291b08", + "id": "a0db06b7", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "739bfed1", + "id": "3da725e1", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "c71aacf7", + "id": "8aac6dec", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1eddb6df", + "id": "a8b62bbf", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "117a857a", + "id": "2282b5da", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d78bab2f", + "id": "9b0bad97", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "83ddf48e", + "id": "da7e91d5", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -325,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5c1e63b7", + "id": "dbd6d41f", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b10cfe15", + "id": "2101a979", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fb6a9c67", + "id": "4e1ae0bd", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "d1a37ddc", + "id": "103a73d3", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -369,7 +369,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78e03230", + "id": "96deef26", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "09a4f8f0", + "id": "b7e4b655", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "78b36c93", + "id": "d5591790", "metadata": {}, "outputs": [], "source": [ @@ -402,7 +402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c139434", + "id": "c5586d2e", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0b6bb2b7", + "id": "6089cb9b", "metadata": {}, "outputs": [], "source": [ @@ -427,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "6928ebd2", + "id": "7abff8e9", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index be601b73..2a06053a 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "ab93e7b4", + "id": "a0298395", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "8face940", + "id": "d5c4d123", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "9bf9571a", + "id": "eb1a1082", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -37,7 +37,7 @@ }, { "cell_type": "markdown", - "id": "fe337043", + "id": "71ef6e0a", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ad14b95", + "id": "0d5a0078", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a4919960", + "id": "ecf8e492", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6608f2d0", + "id": "71efb9d0", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "50c00ad0", + "id": "e623bda8", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "11c57336", + "id": "106712e1", "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "markdown", - "id": "8c5aec49", + "id": "7197bcae", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f6428362", + "id": "ec6c1909", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "319876a1", + "id": "06f060fd", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c2f63f57", + "id": "b7cbb9cd", "metadata": {}, "outputs": [], "source": [ @@ -186,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "a36d211d", + "id": "5c4a365b", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c5cb8bfc", + "id": "1cf5bb9f", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "feb51ef5", + "id": "103b422b", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ { "cell_type": "code", "execution_count": null, - "id": "54198281", + "id": "5f852791", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c011fdc6", + "id": "4f38e09c", "metadata": {}, "outputs": [], "source": [ @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3b2a753f", + "id": "129965ee", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f24ccc25", + "id": "976261b1", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ }, { "cell_type": "markdown", - "id": "9a20155a", + "id": "e3f514f3", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cf286a17", + "id": "77959e31", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "672869f4", + "id": "63c47637", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f43fcbc8", + "id": "273e62f9", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "89aeb032", + "id": "e02290d5", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -396,7 +396,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9a962027", + "id": "57e8280d", "metadata": {}, "outputs": [], "source": [ @@ -406,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "12b94d1b", + "id": "9859edde", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "db86a531", + "id": "b2fd573f", "metadata": { "lines_to_next_cell": 2 }, @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "d5e21f9a", + "id": "4d3f92c0", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c6703d4c", + "id": "f1df6b58", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3df12871", + "id": "d7d29f83", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5af31687", + "id": "9f1c8f0b", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ }, { "cell_type": "markdown", - "id": "96274bf7", + "id": "c3a2fe29", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/5-generating-images.ipynb b/docs/colab_notebooks/5-generating-images.ipynb index f0aee607..98935849 100644 --- a/docs/colab_notebooks/5-generating-images.ipynb +++ b/docs/colab_notebooks/5-generating-images.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "aded0f63", + "id": "6d260a7e", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Generating Images\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "3e3ad8e9", + "id": "f1bd7fd5", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -33,7 +33,7 @@ }, { "cell_type": "markdown", - "id": "56453c53", + "id": "50d5e8d1", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fdb7feac", + "id": "af19dbf4", "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e480f7f3", + "id": "f5be383f", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "db0349eb", + "id": "3fdc5e07", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "11c68030", + "id": "c989cef5", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "97569c84", + "id": "12e6309d", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "496af921", + "id": "0ad4646f", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-generation model\n", @@ -118,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "422c1c49", + "id": "52780619", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ }, { "cell_type": "markdown", - "id": "8dbeee61", + "id": "aa0b4db5", "metadata": {}, "source": [ "### πŸ—οΈ Build the config: samplers + image column\n", @@ -151,7 +151,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a6549814", + "id": "f0ed0d94", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "398efd02", + "id": "e4497637", "metadata": {}, "source": [ "### πŸ” Preview: images as base64\n", @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6fb9aa82", + "id": "9034bec7", "metadata": {}, "outputs": [], "source": [ @@ -345,7 +345,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6c2dde5a", + "id": "f4c2ba02", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63825adc", + "id": "81e20819", "metadata": {}, "outputs": [], "source": [ @@ -365,7 +365,7 @@ }, { "cell_type": "markdown", - "id": "bff94616", + "id": "334a62a5", "metadata": {}, "source": [ "### πŸ†™ Create: images saved to disk\n", @@ -376,7 +376,7 @@ { "cell_type": "code", "execution_count": null, - "id": "633ba7c9", + "id": "8e456be3", "metadata": {}, "outputs": [], "source": [ @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7315ad3c", + "id": "fb789f48", "metadata": {}, "outputs": [], "source": [ @@ -397,7 +397,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15485604", + "id": "3f23ece3", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +413,7 @@ }, { "cell_type": "markdown", - "id": "465e15e5", + "id": "f7b9a97e", "metadata": {}, "source": [ "## ⏭️ Next steps\n", diff --git a/docs/colab_notebooks/6-editing-images-with-image-context.ipynb b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb index 2248e2a5..c7c74096 100644 --- a/docs/colab_notebooks/6-editing-images-with-image-context.ipynb +++ b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "cbe28a17", + "id": "a7bc100e", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Image-to-Image Editing\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "e81cf47f", + "id": "53d134dd", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -34,7 +34,7 @@ }, { "cell_type": "markdown", - "id": "7f7642a8", + "id": "4eec66ab", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -45,7 +45,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aee74ade", + "id": "ba921281", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ { "cell_type": "code", "execution_count": null, - "id": "055dce96", + "id": "0f5a3272", "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19bad01e", + "id": "334a2d03", "metadata": {}, "outputs": [], "source": [ @@ -93,7 +93,7 @@ }, { "cell_type": "markdown", - "id": "cb2a7054", + "id": "bf64783d", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -104,7 +104,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ce997e0", + "id": "5428071d", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ }, { "cell_type": "markdown", - "id": "1d4ba7fa", + "id": "e4ea842b", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-editing model\n", @@ -129,7 +129,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9dd9185e", + "id": "08f33076", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "78d77137", + "id": "d4012e12", "metadata": {}, "source": [ "### 🌱 Load animal portraits from HuggingFace\n", @@ -164,7 +164,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f2bd5a9", + "id": "6c090a1f", "metadata": {}, "outputs": [], "source": [ @@ -197,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ac94f71", + "id": "db2e5288", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "06290ac8", + "id": "6b4bb935", "metadata": {}, "source": [ "### πŸ—οΈ Build the configuration\n", @@ -231,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f23ee931", + "id": "058315e9", "metadata": {}, "outputs": [], "source": [ @@ -319,7 +319,7 @@ }, { "cell_type": "markdown", - "id": "9d824059", + "id": "07d4e2c3", "metadata": {}, "source": [ "### πŸ” Preview: quick iteration\n", @@ -330,7 +330,7 @@ { "cell_type": "code", "execution_count": null, - "id": "66e11057", + "id": "400d2468", "metadata": {}, "outputs": [], "source": [ @@ -340,7 +340,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f0a0965c", + "id": "8b1398cb", "metadata": {}, "outputs": [], "source": [ @@ -351,7 +351,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6e0b73d6", + "id": "7663308e", "metadata": {}, "outputs": [], "source": [ @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "e4fe0d30", + "id": "4aea373c", "metadata": { "lines_to_next_cell": 2 }, @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8a611c24", + "id": "46f36623", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63a57b22", + "id": "994011a6", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +419,7 @@ }, { "cell_type": "markdown", - "id": "f0a32128", + "id": "9760cce8", "metadata": {}, "source": [ "### πŸ†™ Create at scale\n", @@ -430,7 +430,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e4bba421", + "id": "9d151c05", "metadata": {}, "outputs": [], "source": [ @@ -440,7 +440,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3741c8cc", + "id": "3fec28b8", "metadata": {}, "outputs": [], "source": [ @@ -451,7 +451,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2078f892", + "id": "7238b03a", "metadata": {}, "outputs": [], "source": [ @@ -461,7 +461,7 @@ }, { "cell_type": "markdown", - "id": "8ea73b15", + "id": "bd12ed83", "metadata": {}, "source": [ "## ⏭️ Next steps\n", From 002d46bf85413fe0be6d3116c53ad1793d4084cc Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:26:17 -0700 Subject: [PATCH 58/69] Use regex for base64 character validation in is_base64_image --- .../src/data_designer/config/utils/image_helpers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index c20c81ea..69ee5310 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -7,6 +7,7 @@ import base64 import io +import re from pathlib import Path from typing import TYPE_CHECKING @@ -23,6 +24,8 @@ # WEBP uses RIFF header - handled separately } +_BASE64_PATTERN = re.compile(r"^[A-Za-z0-9+/=]+$") + # Patterns for diffusion-based image models only (use image_generation API). IMAGE_DIFFUSION_MODEL_PATTERNS = ( "dall-e", @@ -152,9 +155,7 @@ def is_base64_image(value: str) -> bool: if value.startswith("data:image/"): return True # Check if it looks like base64 (at least 100 chars, contains only base64 chars) - if len(value) > 100 and all( - c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in value[:100] - ): + if len(value) > 100 and _BASE64_PATTERN.match(value[:100]): try: # Try to decode a small portion to verify it's valid base64 base64.b64decode(value[:100]) From a40f0ce27a83ddf6b720b85a81283b2b053db674 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:38:46 -0700 Subject: [PATCH 59/69] move to a constant --- .../data_designer/config/utils/image_helpers.py | 17 ++++------------- .../tests/config/utils/test_image_helpers.py | 10 ---------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index 69ee5310..0fb949af 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -36,6 +36,8 @@ "imagen", ) +SUPPORTED_IMAGE_EXTENSIONS = [f".{fmt.value.lower()}" for fmt in ImageFormat] + def is_image_diffusion_model(model_name: str) -> bool: """Return True if the model is a diffusion-based image generation model. @@ -137,7 +139,7 @@ def is_image_path(value: str) -> bool: """ if not isinstance(value, str): return False - return any(value.lower().endswith(ext) for ext in get_supported_image_extensions()) + return any(value.lower().endswith(ext) for ext in SUPPORTED_IMAGE_EXTENSIONS) def is_base64_image(value: str) -> bool: @@ -176,9 +178,7 @@ def is_image_url(value: str) -> bool: """ if not isinstance(value, str): return False - return value.startswith(("http://", "https://")) and any( - ext in value.lower() for ext in get_supported_image_extensions() - ) + return value.startswith(("http://", "https://")) and any(ext in value.lower() for ext in SUPPORTED_IMAGE_EXTENSIONS) def load_image_path_to_base64(image_path: str, base_path: str | None = None) -> str | None: @@ -228,12 +228,3 @@ def validate_image(image_path: Path) -> None: img.verify() except Exception as e: raise ValueError(f"Image validation failed: {e}") from e - - -def get_supported_image_extensions() -> list[str]: - """Get list of supported image extensions from ImageFormat enum. - - Returns: - List of extensions with leading dot (e.g., [".png", ".jpg", ...]) - """ - return [f".{fmt.value}" for fmt in ImageFormat] diff --git a/packages/data-designer-config/tests/config/utils/test_image_helpers.py b/packages/data-designer-config/tests/config/utils/test_image_helpers.py index f24696e4..08ea3b50 100644 --- a/packages/data-designer-config/tests/config/utils/test_image_helpers.py +++ b/packages/data-designer-config/tests/config/utils/test_image_helpers.py @@ -14,7 +14,6 @@ decode_base64_image, detect_image_format, extract_base64_from_data_uri, - get_supported_image_extensions, is_base64_image, is_image_diffusion_model, is_image_path, @@ -204,15 +203,6 @@ def test_validate_image_nonexistent_raises_error(tmp_path): validate_image(image_path) -# Tests for get_supported_image_extensions - - -def test_get_supported_image_extensions_matches_enum(): - result = get_supported_image_extensions() - enum_values = [f".{fmt.value}" for fmt in ImageFormat] - assert set(result) == set(enum_values) - - # Additional tests for uncovered lines From cdcb2905ef5c5a633f0a27f3b67d6f2668adbf8e Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:43:21 -0700 Subject: [PATCH 60/69] fix pyproject.toml --- pyproject.toml | 6 +++--- uv.lock | 35 +++++++++++++++++++++++------------ 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 35566648..99d7b78c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,11 +39,11 @@ dev-dependencies = [ [dependency-groups] dev = [ "jsonpath-ng>=1.5.3,<2", - "pytest>=8.3.3,<9", - "pytest-asyncio>=0.24.0,<1", + "pytest>=9.0.2,<10", + "pytest-asyncio>=1.3.0,<2", "pytest-cov>=7.0.0,<8", "pytest-env>=1.2.0,<2", - "pytest-httpx>=0.35.0,<1", + "pytest-httpx>=0.36.0,<1", "pre-commit>=4.0.0,<5", ] docs = [ diff --git a/uv.lock b/uv.lock index 17306f0e..200d0b12 100644 --- a/uv.lock +++ b/uv.lock @@ -308,6 +308,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" }, ] +[[package]] +name = "backports-asyncio-runner" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/ff/70dca7d7cb1cbc0edb2c6cc0c38b65cba36cccc491eca64cabd5fe7f8670/backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162", size = 69893, upload-time = "2025-07-02T02:27:15.685Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/59/76ab57e3fe74484f48a53f8e337171b4a2349e506eabe136d7e01d059086/backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5", size = 12313, upload-time = "2025-07-02T02:27:14.263Z" }, +] + [[package]] name = "backrefs" version = "6.1" @@ -905,11 +914,11 @@ recipes = [ dev = [ { name = "jsonpath-ng", specifier = ">=1.5.3,<2" }, { name = "pre-commit", specifier = ">=4.0.0,<5" }, - { name = "pytest", specifier = ">=8.3.3,<9" }, - { name = "pytest-asyncio", specifier = ">=0.24.0,<1" }, + { name = "pytest", specifier = ">=9.0.2,<10" }, + { name = "pytest-asyncio", specifier = ">=1.3.0,<2" }, { name = "pytest-cov", specifier = ">=7.0.0,<8" }, { name = "pytest-env", specifier = ">=1.2.0,<2" }, - { name = "pytest-httpx", specifier = ">=0.35.0,<1" }, + { name = "pytest-httpx", specifier = ">=0.36.0,<1" }, { name = "ruff", specifier = ">=0.14.10,<1" }, ] docs = [ @@ -3986,7 +3995,7 @@ wheels = [ [[package]] name = "pytest" -version = "8.4.2" +version = "9.0.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, @@ -3997,21 +4006,23 @@ dependencies = [ { name = "pygments" }, { name = "tomli", marker = "python_full_version < '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, ] [[package]] name = "pytest-asyncio" -version = "0.26.0" +version = "1.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "backports-asyncio-runner", marker = "python_full_version < '3.11'" }, { name = "pytest" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8e/c4/453c52c659521066969523e87d85d54139bbd17b78f09532fb8eb8cdb58e/pytest_asyncio-0.26.0.tar.gz", hash = "sha256:c4df2a697648241ff39e7f0e4a73050b03f123f760673956cf0d72a4990e312f", size = 54156, upload-time = "2025-03-25T06:22:28.883Z" } +sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/20/7f/338843f449ace853647ace35870874f69a764d251872ed1b4de9f234822c/pytest_asyncio-0.26.0-py3-none-any.whl", hash = "sha256:7b51ed894f4fbea1340262bdae5135797ebbe21d8638978e35d31c6d19f72fb0", size = 19694, upload-time = "2025-03-25T06:22:27.807Z" }, + { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" }, ] [[package]] @@ -4043,15 +4054,15 @@ wheels = [ [[package]] name = "pytest-httpx" -version = "0.35.0" +version = "0.36.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "httpx" }, { name = "pytest" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/1f/89/5b12b7b29e3d0af3a4b9c071ee92fa25a9017453731a38f08ba01c280f4c/pytest_httpx-0.35.0.tar.gz", hash = "sha256:d619ad5d2e67734abfbb224c3d9025d64795d4b8711116b1a13f72a251ae511f", size = 54146, upload-time = "2024-11-28T19:16:54.237Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/5574834da9499066fa1a5ea9c336f94dba2eae02298d36dab192fcf95c86/pytest_httpx-0.36.0.tar.gz", hash = "sha256:9edb66a5fd4388ce3c343189bc67e7e1cb50b07c2e3fc83b97d511975e8a831b", size = 56793, upload-time = "2025-12-02T16:34:57.414Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b0/ed/026d467c1853dd83102411a78126b4842618e86c895f93528b0528c7a620/pytest_httpx-0.35.0-py3-none-any.whl", hash = "sha256:ee11a00ffcea94a5cbff47af2114d34c5b231c326902458deed73f9c459fd744", size = 19442, upload-time = "2024-11-28T19:16:52.787Z" }, + { url = "https://files.pythonhosted.org/packages/e2/d2/1eb1ea9c84f0d2033eb0b49675afdc71aa4ea801b74615f00f3c33b725e3/pytest_httpx-0.36.0-py3-none-any.whl", hash = "sha256:bd4c120bb80e142df856e825ec9f17981effb84d159f9fa29ed97e2357c3a9c8", size = 20229, upload-time = "2025-12-02T16:34:56.45Z" }, ] [[package]] From ce81629d80aa7344be3ccf6b2ad716c9fd1d2122 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:44:34 -0700 Subject: [PATCH 61/69] Regen colab notebooks --- docs/colab_notebooks/1-the-basics.ipynb | 62 ++++++++--------- ...ctured-outputs-and-jinja-expressions.ipynb | 58 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 54 +++++++-------- .../4-providing-images-as-context.ipynb | 66 +++++++++---------- .../colab_notebooks/5-generating-images.ipynb | 42 ++++++------ .../6-editing-images-with-image-context.ipynb | 54 +++++++-------- 6 files changed, 168 insertions(+), 168 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index 5db874b4..393a78ed 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "2a780d2f", + "id": "2b30d465", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "abad0edc", + "id": "d67efcf1", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -26,7 +26,7 @@ }, { "cell_type": "markdown", - "id": "20c9ce2d", + "id": "1f162636", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "94242a74", + "id": "f10185fd", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6c8ee9cd", + "id": "ecd07400", "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c0c489ab", + "id": "bddcb5b3", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "e654a809", + "id": "fb66bfc4", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3eb3b635", + "id": "b7b5c6bb", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "markdown", - "id": "e0efe23c", + "id": "a6ce6733", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ae122c55", + "id": "c5ab5438", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "c60f5053", + "id": "e7bd6270", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "15588980", + "id": "d0f5c573", "metadata": {}, "outputs": [], "source": [ @@ -169,7 +169,7 @@ }, { "cell_type": "markdown", - "id": "2cb13ebb", + "id": "17e984fc", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1b35befd", + "id": "f3fe1027", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "5f0481d2", + "id": "452e768a", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d48fdf2", + "id": "39f52373", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "4e31585b", + "id": "fb0d9a58", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d323959f", + "id": "c3d3cfe2", "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "ebb70b7f", + "id": "8da18b06", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bbd8772c", + "id": "cab13413", "metadata": {}, "outputs": [], "source": [ @@ -382,7 +382,7 @@ }, { "cell_type": "markdown", - "id": "e80705c4", + "id": "3d6c310d", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -399,7 +399,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f18218cb", + "id": "ce196951", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "503e7dbd", + "id": "6ac6670c", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81753910", + "id": "e91fce39", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "a83b7fce", + "id": "4408b92c", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "76c8f996", + "id": "c6359b29", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "f7d95599", + "id": "a4d95d20", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -466,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7b5affee", + "id": "9074a41d", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "91f0a46e", + "id": "85a7e682", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "75f8df3a", + "id": "6ce84541", "metadata": {}, "outputs": [], "source": [ @@ -501,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "30f497e7", + "id": "34129353", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index ec77a097..8babf691 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "d5ef2760", + "id": "bca2d0a2", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "5759706a", + "id": "922bc2aa", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "84222b10", + "id": "ac6b7964", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b7ec2ab3", + "id": "1dc91238", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39e9b141", + "id": "7a4f803b", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5c4c58b8", + "id": "60ff5954", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "d530a175", + "id": "12f93822", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "60dfabbf", + "id": "19e3c523", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "36e10380", + "id": "518f6549", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9a45638e", + "id": "5938258a", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "45664121", + "id": "5daf7f4d", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "34d3e48c", + "id": "b1462e2d", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "646aef9e", + "id": "31d3f6ae", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -198,7 +198,7 @@ { "cell_type": "code", "execution_count": null, - "id": "09925e01", + "id": "905182e0", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "933c9a74", + "id": "8c7ba890", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "08a2ae51", + "id": "b918e153", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "62c579e9", + "id": "31e7f5bc", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -361,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d06a4634", + "id": "1387a3e4", "metadata": {}, "outputs": [], "source": [ @@ -414,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "2f550b1f", + "id": "b6a7999e", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6f220a6d", + "id": "37913463", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ee6a7ed1", + "id": "b68999df", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b8795600", + "id": "cc307cba", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "1e7aa322", + "id": "c6826ef2", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -475,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e0775906", + "id": "32ac395f", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "0ecba803", + "id": "3ead7c70", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -498,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d83bff69", + "id": "74640a2d", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +508,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3a2b5aa2", + "id": "267581cc", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "111f91d1", + "id": "29293f47", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +533,7 @@ }, { "cell_type": "markdown", - "id": "192cc0ac", + "id": "e548aec4", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 301698c4..1ca3260b 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "2a4ce855", + "id": "0923db0d", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "2621a33b", + "id": "381e81d6", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "d95050e8", + "id": "2c654163", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "36899290", + "id": "4bb11baa", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21c1eb59", + "id": "5afbe143", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a867d606", + "id": "4b17708e", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "c283906d", + "id": "40746f63", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "03e84c09", + "id": "be1416b5", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "1c0b8dca", + "id": "559ed0b9", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4ada8b35", + "id": "239e4a87", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "a0db06b7", + "id": "2c6f30ea", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3da725e1", + "id": "f34d41df", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "8aac6dec", + "id": "228d05c9", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a8b62bbf", + "id": "3a2510af", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "2282b5da", + "id": "b0e75210", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b0bad97", + "id": "c5fecff8", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "da7e91d5", + "id": "279822e4", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -325,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dbd6d41f", + "id": "03df9ac3", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "2101a979", + "id": "1b617cc0", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4e1ae0bd", + "id": "516b7366", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "103a73d3", + "id": "d55e8a60", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -369,7 +369,7 @@ { "cell_type": "code", "execution_count": null, - "id": "96deef26", + "id": "3fbca1bf", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "b7e4b655", + "id": "2305d4cd", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d5591790", + "id": "f033e562", "metadata": {}, "outputs": [], "source": [ @@ -402,7 +402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c5586d2e", + "id": "1f1d0be6", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6089cb9b", + "id": "872c9d42", "metadata": {}, "outputs": [], "source": [ @@ -427,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "7abff8e9", + "id": "f0c405cd", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 2a06053a..60393a30 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a0298395", + "id": "687338dc", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "d5c4d123", + "id": "1dbe71dc", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "eb1a1082", + "id": "afb936fb", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -37,7 +37,7 @@ }, { "cell_type": "markdown", - "id": "71ef6e0a", + "id": "f9e3d848", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0d5a0078", + "id": "c2eed28f", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ecf8e492", + "id": "e9a6811f", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "71efb9d0", + "id": "62a7df74", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "e623bda8", + "id": "e5001d0a", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "106712e1", + "id": "49ac8032", "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "markdown", - "id": "7197bcae", + "id": "11a65d27", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ec6c1909", + "id": "5c0df695", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "06f060fd", + "id": "0afd8032", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b7cbb9cd", + "id": "af331a96", "metadata": {}, "outputs": [], "source": [ @@ -186,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "5c4a365b", + "id": "45f30b6a", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1cf5bb9f", + "id": "33a7fa8c", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "103b422b", + "id": "43af7d08", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5f852791", + "id": "dabe974d", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4f38e09c", + "id": "366c425e", "metadata": {}, "outputs": [], "source": [ @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "129965ee", + "id": "efd7ce5e", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "976261b1", + "id": "0cee47cd", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ }, { "cell_type": "markdown", - "id": "e3f514f3", + "id": "d98a6f30", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "77959e31", + "id": "984357eb", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "63c47637", + "id": "dff4335d", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "273e62f9", + "id": "69aa1319", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "e02290d5", + "id": "2b16f8a3", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -396,7 +396,7 @@ { "cell_type": "code", "execution_count": null, - "id": "57e8280d", + "id": "71f8147c", "metadata": {}, "outputs": [], "source": [ @@ -406,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "9859edde", + "id": "7c649cdc", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b2fd573f", + "id": "0ad9eff2", "metadata": { "lines_to_next_cell": 2 }, @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "4d3f92c0", + "id": "2f28d65f", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f1df6b58", + "id": "9477140d", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d7d29f83", + "id": "d6482d90", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9f1c8f0b", + "id": "2a122796", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ }, { "cell_type": "markdown", - "id": "c3a2fe29", + "id": "319dc353", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/5-generating-images.ipynb b/docs/colab_notebooks/5-generating-images.ipynb index 98935849..14d3af1a 100644 --- a/docs/colab_notebooks/5-generating-images.ipynb +++ b/docs/colab_notebooks/5-generating-images.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "6d260a7e", + "id": "95313826", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Generating Images\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "f1bd7fd5", + "id": "0f901286", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -33,7 +33,7 @@ }, { "cell_type": "markdown", - "id": "50d5e8d1", + "id": "916450a9", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "af19dbf4", + "id": "f4e44c23", "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f5be383f", + "id": "ad0afbf1", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3fdc5e07", + "id": "c81b3bfc", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "c989cef5", + "id": "86b43b8c", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "12e6309d", + "id": "85d2a838", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "0ad4646f", + "id": "f5c2a4f6", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-generation model\n", @@ -118,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "52780619", + "id": "65f21c6d", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ }, { "cell_type": "markdown", - "id": "aa0b4db5", + "id": "a2e1eba4", "metadata": {}, "source": [ "### πŸ—οΈ Build the config: samplers + image column\n", @@ -151,7 +151,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f0ed0d94", + "id": "0a9a00ea", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "e4497637", + "id": "6837d438", "metadata": {}, "source": [ "### πŸ” Preview: images as base64\n", @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9034bec7", + "id": "f0649099", "metadata": {}, "outputs": [], "source": [ @@ -345,7 +345,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f4c2ba02", + "id": "5f6651e6", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81e20819", + "id": "09b4776c", "metadata": {}, "outputs": [], "source": [ @@ -365,7 +365,7 @@ }, { "cell_type": "markdown", - "id": "334a62a5", + "id": "e90de058", "metadata": {}, "source": [ "### πŸ†™ Create: images saved to disk\n", @@ -376,7 +376,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8e456be3", + "id": "57342468", "metadata": {}, "outputs": [], "source": [ @@ -386,7 +386,7 @@ { "cell_type": "code", "execution_count": null, - "id": "fb789f48", + "id": "1935fe08", "metadata": {}, "outputs": [], "source": [ @@ -397,7 +397,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3f23ece3", + "id": "cff790dc", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +413,7 @@ }, { "cell_type": "markdown", - "id": "f7b9a97e", + "id": "454656ff", "metadata": {}, "source": [ "## ⏭️ Next steps\n", diff --git a/docs/colab_notebooks/6-editing-images-with-image-context.ipynb b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb index c7c74096..8123004d 100644 --- a/docs/colab_notebooks/6-editing-images-with-image-context.ipynb +++ b/docs/colab_notebooks/6-editing-images-with-image-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "a7bc100e", + "id": "8b931a1d", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Image-to-Image Editing\n", @@ -23,7 +23,7 @@ }, { "cell_type": "markdown", - "id": "53d134dd", + "id": "af5d52b2", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -34,7 +34,7 @@ }, { "cell_type": "markdown", - "id": "4eec66ab", + "id": "2136d619", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -45,7 +45,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ba921281", + "id": "3a66800a", "metadata": {}, "outputs": [], "source": [ @@ -56,7 +56,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0f5a3272", + "id": "43747c26", "metadata": {}, "outputs": [], "source": [ @@ -74,7 +74,7 @@ { "cell_type": "code", "execution_count": null, - "id": "334a2d03", + "id": "a45dd4bd", "metadata": {}, "outputs": [], "source": [ @@ -93,7 +93,7 @@ }, { "cell_type": "markdown", - "id": "bf64783d", + "id": "c6faa0a1", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -104,7 +104,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5428071d", + "id": "25067233", "metadata": {}, "outputs": [], "source": [ @@ -113,7 +113,7 @@ }, { "cell_type": "markdown", - "id": "e4ea842b", + "id": "f1ffc440", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-editing model\n", @@ -129,7 +129,7 @@ { "cell_type": "code", "execution_count": null, - "id": "08f33076", + "id": "70a0a808", "metadata": {}, "outputs": [], "source": [ @@ -151,7 +151,7 @@ }, { "cell_type": "markdown", - "id": "d4012e12", + "id": "76fee54e", "metadata": {}, "source": [ "### 🌱 Load animal portraits from HuggingFace\n", @@ -164,7 +164,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6c090a1f", + "id": "d160e916", "metadata": {}, "outputs": [], "source": [ @@ -197,7 +197,7 @@ { "cell_type": "code", "execution_count": null, - "id": "db2e5288", + "id": "2a758d0f", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "6b4bb935", + "id": "5ca6c91c", "metadata": {}, "source": [ "### πŸ—οΈ Build the configuration\n", @@ -231,7 +231,7 @@ { "cell_type": "code", "execution_count": null, - "id": "058315e9", + "id": "63718a69", "metadata": {}, "outputs": [], "source": [ @@ -319,7 +319,7 @@ }, { "cell_type": "markdown", - "id": "07d4e2c3", + "id": "14cfba63", "metadata": {}, "source": [ "### πŸ” Preview: quick iteration\n", @@ -330,7 +330,7 @@ { "cell_type": "code", "execution_count": null, - "id": "400d2468", + "id": "988167fa", "metadata": {}, "outputs": [], "source": [ @@ -340,7 +340,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8b1398cb", + "id": "a2d5d5c3", "metadata": {}, "outputs": [], "source": [ @@ -351,7 +351,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7663308e", + "id": "fd14f3e1", "metadata": {}, "outputs": [], "source": [ @@ -360,7 +360,7 @@ }, { "cell_type": "markdown", - "id": "4aea373c", + "id": "fe5a4001", "metadata": { "lines_to_next_cell": 2 }, @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "46f36623", + "id": "eb707b0c", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "994011a6", + "id": "cec58c9f", "metadata": {}, "outputs": [], "source": [ @@ -419,7 +419,7 @@ }, { "cell_type": "markdown", - "id": "9760cce8", + "id": "66187e01", "metadata": {}, "source": [ "### πŸ†™ Create at scale\n", @@ -430,7 +430,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9d151c05", + "id": "9c6c36a8", "metadata": {}, "outputs": [], "source": [ @@ -440,7 +440,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3fec28b8", + "id": "a4746440", "metadata": {}, "outputs": [], "source": [ @@ -451,7 +451,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7238b03a", + "id": "61a9a0d2", "metadata": {}, "outputs": [], "source": [ @@ -461,7 +461,7 @@ }, { "cell_type": "markdown", - "id": "bd12ed83", + "id": "b0ee31b3", "metadata": {}, "source": [ "## ⏭️ Next steps\n", From 5aa7e109faab4889e0fb5b1fa6121033caefd565 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:26:17 -0700 Subject: [PATCH 62/69] Use regex for base64 character validation in is_base64_image --- .../src/data_designer/config/utils/image_helpers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index c20c81ea..69ee5310 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -7,6 +7,7 @@ import base64 import io +import re from pathlib import Path from typing import TYPE_CHECKING @@ -23,6 +24,8 @@ # WEBP uses RIFF header - handled separately } +_BASE64_PATTERN = re.compile(r"^[A-Za-z0-9+/=]+$") + # Patterns for diffusion-based image models only (use image_generation API). IMAGE_DIFFUSION_MODEL_PATTERNS = ( "dall-e", @@ -152,9 +155,7 @@ def is_base64_image(value: str) -> bool: if value.startswith("data:image/"): return True # Check if it looks like base64 (at least 100 chars, contains only base64 chars) - if len(value) > 100 and all( - c in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=" for c in value[:100] - ): + if len(value) > 100 and _BASE64_PATTERN.match(value[:100]): try: # Try to decode a small portion to verify it's valid base64 base64.b64decode(value[:100]) From ecaeb727b239427a63c0aaaf178145fbf321bf7c Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:38:46 -0700 Subject: [PATCH 63/69] move to a constant --- .../data_designer/config/utils/image_helpers.py | 17 ++++------------- .../tests/config/utils/test_image_helpers.py | 10 ---------- 2 files changed, 4 insertions(+), 23 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index 69ee5310..0fb949af 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -36,6 +36,8 @@ "imagen", ) +SUPPORTED_IMAGE_EXTENSIONS = [f".{fmt.value.lower()}" for fmt in ImageFormat] + def is_image_diffusion_model(model_name: str) -> bool: """Return True if the model is a diffusion-based image generation model. @@ -137,7 +139,7 @@ def is_image_path(value: str) -> bool: """ if not isinstance(value, str): return False - return any(value.lower().endswith(ext) for ext in get_supported_image_extensions()) + return any(value.lower().endswith(ext) for ext in SUPPORTED_IMAGE_EXTENSIONS) def is_base64_image(value: str) -> bool: @@ -176,9 +178,7 @@ def is_image_url(value: str) -> bool: """ if not isinstance(value, str): return False - return value.startswith(("http://", "https://")) and any( - ext in value.lower() for ext in get_supported_image_extensions() - ) + return value.startswith(("http://", "https://")) and any(ext in value.lower() for ext in SUPPORTED_IMAGE_EXTENSIONS) def load_image_path_to_base64(image_path: str, base_path: str | None = None) -> str | None: @@ -228,12 +228,3 @@ def validate_image(image_path: Path) -> None: img.verify() except Exception as e: raise ValueError(f"Image validation failed: {e}") from e - - -def get_supported_image_extensions() -> list[str]: - """Get list of supported image extensions from ImageFormat enum. - - Returns: - List of extensions with leading dot (e.g., [".png", ".jpg", ...]) - """ - return [f".{fmt.value}" for fmt in ImageFormat] diff --git a/packages/data-designer-config/tests/config/utils/test_image_helpers.py b/packages/data-designer-config/tests/config/utils/test_image_helpers.py index f24696e4..08ea3b50 100644 --- a/packages/data-designer-config/tests/config/utils/test_image_helpers.py +++ b/packages/data-designer-config/tests/config/utils/test_image_helpers.py @@ -14,7 +14,6 @@ decode_base64_image, detect_image_format, extract_base64_from_data_uri, - get_supported_image_extensions, is_base64_image, is_image_diffusion_model, is_image_path, @@ -204,15 +203,6 @@ def test_validate_image_nonexistent_raises_error(tmp_path): validate_image(image_path) -# Tests for get_supported_image_extensions - - -def test_get_supported_image_extensions_matches_enum(): - result = get_supported_image_extensions() - enum_values = [f".{fmt.value}" for fmt in ImageFormat] - assert set(result) == set(enum_values) - - # Additional tests for uncovered lines From 622b1c4f75013d184fefb46da95e2bf5a69a285c Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:43:21 -0700 Subject: [PATCH 64/69] fix pyproject.toml --- pyproject.toml | 6 +++--- uv.lock | 35 +++++++++++++++++++++++------------ 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 35566648..99d7b78c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,11 +39,11 @@ dev-dependencies = [ [dependency-groups] dev = [ "jsonpath-ng>=1.5.3,<2", - "pytest>=8.3.3,<9", - "pytest-asyncio>=0.24.0,<1", + "pytest>=9.0.2,<10", + "pytest-asyncio>=1.3.0,<2", "pytest-cov>=7.0.0,<8", "pytest-env>=1.2.0,<2", - "pytest-httpx>=0.35.0,<1", + "pytest-httpx>=0.36.0,<1", "pre-commit>=4.0.0,<5", ] docs = [ diff --git a/uv.lock b/uv.lock index 17306f0e..200d0b12 100644 --- a/uv.lock +++ b/uv.lock @@ -308,6 +308,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/b8/3fe70c75fe32afc4bb507f75563d39bc5642255d1d94f1f23604725780bf/babel-2.17.0-py3-none-any.whl", hash = "sha256:4d0b53093fdfb4b21c92b5213dba5a1b23885afa8383709427046b21c366e5f2", size = 10182537, upload-time = "2025-02-01T15:17:37.39Z" }, ] +[[package]] +name = "backports-asyncio-runner" +version = "1.2.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/8e/ff/70dca7d7cb1cbc0edb2c6cc0c38b65cba36cccc491eca64cabd5fe7f8670/backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162", size = 69893, upload-time = "2025-07-02T02:27:15.685Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/59/76ab57e3fe74484f48a53f8e337171b4a2349e506eabe136d7e01d059086/backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5", size = 12313, upload-time = "2025-07-02T02:27:14.263Z" }, +] + [[package]] name = "backrefs" version = "6.1" @@ -905,11 +914,11 @@ recipes = [ dev = [ { name = "jsonpath-ng", specifier = ">=1.5.3,<2" }, { name = "pre-commit", specifier = ">=4.0.0,<5" }, - { name = "pytest", specifier = ">=8.3.3,<9" }, - { name = "pytest-asyncio", specifier = ">=0.24.0,<1" }, + { name = "pytest", specifier = ">=9.0.2,<10" }, + { name = "pytest-asyncio", specifier = ">=1.3.0,<2" }, { name = "pytest-cov", specifier = ">=7.0.0,<8" }, { name = "pytest-env", specifier = ">=1.2.0,<2" }, - { name = "pytest-httpx", specifier = ">=0.35.0,<1" }, + { name = "pytest-httpx", specifier = ">=0.36.0,<1" }, { name = "ruff", specifier = ">=0.14.10,<1" }, ] docs = [ @@ -3986,7 +3995,7 @@ wheels = [ [[package]] name = "pytest" -version = "8.4.2" +version = "9.0.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, @@ -3997,21 +4006,23 @@ dependencies = [ { name = "pygments" }, { name = "tomli", marker = "python_full_version < '3.11'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a3/5c/00a0e072241553e1a7496d638deababa67c5058571567b92a7eaa258397c/pytest-8.4.2.tar.gz", hash = "sha256:86c0d0b93306b961d58d62a4db4879f27fe25513d4b969df351abdddb3c30e01", size = 1519618, upload-time = "2025-09-04T14:34:22.711Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d1/db/7ef3487e0fb0049ddb5ce41d3a49c235bf9ad299b6a25d5780a89f19230f/pytest-9.0.2.tar.gz", hash = "sha256:75186651a92bd89611d1d9fc20f0b4345fd827c41ccd5c299a868a05d70edf11", size = 1568901, upload-time = "2025-12-06T21:30:51.014Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/a4/20da314d277121d6534b3a980b29035dcd51e6744bd79075a6ce8fa4eb8d/pytest-8.4.2-py3-none-any.whl", hash = "sha256:872f880de3fc3a5bdc88a11b39c9710c3497a547cfa9320bc3c5e62fbf272e79", size = 365750, upload-time = "2025-09-04T14:34:20.226Z" }, + { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, ] [[package]] name = "pytest-asyncio" -version = "0.26.0" +version = "1.3.0" source = { registry = "https://pypi.org/simple" } dependencies = [ + { name = "backports-asyncio-runner", marker = "python_full_version < '3.11'" }, { name = "pytest" }, + { name = "typing-extensions", marker = "python_full_version < '3.13'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8e/c4/453c52c659521066969523e87d85d54139bbd17b78f09532fb8eb8cdb58e/pytest_asyncio-0.26.0.tar.gz", hash = "sha256:c4df2a697648241ff39e7f0e4a73050b03f123f760673956cf0d72a4990e312f", size = 54156, upload-time = "2025-03-25T06:22:28.883Z" } +sdist = { url = "https://files.pythonhosted.org/packages/90/2c/8af215c0f776415f3590cac4f9086ccefd6fd463befeae41cd4d3f193e5a/pytest_asyncio-1.3.0.tar.gz", hash = "sha256:d7f52f36d231b80ee124cd216ffb19369aa168fc10095013c6b014a34d3ee9e5", size = 50087, upload-time = "2025-11-10T16:07:47.256Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/20/7f/338843f449ace853647ace35870874f69a764d251872ed1b4de9f234822c/pytest_asyncio-0.26.0-py3-none-any.whl", hash = "sha256:7b51ed894f4fbea1340262bdae5135797ebbe21d8638978e35d31c6d19f72fb0", size = 19694, upload-time = "2025-03-25T06:22:27.807Z" }, + { url = "https://files.pythonhosted.org/packages/e5/35/f8b19922b6a25bc0880171a2f1a003eaeb93657475193ab516fd87cac9da/pytest_asyncio-1.3.0-py3-none-any.whl", hash = "sha256:611e26147c7f77640e6d0a92a38ed17c3e9848063698d5c93d5aa7aa11cebff5", size = 15075, upload-time = "2025-11-10T16:07:45.537Z" }, ] [[package]] @@ -4043,15 +4054,15 @@ wheels = [ [[package]] name = "pytest-httpx" -version = "0.35.0" +version = "0.36.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "httpx" }, { name = "pytest" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/1f/89/5b12b7b29e3d0af3a4b9c071ee92fa25a9017453731a38f08ba01c280f4c/pytest_httpx-0.35.0.tar.gz", hash = "sha256:d619ad5d2e67734abfbb224c3d9025d64795d4b8711116b1a13f72a251ae511f", size = 54146, upload-time = "2024-11-28T19:16:54.237Z" } +sdist = { url = "https://files.pythonhosted.org/packages/ca/bc/5574834da9499066fa1a5ea9c336f94dba2eae02298d36dab192fcf95c86/pytest_httpx-0.36.0.tar.gz", hash = "sha256:9edb66a5fd4388ce3c343189bc67e7e1cb50b07c2e3fc83b97d511975e8a831b", size = 56793, upload-time = "2025-12-02T16:34:57.414Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/b0/ed/026d467c1853dd83102411a78126b4842618e86c895f93528b0528c7a620/pytest_httpx-0.35.0-py3-none-any.whl", hash = "sha256:ee11a00ffcea94a5cbff47af2114d34c5b231c326902458deed73f9c459fd744", size = 19442, upload-time = "2024-11-28T19:16:52.787Z" }, + { url = "https://files.pythonhosted.org/packages/e2/d2/1eb1ea9c84f0d2033eb0b49675afdc71aa4ea801b74615f00f3c33b725e3/pytest_httpx-0.36.0-py3-none-any.whl", hash = "sha256:bd4c120bb80e142df856e825ec9f17981effb84d159f9fa29ed97e2357c3a9c8", size = 20229, upload-time = "2025-12-02T16:34:56.45Z" }, ] [[package]] From 400e97b55a555c6fc788245c38862599a29b3189 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 15:50:57 -0700 Subject: [PATCH 65/69] regen colab notebooks --- docs/colab_notebooks/1-the-basics.ipynb | 62 ++++++++--------- ...ctured-outputs-and-jinja-expressions.ipynb | 58 ++++++++-------- .../3-seeding-with-a-dataset.ipynb | 54 +++++++-------- .../4-providing-images-as-context.ipynb | 66 +++++++++---------- .../colab_notebooks/5-generating-images.ipynb | 44 ++++++------- 5 files changed, 142 insertions(+), 142 deletions(-) diff --git a/docs/colab_notebooks/1-the-basics.ipynb b/docs/colab_notebooks/1-the-basics.ipynb index ed8942df..f50209f7 100644 --- a/docs/colab_notebooks/1-the-basics.ipynb +++ b/docs/colab_notebooks/1-the-basics.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "945eebf8", + "id": "96178d08", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: The Basics\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "8e8f2e22", + "id": "1d02a1d6", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -26,7 +26,7 @@ }, { "cell_type": "markdown", - "id": "92d91bf1", + "id": "2292d817", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0b9b4427", + "id": "8af621fc", "metadata": {}, "outputs": [], "source": [ @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8878d172", + "id": "70e6a11c", "metadata": {}, "outputs": [], "source": [ @@ -66,7 +66,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4c92bfb3", + "id": "41031828", "metadata": {}, "outputs": [], "source": [ @@ -76,7 +76,7 @@ }, { "cell_type": "markdown", - "id": "4e39eed1", + "id": "0b480b10", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -89,7 +89,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70c96cfb", + "id": "d434a8e2", "metadata": {}, "outputs": [], "source": [ @@ -98,7 +98,7 @@ }, { "cell_type": "markdown", - "id": "99d975c9", + "id": "f88f6792", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -115,7 +115,7 @@ { "cell_type": "code", "execution_count": null, - "id": "851228c8", + "id": "4261574c", "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "markdown", - "id": "fefb639d", + "id": "bbbc3d58", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -160,7 +160,7 @@ { "cell_type": "code", "execution_count": null, - "id": "0ba52672", + "id": "92c0cf35", "metadata": {}, "outputs": [], "source": [ @@ -169,7 +169,7 @@ }, { "cell_type": "markdown", - "id": "7cc2aefc", + "id": "44246c7d", "metadata": {}, "source": [ "## 🎲 Getting started with sampler columns\n", @@ -186,7 +186,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a5a34b1a", + "id": "07d20f3f", "metadata": {}, "outputs": [], "source": [ @@ -195,7 +195,7 @@ }, { "cell_type": "markdown", - "id": "ee4d1b6a", + "id": "9d3c87b0", "metadata": {}, "source": [ "Let's start designing our product review dataset by adding product category and subcategory columns.\n" @@ -204,7 +204,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7782d790", + "id": "c646b021", "metadata": {}, "outputs": [], "source": [ @@ -285,7 +285,7 @@ }, { "cell_type": "markdown", - "id": "f88e8b18", + "id": "ff18b032", "metadata": {}, "source": [ "Next, let's add samplers to generate data related to the customer and their review.\n" @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "19174a73", + "id": "78846d99", "metadata": {}, "outputs": [], "source": [ @@ -331,7 +331,7 @@ }, { "cell_type": "markdown", - "id": "01438115", + "id": "97059bfc", "metadata": {}, "source": [ "## 🦜 LLM-generated columns\n", @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9c8f1275", + "id": "98c66eff", "metadata": {}, "outputs": [], "source": [ @@ -382,7 +382,7 @@ }, { "cell_type": "markdown", - "id": "f61e3771", + "id": "ff2d52b9", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -399,7 +399,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f8dc56e", + "id": "6e622478", "metadata": {}, "outputs": [], "source": [ @@ -409,7 +409,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5b66172a", + "id": "1addc7d8", "metadata": {}, "outputs": [], "source": [ @@ -420,7 +420,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b0eaa931", + "id": "7af4b9c3", "metadata": {}, "outputs": [], "source": [ @@ -430,7 +430,7 @@ }, { "cell_type": "markdown", - "id": "122d099d", + "id": "91d0ee89", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -443,7 +443,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f40f7ba0", + "id": "e1e3aed0", "metadata": {}, "outputs": [], "source": [ @@ -453,7 +453,7 @@ }, { "cell_type": "markdown", - "id": "597c41ec", + "id": "6eaa402e", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -466,7 +466,7 @@ { "cell_type": "code", "execution_count": null, - "id": "acf8caa3", + "id": "f6b148d4", "metadata": {}, "outputs": [], "source": [ @@ -476,7 +476,7 @@ { "cell_type": "code", "execution_count": null, - "id": "697e9090", + "id": "f4e62e5b", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18f34e66", + "id": "7d426ab0", "metadata": {}, "outputs": [], "source": [ @@ -501,7 +501,7 @@ }, { "cell_type": "markdown", - "id": "4c498f62", + "id": "449d003c", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb index 49be6edb..a6e04680 100644 --- a/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb +++ b/docs/colab_notebooks/2-structured-outputs-and-jinja-expressions.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "bd333de9", + "id": "ba22504d", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Structured Outputs and Jinja Expressions\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "28fb2ee3", + "id": "c176fe63", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "fbeb3b2d", + "id": "32c80f72", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6ef3d2ae", + "id": "4ab45e3a", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "07546806", + "id": "2ae70d67", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "81b00725", + "id": "2cdc070b", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "a5cf694f", + "id": "a04261b9", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "8320e2b0", + "id": "c8bef18a", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "348e2c5a", + "id": "ed555636", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "21019fc5", + "id": "47208094", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "7bf9d9af", + "id": "36c200d9", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "88abb685", + "id": "57c0d82f", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "d8e790c6", + "id": "01ff63ca", "metadata": {}, "source": [ "### πŸ§‘β€πŸŽ¨ Designing our data\n", @@ -198,7 +198,7 @@ { "cell_type": "code", "execution_count": null, - "id": "64465ab1", + "id": "4fb0f1ca", "metadata": {}, "outputs": [], "source": [ @@ -226,7 +226,7 @@ }, { "cell_type": "markdown", - "id": "cfbad124", + "id": "8f35bd87", "metadata": {}, "source": [ "Next, let's design our product review dataset using a few more tricks compared to the previous notebook.\n" @@ -235,7 +235,7 @@ { "cell_type": "code", "execution_count": null, - "id": "aa93a4c9", + "id": "43341f16", "metadata": {}, "outputs": [], "source": [ @@ -344,7 +344,7 @@ }, { "cell_type": "markdown", - "id": "74aa72fc", + "id": "34c3e08b", "metadata": {}, "source": [ "Next, we will use more advanced Jinja expressions to create new columns.\n", @@ -361,7 +361,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9ae978cc", + "id": "c168c089", "metadata": {}, "outputs": [], "source": [ @@ -414,7 +414,7 @@ }, { "cell_type": "markdown", - "id": "ec850f14", + "id": "7e6521a2", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -431,7 +431,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cb18575e", + "id": "03510f78", "metadata": {}, "outputs": [], "source": [ @@ -441,7 +441,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eee46dc6", + "id": "ad599c43", "metadata": {}, "outputs": [], "source": [ @@ -452,7 +452,7 @@ { "cell_type": "code", "execution_count": null, - "id": "082d0fc4", + "id": "dbd3e17c", "metadata": {}, "outputs": [], "source": [ @@ -462,7 +462,7 @@ }, { "cell_type": "markdown", - "id": "e8d80b94", + "id": "4db52c26", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -475,7 +475,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4b0a7299", + "id": "f1007ac4", "metadata": {}, "outputs": [], "source": [ @@ -485,7 +485,7 @@ }, { "cell_type": "markdown", - "id": "d7e0c925", + "id": "dcd68de4", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -498,7 +498,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b599d759", + "id": "27b6bfe8", "metadata": {}, "outputs": [], "source": [ @@ -508,7 +508,7 @@ { "cell_type": "code", "execution_count": null, - "id": "07a7c0da", + "id": "d4e9a395", "metadata": {}, "outputs": [], "source": [ @@ -521,7 +521,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7760dffa", + "id": "946b3aa8", "metadata": {}, "outputs": [], "source": [ @@ -533,7 +533,7 @@ }, { "cell_type": "markdown", - "id": "6d19000a", + "id": "f50d996e", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb index 468aa795..639e88df 100644 --- a/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb +++ b/docs/colab_notebooks/3-seeding-with-a-dataset.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "573c3e7b", + "id": "25501772", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Seeding Synthetic Data Generation with an External Dataset\n", @@ -16,7 +16,7 @@ }, { "cell_type": "markdown", - "id": "63f6c36d", + "id": "67ffc49e", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -28,7 +28,7 @@ }, { "cell_type": "markdown", - "id": "02cc81c7", + "id": "54a42504", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -39,7 +39,7 @@ { "cell_type": "code", "execution_count": null, - "id": "18d51631", + "id": "05b45354", "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ { "cell_type": "code", "execution_count": null, - "id": "67c55f6b", + "id": "039360fe", "metadata": {}, "outputs": [], "source": [ @@ -68,7 +68,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cfe2ff62", + "id": "028d5e8a", "metadata": {}, "outputs": [], "source": [ @@ -78,7 +78,7 @@ }, { "cell_type": "markdown", - "id": "bdbc5b03", + "id": "15a1df61", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": null, - "id": "55d9caf1", + "id": "a87b6ff6", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "aa1623bc", + "id": "b9166cfd", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -117,7 +117,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9d1310cf", + "id": "4961d3b0", "metadata": {}, "outputs": [], "source": [ @@ -147,7 +147,7 @@ }, { "cell_type": "markdown", - "id": "e64ce3b7", + "id": "b1d8588a", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -162,7 +162,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dafd6155", + "id": "cf42a4dd", "metadata": {}, "outputs": [], "source": [ @@ -171,7 +171,7 @@ }, { "cell_type": "markdown", - "id": "7c01f11c", + "id": "8d6b26aa", "metadata": {}, "source": [ "## πŸ₯ Prepare a seed dataset\n", @@ -196,7 +196,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7941073f", + "id": "fc90401d", "metadata": {}, "outputs": [], "source": [ @@ -214,7 +214,7 @@ }, { "cell_type": "markdown", - "id": "a68c7d55", + "id": "6f5ee960", "metadata": {}, "source": [ "## 🎨 Designing our synthetic patient notes dataset\n", @@ -227,7 +227,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f1b3d4d4", + "id": "e9db2ff0", "metadata": {}, "outputs": [], "source": [ @@ -308,7 +308,7 @@ }, { "cell_type": "markdown", - "id": "eff1bf9f", + "id": "00efc894", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -325,7 +325,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b5955230", + "id": "3e3d824e", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "062a7294", + "id": "27785af7", "metadata": {}, "outputs": [], "source": [ @@ -346,7 +346,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6378e1be", + "id": "430998d1", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ }, { "cell_type": "markdown", - "id": "51e5175e", + "id": "dda6458b", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -369,7 +369,7 @@ { "cell_type": "code", "execution_count": null, - "id": "891b6860", + "id": "f45bc088", "metadata": {}, "outputs": [], "source": [ @@ -379,7 +379,7 @@ }, { "cell_type": "markdown", - "id": "0f52668f", + "id": "1e913fd8", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -392,7 +392,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ed083bd8", + "id": "30b8b7f7", "metadata": {}, "outputs": [], "source": [ @@ -402,7 +402,7 @@ { "cell_type": "code", "execution_count": null, - "id": "039c42e4", + "id": "b7ff96d1", "metadata": {}, "outputs": [], "source": [ @@ -415,7 +415,7 @@ { "cell_type": "code", "execution_count": null, - "id": "623ca205", + "id": "dbfef8a8", "metadata": {}, "outputs": [], "source": [ @@ -427,7 +427,7 @@ }, { "cell_type": "markdown", - "id": "0a7e7d42", + "id": "5db3f38d", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/4-providing-images-as-context.ipynb b/docs/colab_notebooks/4-providing-images-as-context.ipynb index 62ac63e8..9797695e 100644 --- a/docs/colab_notebooks/4-providing-images-as-context.ipynb +++ b/docs/colab_notebooks/4-providing-images-as-context.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "731384ed", + "id": "19e57933", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Providing Images as Context for Vision-Based Data Generation" @@ -10,7 +10,7 @@ }, { "cell_type": "markdown", - "id": "bc66dd23", + "id": "25e3cc64", "metadata": {}, "source": [ "#### πŸ“š What you'll learn\n", @@ -25,7 +25,7 @@ }, { "cell_type": "markdown", - "id": "4539a931", + "id": "4aae5c82", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -37,7 +37,7 @@ }, { "cell_type": "markdown", - "id": "f88809bf", + "id": "24dfae6c", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -48,7 +48,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3628d4c4", + "id": "619b1aae", "metadata": {}, "outputs": [], "source": [ @@ -59,7 +59,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7fcf0f75", + "id": "0d49a542", "metadata": {}, "outputs": [], "source": [ @@ -77,7 +77,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6654714a", + "id": "1b28f160", "metadata": {}, "outputs": [], "source": [ @@ -100,7 +100,7 @@ }, { "cell_type": "markdown", - "id": "22488cb7", + "id": "63dc34de", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -113,7 +113,7 @@ { "cell_type": "code", "execution_count": null, - "id": "39913ca0", + "id": "672155c8", "metadata": {}, "outputs": [], "source": [ @@ -122,7 +122,7 @@ }, { "cell_type": "markdown", - "id": "fba112ab", + "id": "4b32c25e", "metadata": {}, "source": [ "### πŸŽ›οΈ Define model configurations\n", @@ -139,7 +139,7 @@ { "cell_type": "code", "execution_count": null, - "id": "70fd86dd", + "id": "72971915", "metadata": {}, "outputs": [], "source": [ @@ -162,7 +162,7 @@ }, { "cell_type": "markdown", - "id": "810c7457", + "id": "115ad20f", "metadata": {}, "source": [ "### πŸ—οΈ Initialize the Data Designer Config Builder\n", @@ -177,7 +177,7 @@ { "cell_type": "code", "execution_count": null, - "id": "9b2204d0", + "id": "11e844d2", "metadata": {}, "outputs": [], "source": [ @@ -186,7 +186,7 @@ }, { "cell_type": "markdown", - "id": "29e3dae5", + "id": "77862fce", "metadata": {}, "source": [ "### 🌱 Seed Dataset Creation\n", @@ -203,7 +203,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e2cc3506", + "id": "e415a502", "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7a821067", + "id": "335f2611", "metadata": {}, "outputs": [], "source": [ @@ -266,7 +266,7 @@ { "cell_type": "code", "execution_count": null, - "id": "359d144b", + "id": "f055e88d", "metadata": {}, "outputs": [], "source": [ @@ -284,7 +284,7 @@ { "cell_type": "code", "execution_count": null, - "id": "985cd308", + "id": "47a1c586", "metadata": {}, "outputs": [], "source": [ @@ -294,7 +294,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6a8cb414", + "id": "3a77fc52", "metadata": {}, "outputs": [], "source": [ @@ -306,7 +306,7 @@ { "cell_type": "code", "execution_count": null, - "id": "a57e1b73", + "id": "c0941cc7", "metadata": {}, "outputs": [], "source": [ @@ -335,7 +335,7 @@ }, { "cell_type": "markdown", - "id": "7518100a", + "id": "578e77dc", "metadata": {}, "source": [ "### πŸ” Iteration is key – preview the dataset!\n", @@ -352,7 +352,7 @@ { "cell_type": "code", "execution_count": null, - "id": "4c1fe540", + "id": "9f0c11ce", "metadata": {}, "outputs": [], "source": [ @@ -362,7 +362,7 @@ { "cell_type": "code", "execution_count": null, - "id": "bceafe91", + "id": "b10412c1", "metadata": {}, "outputs": [], "source": [ @@ -373,7 +373,7 @@ { "cell_type": "code", "execution_count": null, - "id": "20f4ace5", + "id": "766ee2d7", "metadata": {}, "outputs": [], "source": [ @@ -383,7 +383,7 @@ }, { "cell_type": "markdown", - "id": "16a86d56", + "id": "6370bfa5", "metadata": {}, "source": [ "### πŸ“Š Analyze the generated data\n", @@ -396,7 +396,7 @@ { "cell_type": "code", "execution_count": null, - "id": "c1bbae97", + "id": "d57ded0e", "metadata": {}, "outputs": [], "source": [ @@ -406,7 +406,7 @@ }, { "cell_type": "markdown", - "id": "d8d7604f", + "id": "5afd8e8c", "metadata": {}, "source": [ "### πŸ”Ž Visual Inspection\n", @@ -417,7 +417,7 @@ { "cell_type": "code", "execution_count": null, - "id": "27c0636c", + "id": "aa4bfcc3", "metadata": { "lines_to_next_cell": 2 }, @@ -441,7 +441,7 @@ }, { "cell_type": "markdown", - "id": "f6b99539", + "id": "4eeaada6", "metadata": {}, "source": [ "### πŸ†™ Scale up!\n", @@ -454,7 +454,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e5d53787", + "id": "0ee5b1b9", "metadata": {}, "outputs": [], "source": [ @@ -464,7 +464,7 @@ { "cell_type": "code", "execution_count": null, - "id": "1f859e49", + "id": "e5e8b241", "metadata": {}, "outputs": [], "source": [ @@ -477,7 +477,7 @@ { "cell_type": "code", "execution_count": null, - "id": "6688e3c5", + "id": "23ebb3ca", "metadata": {}, "outputs": [], "source": [ @@ -489,7 +489,7 @@ }, { "cell_type": "markdown", - "id": "28635b09", + "id": "14a78533", "metadata": {}, "source": [ "## ⏭️ Next Steps\n", diff --git a/docs/colab_notebooks/5-generating-images.ipynb b/docs/colab_notebooks/5-generating-images.ipynb index 485fe258..c8092938 100644 --- a/docs/colab_notebooks/5-generating-images.ipynb +++ b/docs/colab_notebooks/5-generating-images.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "0ee289e6", + "id": "735e6197", "metadata": {}, "source": [ "# 🎨 Data Designer Tutorial: Generating Images\n", @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "86f748c1", + "id": "92ae4afe", "metadata": {}, "source": [ "### πŸ“¦ Import Data Designer\n", @@ -33,7 +33,7 @@ }, { "cell_type": "markdown", - "id": "c610ee22", + "id": "ccc77347", "metadata": {}, "source": [ "### ⚑ Colab Setup\n", @@ -44,7 +44,7 @@ { "cell_type": "code", "execution_count": null, - "id": "818ca495", + "id": "23627c23", "metadata": {}, "outputs": [], "source": [ @@ -55,7 +55,7 @@ { "cell_type": "code", "execution_count": null, - "id": "f165bb15", + "id": "bf958dc6", "metadata": {}, "outputs": [], "source": [ @@ -73,7 +73,7 @@ { "cell_type": "code", "execution_count": null, - "id": "5decfc83", + "id": "ab0cfff8", "metadata": {}, "outputs": [], "source": [ @@ -86,7 +86,7 @@ }, { "cell_type": "markdown", - "id": "929f35d6", + "id": "a18ef5ce", "metadata": {}, "source": [ "### βš™οΈ Initialize the Data Designer interface\n", @@ -97,7 +97,7 @@ { "cell_type": "code", "execution_count": null, - "id": "b4c8b7d7", + "id": "5fe11301", "metadata": {}, "outputs": [], "source": [ @@ -106,7 +106,7 @@ }, { "cell_type": "markdown", - "id": "8ed7b0b6", + "id": "b913d454", "metadata": {}, "source": [ "### πŸŽ›οΈ Define an image-generation model\n", @@ -118,7 +118,7 @@ { "cell_type": "code", "execution_count": null, - "id": "d6b1ca66", + "id": "a50d26ee", "metadata": {}, "outputs": [], "source": [ @@ -140,7 +140,7 @@ }, { "cell_type": "markdown", - "id": "498cfecf", + "id": "122374d9", "metadata": {}, "source": [ "### πŸ—οΈ Build the config: samplers + image column\n", @@ -151,7 +151,7 @@ { "cell_type": "code", "execution_count": null, - "id": "e74fc7ab", + "id": "940f2b70", "metadata": {}, "outputs": [], "source": [ @@ -324,7 +324,7 @@ }, { "cell_type": "markdown", - "id": "c592c820", + "id": "e13e0bb4", "metadata": {}, "source": [ "### πŸ” Preview: images as base64\n", @@ -335,7 +335,7 @@ { "cell_type": "code", "execution_count": null, - "id": "eee17bb1", + "id": "2a60a76f", "metadata": {}, "outputs": [], "source": [ @@ -345,7 +345,7 @@ { "cell_type": "code", "execution_count": null, - "id": "3cd320cc", + "id": "3c831ee8", "metadata": {}, "outputs": [], "source": [ @@ -356,7 +356,7 @@ { "cell_type": "code", "execution_count": null, - "id": "ffb5e188", + "id": "143e762f", "metadata": {}, "outputs": [], "source": [ @@ -365,7 +365,7 @@ }, { "cell_type": "markdown", - "id": "87b83328", + "id": "a84606b4", "metadata": {}, "source": [ "### πŸ†™ Create: images saved to disk\n", @@ -376,17 +376,17 @@ { "cell_type": "code", "execution_count": null, - "id": "a8f9cc41", + "id": "89147954", "metadata": {}, "outputs": [], "source": [ - "results = data_designer.create(config_builder, num_records=5, dataset_name=\"tutorial-5-images\")" + "results = data_designer.create(config_builder, num_records=2, dataset_name=\"tutorial-5-images\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "0d4453e5", + "id": "04c96063", "metadata": {}, "outputs": [], "source": [ @@ -397,7 +397,7 @@ { "cell_type": "code", "execution_count": null, - "id": "198301ab", + "id": "edb794bb", "metadata": {}, "outputs": [], "source": [ @@ -413,7 +413,7 @@ }, { "cell_type": "markdown", - "id": "2bdcef2b", + "id": "e0a72bf6", "metadata": {}, "source": [ "## ⏭️ Next steps\n", From 469a3d295fd24afbce9179e24889ff4947c95c6d Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 10 Feb 2026 16:01:01 -0700 Subject: [PATCH 66/69] raise a ValueError if we fail to detect image format --- .../config/utils/image_helpers.py | 28 +++++++++++++++---- .../tests/config/utils/test_image_helpers.py | 17 +++++------ .../engine/storage/test_media_storage.py | 2 +- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index 0fb949af..c91974d8 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -21,9 +21,20 @@ IMAGE_FORMAT_MAGIC_BYTES = { ImageFormat.PNG: b"\x89PNG\r\n\x1a\n", ImageFormat.JPG: b"\xff\xd8\xff", + ImageFormat.GIF: b"GIF8", # WEBP uses RIFF header - handled separately } +# Maps PIL format name (lowercase) to our ImageFormat enum. +# PIL reports "JPEG" (not "JPG"), so we normalize it here. +_PIL_FORMAT_TO_IMAGE_FORMAT: dict[str, ImageFormat] = { + "png": ImageFormat.PNG, + "jpeg": ImageFormat.JPG, + "jpg": ImageFormat.JPG, + "gif": ImageFormat.GIF, + "webp": ImageFormat.WEBP, +} + _BASE64_PATTERN = re.compile(r"^[A-Za-z0-9+/=]+$") # Patterns for diffusion-based image models only (use image_generation API). @@ -105,13 +116,18 @@ def detect_image_format(image_bytes: bytes) -> ImageFormat: image_bytes: Image data as bytes Returns: - Detected format (defaults to PNG if unknown) + Detected ImageFormat + + Raises: + ValueError: If the image format cannot be determined """ # Check magic bytes first (fast) if image_bytes.startswith(IMAGE_FORMAT_MAGIC_BYTES[ImageFormat.PNG]): return ImageFormat.PNG elif image_bytes.startswith(IMAGE_FORMAT_MAGIC_BYTES[ImageFormat.JPG]): return ImageFormat.JPG + elif image_bytes.startswith(IMAGE_FORMAT_MAGIC_BYTES[ImageFormat.GIF]): + return ImageFormat.GIF elif image_bytes.startswith(b"RIFF") and b"WEBP" in image_bytes[:12]: return ImageFormat.WEBP @@ -119,13 +135,15 @@ def detect_image_format(image_bytes: bytes) -> ImageFormat: try: img = Image.open(io.BytesIO(image_bytes)) format_str = img.format.lower() if img.format else None - if format_str in [fmt.value for fmt in ImageFormat]: - return ImageFormat(format_str if format_str != ImageFormat.JPEG.value else ImageFormat.JPG.value) + if format_str in _PIL_FORMAT_TO_IMAGE_FORMAT: + return _PIL_FORMAT_TO_IMAGE_FORMAT[format_str] except Exception: pass - # Default to PNG - return ImageFormat.PNG + raise ValueError( + f"Unable to detect image format (first 8 bytes: {image_bytes[:8]!r}). " + f"Supported formats: {', '.join(SUPPORTED_IMAGE_EXTENSIONS)}." + ) def is_image_path(value: str) -> bool: diff --git a/packages/data-designer-config/tests/config/utils/test_image_helpers.py b/packages/data-designer-config/tests/config/utils/test_image_helpers.py index 08ea3b50..fe2f40b7 100644 --- a/packages/data-designer-config/tests/config/utils/test_image_helpers.py +++ b/packages/data-designer-config/tests/config/utils/test_image_helpers.py @@ -84,9 +84,10 @@ def test_detect_image_format_webp(): assert detect_image_format(webp_magic) == ImageFormat.WEBP -def test_detect_image_format_unknown_defaults_to_png(): +def test_detect_image_format_unknown_raises_error(): unknown_bytes = b"\x00\x00\x00\x00" + b"\x00" * 10 - assert detect_image_format(unknown_bytes) == ImageFormat.PNG + with pytest.raises(ValueError, match="Unable to detect image format"): + detect_image_format(unknown_bytes) # Tests for is_image_path @@ -206,31 +207,27 @@ def test_validate_image_nonexistent_raises_error(tmp_path): # Additional tests for uncovered lines -def test_detect_image_format_with_pil_fallback_unsupported_format(tmp_path): - # Create a real GIF image that will trigger PIL fallback - # (GIF has different magic bytes not in our fast-path detection) +def test_detect_image_format_gif_magic_bytes(tmp_path): + # GIF files start with "GIF87a" or "GIF89a" and are now detected via magic bytes img = Image.new("RGB", (1, 1), color="red") gif_path = tmp_path / "test.gif" img.save(gif_path, format="GIF") gif_bytes = gif_path.read_bytes() - # Should use PIL fallback and correctly detect GIF format result = detect_image_format(gif_bytes) assert result == ImageFormat.GIF def test_detect_image_format_with_pil_fallback_jpeg(): - # Test PIL fallback path that converts "jpeg" format string to JPG enum - # Use mock since we can't easily create valid JPEG bytes without magic bytes + # Test PIL fallback path that normalizes "jpeg" -> JPG enum mock_img = Mock() mock_img.format = "JPEG" - # Use bytes that don't match our magic bytes to trigger PIL fallback + # Use bytes that don't match any magic bytes to trigger PIL fallback test_bytes = b"\x00\x00\x00\x00" with patch.object(Image, "open", return_value=mock_img): result = detect_image_format(test_bytes) - # Should convert JPEG -> JPG via line 96 assert result == ImageFormat.JPG diff --git a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py index 9d74734a..2e690fb4 100644 --- a/packages/data-designer-engine/tests/engine/storage/test_media_storage.py +++ b/packages/data-designer-engine/tests/engine/storage/test_media_storage.py @@ -140,7 +140,7 @@ def test_save_base64_image_disk_mode_corrupted_image_raises_error(tmp_path): corrupted_bytes = b"not a valid image" corrupted_base64 = base64.b64encode(corrupted_bytes).decode() - with pytest.raises(ValueError, match="Image validation failed"): + with pytest.raises(ValueError, match="Unable to detect image format"): storage.save_base64_image(corrupted_base64, subfolder_name="test_column") # Check that no files were left behind (cleanup on validation failure) From 1e43394b142b77acfb589f0e0cec0d567587acf9 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 11 Feb 2026 10:19:48 -0700 Subject: [PATCH 67/69] Fix diffusion image gen --- .../config/utils/image_helpers.py | 23 ++++++++++- .../src/data_designer/engine/models/facade.py | 40 ++++++++++++++----- 2 files changed, 52 insertions(+), 11 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py index c91974d8..45f43622 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py +++ b/packages/data-designer-config/src/data_designer/config/utils/image_helpers.py @@ -11,6 +11,8 @@ from pathlib import Path from typing import TYPE_CHECKING +import requests + from data_designer.config.models import ImageFormat from data_designer.lazy_heavy_imports import Image @@ -39,12 +41,13 @@ # Patterns for diffusion-based image models only (use image_generation API). IMAGE_DIFFUSION_MODEL_PATTERNS = ( - "dall-e", + "dall-e-", "dalle", "stable-diffusion", "sd-", "sd_", "imagen", + "gpt-image-", ) SUPPORTED_IMAGE_EXTENSIONS = [f".{fmt.value.lower()}" for fmt in ImageFormat] @@ -232,6 +235,24 @@ def load_image_path_to_base64(image_path: str, base_path: str | None = None) -> return None +def load_image_url_to_base64(url: str, timeout: int = 60) -> str: + """Download an image from a URL and return as base64. + + Args: + url: HTTP(S) URL pointing to an image. + timeout: Request timeout in seconds. + + Returns: + Base64-encoded image data. + + Raises: + requests.HTTPError: If the download fails with a non-2xx status. + """ + resp = requests.get(url, timeout=timeout) + resp.raise_for_status() + return base64.b64encode(resp.content).decode() + + def validate_image(image_path: Path) -> None: """Validate that an image file is readable and not corrupted. diff --git a/packages/data-designer-engine/src/data_designer/engine/models/facade.py b/packages/data-designer-engine/src/data_designer/engine/models/facade.py index e637d9f4..902ac80a 100644 --- a/packages/data-designer-engine/src/data_designer/engine/models/facade.py +++ b/packages/data-designer-engine/src/data_designer/engine/models/facade.py @@ -13,6 +13,7 @@ extract_base64_from_data_uri, is_base64_image, is_image_diffusion_model, + load_image_url_to_base64, ) from data_designer.engine.mcp.errors import MCPConfigurationError from data_designer.engine.model_provider import ModelProviderRegistry @@ -41,13 +42,30 @@ def _identity(x: Any) -> Any: return x -def _try_extract_base64(data: str) -> str | None: - """Try to extract base64 image data from a data URI, returning None on failure.""" +def _try_extract_base64(source: str | litellm.types.utils.ImageObject) -> str | None: + """Try to extract base64 image data from a data URI string or image response object. + + Args: + source: Either a data URI string (e.g. "data:image/png;base64,...") + or a litellm ImageObject with b64_json/url attributes. + + Returns: + Base64-encoded image string, or None if extraction fails. + """ try: - return extract_base64_from_data_uri(data) - except ValueError: + if isinstance(source, str): + return extract_base64_from_data_uri(source) + + if getattr(source, "b64_json", None): + return source.b64_json + + if getattr(source, "url", None): + return load_image_url_to_base64(source.url) + except Exception: return None + return None + logger = logging.getLogger(__name__) @@ -447,16 +465,14 @@ def _generate_image_chat_completion( def _generate_image_diffusion(self, prompt: str, skip_usage_tracking: bool = False, **kwargs) -> list[str]: """Generate image(s) using diffusion model via image_generation API. - Always returns base64. The API is configured to return base64 format. + Always returns base64. If the API returns URLs instead of inline base64, + the images are downloaded and converted automatically. Returns: List of base64-encoded image strings """ kwargs = self.consolidate_kwargs(**kwargs) - # Always request base64 format - kwargs["response_format"] = "b64_json" - response = None try: @@ -471,8 +487,12 @@ def _generate_image_diffusion(self, prompt: str, skip_usage_tracking: bool = Fal if not response.data or len(response.data) == 0: raise ImageGenerationError("Image generation returned no data") - # Return all images as list - return [img.b64_json for img in response.data] + images = [b64 for img in response.data if (b64 := _try_extract_base64(img)) is not None] + + if not images: + raise ImageGenerationError("No image data could be extracted from response") + + return images except Exception: raise From 8f6be9bae09623873d805207750e7d27d68b5e8f Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 11 Feb 2026 10:28:38 -0700 Subject: [PATCH 68/69] Add requests to config pyproject.toml --- packages/data-designer-config/pyproject.toml | 1 + uv.lock | 2 ++ 2 files changed, 3 insertions(+) diff --git a/packages/data-designer-config/pyproject.toml b/packages/data-designer-config/pyproject.toml index 569c8fe0..dc980798 100644 --- a/packages/data-designer-config/pyproject.toml +++ b/packages/data-designer-config/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "pygments>=2.19.2,<3", "python-json-logger>=3,<4", "pyyaml>=6.0.1,<7", + "requests>=2.32.0,<3", "rich>=13.7.1,<15", ] diff --git a/uv.lock b/uv.lock index 200d0b12..d92497dd 100644 --- a/uv.lock +++ b/uv.lock @@ -805,6 +805,7 @@ dependencies = [ { name = "pygments" }, { name = "python-json-logger" }, { name = "pyyaml" }, + { name = "requests" }, { name = "rich" }, ] @@ -819,6 +820,7 @@ requires-dist = [ { name = "pygments", specifier = ">=2.19.2,<3" }, { name = "python-json-logger", specifier = ">=3,<4" }, { name = "pyyaml", specifier = ">=6.0.1,<7" }, + { name = "requests", specifier = ">=2.32.0,<3" }, { name = "rich", specifier = ">=13.7.1,<15" }, ] From d6a4015ccc9deedaf9e4a3193811f5b718f2b6c9 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Thu, 12 Feb 2026 14:14:23 -0700 Subject: [PATCH 69/69] address pr feedback --- docs/concepts/columns.md | 4 ++-- docs/notebook_source/5-generating-images.py | 4 +++- docs/notebook_source/6-editing-images-with-image-context.py | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/concepts/columns.md b/docs/concepts/columns.md index d1422277..03fda5de 100644 --- a/docs/concepts/columns.md +++ b/docs/concepts/columns.md @@ -111,8 +111,8 @@ Image columns require a model configured with `ImageInferenceParams`. Model-spec Image columns also support `multi_modal_context` for autoregressive models that accept image inputs, enabling image-to-image generation workflows. -!!! tip "Tutorial" - See the [Generating Images](../notebooks/5-generating-images.ipynb) tutorial for a complete walkthrough, and [Providing Images as Context](../notebooks/4-providing-images-as-context.ipynb) for using images as input to other columns. +!!! tip "Tutorials" + The image tutorials cover three workflows: [Providing Images as Context](../notebooks/4-providing-images-as-context.ipynb) (image β†’ text), [Generating Images](../notebooks/5-generating-images.ipynb) (text β†’ image), and [Editing Images with Image Context](../notebooks/6-editing-images-with-image-context.ipynb) (image β†’ image). ### 🧬 Embedding Columns diff --git a/docs/notebook_source/5-generating-images.py b/docs/notebook_source/5-generating-images.py index ba7828c9..dfdc5782 100644 --- a/docs/notebook_source/5-generating-images.py +++ b/docs/notebook_source/5-generating-images.py @@ -25,6 +25,8 @@ # # Data Designer supports both **diffusion** (e.g. DALLΒ·E, Stable Diffusion, Imagen) and **autoregressive** (e.g. Gemini image, GPT image) models. # +# > **Prerequisites**: This tutorial uses [OpenRouter](https://openrouter.ai) with the Flux 2 Pro image model. Set `OPENROUTER_API_KEY` in your environment before running. +# # If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series. # @@ -45,7 +47,7 @@ # %% [markdown] # ### βš™οΈ Initialize the Data Designer interface # -# When initialized without arguments, [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used. This tutorial uses [OpenRouter](https://openrouter.ai) with the Flux 2 Pro image model; set `OPENROUTER_API_KEY` in your environment. +# We initialize Data Designer without arguments hereβ€”the image model is configured explicitly in the next cell. No default text model is needed for this tutorial. # # %% diff --git a/docs/notebook_source/6-editing-images-with-image-context.py b/docs/notebook_source/6-editing-images-with-image-context.py index 74e5d3d0..c419ad23 100644 --- a/docs/notebook_source/6-editing-images-with-image-context.py +++ b/docs/notebook_source/6-editing-images-with-image-context.py @@ -26,6 +26,8 @@ # # This tutorial uses an **autoregressive** model (one that supports both image input *and* image output via the chat completions API). Diffusion models (DALLΒ·E, Stable Diffusion, etc.) do not support image contextβ€”see [Tutorial 5](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/5-generating-images/) for text-to-image generation with diffusion models. # +# > **Prerequisites**: This tutorial uses [OpenRouter](https://openrouter.ai) with the Flux 2 Pro model. Set `OPENROUTER_API_KEY` in your environment before running. +# # If this is your first time using Data Designer, we recommend starting with the [first notebook](https://nvidia-nemo.github.io/DataDesigner/latest/notebooks/1-the-basics/) in this tutorial series. # @@ -52,7 +54,7 @@ # %% [markdown] # ### βš™οΈ Initialize the Data Designer interface # -# When initialized without arguments, [default model providers](https://nvidia-nemo.github.io/DataDesigner/latest/concepts/models/default-model-settings/) are used. +# We initialize Data Designer without arguments hereβ€”the image-editing model is configured explicitly in the next cell. No default text model is needed for this tutorial. # # %%