From feadff736f017f65400b96240c1048c0ad695a92 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 10:23:08 -0700 Subject: [PATCH 01/25] feat: add push_to_hub integration for HuggingFace datasets Implement HuggingFace Hub integration to upload DataDesigner datasets: - Add HuggingFaceHubClient with upload_dataset method - Upload main parquet files to data/ subset - Upload processor outputs to data/{processor_name}/ subsets - Generate dataset card from metadata.json with column statistics - Include sdg.json and metadata.json configuration files - Comprehensive validation and error handling - Add push_to_hub() method to DatasetCreationResults --- packages/data-designer/pyproject.toml | 1 + .../integrations/huggingface/__init__.py | 7 + .../integrations/huggingface/client.py | 262 +++++++++ .../integrations/huggingface/dataset_card.py | 114 ++++ .../huggingface/dataset_card_template.md | 82 +++ .../src/data_designer/interface/results.py | 42 ++ .../integrations/huggingface/__init__.py | 2 + .../integrations/huggingface/test_client.py | 523 ++++++++++++++++++ .../huggingface/test_dataset_card.py | 125 +++++ uv.lock | 2 + 10 files changed, 1160 insertions(+) create mode 100644 packages/data-designer/src/data_designer/integrations/huggingface/__init__.py create mode 100644 packages/data-designer/src/data_designer/integrations/huggingface/client.py create mode 100644 packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py create mode 100644 packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md create mode 100644 packages/data-designer/tests/integrations/huggingface/__init__.py create mode 100644 packages/data-designer/tests/integrations/huggingface/test_client.py create mode 100644 packages/data-designer/tests/integrations/huggingface/test_dataset_card.py diff --git a/packages/data-designer/pyproject.toml b/packages/data-designer/pyproject.toml index 31076704..883e18ea 100644 --- a/packages/data-designer/pyproject.toml +++ b/packages/data-designer/pyproject.toml @@ -22,6 +22,7 @@ classifiers = [ dependencies = [ "data-designer-config", "data-designer-engine", + "huggingface-hub>=1.0.1,<2", "prompt-toolkit>=3.0.0,<4", "typer>=0.12.0,<1", ] diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py new file mode 100644 index 00000000..99b9d93e --- /dev/null +++ b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError +from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard + +__all__ = ["HuggingFaceHubClient", "HuggingFaceUploadError", "DataDesignerDatasetCard"] diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py new file mode 100644 index 00000000..7121bff4 --- /dev/null +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -0,0 +1,262 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +import re +from pathlib import Path + +from huggingface_hub import HfApi +from huggingface_hub.utils import HfHubHTTPError + +from data_designer.errors import DataDesignerError +from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard + + +class HuggingFaceUploadError(DataDesignerError): + """Error during HuggingFace dataset upload.""" + + +class HuggingFaceHubClient: + """Client for interacting with HuggingFace Hub to upload datasets.""" + + def __init__(self, token: str | None = None): + """Initialize HuggingFace Hub client. + + Args: + token: HuggingFace API token. If None, the token is automatically + resolved from HF_TOKEN environment variable or cached credentials + from `huggingface-cli login`. + """ + self.token = token + self._api = HfApi(token=token) + + def upload_dataset( + self, + repo_id: str, + base_dataset_path: Path, + *, + private: bool = False, + create_pr: bool = False, + ) -> str: + """Upload dataset to HuggingFace Hub. + + Uploads the complete dataset including: + - Main parquet batch files from parquet-files/ → data/ + - Processor output batch files from processors-files/{name}/ → data/{name}/ + - Existing sdg.json and metadata.json files + - Auto-generated README.md (dataset card) + + Args: + repo_id: HuggingFace repo ID (e.g., "username/dataset-name") + base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.) + private: Whether to create private repo + create_pr: Whether to create a PR instead of direct push + + Returns: + URL to the uploaded dataset + + Raises: + HuggingFaceUploadError: If validation fails or upload encounters errors + """ + self._validate_repo_id(repo_id) + self._validate_dataset_path(base_dataset_path) + + try: + self._api.create_repo( + repo_id=repo_id, + repo_type="dataset", + exist_ok=True, + private=private, + ) + except HfHubHTTPError as e: + if e.response.status_code == 401: + raise HuggingFaceUploadError( + "Authentication failed. Please provide a valid HuggingFace token. " + "You can set it via the token parameter or HF_TOKEN environment variable, " + "or run 'huggingface-cli login'." + ) from e + elif e.response.status_code == 403: + raise HuggingFaceUploadError( + f"Permission denied. You don't have access to create repository '{repo_id}'. " + "Check your token permissions or repository ownership." + ) from e + else: + raise HuggingFaceUploadError(f"Failed to create repository '{repo_id}': {e}") from e + except Exception as e: + raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e + + try: + self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr) + except Exception as e: + raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e + + parquet_folder = base_dataset_path / "parquet-files" + try: + self._api.upload_folder( + repo_id=repo_id, + folder_path=str(parquet_folder), + path_in_repo="data", + repo_type="dataset", + commit_message="Upload main dataset parquet files", + create_pr=create_pr, + ) + except Exception as e: + raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e + + processors_folder = base_dataset_path / "processors-files" + if processors_folder.exists(): + for processor_dir in processors_folder.iterdir(): + if processor_dir.is_dir(): + try: + self._api.upload_folder( + repo_id=repo_id, + folder_path=str(processor_dir), + path_in_repo=f"data/{processor_dir.name}", + repo_type="dataset", + commit_message=f"Upload processor outputs: {processor_dir.name}", + create_pr=create_pr, + ) + except Exception as e: + raise HuggingFaceUploadError( + f"Failed to upload processor outputs for '{processor_dir.name}': {e}" + ) from e + + for config_file in ["sdg.json", "metadata.json"]: + config_path = base_dataset_path / config_file + if config_path.exists(): + try: + self._api.upload_file( + repo_id=repo_id, + path_or_fileobj=str(config_path), + path_in_repo=config_file, + repo_type="dataset", + commit_message=f"Upload {config_file}", + create_pr=create_pr, + ) + except Exception as e: + raise HuggingFaceUploadError(f"Failed to upload {config_file}: {e}") from e + + return f"https://huggingface.co/datasets/{repo_id}" + + def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None: + """Generate and upload dataset card from metadata.json. + + Args: + repo_id: HuggingFace repo ID + base_dataset_path: Path to dataset artifacts + create_pr: Whether to create a PR instead of direct push + + Raises: + HuggingFaceUploadError: If dataset card generation or upload fails + """ + metadata_path = base_dataset_path / "metadata.json" + try: + with open(metadata_path) as f: + metadata = json.load(f) + except json.JSONDecodeError as e: + raise HuggingFaceUploadError(f"Failed to parse metadata.json: {e}") from e + except Exception as e: + raise HuggingFaceUploadError(f"Failed to read metadata.json: {e}") from e + + sdg_path = base_dataset_path / "sdg.json" + sdg_config = None + if sdg_path.exists(): + try: + with open(sdg_path) as f: + sdg_config = json.load(f) + except json.JSONDecodeError as e: + raise HuggingFaceUploadError(f"Failed to parse sdg.json: {e}") from e + except Exception as e: + raise HuggingFaceUploadError(f"Failed to read sdg.json: {e}") from e + + try: + card = DataDesignerDatasetCard.from_metadata( + metadata=metadata, + sdg_config=sdg_config, + repo_id=repo_id, + ) + except Exception as e: + raise HuggingFaceUploadError(f"Failed to generate dataset card: {e}") from e + + try: + card.push_to_hub(repo_id, repo_type="dataset", create_pr=create_pr) + except Exception as e: + raise HuggingFaceUploadError(f"Failed to push dataset card to hub: {e}") from e + + @staticmethod + def _validate_repo_id(repo_id: str) -> None: + """Validate HuggingFace repository ID format. + + Args: + repo_id: Repository ID to validate + + Raises: + HuggingFaceUploadError: If repo_id format is invalid + """ + if not repo_id or not isinstance(repo_id, str): + raise HuggingFaceUploadError("repo_id must be a non-empty string") + + pattern = r"^[a-zA-Z0-9][-a-zA-Z0-9._]*/[a-zA-Z0-9][-a-zA-Z0-9._]*$" + + if not re.match(pattern, repo_id): + raise HuggingFaceUploadError( + f"Invalid repo_id format: '{repo_id}'. " + "Expected format: 'username/dataset-name' or 'organization/dataset-name'. " + "Names can contain alphanumeric characters, dashes, underscores, and dots." + ) + + @staticmethod + def _validate_dataset_path(base_dataset_path: Path) -> None: + """Validate dataset directory structure. + + Args: + base_dataset_path: Path to dataset directory + + Raises: + HuggingFaceUploadError: If directory structure is invalid + """ + if not base_dataset_path.exists(): + raise HuggingFaceUploadError(f"Dataset path does not exist: {base_dataset_path}") + + if not base_dataset_path.is_dir(): + raise HuggingFaceUploadError(f"Dataset path is not a directory: {base_dataset_path}") + + metadata_path = base_dataset_path / "metadata.json" + if not metadata_path.exists(): + raise HuggingFaceUploadError(f"Required file not found: {metadata_path}") + + if not metadata_path.is_file(): + raise HuggingFaceUploadError(f"metadata.json is not a file: {metadata_path}") + + parquet_dir = base_dataset_path / "parquet-files" + if not parquet_dir.exists(): + raise HuggingFaceUploadError( + f"Required directory not found: {parquet_dir}. " + "Dataset must contain parquet-files directory with batch files." + ) + + if not parquet_dir.is_dir(): + raise HuggingFaceUploadError(f"parquet-files is not a directory: {parquet_dir}") + + if not any(parquet_dir.glob("*.parquet")): + raise HuggingFaceUploadError( + f"parquet-files directory is empty: {parquet_dir}. At least one .parquet file is required." + ) + + try: + with open(metadata_path) as f: + json.load(f) + except json.JSONDecodeError as e: + raise HuggingFaceUploadError(f"Invalid JSON in metadata.json: {e}") + + sdg_path = base_dataset_path / "sdg.json" + if sdg_path.exists(): + if not sdg_path.is_file(): + raise HuggingFaceUploadError(f"sdg.json is not a file: {sdg_path}") + try: + with open(sdg_path) as f: + json.load(f) + except json.JSONDecodeError as e: + raise HuggingFaceUploadError(f"Invalid JSON in sdg.json: {e}") diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py new file mode 100644 index 00000000..792e0a47 --- /dev/null +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -0,0 +1,114 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from datetime import datetime +from pathlib import Path + +from huggingface_hub import CardData, DatasetCard + +TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH = Path(__file__).parent / "dataset_card_template.md" + + +class DataDesignerDatasetCard(DatasetCard): + """Dataset card for NeMo Data Designer datasets. + + This class extends Hugging Face's DatasetCard with a custom template + specifically designed for Data Designer generated datasets. + The template is located at `data_designer/integrations/huggingface/dataset_card_template.md`. + """ + + default_template_path = TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH + + @classmethod + def from_metadata( + cls, + metadata: dict, + sdg_config: dict | None, + repo_id: str, + ) -> DataDesignerDatasetCard: + """Create dataset card from metadata.json and sdg.json. + + Args: + metadata: Contents of metadata.json + sdg_config: Contents of sdg.json (optional) + repo_id: HuggingFace repo ID + + Returns: + DataDesignerDatasetCard instance ready to upload + """ + # Extract info from metadata + target_num_records = metadata.get("target_num_records", 0) + schema = metadata.get("schema", {}) + column_stats = metadata.get("column_statistics", []) + + # Get actual num_records from column_statistics if available + if column_stats: + actual_num_records = column_stats[0].get("num_records", target_num_records) + else: + actual_num_records = target_num_records + + # Compute size category + size_categories = cls._compute_size_category(actual_num_records) + + # Extract column types from sdg.json if available + config_types: dict[str, int] = {} + num_columns_configured = 0 + if sdg_config: + columns = sdg_config.get("data_designer", {}).get("columns", []) + num_columns_configured = len(columns) + for col in columns: + col_type = col.get("column_type", "unknown") + if isinstance(col_type, dict): + col_type = col_type.get("value", "unknown") + config_types[col_type] = config_types.get(col_type, 0) + 1 + + # Prepare CardData (metadata for YAML frontmatter) + card_data = CardData( + library="datadesigner", + size_categories=size_categories, + tags=["synthetic", "nemo-data-designer"], + ) + + # Prepare template variables + template_vars = { + "repo_id": repo_id, + "num_records": actual_num_records, + "target_num_records": target_num_records, + "num_columns": len(schema), + "size_categories": size_categories, + "all_columns": schema, + "column_statistics": column_stats, + "num_columns_configured": num_columns_configured, + "config_types": config_types, + "percent_complete": 100 * actual_num_records / (target_num_records + 1e-10), + "current_year": datetime.now().year, + } + + # Create card from template + card = cls.from_template(card_data, template_path=str(cls.default_template_path), **template_vars) + return card + + @staticmethod + def _compute_size_category(num_records: int) -> str: + """Compute HuggingFace size category from record count. + + Args: + num_records: Number of records in the dataset + + Returns: + Size category string for HuggingFace Hub tags + """ + if num_records < 1000: + return "n<1K" + elif num_records < 10000: + return "1K10M" diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md new file mode 100644 index 00000000..651741eb --- /dev/null +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md @@ -0,0 +1,82 @@ +--- +library: datadesigner +size_categories: {{ size_categories }} +tags: + - synthetic + - nemo-data-designer +--- + +# {{ repo_id.split('/')[-1] | title }} + +This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a framework for creating high-quality synthetic datasets. + +## Dataset Summary + +- **Records**: {{ "{:,}".format(num_records) }} +- **Columns**: {{ num_columns }} +{% if target_num_records != num_records %} +- **Completion**: {{ "%.1f" | format(percent_complete) }}% ({{ "{:,}".format(target_num_records) }} requested) +{% endif %} + +## Quick Start + +```python +from datasets import load_dataset + +# Load the dataset +dataset = load_dataset("{{ repo_id }}") +df = dataset["train"].to_pandas() +``` + +## Schema & Statistics + +{% if column_statistics %} +{% for stat in column_statistics %} +### {{ stat.column_name }} + +- **Type**: `{{ stat.simple_dtype }}` +- **Column Type**: {{ stat.column_type }} +- **Unique Values**: {{ stat.num_unique }} ({{ "%.1f" | format((stat.num_unique / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) +{% if stat.num_null > 0 %} +- **Null Values**: {{ stat.num_null }} ({{ "%.1f" | format((stat.num_null / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) +{% endif %} +{% if stat.column_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"] %} +- **Avg Output Tokens**: {{ "%.1f" | format(stat.output_tokens_mean) if stat.output_tokens_mean is defined else "N/A" }} +- **Avg Input Tokens**: {{ "%.1f" | format(stat.input_tokens_mean) if stat.input_tokens_mean is defined else "N/A" }} +{% endif %} +{% if stat.column_type == "sampler" and stat.sampler_type is defined %} +- **Sampler Type**: {% if stat.sampler_type is mapping %}{{ stat.sampler_type.value }}{% else %}{{ stat.sampler_type }}{% endif %} +{% endif %} + +{% endfor %} +{% else %} +| Column | Type | +|--------|------| +{% for col_name, dtype in all_columns.items() | sort -%} +| `{{ col_name }}` | {{ dtype }} | +{% endfor %} +{% endif %} + +## Generation Details + +{% if config_types %} +Generated with {{ num_columns_configured }} column configuration(s): + +{% for col_type, count in config_types.items() | sort %} +- **{{ col_type }}**: {{ count }} column(s) +{% endfor %} +{% endif %} + +Full configuration available in `sdg.json` and detailed metadata in `metadata.json`. + +## Citation + +```bibtex +@misc{nemo-data-designer, + author = {The NeMo Data Designer Team, NVIDIA}, + title = {NeMo Data Designer: A framework for generating synthetic data from scratch or based on your own seed data}, + howpublished = {\url{https://github.com/NVIDIA-NeMo/DataDesigner}}, + year = {{ current_year }}, + note = {GitHub Repository}, +} +``` diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index b9467c58..b39c265e 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -12,6 +12,7 @@ from data_designer.config.utils.visualization import WithRecordSamplerMixin from data_designer.engine.dataset_builders.artifact_storage import ArtifactStorage from data_designer.engine.dataset_builders.errors import ArtifactStorageError +from data_designer.integrations.huggingface.client import HuggingFaceHubClient from data_designer.lazy_heavy_imports import pd if TYPE_CHECKING: @@ -96,3 +97,44 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path: if not self.artifact_storage.processors_outputs_path.exists(): raise ArtifactStorageError(f"Processor {processor_name} has no artifacts.") return self.artifact_storage.processors_outputs_path / processor_name + + def push_to_hub( + self, + repo_id: str, + *, + token: str | None = None, + private: bool = False, + create_pr: bool = False, + ) -> str: + """Push dataset to HuggingFace Hub. + + Uploads all artifacts including: + - Main parquet batch files (data subset) + - Processor output batch files (data/{processor_name} subsets) + - Configuration (sdg.json) + - Metadata (metadata.json) + - Auto-generated dataset card (README.md) + + Args: + repo_id: HuggingFace repo ID (e.g., "username/my-dataset") + token: HuggingFace API token. If None, the token is automatically + resolved from HF_TOKEN environment variable or cached credentials + from `huggingface-cli login`. + private: Create private repo + create_pr: Create PR instead of direct push + + Returns: + URL to the uploaded dataset + + Example: + >>> results = data_designer.create(config, num_records=1000) + >>> results.push_to_hub("username/my-synthetic-dataset") + 'https://huggingface.co/datasets/username/my-synthetic-dataset' + """ + client = HuggingFaceHubClient(token=token) + return client.upload_dataset( + repo_id=repo_id, + base_dataset_path=self.artifact_storage.base_dataset_path, + private=private, + create_pr=create_pr, + ) diff --git a/packages/data-designer/tests/integrations/huggingface/__init__.py b/packages/data-designer/tests/integrations/huggingface/__init__.py new file mode 100644 index 00000000..1a8431c3 --- /dev/null +++ b/packages/data-designer/tests/integrations/huggingface/__init__.py @@ -0,0 +1,2 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py new file mode 100644 index 00000000..3e5bcdb1 --- /dev/null +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -0,0 +1,523 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError + + +@pytest.fixture +def mock_hf_api() -> MagicMock: + """Mock HfApi for testing.""" + with patch("data_designer.integrations.huggingface.client.HfApi") as mock: + api_instance = MagicMock() + mock.return_value = api_instance + yield api_instance + + +@pytest.fixture +def sample_dataset_path(tmp_path: Path) -> Path: + """Create a sample dataset directory structure. + + Structure mirrors actual DataDesigner output: + - parquet-files/: Main dataset batch files + - processors-files/{processor_name}/: Processor output batch files (same structure) + - metadata.json: Dataset metadata + - sdg.json: Configuration + """ + base_path = tmp_path / "dataset" + base_path.mkdir() + + # Create parquet-files directory with batch files + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("dummy parquet data") + (parquet_dir / "batch_00001.parquet").write_text("dummy parquet data") + + # Create processors-files directory with same structure as main parquet-files + processors_dir = base_path / "processors-files" + processors_dir.mkdir() + processor1_dir = processors_dir / "processor1" + processor1_dir.mkdir() + (processor1_dir / "batch_00000.parquet").write_text("dummy processor output") + (processor1_dir / "batch_00001.parquet").write_text("dummy processor output") + + processor2_dir = processors_dir / "processor2" + processor2_dir.mkdir() + (processor2_dir / "batch_00000.parquet").write_text("dummy processor output") + + # Create metadata.json with matching column statistics + metadata = { + "target_num_records": 100, + "total_num_batches": 2, + "buffer_size": 50, + "schema": {"col1": "string"}, + "file_paths": { + "parquet-files": ["parquet-files/batch_00000.parquet", "parquet-files/batch_00001.parquet"], + "processor-files": { + "processor1": ["processors-files/processor1/batch_00000.parquet"], + "processor2": ["processors-files/processor2/batch_00000.parquet"], + }, + }, + "num_completed_batches": 2, + "dataset_name": "dataset", + "column_statistics": [ + { + "column_name": "col1", + "num_records": 100, + "num_unique": 100, + "num_null": 0, + "simple_dtype": "string", + "pyarrow_dtype": "string", + "column_type": "sampler", + "sampler_type": "uuid", + } + ], + } + (base_path / "metadata.json").write_text(json.dumps(metadata)) + + # Create sdg.json with realistic BuilderConfig structure + sdg_config = { + "data_designer": { + "columns": [ + { + "name": "col1", + "column_type": "sampler", + "sampler_type": "uuid", + "params": {}, + } + ], + "model_configs": [], + "constraints": None, + "seed_config": None, + "profilers": None, + } + } + (base_path / "sdg.json").write_text(json.dumps(sdg_config)) + + return base_path + + +def test_client_initialization() -> None: + """Test HuggingFaceHubClient initialization.""" + with patch("data_designer.integrations.huggingface.client.HfApi"): + client = HuggingFaceHubClient(token="test-token") + assert client.token == "test-token" + + +def test_client_initialization_no_token() -> None: + """Test HuggingFaceHubClient initialization without token.""" + with patch("data_designer.integrations.huggingface.client.HfApi"): + client = HuggingFaceHubClient() + assert client.token is None + + +def test_upload_dataset_creates_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test that upload_dataset creates a repository.""" + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + ) + + mock_hf_api.create_repo.assert_called_once_with( + repo_id="test/dataset", + repo_type="dataset", + exist_ok=True, + private=False, + ) + + +def test_upload_dataset_uploads_parquet_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test that upload_dataset uploads parquet files.""" + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + ) + + # Check that upload_folder was called for parquet files + calls = [call for call in mock_hf_api.upload_folder.call_args_list if "parquet-files" in str(call)] + assert len(calls) == 1 + assert calls[0].kwargs["path_in_repo"] == "data" + + +def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test that upload_dataset uploads processor outputs.""" + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + ) + + # Check that upload_folder was called for processor outputs + calls = [call for call in mock_hf_api.upload_folder.call_args_list if "processor1" in str(call)] + assert len(calls) == 1 + assert calls[0].kwargs["path_in_repo"] == "data/processor1" + + +def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test that upload_dataset uploads sdg.json and metadata.json.""" + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + ) + + # Check that upload_file was called for config files + upload_file_calls = mock_hf_api.upload_file.call_args_list + assert len(upload_file_calls) == 2 + + uploaded_files = [call.kwargs["path_in_repo"] for call in upload_file_calls] + assert "sdg.json" in uploaded_files + assert "metadata.json" in uploaded_files + + +def test_upload_dataset_returns_url(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test that upload_dataset returns the correct URL.""" + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + url = client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + ) + + assert url == "https://huggingface.co/datasets/test/dataset" + + +def test_upload_dataset_with_private_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset with private repository.""" + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + private=True, + ) + + mock_hf_api.create_repo.assert_called_once_with( + repo_id="test/dataset", + repo_type="dataset", + exist_ok=True, + private=True, + ) + + +def test_upload_dataset_with_create_pr(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset with create_pr option.""" + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + create_pr=True, + ) + + # Verify create_pr is passed to upload operations + for call in mock_hf_api.upload_folder.call_args_list: + assert call.kwargs["create_pr"] is True + + +def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None: + """Test _upload_dataset_card raises error when metadata.json is missing.""" + client = HuggingFaceHubClient(token="test-token") + + # Create directory without metadata.json + base_path = tmp_path / "dataset" + base_path.mkdir() + + with pytest.raises(HuggingFaceUploadError, match="Failed to read metadata.json"): + client._upload_dataset_card("test/dataset", base_path) + + +def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> None: + """Test _upload_dataset_card generates card and pushes to hub.""" + client = HuggingFaceHubClient(token="test-token") + + with patch("data_designer.integrations.huggingface.client.DataDesignerDatasetCard") as mock_card_class: + mock_card = MagicMock() + mock_card_class.from_metadata.return_value = mock_card + + client._upload_dataset_card("test/dataset", sample_dataset_path) + + # Verify card was created from metadata + mock_card_class.from_metadata.assert_called_once() + call_kwargs = mock_card_class.from_metadata.call_args.kwargs + assert call_kwargs["repo_id"] == "test/dataset" + assert "metadata" in call_kwargs + assert "sdg_config" in call_kwargs + + # Verify card was pushed to hub + mock_card.push_to_hub.assert_called_once_with( + "test/dataset", + repo_type="dataset", + create_pr=False, + ) + + +def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Path) -> None: + """Test upload_dataset when no processor outputs exist.""" + # Create dataset path without processors directory + base_path = tmp_path / "dataset" + base_path.mkdir() + + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("dummy data") + + metadata = {"target_num_records": 10, "schema": {"col1": "string"}, "column_statistics": []} + (base_path / "metadata.json").write_text(json.dumps(metadata)) + + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + ) + + # Should only upload parquet files, not processors + folder_calls = mock_hf_api.upload_folder.call_args_list + assert len(folder_calls) == 1 # Only main parquet files + assert folder_calls[0].kwargs["path_in_repo"] == "data" + + +def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Path) -> None: + """Test upload_dataset when sdg.json doesn't exist.""" + base_path = tmp_path / "dataset" + base_path.mkdir() + + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("dummy data") + + metadata = {"target_num_records": 10, "schema": {"col1": "string"}, "column_statistics": []} + (base_path / "metadata.json").write_text(json.dumps(metadata)) + + # No sdg.json file + + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + ) + + # Should only upload metadata.json, not sdg.json + file_calls = mock_hf_api.upload_file.call_args_list + assert len(file_calls) == 1 + assert file_calls[0].kwargs["path_in_repo"] == "metadata.json" + + +def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test that multiple processor outputs are uploaded correctly.""" + client = HuggingFaceHubClient(token="test-token") + + with patch.object(client, "_upload_dataset_card"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + ) + + # Check that both processors were uploaded + folder_calls = mock_hf_api.upload_folder.call_args_list + processor_calls = [call for call in folder_calls if "processor" in call.kwargs["path_in_repo"]] + + assert len(processor_calls) == 2 + processor_paths = [call.kwargs["path_in_repo"] for call in processor_calls] + assert "data/processor1" in processor_paths + assert "data/processor2" in processor_paths + + +# Error handling and validation tests + + +def test_validate_repo_id_invalid_format() -> None: + """Test repo_id validation with invalid formats.""" + client = HuggingFaceHubClient(token="test-token") + + # Missing slash + with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): + client._validate_repo_id("my-dataset") + + # Too many slashes (caught by regex) + with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): + client._validate_repo_id("user/org/dataset") + + # Invalid characters (space) + with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): + client._validate_repo_id("user/my dataset") + + # Empty string + with pytest.raises(HuggingFaceUploadError, match="must be a non-empty string"): + client._validate_repo_id("") + + +def test_validate_repo_id_valid_formats() -> None: + """Test repo_id validation with valid formats.""" + client = HuggingFaceHubClient(token="test-token") + + # Valid formats should not raise + client._validate_repo_id("username/dataset") + client._validate_repo_id("org/my-dataset") + client._validate_repo_id("user/dataset_name") + client._validate_repo_id("user123/dataset-123") + client._validate_repo_id("user/dataset.v2") + + +def test_validate_dataset_path_not_exists(tmp_path: Path) -> None: + """Test validation fails when dataset path doesn't exist.""" + client = HuggingFaceHubClient(token="test-token") + non_existent = tmp_path / "does-not-exist" + + with pytest.raises(HuggingFaceUploadError, match="does not exist"): + client._validate_dataset_path(non_existent) + + +def test_validate_dataset_path_is_file(tmp_path: Path) -> None: + """Test validation fails when dataset path is a file.""" + client = HuggingFaceHubClient(token="test-token") + file_path = tmp_path / "file.txt" + file_path.write_text("not a directory") + + with pytest.raises(HuggingFaceUploadError, match="not a directory"): + client._validate_dataset_path(file_path) + + +def test_validate_dataset_path_missing_metadata(tmp_path: Path) -> None: + """Test validation fails when metadata.json is missing.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + + with pytest.raises(HuggingFaceUploadError, match="Required file not found.*metadata.json"): + client._validate_dataset_path(base_path) + + +def test_validate_dataset_path_missing_parquet_folder(tmp_path: Path) -> None: + """Test validation fails when parquet-files directory is missing.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text('{"target_num_records": 10}') + + with pytest.raises(HuggingFaceUploadError, match="Required directory not found.*parquet-files"): + client._validate_dataset_path(base_path) + + +def test_validate_dataset_path_empty_parquet_folder(tmp_path: Path) -> None: + """Test validation fails when parquet-files directory is empty.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text('{"target_num_records": 10}') + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + + with pytest.raises(HuggingFaceUploadError, match="parquet-files directory is empty"): + client._validate_dataset_path(base_path) + + +def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None: + """Test validation fails when metadata.json contains invalid JSON.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text("invalid json {{{") + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("data") + + with pytest.raises(HuggingFaceUploadError, match="Invalid JSON in metadata.json"): + client._validate_dataset_path(base_path) + + +def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None: + """Test validation fails when sdg.json contains invalid JSON.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text('{"target_num_records": 10}') + (base_path / "sdg.json").write_text("invalid json {{{") + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("data") + + with pytest.raises(HuggingFaceUploadError, match="Invalid JSON in sdg.json"): + client._validate_dataset_path(base_path) + + +def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset fails with invalid repo_id.""" + client = HuggingFaceHubClient(token="test-token") + + with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): + client.upload_dataset( + repo_id="invalid-repo-id", # Missing slash + base_dataset_path=sample_dataset_path, + ) + + +def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset handles authentication errors.""" + from huggingface_hub.utils import HfHubHTTPError + + client = HuggingFaceHubClient(token="invalid-token") + + # Mock 401 authentication error + error_response = MagicMock() + error_response.status_code = 401 + mock_hf_api.create_repo.side_effect = HfHubHTTPError("Unauthorized", response=error_response) + + with pytest.raises(HuggingFaceUploadError, match="Authentication failed"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + ) + + +def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset handles permission errors.""" + from huggingface_hub.utils import HfHubHTTPError + + client = HuggingFaceHubClient(token="test-token") + + # Mock 403 permission error + error_response = MagicMock() + error_response.status_code = 403 + mock_hf_api.create_repo.side_effect = HfHubHTTPError("Forbidden", response=error_response) + + with pytest.raises(HuggingFaceUploadError, match="Permission denied"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + ) + + +def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None: + """Test _upload_dataset_card handles corrupted metadata.json.""" + client = HuggingFaceHubClient(token="test-token") + base_path = tmp_path / "dataset" + base_path.mkdir() + (base_path / "metadata.json").write_text("invalid json") + + with pytest.raises(HuggingFaceUploadError, match="Failed to parse metadata.json"): + client._upload_dataset_card("test/dataset", base_path) diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py new file mode 100644 index 00000000..e4b3b8eb --- /dev/null +++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py @@ -0,0 +1,125 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard + + +def test_compute_size_category() -> None: + """Test size category computation for various dataset sizes.""" + assert DataDesignerDatasetCard._compute_size_category(500) == "n<1K" + assert DataDesignerDatasetCard._compute_size_category(5000) == "1K10M" + + +def test_from_metadata_minimal() -> None: + """Test creating dataset card from minimal metadata.""" + metadata = { + "target_num_records": 100, + "schema": {"col1": "string", "col2": "int64"}, + "column_statistics": [ + { + "column_name": "col1", + "num_records": 100, + "num_unique": 100, + "num_null": 0, + "simple_dtype": "string", + "column_type": "sampler", + } + ], + } + + card = DataDesignerDatasetCard.from_metadata( + metadata=metadata, + sdg_config=None, + repo_id="test/dataset", + ) + + # Verify card was created + assert card is not None + assert "test/dataset" in str(card) + assert "100" in str(card) + assert "col1" in str(card) + assert "2" in str(card) # Number of columns + + +def test_from_metadata_with_sdg_config() -> None: + """Test creating dataset card with sdg config.""" + metadata = { + "target_num_records": 50, + "schema": {"name": "string", "age": "int64"}, + "column_statistics": [ + { + "column_name": "name", + "num_records": 50, + "num_unique": 50, + "num_null": 0, + "simple_dtype": "string", + "column_type": "sampler", + "sampler_type": "person", + }, + { + "column_name": "age", + "num_records": 50, + "num_unique": 30, + "num_null": 0, + "simple_dtype": "int64", + "column_type": "sampler", + "sampler_type": "uniform", + }, + ], + } + + sdg_config = { + "data_designer": { + "columns": [ + {"name": "name", "column_type": "sampler"}, + {"name": "age", "column_type": "sampler"}, + ] + } + } + + card = DataDesignerDatasetCard.from_metadata( + metadata=metadata, + sdg_config=sdg_config, + repo_id="test/dataset-with-config", + ) + + # Verify card includes config info + assert card is not None + assert "sampler" in str(card) + assert "2 column" in str(card) + + +def test_from_metadata_with_llm_columns() -> None: + """Test creating dataset card with LLM column statistics.""" + metadata = { + "target_num_records": 10, + "schema": {"prompt": "string", "response": "string"}, + "column_statistics": [ + { + "column_name": "response", + "num_records": 10, + "num_unique": 10, + "num_null": 0, + "simple_dtype": "string", + "column_type": "llm-text", + "output_tokens_mean": 50.5, + "input_tokens_mean": 20.3, + } + ], + } + + card = DataDesignerDatasetCard.from_metadata( + metadata=metadata, + sdg_config=None, + repo_id="test/llm-dataset", + ) + + # Verify LLM statistics are included + assert card is not None + assert "50.5" in str(card) or "Avg Output Tokens" in str(card) diff --git a/uv.lock b/uv.lock index 279f21de..46716729 100644 --- a/uv.lock +++ b/uv.lock @@ -690,6 +690,7 @@ source = { editable = "packages/data-designer" } dependencies = [ { name = "data-designer-config" }, { name = "data-designer-engine" }, + { name = "huggingface-hub" }, { name = "prompt-toolkit" }, { name = "typer" }, ] @@ -698,6 +699,7 @@ dependencies = [ requires-dist = [ { name = "data-designer-config", editable = "packages/data-designer-config" }, { name = "data-designer-engine", editable = "packages/data-designer-engine" }, + { name = "huggingface-hub", specifier = ">=1.0.1,<2" }, { name = "prompt-toolkit", specifier = ">=3.0.0,<4" }, { name = "typer", specifier = ">=0.12.0,<1" }, ] From 3ff3aba25b53d2c42a3317452a4e9a1a9aa48044 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 11:44:56 -0700 Subject: [PATCH 02/25] feat: improve push_to_hub with logging, path mapping, and config definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add progress logging with emojis following codebase style - Add repository exists check before creation - Update metadata.json paths for HuggingFace structure (parquet-files/ → data/, processors-files/{name}/ → {name}/) - Enhance dataset card with detailed intro, tabular schema/statistics, and clickable config links - Add explicit configs in YAML frontmatter to fix schema mismatch between main dataset and processor outputs - Set data config as default configuration --- .../integrations/huggingface/client.py | 135 ++++++++++++++---- .../integrations/huggingface/dataset_card.py | 26 +++- .../huggingface/dataset_card_template.md | 63 ++++---- .../src/data_designer/interface/results.py | 2 +- .../integrations/huggingface/test_client.py | 63 ++++++-- .../huggingface/test_dataset_card.py | 41 +++++- 6 files changed, 258 insertions(+), 72 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 7121bff4..8b182b25 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -4,7 +4,9 @@ from __future__ import annotations import json +import logging import re +import tempfile from pathlib import Path from huggingface_hub import HfApi @@ -12,6 +14,9 @@ from data_designer.errors import DataDesignerError from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard +from data_designer.logging import RandomEmoji + +logger = logging.getLogger(__name__) class HuggingFaceUploadError(DataDesignerError): @@ -44,7 +49,7 @@ def upload_dataset( Uploads the complete dataset including: - Main parquet batch files from parquet-files/ → data/ - - Processor output batch files from processors-files/{name}/ → data/{name}/ + - Processor output batch files from processors-files/{name}/ → {name}/ - Existing sdg.json and metadata.json files - Auto-generated README.md (dataset card) @@ -60,10 +65,19 @@ def upload_dataset( Raises: HuggingFaceUploadError: If validation fails or upload encounters errors """ + logger.info(f"🤗 Uploading dataset to HuggingFace Hub: {repo_id}") + self._validate_repo_id(repo_id) self._validate_dataset_path(base_dataset_path) + logger.info(f"|-- {RandomEmoji.working()} Checking if repository exists...") try: + repo_exists = self._api.repo_exists(repo_id=repo_id, repo_type="dataset") + if repo_exists: + logger.info(f"|-- {RandomEmoji.success()} Repository already exists, updating content...") + else: + logger.info(f"|-- {RandomEmoji.working()} Creating new repository...") + self._api.create_repo( repo_id=repo_id, repo_type="dataset", @@ -87,11 +101,13 @@ def upload_dataset( except Exception as e: raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e + logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...") try: self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr) except Exception as e: raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e + logger.info(f"|-- {RandomEmoji.loading()} Uploading main dataset files...") parquet_folder = base_dataset_path / "parquet-files" try: self._api.upload_folder( @@ -99,7 +115,7 @@ def upload_dataset( folder_path=str(parquet_folder), path_in_repo="data", repo_type="dataset", - commit_message="Upload main dataset parquet files", + commit_message="Upload main dataset files", create_pr=create_pr, ) except Exception as e: @@ -107,38 +123,67 @@ def upload_dataset( processors_folder = base_dataset_path / "processors-files" if processors_folder.exists(): - for processor_dir in processors_folder.iterdir(): - if processor_dir.is_dir(): - try: - self._api.upload_folder( - repo_id=repo_id, - folder_path=str(processor_dir), - path_in_repo=f"data/{processor_dir.name}", - repo_type="dataset", - commit_message=f"Upload processor outputs: {processor_dir.name}", - create_pr=create_pr, - ) - except Exception as e: - raise HuggingFaceUploadError( - f"Failed to upload processor outputs for '{processor_dir.name}': {e}" - ) from e - - for config_file in ["sdg.json", "metadata.json"]: - config_path = base_dataset_path / config_file - if config_path.exists(): + processor_dirs = [d for d in processors_folder.iterdir() if d.is_dir()] + if processor_dirs: + logger.info( + f"|-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)..." + ) + for processor_dir in processor_dirs: try: - self._api.upload_file( + self._api.upload_folder( repo_id=repo_id, - path_or_fileobj=str(config_path), - path_in_repo=config_file, + folder_path=str(processor_dir), + path_in_repo=processor_dir.name, repo_type="dataset", - commit_message=f"Upload {config_file}", + commit_message=f"Upload {processor_dir.name} processor outputs", create_pr=create_pr, ) except Exception as e: - raise HuggingFaceUploadError(f"Failed to upload {config_file}: {e}") from e + raise HuggingFaceUploadError( + f"Failed to upload processor outputs for '{processor_dir.name}': {e}" + ) from e + + logger.info(f"|-- {RandomEmoji.loading()} Uploading configuration files...") + + sdg_path = base_dataset_path / "sdg.json" + if sdg_path.exists(): + try: + self._api.upload_file( + repo_id=repo_id, + path_or_fileobj=str(sdg_path), + path_in_repo="sdg.json", + repo_type="dataset", + commit_message="Upload sdg.json", + create_pr=create_pr, + ) + except Exception as e: + raise HuggingFaceUploadError(f"Failed to upload sdg.json: {e}") from e - return f"https://huggingface.co/datasets/{repo_id}" + metadata_path = base_dataset_path / "metadata.json" + if metadata_path.exists(): + try: + updated_metadata = self._update_metadata_paths(metadata_path) + with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp_file: + json.dump(updated_metadata, tmp_file, indent=2) + tmp_path = tmp_file.name + + try: + self._api.upload_file( + repo_id=repo_id, + path_or_fileobj=tmp_path, + path_in_repo="metadata.json", + repo_type="dataset", + commit_message="Upload metadata.json", + create_pr=create_pr, + ) + finally: + Path(tmp_path).unlink() + except Exception as e: + raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e + + url = f"https://huggingface.co/datasets/{repo_id}" + logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") + return url def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None: """Generate and upload dataset card from metadata.json. @@ -207,6 +252,42 @@ def _validate_repo_id(repo_id: str) -> None: "Names can contain alphanumeric characters, dashes, underscores, and dots." ) + @staticmethod + def _update_metadata_paths(metadata_path: Path) -> dict: + """Update file paths in metadata.json to match HuggingFace Hub structure. + + Local paths: + - parquet-files/batch_00000.parquet → data/batch_00000.parquet + - processors-files/processor1/batch_00000.parquet → processor1/batch_00000.parquet + + Args: + metadata_path: Path to metadata.json file + + Returns: + Updated metadata dictionary with corrected paths + """ + with open(metadata_path) as f: + metadata = json.load(f) + + if "file_paths" in metadata: + updated_file_paths = {} + + if "parquet-files" in metadata["file_paths"]: + updated_file_paths["data"] = [ + path.replace("parquet-files/", "data/") for path in metadata["file_paths"]["parquet-files"] + ] + + if "processor-files" in metadata["file_paths"]: + updated_file_paths["processor-files"] = {} + for processor_name, paths in metadata["file_paths"]["processor-files"].items(): + updated_file_paths["processor-files"][processor_name] = [ + path.replace(f"processors-files/{processor_name}/", f"{processor_name}/") for path in paths + ] + + metadata["file_paths"] = updated_file_paths + + return metadata + @staticmethod def _validate_dataset_path(base_dataset_path: Path) -> None: """Validate dataset directory structure. diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py index 792e0a47..7ec3b2de 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -64,11 +64,30 @@ def from_metadata( col_type = col_type.get("value", "unknown") config_types[col_type] = config_types.get(col_type, 0) + 1 + # Extract processor names from file_paths + processor_names = [] + if "file_paths" in metadata and "processor-files" in metadata["file_paths"]: + processor_names = list(metadata["file_paths"]["processor-files"].keys()) + + # Determine modalities based on column types + modalities = set() + has_text = False + for stat in column_stats: + col_type = stat.get("column_type", "") + if col_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"]: + has_text = True + + if has_text: + modalities.add("text") + modalities.add("tabular") + + # Prepare tags + tags = ["synthetic", "datadesigner"] + list(modalities) + # Prepare CardData (metadata for YAML frontmatter) card_data = CardData( - library="datadesigner", size_categories=size_categories, - tags=["synthetic", "nemo-data-designer"], + tags=tags, ) # Prepare template variables @@ -84,6 +103,9 @@ def from_metadata( "config_types": config_types, "percent_complete": 100 * actual_num_records / (target_num_records + 1e-10), "current_year": datetime.now().year, + "has_processors": len(processor_names) > 0, + "processor_names": processor_names, + "tags": tags, } # Create card from template diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md index 651741eb..46de1474 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md @@ -1,14 +1,33 @@ --- -library: datadesigner size_categories: {{ size_categories }} tags: - - synthetic - - nemo-data-designer +{% for tag in tags %} + - {{ tag }} +{% endfor %} +configs: +- config_name: data + data_files: "data/*.parquet" + default: true +{% if has_processors %}{% for processor_name in processor_names %}- config_name: {{ processor_name }} + data_files: "{{ processor_name }}/*.parquet" +{% endfor %}{% endif %} --- # {{ repo_id.split('/')[-1] | title }} -This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a framework for creating high-quality synthetic datasets. +This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a comprehensive framework for creating high-quality synthetic datasets from scratch or using seed data. + +## About NeMo Data Designer + +NeMo Data Designer is a general framework for generating high-quality synthetic data that goes beyond simple LLM prompting. It provides: + +- **Diverse data generation** using statistical samplers, LLMs, or existing seed datasets +- **Relationship control** between fields with dependency-aware generation +- **Quality validation** with built-in Python, SQL, and custom local and remote validators +- **LLM-as-a-judge** scoring for quality assessment +- **Fast iteration** with preview mode before full-scale generation + +For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) (`pip install data-designer`) ## Dataset Summary @@ -23,32 +42,24 @@ This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDI ```python from datasets import load_dataset -# Load the dataset -dataset = load_dataset("{{ repo_id }}") -df = dataset["train"].to_pandas() +# Load the main dataset +dataset = load_dataset("{{ repo_id }}", "data", split="train") +df = dataset.to_pandas() +{% if has_processors %} +# Load processor outputs (if available){% for processor_name in processor_names %} +processor_{{ processor_name }} = load_dataset("{{ repo_id }}", "{{ processor_name }}", split="train") +df_{{ processor_name }} = processor_{{ processor_name }}.to_pandas() +{% endfor %}{% endif %} ``` ## Schema & Statistics {% if column_statistics %} -{% for stat in column_statistics %} -### {{ stat.column_name }} - -- **Type**: `{{ stat.simple_dtype }}` -- **Column Type**: {{ stat.column_type }} -- **Unique Values**: {{ stat.num_unique }} ({{ "%.1f" | format((stat.num_unique / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) -{% if stat.num_null > 0 %} -- **Null Values**: {{ stat.num_null }} ({{ "%.1f" | format((stat.num_null / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) -{% endif %} -{% if stat.column_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"] %} -- **Avg Output Tokens**: {{ "%.1f" | format(stat.output_tokens_mean) if stat.output_tokens_mean is defined else "N/A" }} -- **Avg Input Tokens**: {{ "%.1f" | format(stat.input_tokens_mean) if stat.input_tokens_mean is defined else "N/A" }} -{% endif %} -{% if stat.column_type == "sampler" and stat.sampler_type is defined %} -- **Sampler Type**: {% if stat.sampler_type is mapping %}{{ stat.sampler_type.value }}{% else %}{{ stat.sampler_type }}{% endif %} -{% endif %} - -{% endfor %} +| Column | Type | Column Type | Unique (%) | Null (%) | Details | +|--------|------|-------------|------------|----------|---------| +{% for stat in column_statistics -%} +| `{{ stat.column_name }}` | `{{ stat.simple_dtype }}` | {{ stat.column_type }} | {{ stat.num_unique }} ({{ "%.1f" | format((stat.num_unique / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) | {{ stat.num_null if stat.num_null > 0 else 0 }} ({{ "%.1f" | format((stat.num_null / stat.num_records * 100) if stat.num_records > 0 else 0) }}%) | {% if stat.column_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"] %}Tokens: {{ "%.0f" | format(stat.output_tokens_mean) if stat.output_tokens_mean is defined else "N/A" }} out / {{ "%.0f" | format(stat.input_tokens_mean) if stat.input_tokens_mean is defined else "N/A" }} in{% elif stat.column_type == "sampler" and stat.sampler_type is defined %}{% if stat.sampler_type is mapping %}{{ stat.sampler_type.value }}{% else %}{{ stat.sampler_type }}{% endif %}{% else %}-{% endif %} | +{% endfor -%} {% else %} | Column | Type | |--------|------| @@ -67,7 +78,7 @@ Generated with {{ num_columns_configured }} column configuration(s): {% endfor %} {% endif %} -Full configuration available in `sdg.json` and detailed metadata in `metadata.json`. +Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in [`metadata.json`](metadata.json). ## Citation diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index b39c265e..4281e6f0 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -110,7 +110,7 @@ def push_to_hub( Uploads all artifacts including: - Main parquet batch files (data subset) - - Processor output batch files (data/{processor_name} subsets) + - Processor output batch files ({processor_name} subsets) - Configuration (sdg.json) - Metadata (metadata.json) - Auto-generated dataset card (README.md) diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index 3e5bcdb1..efc64ffb 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -147,9 +147,8 @@ def test_upload_dataset_uploads_parquet_files(mock_hf_api: MagicMock, sample_dat ) # Check that upload_folder was called for parquet files - calls = [call for call in mock_hf_api.upload_folder.call_args_list if "parquet-files" in str(call)] - assert len(calls) == 1 - assert calls[0].kwargs["path_in_repo"] == "data" + calls = [call for call in mock_hf_api.upload_folder.call_args_list if call.kwargs["path_in_repo"] == "data"] + assert len(calls) >= 1 def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: @@ -163,9 +162,8 @@ def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample ) # Check that upload_folder was called for processor outputs - calls = [call for call in mock_hf_api.upload_folder.call_args_list if "processor1" in str(call)] - assert len(calls) == 1 - assert calls[0].kwargs["path_in_repo"] == "data/processor1" + calls = [call for call in mock_hf_api.upload_folder.call_args_list if "processor1" in call.kwargs["path_in_repo"]] + assert len(calls) >= 1 def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: @@ -180,8 +178,6 @@ def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_data # Check that upload_file was called for config files upload_file_calls = mock_hf_api.upload_file.call_args_list - assert len(upload_file_calls) == 2 - uploaded_files = [call.kwargs["path_in_repo"] for call in upload_file_calls] assert "sdg.json" in uploaded_files assert "metadata.json" in uploaded_files @@ -295,8 +291,11 @@ def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Pat # Should only upload parquet files, not processors folder_calls = mock_hf_api.upload_folder.call_args_list - assert len(folder_calls) == 1 # Only main parquet files - assert folder_calls[0].kwargs["path_in_repo"] == "data" + data_calls = [call for call in folder_calls if call.kwargs["path_in_repo"] == "data"] + processor_calls = [call for call in folder_calls if "processor" in call.kwargs["path_in_repo"]] + + assert len(data_calls) == 1 # Main parquet files uploaded + assert len(processor_calls) == 0 # No processor files def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Path) -> None: @@ -323,8 +322,11 @@ def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Pat # Should only upload metadata.json, not sdg.json file_calls = mock_hf_api.upload_file.call_args_list - assert len(file_calls) == 1 - assert file_calls[0].kwargs["path_in_repo"] == "metadata.json" + uploaded_files = [call.kwargs["path_in_repo"] for call in file_calls] + + assert len(uploaded_files) == 1 # Only metadata.json + assert "metadata.json" in uploaded_files + assert "sdg.json" not in uploaded_files def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: @@ -341,10 +343,10 @@ def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_datas folder_calls = mock_hf_api.upload_folder.call_args_list processor_calls = [call for call in folder_calls if "processor" in call.kwargs["path_in_repo"]] - assert len(processor_calls) == 2 + assert len(processor_calls) >= 2 processor_paths = [call.kwargs["path_in_repo"] for call in processor_calls] - assert "data/processor1" in processor_paths - assert "data/processor2" in processor_paths + assert any("processor1" in path for path in processor_paths) + assert any("processor2" in path for path in processor_paths) # Error handling and validation tests @@ -521,3 +523,34 @@ def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None: with pytest.raises(HuggingFaceUploadError, match="Failed to parse metadata.json"): client._upload_dataset_card("test/dataset", base_path) + + +def test_update_metadata_paths(tmp_path: Path) -> None: + """Test that _update_metadata_paths correctly updates file paths for HuggingFace Hub.""" + metadata = { + "target_num_records": 100, + "file_paths": { + "parquet-files": [ + "parquet-files/batch_00000.parquet", + "parquet-files/batch_00001.parquet", + ], + "processor-files": { + "processor1": ["processors-files/processor1/batch_00000.parquet"], + "processor2": ["processors-files/processor2/batch_00000.parquet"], + }, + }, + } + + metadata_path = tmp_path / "metadata.json" + with open(metadata_path, "w") as f: + json.dump(metadata, f) + + updated = HuggingFaceHubClient._update_metadata_paths(metadata_path) + + assert updated["file_paths"]["data"] == [ + "data/batch_00000.parquet", + "data/batch_00001.parquet", + ] + assert updated["file_paths"]["processor-files"]["processor1"] == ["processor1/batch_00000.parquet"] + assert updated["file_paths"]["processor-files"]["processor2"] == ["processor2/batch_00000.parquet"] + assert "parquet-files" not in updated["file_paths"] diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py index e4b3b8eb..b2821c6c 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py +++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py @@ -122,4 +122,43 @@ def test_from_metadata_with_llm_columns() -> None: # Verify LLM statistics are included assert card is not None - assert "50.5" in str(card) or "Avg Output Tokens" in str(card) + assert "Tokens:" in str(card) and "out" in str(card) and "in" in str(card) + + +def test_from_metadata_with_processors() -> None: + """Test creating dataset card with processor outputs includes loading examples.""" + metadata = { + "target_num_records": 100, + "schema": {"col1": "string"}, + "file_paths": { + "parquet-files": ["parquet-files/batch_00000.parquet"], + "processor-files": { + "processor1": ["processors-files/processor1/batch_00000.parquet"], + "processor2": ["processors-files/processor2/batch_00000.parquet"], + }, + }, + "column_statistics": [ + { + "column_name": "col1", + "num_records": 100, + "num_unique": 100, + "num_null": 0, + "simple_dtype": "string", + "column_type": "sampler", + } + ], + } + + card = DataDesignerDatasetCard.from_metadata( + metadata=metadata, + sdg_config=None, + repo_id="test/dataset-with-processors", + ) + + card_str = str(card) + assert card is not None + assert "processor1" in card_str + assert "processor2" in card_str + assert '"processor1"' in card_str + assert '"processor2"' in card_str + assert "Load processor outputs" in card_str From 0cd2dd13a365ac499dc5620c3cb9d6c136178cc5 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 11:55:34 -0700 Subject: [PATCH 03/25] feat: add optional description parameter to push_to_hub - Add description parameter to push_to_hub() for custom dataset card content - Description appears after NeMo Data Designer intro section - Update dataset card template to conditionally render custom description - Add tests for with/without custom description scenarios --- .../integrations/huggingface/client.py | 10 +++- .../integrations/huggingface/dataset_card.py | 3 + .../huggingface/dataset_card_template.md | 4 ++ .../src/data_designer/interface/results.py | 8 +++ .../huggingface/test_dataset_card.py | 59 +++++++++++++++++++ 5 files changed, 82 insertions(+), 2 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 8b182b25..a5ada120 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -44,6 +44,7 @@ def upload_dataset( *, private: bool = False, create_pr: bool = False, + description: str | None = None, ) -> str: """Upload dataset to HuggingFace Hub. @@ -58,6 +59,7 @@ def upload_dataset( base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.) private: Whether to create private repo create_pr: Whether to create a PR instead of direct push + description: Optional custom description text for dataset card Returns: URL to the uploaded dataset @@ -103,7 +105,7 @@ def upload_dataset( logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...") try: - self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr) + self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr, description=description) except Exception as e: raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e @@ -185,13 +187,16 @@ def upload_dataset( logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") return url - def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None: + def _upload_dataset_card( + self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False, description: str | None = None + ) -> None: """Generate and upload dataset card from metadata.json. Args: repo_id: HuggingFace repo ID base_dataset_path: Path to dataset artifacts create_pr: Whether to create a PR instead of direct push + description: Optional custom description text for dataset card Raises: HuggingFaceUploadError: If dataset card generation or upload fails @@ -221,6 +226,7 @@ def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, *, create_ metadata=metadata, sdg_config=sdg_config, repo_id=repo_id, + description=description, ) except Exception as e: raise HuggingFaceUploadError(f"Failed to generate dataset card: {e}") from e diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py index 7ec3b2de..02960a17 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -27,6 +27,7 @@ def from_metadata( metadata: dict, sdg_config: dict | None, repo_id: str, + description: str | None = None, ) -> DataDesignerDatasetCard: """Create dataset card from metadata.json and sdg.json. @@ -34,6 +35,7 @@ def from_metadata( metadata: Contents of metadata.json sdg_config: Contents of sdg.json (optional) repo_id: HuggingFace repo ID + description: Optional custom description text Returns: DataDesignerDatasetCard instance ready to upload @@ -106,6 +108,7 @@ def from_metadata( "has_processors": len(processor_names) > 0, "processor_names": processor_names, "tags": tags, + "custom_description": description, } # Create card from template diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md index 46de1474..1e63be49 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md @@ -16,6 +16,10 @@ configs: # {{ repo_id.split('/')[-1] | title }} This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a comprehensive framework for creating high-quality synthetic datasets from scratch or using seed data. +{% if custom_description %} + +{{ custom_description }} +{% endif %} ## About NeMo Data Designer diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index 4281e6f0..3f1072d7 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -105,6 +105,7 @@ def push_to_hub( token: str | None = None, private: bool = False, create_pr: bool = False, + description: str | None = None, ) -> str: """Push dataset to HuggingFace Hub. @@ -122,6 +123,8 @@ def push_to_hub( from `huggingface-cli login`. private: Create private repo create_pr: Create PR instead of direct push + description: Optional custom description text for the dataset card. + Appears after the NeMo Data Designer intro. Returns: URL to the uploaded dataset @@ -130,6 +133,10 @@ def push_to_hub( >>> results = data_designer.create(config, num_records=1000) >>> results.push_to_hub("username/my-synthetic-dataset") 'https://huggingface.co/datasets/username/my-synthetic-dataset' + + >>> # With custom description + >>> description = "This dataset contains synthetic conversations for training chatbots." + >>> results.push_to_hub("username/my-dataset", description=description) """ client = HuggingFaceHubClient(token=token) return client.upload_dataset( @@ -137,4 +144,5 @@ def push_to_hub( base_dataset_path=self.artifact_storage.base_dataset_path, private=private, create_pr=create_pr, + description=description, ) diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py index b2821c6c..40b8ecaa 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py +++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py @@ -162,3 +162,62 @@ def test_from_metadata_with_processors() -> None: assert '"processor1"' in card_str assert '"processor2"' in card_str assert "Load processor outputs" in card_str + + +def test_from_metadata_with_custom_description() -> None: + """Test creating dataset card with custom description.""" + metadata = { + "target_num_records": 100, + "schema": {"col1": "string", "col2": "int64"}, + "column_statistics": [ + { + "column_name": "col1", + "num_records": 100, + "num_unique": 100, + "num_null": 0, + "simple_dtype": "string", + "column_type": "sampler", + } + ], + } + + description = "This dataset contains synthetic data for testing chatbot responses." + + card = DataDesignerDatasetCard.from_metadata( + metadata=metadata, + sdg_config=None, + repo_id="test/dataset-with-description", + description=description, + ) + + card_str = str(card) + assert card is not None + assert "This dataset contains synthetic data for testing chatbot responses." in card_str + + +def test_from_metadata_without_custom_description() -> None: + """Test creating dataset card without custom description.""" + metadata = { + "target_num_records": 50, + "schema": {"col1": "string"}, + "column_statistics": [ + { + "column_name": "col1", + "num_records": 50, + "num_unique": 50, + "num_null": 0, + "simple_dtype": "string", + "column_type": "sampler", + } + ], + } + + card = DataDesignerDatasetCard.from_metadata( + metadata=metadata, + sdg_config=None, + repo_id="test/dataset-no-description", + ) + + card_str = str(card) + assert card is not None + assert "About NeMo Data Designer" in card_str From da2acc881264cbaf677fba65f2165d85e5b1ae36 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 14:38:29 -0700 Subject: [PATCH 04/25] feat: make description required and enhance dataset card design MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Make description parameter required in push_to_hub() - Improve dataset card layout with flexbox header (title + right-aligned tagline) - Add horizontal dividers between sections for visual separation - Add emoji icons to section headers for better readability - Move About NeMo Data Designer section after Citation - Update section order: Description → Quick Start → Dataset Summary → Schema & Statistics → Generation Details → Citation → About - Update all tests to provide required description parameter --- .../integrations/huggingface/client.py | 10 +-- .../integrations/huggingface/dataset_card.py | 4 +- .../huggingface/dataset_card_template.md | 70 +++++++++++-------- .../src/data_designer/interface/results.py | 13 ++-- .../integrations/huggingface/test_client.py | 19 ++++- .../huggingface/test_dataset_card.py | 16 ++++- 6 files changed, 83 insertions(+), 49 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index a5ada120..b655c0dd 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -41,10 +41,10 @@ def upload_dataset( self, repo_id: str, base_dataset_path: Path, + description: str, *, private: bool = False, create_pr: bool = False, - description: str | None = None, ) -> str: """Upload dataset to HuggingFace Hub. @@ -57,9 +57,9 @@ def upload_dataset( Args: repo_id: HuggingFace repo ID (e.g., "username/dataset-name") base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.) + description: Custom description text for dataset card private: Whether to create private repo create_pr: Whether to create a PR instead of direct push - description: Optional custom description text for dataset card Returns: URL to the uploaded dataset @@ -105,7 +105,7 @@ def upload_dataset( logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...") try: - self._upload_dataset_card(repo_id, base_dataset_path, create_pr=create_pr, description=description) + self._upload_dataset_card(repo_id, base_dataset_path, description, create_pr=create_pr) except Exception as e: raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e @@ -188,15 +188,15 @@ def upload_dataset( return url def _upload_dataset_card( - self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False, description: str | None = None + self, repo_id: str, base_dataset_path: Path, description: str, *, create_pr: bool = False ) -> None: """Generate and upload dataset card from metadata.json. Args: repo_id: HuggingFace repo ID base_dataset_path: Path to dataset artifacts + description: Custom description text for dataset card create_pr: Whether to create a PR instead of direct push - description: Optional custom description text for dataset card Raises: HuggingFaceUploadError: If dataset card generation or upload fails diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py index 02960a17..43ccb48d 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -27,7 +27,7 @@ def from_metadata( metadata: dict, sdg_config: dict | None, repo_id: str, - description: str | None = None, + description: str, ) -> DataDesignerDatasetCard: """Create dataset card from metadata.json and sdg.json. @@ -35,7 +35,7 @@ def from_metadata( metadata: Contents of metadata.json sdg_config: Contents of sdg.json (optional) repo_id: HuggingFace repo ID - description: Optional custom description text + description: Custom description text Returns: DataDesignerDatasetCard instance ready to upload diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md index 1e63be49..89c87e67 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md @@ -13,35 +13,18 @@ configs: {% endfor %}{% endif %} --- -# {{ repo_id.split('/')[-1] | title }} +
+

{{ repo_id.split('/')[-1] | title }}

+Made with ❤️ using 🎨 NeMo Data Designer +
-This dataset was generated using **[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner)**, a comprehensive framework for creating high-quality synthetic datasets from scratch or using seed data. -{% if custom_description %} +--- {{ custom_description }} -{% endif %} - -## About NeMo Data Designer - -NeMo Data Designer is a general framework for generating high-quality synthetic data that goes beyond simple LLM prompting. It provides: - -- **Diverse data generation** using statistical samplers, LLMs, or existing seed datasets -- **Relationship control** between fields with dependency-aware generation -- **Quality validation** with built-in Python, SQL, and custom local and remote validators -- **LLM-as-a-judge** scoring for quality assessment -- **Fast iteration** with preview mode before full-scale generation -For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) (`pip install data-designer`) - -## Dataset Summary - -- **Records**: {{ "{:,}".format(num_records) }} -- **Columns**: {{ num_columns }} -{% if target_num_records != num_records %} -- **Completion**: {{ "%.1f" | format(percent_complete) }}% ({{ "{:,}".format(target_num_records) }} requested) -{% endif %} +--- -## Quick Start +## 🚀 Quick Start ```python from datasets import load_dataset @@ -50,13 +33,26 @@ from datasets import load_dataset dataset = load_dataset("{{ repo_id }}", "data", split="train") df = dataset.to_pandas() {% if has_processors %} + # Load processor outputs (if available){% for processor_name in processor_names %} processor_{{ processor_name }} = load_dataset("{{ repo_id }}", "{{ processor_name }}", split="train") df_{{ processor_name }} = processor_{{ processor_name }}.to_pandas() {% endfor %}{% endif %} ``` -## Schema & Statistics +--- + +## 📊 Dataset Summary + +- **📈 Records**: {{ "{:,}".format(num_records) }} +- **📋 Columns**: {{ num_columns }} +{% if target_num_records != num_records %} +- **✅ Completion**: {{ "%.1f" | format(percent_complete) }}% ({{ "{:,}".format(target_num_records) }} requested) +{% endif %} + +--- + +## 📋 Schema & Statistics {% if column_statistics %} | Column | Type | Column Type | Unique (%) | Null (%) | Details | @@ -72,7 +68,9 @@ df_{{ processor_name }} = processor_{{ processor_name }}.to_pandas() {% endfor %} {% endif %} -## Generation Details +--- + +## ⚙️ Generation Details {% if config_types %} Generated with {{ num_columns_configured }} column configuration(s): @@ -80,11 +78,13 @@ Generated with {{ num_columns_configured }} column configuration(s): {% for col_type, count in config_types.items() | sort %} - **{{ col_type }}**: {{ count }} column(s) {% endfor %} + {% endif %} +📄 Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in [`metadata.json`](metadata.json). -Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in [`metadata.json`](metadata.json). +--- -## Citation +## 📚 Citation ```bibtex @misc{nemo-data-designer, @@ -95,3 +95,17 @@ Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in note = {GitHub Repository}, } ``` + +--- + +## 💡 About NeMo Data Designer + +NeMo Data Designer is a general framework for generating high-quality synthetic data that goes beyond simple LLM prompting. It provides: + +- **Diverse data generation** using statistical samplers, LLMs, or existing seed datasets +- **Relationship control** between fields with dependency-aware generation +- **Quality validation** with built-in Python, SQL, and custom local and remote validators +- **LLM-as-a-judge** scoring for quality assessment +- **Fast iteration** with preview mode before full-scale generation + +For more information, visit: [https://github.com/NVIDIA-NeMo/DataDesigner](https://github.com/NVIDIA-NeMo/DataDesigner) (`pip install data-designer`) diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index 3f1072d7..cde3b95d 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -101,11 +101,11 @@ def get_path_to_processor_artifacts(self, processor_name: str) -> Path: def push_to_hub( self, repo_id: str, + description: str, *, token: str | None = None, private: bool = False, create_pr: bool = False, - description: str | None = None, ) -> str: """Push dataset to HuggingFace Hub. @@ -118,25 +118,22 @@ def push_to_hub( Args: repo_id: HuggingFace repo ID (e.g., "username/my-dataset") + description: Custom description text for the dataset card. + Appears after the title. token: HuggingFace API token. If None, the token is automatically resolved from HF_TOKEN environment variable or cached credentials from `huggingface-cli login`. private: Create private repo create_pr: Create PR instead of direct push - description: Optional custom description text for the dataset card. - Appears after the NeMo Data Designer intro. Returns: URL to the uploaded dataset Example: >>> results = data_designer.create(config, num_records=1000) - >>> results.push_to_hub("username/my-synthetic-dataset") - 'https://huggingface.co/datasets/username/my-synthetic-dataset' - - >>> # With custom description >>> description = "This dataset contains synthetic conversations for training chatbots." - >>> results.push_to_hub("username/my-dataset", description=description) + >>> results.push_to_hub("username/my-synthetic-dataset", description) + 'https://huggingface.co/datasets/username/my-synthetic-dataset' """ client = HuggingFaceHubClient(token=token) return client.upload_dataset( diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index efc64ffb..25f75f23 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -126,6 +126,7 @@ def test_upload_dataset_creates_repo(mock_hf_api: MagicMock, sample_dataset_path client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", ) mock_hf_api.create_repo.assert_called_once_with( @@ -144,6 +145,7 @@ def test_upload_dataset_uploads_parquet_files(mock_hf_api: MagicMock, sample_dat client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", ) # Check that upload_folder was called for parquet files @@ -159,6 +161,7 @@ def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", ) # Check that upload_folder was called for processor outputs @@ -174,6 +177,7 @@ def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_data client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", ) # Check that upload_file was called for config files @@ -191,6 +195,7 @@ def test_upload_dataset_returns_url(mock_hf_api: MagicMock, sample_dataset_path: url = client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", ) assert url == "https://huggingface.co/datasets/test/dataset" @@ -204,6 +209,7 @@ def test_upload_dataset_with_private_repo(mock_hf_api: MagicMock, sample_dataset client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", private=True, ) @@ -223,6 +229,7 @@ def test_upload_dataset_with_create_pr(mock_hf_api: MagicMock, sample_dataset_pa client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", create_pr=True, ) @@ -240,7 +247,7 @@ def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None: base_path.mkdir() with pytest.raises(HuggingFaceUploadError, match="Failed to read metadata.json"): - client._upload_dataset_card("test/dataset", base_path) + client._upload_dataset_card("test/dataset", base_path, "Test description") def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> None: @@ -251,7 +258,7 @@ def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> Non mock_card = MagicMock() mock_card_class.from_metadata.return_value = mock_card - client._upload_dataset_card("test/dataset", sample_dataset_path) + client._upload_dataset_card("test/dataset", sample_dataset_path, "Test description") # Verify card was created from metadata mock_card_class.from_metadata.assert_called_once() @@ -287,6 +294,7 @@ def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Pat client.upload_dataset( repo_id="test/dataset", base_dataset_path=base_path, + description="Test dataset", ) # Should only upload parquet files, not processors @@ -318,6 +326,7 @@ def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Pat client.upload_dataset( repo_id="test/dataset", base_dataset_path=base_path, + description="Test dataset", ) # Should only upload metadata.json, not sdg.json @@ -337,6 +346,7 @@ def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_datas client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", ) # Check that both processors were uploaded @@ -475,6 +485,7 @@ def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_p client.upload_dataset( repo_id="invalid-repo-id", # Missing slash base_dataset_path=sample_dataset_path, + description="Test dataset", ) @@ -493,6 +504,7 @@ def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_data client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", ) @@ -511,6 +523,7 @@ def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_ client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, + description="Test dataset", ) @@ -522,7 +535,7 @@ def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None: (base_path / "metadata.json").write_text("invalid json") with pytest.raises(HuggingFaceUploadError, match="Failed to parse metadata.json"): - client._upload_dataset_card("test/dataset", base_path) + client._upload_dataset_card("test/dataset", base_path, "Test description") def test_update_metadata_paths(tmp_path: Path) -> None: diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py index 40b8ecaa..956720e9 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py +++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py @@ -37,6 +37,7 @@ def test_from_metadata_minimal() -> None: metadata=metadata, sdg_config=None, repo_id="test/dataset", + description="Test dataset for unit testing.", ) # Verify card was created @@ -87,6 +88,7 @@ def test_from_metadata_with_sdg_config() -> None: metadata=metadata, sdg_config=sdg_config, repo_id="test/dataset-with-config", + description="Test dataset with SDG config.", ) # Verify card includes config info @@ -118,6 +120,7 @@ def test_from_metadata_with_llm_columns() -> None: metadata=metadata, sdg_config=None, repo_id="test/llm-dataset", + description="Test dataset with LLM columns.", ) # Verify LLM statistics are included @@ -153,6 +156,7 @@ def test_from_metadata_with_processors() -> None: metadata=metadata, sdg_config=None, repo_id="test/dataset-with-processors", + description="Test dataset with processor outputs.", ) card_str = str(card) @@ -195,8 +199,8 @@ def test_from_metadata_with_custom_description() -> None: assert "This dataset contains synthetic data for testing chatbot responses." in card_str -def test_from_metadata_without_custom_description() -> None: - """Test creating dataset card without custom description.""" +def test_from_metadata_description_placement() -> None: + """Test that description appears in the correct location.""" metadata = { "target_num_records": 50, "schema": {"col1": "string"}, @@ -215,9 +219,15 @@ def test_from_metadata_without_custom_description() -> None: card = DataDesignerDatasetCard.from_metadata( metadata=metadata, sdg_config=None, - repo_id="test/dataset-no-description", + repo_id="test/dataset-description-placement", + description="Test description placement.", ) card_str = str(card) assert card is not None + assert "Test description placement." in card_str assert "About NeMo Data Designer" in card_str + # Description should appear before Dataset Summary + desc_pos = card_str.find("Test description placement.") + summary_pos = card_str.find("Dataset Summary") + assert desc_pos < summary_pos From 5b83a1a24c5b41fc2b94173a9e73bc0ae31f979e Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 14:50:29 -0700 Subject: [PATCH 05/25] fix license headers --- .../src/data_designer/integrations/huggingface/__init__.py | 2 +- .../src/data_designer/integrations/huggingface/client.py | 2 +- .../src/data_designer/integrations/huggingface/dataset_card.py | 2 +- .../data-designer/tests/integrations/huggingface/__init__.py | 2 +- .../data-designer/tests/integrations/huggingface/test_client.py | 2 +- .../tests/integrations/huggingface/test_dataset_card.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py index 99b9d93e..bbdaddff 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index b655c0dd..c73c6f89 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py index 43ccb48d..606a54f9 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/packages/data-designer/tests/integrations/huggingface/__init__.py b/packages/data-designer/tests/integrations/huggingface/__init__.py index 1a8431c3..52a7a9da 100644 --- a/packages/data-designer/tests/integrations/huggingface/__init__.py +++ b/packages/data-designer/tests/integrations/huggingface/__init__.py @@ -1,2 +1,2 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index 25f75f23..b946c473 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py index 956720e9..aa573cd6 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py +++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations From 0ecba21c1f0b6d8d842bcc18f9f9b1719fbcd27b Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 15:02:42 -0700 Subject: [PATCH 06/25] remove modality deteciton --- .../integrations/huggingface/dataset_card.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py index 606a54f9..766bde8c 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -71,20 +71,8 @@ def from_metadata( if "file_paths" in metadata and "processor-files" in metadata["file_paths"]: processor_names = list(metadata["file_paths"]["processor-files"].keys()) - # Determine modalities based on column types - modalities = set() - has_text = False - for stat in column_stats: - col_type = stat.get("column_type", "") - if col_type in ["llm-text", "llm-code", "llm-structured", "llm-judge"]: - has_text = True - - if has_text: - modalities.add("text") - modalities.add("tabular") - # Prepare tags - tags = ["synthetic", "datadesigner"] + list(modalities) + tags = ["synthetic", "datadesigner"] # Prepare CardData (metadata for YAML frontmatter) card_data = CardData( From 08b8aa6021674b25dd60c2493f794ebe8de8dded Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 15:06:29 -0700 Subject: [PATCH 07/25] break up upload_dataset --- .../integrations/huggingface/client.py | 107 +++++++++++++----- 1 file changed, 79 insertions(+), 28 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index c73c6f89..3120934f 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -72,6 +72,32 @@ def upload_dataset( self._validate_repo_id(repo_id) self._validate_dataset_path(base_dataset_path) + self._create_or_get_repo(repo_id, private=private) + + logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...") + try: + self._upload_dataset_card(repo_id, base_dataset_path, description, create_pr=create_pr) + except Exception as e: + raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e + + self._upload_main_dataset_files(repo_id, base_dataset_path, create_pr=create_pr) + self._upload_processor_files(repo_id, base_dataset_path, create_pr=create_pr) + self._upload_config_files(repo_id, base_dataset_path, create_pr=create_pr) + + url = f"https://huggingface.co/datasets/{repo_id}" + logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") + return url + + def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: + """Create or get existing repository on HuggingFace Hub. + + Args: + repo_id: HuggingFace repo ID + private: Whether to create private repo + + Raises: + HuggingFaceUploadError: If repository creation fails + """ logger.info(f"|-- {RandomEmoji.working()} Checking if repository exists...") try: repo_exists = self._api.repo_exists(repo_id=repo_id, repo_type="dataset") @@ -103,12 +129,17 @@ def upload_dataset( except Exception as e: raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e - logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...") - try: - self._upload_dataset_card(repo_id, base_dataset_path, description, create_pr=create_pr) - except Exception as e: - raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e + def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None: + """Upload main parquet dataset files. + + Args: + repo_id: HuggingFace repo ID + base_dataset_path: Path to dataset directory + create_pr: Whether to create a PR instead of direct push + Raises: + HuggingFaceUploadError: If upload fails + """ logger.info(f"|-- {RandomEmoji.loading()} Uploading main dataset files...") parquet_folder = base_dataset_path / "parquet-files" try: @@ -123,28 +154,52 @@ def upload_dataset( except Exception as e: raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e + def _upload_processor_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None: + """Upload processor output files. + + Args: + repo_id: HuggingFace repo ID + base_dataset_path: Path to dataset directory + create_pr: Whether to create a PR instead of direct push + + Raises: + HuggingFaceUploadError: If upload fails + """ processors_folder = base_dataset_path / "processors-files" - if processors_folder.exists(): - processor_dirs = [d for d in processors_folder.iterdir() if d.is_dir()] - if processor_dirs: - logger.info( - f"|-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)..." + if not processors_folder.exists(): + return + + processor_dirs = [d for d in processors_folder.iterdir() if d.is_dir()] + if not processor_dirs: + return + + logger.info(f"|-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)...") + for processor_dir in processor_dirs: + try: + self._api.upload_folder( + repo_id=repo_id, + folder_path=str(processor_dir), + path_in_repo=processor_dir.name, + repo_type="dataset", + commit_message=f"Upload {processor_dir.name} processor outputs", + create_pr=create_pr, ) - for processor_dir in processor_dirs: - try: - self._api.upload_folder( - repo_id=repo_id, - folder_path=str(processor_dir), - path_in_repo=processor_dir.name, - repo_type="dataset", - commit_message=f"Upload {processor_dir.name} processor outputs", - create_pr=create_pr, - ) - except Exception as e: - raise HuggingFaceUploadError( - f"Failed to upload processor outputs for '{processor_dir.name}': {e}" - ) from e + except Exception as e: + raise HuggingFaceUploadError( + f"Failed to upload processor outputs for '{processor_dir.name}': {e}" + ) from e + + def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None: + """Upload configuration files (sdg.json and metadata.json). + + Args: + repo_id: HuggingFace repo ID + base_dataset_path: Path to dataset directory + create_pr: Whether to create a PR instead of direct push + Raises: + HuggingFaceUploadError: If upload fails + """ logger.info(f"|-- {RandomEmoji.loading()} Uploading configuration files...") sdg_path = base_dataset_path / "sdg.json" @@ -183,10 +238,6 @@ def upload_dataset( except Exception as e: raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e - url = f"https://huggingface.co/datasets/{repo_id}" - logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") - return url - def _upload_dataset_card( self, repo_id: str, base_dataset_path: Path, description: str, *, create_pr: bool = False ) -> None: From ddd562900facd45387b9e38b397a2b1a61739964 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 15:09:56 -0700 Subject: [PATCH 08/25] make token private --- .../data_designer/integrations/huggingface/client.py | 11 ++++++++++- .../tests/integrations/huggingface/test_client.py | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 3120934f..8cfbb9dd 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -34,9 +34,18 @@ def __init__(self, token: str | None = None): resolved from HF_TOKEN environment variable or cached credentials from `huggingface-cli login`. """ - self.token = token + self._token = token self._api = HfApi(token=token) + @property + def has_token(self) -> bool: + """Check if a token was explicitly provided. + + Returns: + True if a token was provided during initialization, False otherwise. + """ + return self._token is not None + def upload_dataset( self, repo_id: str, diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index b946c473..7c54722e 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -108,14 +108,14 @@ def test_client_initialization() -> None: """Test HuggingFaceHubClient initialization.""" with patch("data_designer.integrations.huggingface.client.HfApi"): client = HuggingFaceHubClient(token="test-token") - assert client.token == "test-token" + assert client.has_token is True def test_client_initialization_no_token() -> None: """Test HuggingFaceHubClient initialization without token.""" with patch("data_designer.integrations.huggingface.client.HfApi"): client = HuggingFaceHubClient() - assert client.token is None + assert client.has_token is False def test_upload_dataset_creates_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: From 4590c7d4127b157bcfd44976207355408e978de9 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 15:15:46 -0700 Subject: [PATCH 09/25] HuggingFace -> Hugging Face --- .../integrations/huggingface/client.py | 32 +++++++++---------- .../integrations/huggingface/dataset_card.py | 6 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 8cfbb9dd..ffd732bc 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -20,17 +20,17 @@ class HuggingFaceUploadError(DataDesignerError): - """Error during HuggingFace dataset upload.""" + """Error during Hugging Face dataset upload.""" class HuggingFaceHubClient: - """Client for interacting with HuggingFace Hub to upload datasets.""" + """Client for interacting with Hugging Face Hub to upload datasets.""" def __init__(self, token: str | None = None): - """Initialize HuggingFace Hub client. + """Initialize Hugging Face Hub client. Args: - token: HuggingFace API token. If None, the token is automatically + token: Hugging Face API token. If None, the token is automatically resolved from HF_TOKEN environment variable or cached credentials from `huggingface-cli login`. """ @@ -55,7 +55,7 @@ def upload_dataset( private: bool = False, create_pr: bool = False, ) -> str: - """Upload dataset to HuggingFace Hub. + """Upload dataset to Hugging Face Hub. Uploads the complete dataset including: - Main parquet batch files from parquet-files/ → data/ @@ -64,7 +64,7 @@ def upload_dataset( - Auto-generated README.md (dataset card) Args: - repo_id: HuggingFace repo ID (e.g., "username/dataset-name") + repo_id: Hugging Face dataset repo ID (e.g., "username/dataset-name") base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.) description: Custom description text for dataset card private: Whether to create private repo @@ -76,7 +76,7 @@ def upload_dataset( Raises: HuggingFaceUploadError: If validation fails or upload encounters errors """ - logger.info(f"🤗 Uploading dataset to HuggingFace Hub: {repo_id}") + logger.info(f"🤗 Uploading dataset to Hugging Face Hub: {repo_id}") self._validate_repo_id(repo_id) self._validate_dataset_path(base_dataset_path) @@ -98,10 +98,10 @@ def upload_dataset( return url def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: - """Create or get existing repository on HuggingFace Hub. + """Create or get existing repository on Hugging Face Hub. Args: - repo_id: HuggingFace repo ID + repo_id: Hugging Face dataset repo ID private: Whether to create private repo Raises: @@ -124,7 +124,7 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: except HfHubHTTPError as e: if e.response.status_code == 401: raise HuggingFaceUploadError( - "Authentication failed. Please provide a valid HuggingFace token. " + "Authentication failed. Please provide a valid Hugging Face token. " "You can set it via the token parameter or HF_TOKEN environment variable, " "or run 'huggingface-cli login'." ) from e @@ -142,7 +142,7 @@ def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path, *, c """Upload main parquet dataset files. Args: - repo_id: HuggingFace repo ID + repo_id: Hugging Face dataset repo ID base_dataset_path: Path to dataset directory create_pr: Whether to create a PR instead of direct push @@ -167,7 +167,7 @@ def _upload_processor_files(self, repo_id: str, base_dataset_path: Path, *, crea """Upload processor output files. Args: - repo_id: HuggingFace repo ID + repo_id: Hugging Face dataset repo ID base_dataset_path: Path to dataset directory create_pr: Whether to create a PR instead of direct push @@ -202,7 +202,7 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_ """Upload configuration files (sdg.json and metadata.json). Args: - repo_id: HuggingFace repo ID + repo_id: Hugging Face dataset repo ID base_dataset_path: Path to dataset directory create_pr: Whether to create a PR instead of direct push @@ -253,7 +253,7 @@ def _upload_dataset_card( """Generate and upload dataset card from metadata.json. Args: - repo_id: HuggingFace repo ID + repo_id: Hugging Face dataset repo ID base_dataset_path: Path to dataset artifacts description: Custom description text for dataset card create_pr: Whether to create a PR instead of direct push @@ -298,7 +298,7 @@ def _upload_dataset_card( @staticmethod def _validate_repo_id(repo_id: str) -> None: - """Validate HuggingFace repository ID format. + """Validate Hugging Face dataset repository ID format. Args: repo_id: Repository ID to validate @@ -320,7 +320,7 @@ def _validate_repo_id(repo_id: str) -> None: @staticmethod def _update_metadata_paths(metadata_path: Path) -> dict: - """Update file paths in metadata.json to match HuggingFace Hub structure. + """Update file paths in metadata.json to match Hugging Face dataset repository structure. Local paths: - parquet-files/batch_00000.parquet → data/batch_00000.parquet diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py index 766bde8c..166750fc 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -34,7 +34,7 @@ def from_metadata( Args: metadata: Contents of metadata.json sdg_config: Contents of sdg.json (optional) - repo_id: HuggingFace repo ID + repo_id: Hugging Face dataset repo ID description: Custom description text Returns: @@ -105,13 +105,13 @@ def from_metadata( @staticmethod def _compute_size_category(num_records: int) -> str: - """Compute HuggingFace size category from record count. + """Compute Hugging Face dataset size category from record count. Args: num_records: Number of records in the dataset Returns: - Size category string for HuggingFace Hub tags + Size category string for Hugging Face dataset repository tags """ if num_records < 1000: return "n<1K" From 5113069d767d998fdd2071bc6ca7322ecc272d88 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 15:19:39 -0700 Subject: [PATCH 10/25] remove inline imports --- .../tests/integrations/huggingface/test_client.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index 7c54722e..5fedf608 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -8,6 +8,7 @@ from unittest.mock import MagicMock, patch import pytest +from huggingface_hub.utils import HfHubHTTPError from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError @@ -491,8 +492,6 @@ def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_p def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: """Test upload_dataset handles authentication errors.""" - from huggingface_hub.utils import HfHubHTTPError - client = HuggingFaceHubClient(token="invalid-token") # Mock 401 authentication error @@ -510,8 +509,6 @@ def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_data def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: """Test upload_dataset handles permission errors.""" - from huggingface_hub.utils import HfHubHTTPError - client = HuggingFaceHubClient(token="test-token") # Mock 403 permission error From 02182f991a9c748ae6ef8069c321fb917609cfb9 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 15:32:33 -0700 Subject: [PATCH 11/25] simplify tests + remvoe create pr option for simplicity --- .../integrations/huggingface/client.py | 30 +-- .../src/data_designer/interface/results.py | 3 - .../integrations/huggingface/test_client.py | 184 ++++++++---------- 3 files changed, 95 insertions(+), 122 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index ffd732bc..e0a2a266 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -53,7 +53,6 @@ def upload_dataset( description: str, *, private: bool = False, - create_pr: bool = False, ) -> str: """Upload dataset to Hugging Face Hub. @@ -68,7 +67,6 @@ def upload_dataset( base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.) description: Custom description text for dataset card private: Whether to create private repo - create_pr: Whether to create a PR instead of direct push Returns: URL to the uploaded dataset @@ -85,13 +83,13 @@ def upload_dataset( logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...") try: - self._upload_dataset_card(repo_id, base_dataset_path, description, create_pr=create_pr) + self._upload_dataset_card(repo_id, base_dataset_path, description) except Exception as e: raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e - self._upload_main_dataset_files(repo_id, base_dataset_path, create_pr=create_pr) - self._upload_processor_files(repo_id, base_dataset_path, create_pr=create_pr) - self._upload_config_files(repo_id, base_dataset_path, create_pr=create_pr) + self._upload_main_dataset_files(repo_id, base_dataset_path) + self._upload_processor_files(repo_id, base_dataset_path) + self._upload_config_files(repo_id, base_dataset_path) url = f"https://huggingface.co/datasets/{repo_id}" logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") @@ -138,13 +136,12 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: except Exception as e: raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e - def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None: + def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path) -> None: """Upload main parquet dataset files. Args: repo_id: Hugging Face dataset repo ID base_dataset_path: Path to dataset directory - create_pr: Whether to create a PR instead of direct push Raises: HuggingFaceUploadError: If upload fails @@ -158,18 +155,16 @@ def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path, *, c path_in_repo="data", repo_type="dataset", commit_message="Upload main dataset files", - create_pr=create_pr, ) except Exception as e: raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e - def _upload_processor_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None: + def _upload_processor_files(self, repo_id: str, base_dataset_path: Path) -> None: """Upload processor output files. Args: repo_id: Hugging Face dataset repo ID base_dataset_path: Path to dataset directory - create_pr: Whether to create a PR instead of direct push Raises: HuggingFaceUploadError: If upload fails @@ -191,20 +186,18 @@ def _upload_processor_files(self, repo_id: str, base_dataset_path: Path, *, crea path_in_repo=processor_dir.name, repo_type="dataset", commit_message=f"Upload {processor_dir.name} processor outputs", - create_pr=create_pr, ) except Exception as e: raise HuggingFaceUploadError( f"Failed to upload processor outputs for '{processor_dir.name}': {e}" ) from e - def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_pr: bool = False) -> None: + def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None: """Upload configuration files (sdg.json and metadata.json). Args: repo_id: Hugging Face dataset repo ID base_dataset_path: Path to dataset directory - create_pr: Whether to create a PR instead of direct push Raises: HuggingFaceUploadError: If upload fails @@ -220,7 +213,6 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_ path_in_repo="sdg.json", repo_type="dataset", commit_message="Upload sdg.json", - create_pr=create_pr, ) except Exception as e: raise HuggingFaceUploadError(f"Failed to upload sdg.json: {e}") from e @@ -240,23 +232,19 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path, *, create_ path_in_repo="metadata.json", repo_type="dataset", commit_message="Upload metadata.json", - create_pr=create_pr, ) finally: Path(tmp_path).unlink() except Exception as e: raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e - def _upload_dataset_card( - self, repo_id: str, base_dataset_path: Path, description: str, *, create_pr: bool = False - ) -> None: + def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, description: str) -> None: """Generate and upload dataset card from metadata.json. Args: repo_id: Hugging Face dataset repo ID base_dataset_path: Path to dataset artifacts description: Custom description text for dataset card - create_pr: Whether to create a PR instead of direct push Raises: HuggingFaceUploadError: If dataset card generation or upload fails @@ -292,7 +280,7 @@ def _upload_dataset_card( raise HuggingFaceUploadError(f"Failed to generate dataset card: {e}") from e try: - card.push_to_hub(repo_id, repo_type="dataset", create_pr=create_pr) + card.push_to_hub(repo_id, repo_type="dataset") except Exception as e: raise HuggingFaceUploadError(f"Failed to push dataset card to hub: {e}") from e diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index cde3b95d..a37a5483 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -105,7 +105,6 @@ def push_to_hub( *, token: str | None = None, private: bool = False, - create_pr: bool = False, ) -> str: """Push dataset to HuggingFace Hub. @@ -124,7 +123,6 @@ def push_to_hub( resolved from HF_TOKEN environment variable or cached credentials from `huggingface-cli login`. private: Create private repo - create_pr: Create PR instead of direct push Returns: URL to the uploaded dataset @@ -140,6 +138,5 @@ def push_to_hub( repo_id=repo_id, base_dataset_path=self.artifact_storage.base_dataset_path, private=private, - create_pr=create_pr, description=description, ) diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index 5fedf608..6ba24647 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -22,6 +22,15 @@ def mock_hf_api() -> MagicMock: yield api_instance +@pytest.fixture +def mock_dataset_card() -> MagicMock: + """Mock DataDesignerDatasetCard for testing.""" + with patch("data_designer.integrations.huggingface.client.DataDesignerDatasetCard") as mock: + card_instance = MagicMock() + mock.from_metadata.return_value = card_instance + yield mock + + @pytest.fixture def sample_dataset_path(tmp_path: Path) -> Path: """Create a sample dataset directory structure. @@ -119,67 +128,68 @@ def test_client_initialization_no_token() -> None: assert client.has_token is False -def test_upload_dataset_creates_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: +def test_upload_dataset_creates_repo( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: """Test that upload_dataset creates a repository.""" client = HuggingFaceHubClient(token="test-token") - with patch.object(client, "_upload_dataset_card"): - client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=sample_dataset_path, - description="Test dataset", - ) - - mock_hf_api.create_repo.assert_called_once_with( + client.upload_dataset( repo_id="test/dataset", - repo_type="dataset", - exist_ok=True, - private=False, + base_dataset_path=sample_dataset_path, + description="Test dataset", ) + # Verify repo creation was called + mock_hf_api.create_repo.assert_called_once() + assert mock_hf_api.create_repo.call_args.kwargs["repo_id"] == "test/dataset" + -def test_upload_dataset_uploads_parquet_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: +def test_upload_dataset_uploads_parquet_files( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: """Test that upload_dataset uploads parquet files.""" client = HuggingFaceHubClient(token="test-token") - with patch.object(client, "_upload_dataset_card"): - client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=sample_dataset_path, - description="Test dataset", - ) + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) # Check that upload_folder was called for parquet files calls = [call for call in mock_hf_api.upload_folder.call_args_list if call.kwargs["path_in_repo"] == "data"] assert len(calls) >= 1 -def test_upload_dataset_uploads_processor_outputs(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: +def test_upload_dataset_uploads_processor_outputs( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: """Test that upload_dataset uploads processor outputs.""" client = HuggingFaceHubClient(token="test-token") - with patch.object(client, "_upload_dataset_card"): - client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=sample_dataset_path, - description="Test dataset", - ) + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) # Check that upload_folder was called for processor outputs calls = [call for call in mock_hf_api.upload_folder.call_args_list if "processor1" in call.kwargs["path_in_repo"]] assert len(calls) >= 1 -def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: +def test_upload_dataset_uploads_config_files( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: """Test that upload_dataset uploads sdg.json and metadata.json.""" client = HuggingFaceHubClient(token="test-token") - with patch.object(client, "_upload_dataset_card"): - client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=sample_dataset_path, - description="Test dataset", - ) + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) # Check that upload_file was called for config files upload_file_calls = mock_hf_api.upload_file.call_args_list @@ -188,31 +198,33 @@ def test_upload_dataset_uploads_config_files(mock_hf_api: MagicMock, sample_data assert "metadata.json" in uploaded_files -def test_upload_dataset_returns_url(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: +def test_upload_dataset_returns_url( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: """Test that upload_dataset returns the correct URL.""" client = HuggingFaceHubClient(token="test-token") - with patch.object(client, "_upload_dataset_card"): - url = client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=sample_dataset_path, - description="Test dataset", - ) + url = client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) assert url == "https://huggingface.co/datasets/test/dataset" -def test_upload_dataset_with_private_repo(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: +def test_upload_dataset_with_private_repo( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: """Test upload_dataset with private repository.""" client = HuggingFaceHubClient(token="test-token") - with patch.object(client, "_upload_dataset_card"): - client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=sample_dataset_path, - description="Test dataset", - private=True, - ) + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + private=True, + ) mock_hf_api.create_repo.assert_called_once_with( repo_id="test/dataset", @@ -222,23 +234,6 @@ def test_upload_dataset_with_private_repo(mock_hf_api: MagicMock, sample_dataset ) -def test_upload_dataset_with_create_pr(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: - """Test upload_dataset with create_pr option.""" - client = HuggingFaceHubClient(token="test-token") - - with patch.object(client, "_upload_dataset_card"): - client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=sample_dataset_path, - description="Test dataset", - create_pr=True, - ) - - # Verify create_pr is passed to upload operations - for call in mock_hf_api.upload_folder.call_args_list: - assert call.kwargs["create_pr"] is True - - def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None: """Test _upload_dataset_card raises error when metadata.json is missing.""" client = HuggingFaceHubClient(token="test-token") @@ -261,22 +256,14 @@ def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> Non client._upload_dataset_card("test/dataset", sample_dataset_path, "Test description") - # Verify card was created from metadata + # Verify card was created and pushed mock_card_class.from_metadata.assert_called_once() - call_kwargs = mock_card_class.from_metadata.call_args.kwargs - assert call_kwargs["repo_id"] == "test/dataset" - assert "metadata" in call_kwargs - assert "sdg_config" in call_kwargs - - # Verify card was pushed to hub - mock_card.push_to_hub.assert_called_once_with( - "test/dataset", - repo_type="dataset", - create_pr=False, - ) + mock_card.push_to_hub.assert_called_once() -def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Path) -> None: +def test_upload_dataset_without_processors( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, tmp_path: Path +) -> None: """Test upload_dataset when no processor outputs exist.""" # Create dataset path without processors directory base_path = tmp_path / "dataset" @@ -291,12 +278,11 @@ def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Pat client = HuggingFaceHubClient(token="test-token") - with patch.object(client, "_upload_dataset_card"): - client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=base_path, - description="Test dataset", - ) + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + description="Test dataset", + ) # Should only upload parquet files, not processors folder_calls = mock_hf_api.upload_folder.call_args_list @@ -307,7 +293,9 @@ def test_upload_dataset_without_processors(mock_hf_api: MagicMock, tmp_path: Pat assert len(processor_calls) == 0 # No processor files -def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Path) -> None: +def test_upload_dataset_without_sdg_config( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, tmp_path: Path +) -> None: """Test upload_dataset when sdg.json doesn't exist.""" base_path = tmp_path / "dataset" base_path.mkdir() @@ -323,12 +311,11 @@ def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Pat client = HuggingFaceHubClient(token="test-token") - with patch.object(client, "_upload_dataset_card"): - client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=base_path, - description="Test dataset", - ) + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + description="Test dataset", + ) # Should only upload metadata.json, not sdg.json file_calls = mock_hf_api.upload_file.call_args_list @@ -339,16 +326,17 @@ def test_upload_dataset_without_sdg_config(mock_hf_api: MagicMock, tmp_path: Pat assert "sdg.json" not in uploaded_files -def test_upload_dataset_multiple_processors(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: +def test_upload_dataset_multiple_processors( + mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path +) -> None: """Test that multiple processor outputs are uploaded correctly.""" client = HuggingFaceHubClient(token="test-token") - with patch.object(client, "_upload_dataset_card"): - client.upload_dataset( - repo_id="test/dataset", - base_dataset_path=sample_dataset_path, - description="Test dataset", - ) + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test dataset", + ) # Check that both processors were uploaded folder_calls = mock_hf_api.upload_folder.call_args_list From 9b99aed18becc5022dc5e98d28a7bec5935e4c1d Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 15:46:49 -0700 Subject: [PATCH 12/25] Update packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- .../src/data_designer/integrations/huggingface/dataset_card.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py index 166750fc..04bc6324 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -91,7 +91,7 @@ def from_metadata( "column_statistics": column_stats, "num_columns_configured": num_columns_configured, "config_types": config_types, - "percent_complete": 100 * actual_num_records / (target_num_records + 1e-10), + "percent_complete": 100 * actual_num_records / target_num_records if target_num_records > 0 else 0, "current_year": datetime.now().year, "has_processors": len(processor_names) > 0, "processor_names": processor_names, From ce05fa18b40cf4086179b9cd6d2ee8d15a39fc56 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 15:51:54 -0700 Subject: [PATCH 13/25] use consistent indentaion --- .../data_designer/engine/dataset_builders/artifact_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py index 905b0350..0d22bb89 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py @@ -259,7 +259,7 @@ def write_metadata(self, metadata: dict) -> Path: """ self.mkdir_if_needed(self.base_dataset_path) with open(self.metadata_file_path, "w") as file: - json.dump(metadata, file, indent=4, sort_keys=True) + json.dump(metadata, file, indent=2, sort_keys=True) return self.metadata_file_path def update_metadata(self, updates: dict) -> Path: From 243c08739844f871d7cfa9b84a5aa4777367df1d Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Fri, 30 Jan 2026 15:54:33 -0700 Subject: [PATCH 14/25] fix temp file clean up --- .../integrations/huggingface/client.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index e0a2a266..b1670598 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -219,24 +219,25 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None: metadata_path = base_dataset_path / "metadata.json" if metadata_path.exists(): + tmp_path = None try: updated_metadata = self._update_metadata_paths(metadata_path) with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp_file: json.dump(updated_metadata, tmp_file, indent=2) tmp_path = tmp_file.name - try: - self._api.upload_file( - repo_id=repo_id, - path_or_fileobj=tmp_path, - path_in_repo="metadata.json", - repo_type="dataset", - commit_message="Upload metadata.json", - ) - finally: - Path(tmp_path).unlink() + self._api.upload_file( + repo_id=repo_id, + path_or_fileobj=tmp_path, + path_in_repo="metadata.json", + repo_type="dataset", + commit_message="Upload metadata.json", + ) except Exception as e: raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e + finally: + if tmp_path and Path(tmp_path).exists(): + Path(tmp_path).unlink() def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, description: str) -> None: """Generate and upload dataset card from metadata.json. From de61805b4f9c2a6b2f216c6395562e083a290e55 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 2 Feb 2026 10:24:12 -0700 Subject: [PATCH 15/25] huggingface hub already a dep in engine --- packages/data-designer/pyproject.toml | 1 - uv.lock | 2 -- 2 files changed, 3 deletions(-) diff --git a/packages/data-designer/pyproject.toml b/packages/data-designer/pyproject.toml index 883e18ea..31076704 100644 --- a/packages/data-designer/pyproject.toml +++ b/packages/data-designer/pyproject.toml @@ -22,7 +22,6 @@ classifiers = [ dependencies = [ "data-designer-config", "data-designer-engine", - "huggingface-hub>=1.0.1,<2", "prompt-toolkit>=3.0.0,<4", "typer>=0.12.0,<1", ] diff --git a/uv.lock b/uv.lock index 46716729..279f21de 100644 --- a/uv.lock +++ b/uv.lock @@ -690,7 +690,6 @@ source = { editable = "packages/data-designer" } dependencies = [ { name = "data-designer-config" }, { name = "data-designer-engine" }, - { name = "huggingface-hub" }, { name = "prompt-toolkit" }, { name = "typer" }, ] @@ -699,7 +698,6 @@ dependencies = [ requires-dist = [ { name = "data-designer-config", editable = "packages/data-designer-config" }, { name = "data-designer-engine", editable = "packages/data-designer-engine" }, - { name = "huggingface-hub", specifier = ">=1.0.1,<2" }, { name = "prompt-toolkit", specifier = ">=3.0.0,<4" }, { name = "typer", specifier = ">=0.12.0,<1" }, ] From f0e3fcb5f6cbe4bad73d6da465c25b08d317240d Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 2 Feb 2026 10:28:30 -0700 Subject: [PATCH 16/25] add missing spaces --- .../integrations/huggingface/client.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index b1670598..949a390b 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -81,7 +81,7 @@ def upload_dataset( self._create_or_get_repo(repo_id, private=private) - logger.info(f"|-- {RandomEmoji.data()} Uploading dataset card...") + logger.info(f" |-- {RandomEmoji.data()} Uploading dataset card...") try: self._upload_dataset_card(repo_id, base_dataset_path, description) except Exception as e: @@ -92,7 +92,7 @@ def upload_dataset( self._upload_config_files(repo_id, base_dataset_path) url = f"https://huggingface.co/datasets/{repo_id}" - logger.info(f"|-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") + logger.info(f" |-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") return url def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: @@ -105,13 +105,13 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: Raises: HuggingFaceUploadError: If repository creation fails """ - logger.info(f"|-- {RandomEmoji.working()} Checking if repository exists...") + logger.info(f" |-- {RandomEmoji.working()} Checking if repository exists...") try: repo_exists = self._api.repo_exists(repo_id=repo_id, repo_type="dataset") if repo_exists: - logger.info(f"|-- {RandomEmoji.success()} Repository already exists, updating content...") + logger.info(f" |-- {RandomEmoji.success()} Repository already exists, updating content...") else: - logger.info(f"|-- {RandomEmoji.working()} Creating new repository...") + logger.info(f" |-- {RandomEmoji.working()} Creating new repository...") self._api.create_repo( repo_id=repo_id, @@ -146,7 +146,7 @@ def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path) -> N Raises: HuggingFaceUploadError: If upload fails """ - logger.info(f"|-- {RandomEmoji.loading()} Uploading main dataset files...") + logger.info(f" |-- {RandomEmoji.loading()} Uploading main dataset files...") parquet_folder = base_dataset_path / "parquet-files" try: self._api.upload_folder( @@ -177,7 +177,7 @@ def _upload_processor_files(self, repo_id: str, base_dataset_path: Path) -> None if not processor_dirs: return - logger.info(f"|-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)...") + logger.info(f" |-- {RandomEmoji.loading()} Uploading processor outputs ({len(processor_dirs)} processors)...") for processor_dir in processor_dirs: try: self._api.upload_folder( @@ -202,7 +202,7 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None: Raises: HuggingFaceUploadError: If upload fails """ - logger.info(f"|-- {RandomEmoji.loading()} Uploading configuration files...") + logger.info(f" |-- {RandomEmoji.loading()} Uploading configuration files...") sdg_path = base_dataset_path / "sdg.json" if sdg_path.exists(): From 99c61fedafb4289d89aa57288d90b25aa44d627c Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 2 Feb 2026 11:36:17 -0700 Subject: [PATCH 17/25] reuse vars from artifact_storage.py --- .../dataset_builders/artifact_storage.py | 9 +- .../integrations/huggingface/client.py | 92 ++++++++++-------- .../integrations/huggingface/test_client.py | 97 ++++++++++--------- 3 files changed, 111 insertions(+), 87 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py index 0d22bb89..e2daaca6 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py @@ -25,6 +25,9 @@ BATCH_FILE_NAME_FORMAT = "batch_{batch_number:05d}.parquet" SDG_CONFIG_FILENAME = "sdg.json" +METADATA_FILENAME = "metadata.json" +FINAL_DATASET_FOLDER_NAME = "parquet-files" +PROCESSORS_OUTPUTS_FOLDER_NAME = "processors-files" class BatchStage(StrEnum): @@ -37,10 +40,10 @@ class BatchStage(StrEnum): class ArtifactStorage(BaseModel): artifact_path: Path | str dataset_name: str = "dataset" - final_dataset_folder_name: str = "parquet-files" + final_dataset_folder_name: str = FINAL_DATASET_FOLDER_NAME partial_results_folder_name: str = "tmp-partial-parquet-files" dropped_columns_folder_name: str = "dropped-columns-parquet-files" - processors_outputs_folder_name: str = "processors-files" + processors_outputs_folder_name: str = PROCESSORS_OUTPUTS_FOLDER_NAME @property def artifact_path_exists(self) -> bool: @@ -72,7 +75,7 @@ def final_dataset_path(self) -> Path: @property def metadata_file_path(self) -> Path: - return self.base_dataset_path / "metadata.json" + return self.base_dataset_path / METADATA_FILENAME @property def partial_results_path(self) -> Path: diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 949a390b..df8e26e7 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -12,6 +12,12 @@ from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError +from data_designer.engine.dataset_builders.artifact_storage import ( + FINAL_DATASET_FOLDER_NAME, + METADATA_FILENAME, + PROCESSORS_OUTPUTS_FOLDER_NAME, + SDG_CONFIG_FILENAME, +) from data_designer.errors import DataDesignerError from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard from data_designer.logging import RandomEmoji @@ -76,20 +82,30 @@ def upload_dataset( """ logger.info(f"🤗 Uploading dataset to Hugging Face Hub: {repo_id}") - self._validate_repo_id(repo_id) - self._validate_dataset_path(base_dataset_path) - - self._create_or_get_repo(repo_id, private=private) + self._validate_repo_id(repo_id=repo_id) + self._validate_dataset_path(base_dataset_path=base_dataset_path) + self._create_or_get_repo(repo_id=repo_id, private=private) logger.info(f" |-- {RandomEmoji.data()} Uploading dataset card...") try: - self._upload_dataset_card(repo_id, base_dataset_path, description) + self._upload_dataset_card( + repo_id=repo_id, + metadata_path=base_dataset_path / METADATA_FILENAME, + sdg_path=base_dataset_path / SDG_CONFIG_FILENAME, + description=description, + ) except Exception as e: raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e - self._upload_main_dataset_files(repo_id, base_dataset_path) - self._upload_processor_files(repo_id, base_dataset_path) - self._upload_config_files(repo_id, base_dataset_path) + self._upload_main_dataset_files(repo_id=repo_id, parquet_folder=base_dataset_path / FINAL_DATASET_FOLDER_NAME) + self._upload_processor_files( + repo_id=repo_id, processors_folder=base_dataset_path / PROCESSORS_OUTPUTS_FOLDER_NAME + ) + self._upload_config_files( + repo_id=repo_id, + metadata_path=base_dataset_path / METADATA_FILENAME, + sdg_path=base_dataset_path / SDG_CONFIG_FILENAME, + ) url = f"https://huggingface.co/datasets/{repo_id}" logger.info(f" |-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") @@ -136,18 +152,17 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: except Exception as e: raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e - def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path) -> None: + def _upload_main_dataset_files(self, repo_id: str, parquet_folder: Path) -> None: """Upload main parquet dataset files. Args: repo_id: Hugging Face dataset repo ID - base_dataset_path: Path to dataset directory + parquet_folder: Path to folder containing parquet files Raises: HuggingFaceUploadError: If upload fails """ logger.info(f" |-- {RandomEmoji.loading()} Uploading main dataset files...") - parquet_folder = base_dataset_path / "parquet-files" try: self._api.upload_folder( repo_id=repo_id, @@ -159,17 +174,16 @@ def _upload_main_dataset_files(self, repo_id: str, base_dataset_path: Path) -> N except Exception as e: raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e - def _upload_processor_files(self, repo_id: str, base_dataset_path: Path) -> None: + def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None: """Upload processor output files. Args: repo_id: Hugging Face dataset repo ID - base_dataset_path: Path to dataset directory + processors_folder: Path to folder containing processor output directories Raises: HuggingFaceUploadError: If upload fails """ - processors_folder = base_dataset_path / "processors-files" if not processors_folder.exists(): return @@ -192,32 +206,31 @@ def _upload_processor_files(self, repo_id: str, base_dataset_path: Path) -> None f"Failed to upload processor outputs for '{processor_dir.name}': {e}" ) from e - def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None: + def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path) -> None: """Upload configuration files (sdg.json and metadata.json). Args: repo_id: Hugging Face dataset repo ID - base_dataset_path: Path to dataset directory + metadata_path: Path to metadata.json file + sdg_path: Path to sdg.json file Raises: HuggingFaceUploadError: If upload fails """ logger.info(f" |-- {RandomEmoji.loading()} Uploading configuration files...") - sdg_path = base_dataset_path / "sdg.json" if sdg_path.exists(): try: self._api.upload_file( repo_id=repo_id, path_or_fileobj=str(sdg_path), - path_in_repo="sdg.json", + path_in_repo=SDG_CONFIG_FILENAME, repo_type="dataset", commit_message="Upload sdg.json", ) except Exception as e: raise HuggingFaceUploadError(f"Failed to upload sdg.json: {e}") from e - metadata_path = base_dataset_path / "metadata.json" if metadata_path.exists(): tmp_path = None try: @@ -229,37 +242,36 @@ def _upload_config_files(self, repo_id: str, base_dataset_path: Path) -> None: self._api.upload_file( repo_id=repo_id, path_or_fileobj=tmp_path, - path_in_repo="metadata.json", + path_in_repo=METADATA_FILENAME, repo_type="dataset", - commit_message="Upload metadata.json", + commit_message=f"Upload {METADATA_FILENAME}", ) except Exception as e: - raise HuggingFaceUploadError(f"Failed to upload metadata.json: {e}") from e + raise HuggingFaceUploadError(f"Failed to upload {METADATA_FILENAME}: {e}") from e finally: if tmp_path and Path(tmp_path).exists(): Path(tmp_path).unlink() - def _upload_dataset_card(self, repo_id: str, base_dataset_path: Path, description: str) -> None: + def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path, description: str) -> None: """Generate and upload dataset card from metadata.json. Args: repo_id: Hugging Face dataset repo ID - base_dataset_path: Path to dataset artifacts + metadata_path: Path to metadata.json file + sdg_path: Path to sdg.json file description: Custom description text for dataset card Raises: HuggingFaceUploadError: If dataset card generation or upload fails """ - metadata_path = base_dataset_path / "metadata.json" try: with open(metadata_path) as f: metadata = json.load(f) except json.JSONDecodeError as e: - raise HuggingFaceUploadError(f"Failed to parse metadata.json: {e}") from e + raise HuggingFaceUploadError(f"Failed to parse {METADATA_FILENAME}: {e}") from e except Exception as e: - raise HuggingFaceUploadError(f"Failed to read metadata.json: {e}") from e + raise HuggingFaceUploadError(f"Failed to read {METADATA_FILENAME}: {e}") from e - sdg_path = base_dataset_path / "sdg.json" sdg_config = None if sdg_path.exists(): try: @@ -327,16 +339,20 @@ def _update_metadata_paths(metadata_path: Path) -> dict: if "file_paths" in metadata: updated_file_paths = {} - if "parquet-files" in metadata["file_paths"]: + # Update parquet files path: parquet-files/ → data/ + if FINAL_DATASET_FOLDER_NAME in metadata["file_paths"]: updated_file_paths["data"] = [ - path.replace("parquet-files/", "data/") for path in metadata["file_paths"]["parquet-files"] + path.replace(f"{FINAL_DATASET_FOLDER_NAME}/", "data/") + for path in metadata["file_paths"][FINAL_DATASET_FOLDER_NAME] ] + # Update processor files paths: processors-files/{name}/ → {name}/ if "processor-files" in metadata["file_paths"]: updated_file_paths["processor-files"] = {} for processor_name, paths in metadata["file_paths"]["processor-files"].items(): updated_file_paths["processor-files"][processor_name] = [ - path.replace(f"processors-files/{processor_name}/", f"{processor_name}/") for path in paths + path.replace(f"{PROCESSORS_OUTPUTS_FOLDER_NAME}/{processor_name}/", f"{processor_name}/") + for path in paths ] metadata["file_paths"] = updated_file_paths @@ -359,14 +375,14 @@ def _validate_dataset_path(base_dataset_path: Path) -> None: if not base_dataset_path.is_dir(): raise HuggingFaceUploadError(f"Dataset path is not a directory: {base_dataset_path}") - metadata_path = base_dataset_path / "metadata.json" + metadata_path = base_dataset_path / METADATA_FILENAME if not metadata_path.exists(): raise HuggingFaceUploadError(f"Required file not found: {metadata_path}") if not metadata_path.is_file(): - raise HuggingFaceUploadError(f"metadata.json is not a file: {metadata_path}") + raise HuggingFaceUploadError(f"{METADATA_FILENAME} is not a file: {metadata_path}") - parquet_dir = base_dataset_path / "parquet-files" + parquet_dir = base_dataset_path / FINAL_DATASET_FOLDER_NAME if not parquet_dir.exists(): raise HuggingFaceUploadError( f"Required directory not found: {parquet_dir}. " @@ -385,14 +401,14 @@ def _validate_dataset_path(base_dataset_path: Path) -> None: with open(metadata_path) as f: json.load(f) except json.JSONDecodeError as e: - raise HuggingFaceUploadError(f"Invalid JSON in metadata.json: {e}") + raise HuggingFaceUploadError(f"Invalid JSON in {METADATA_FILENAME}: {e}") - sdg_path = base_dataset_path / "sdg.json" + sdg_path = base_dataset_path / SDG_CONFIG_FILENAME if sdg_path.exists(): if not sdg_path.is_file(): - raise HuggingFaceUploadError(f"sdg.json is not a file: {sdg_path}") + raise HuggingFaceUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {sdg_path}") try: with open(sdg_path) as f: json.load(f) except json.JSONDecodeError as e: - raise HuggingFaceUploadError(f"Invalid JSON in sdg.json: {e}") + raise HuggingFaceUploadError(f"Invalid JSON in {SDG_CONFIG_FILENAME}: {e}") diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index 6ba24647..0c87ffa6 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -235,26 +235,34 @@ def test_upload_dataset_with_private_repo( def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None: - """Test _upload_dataset_card raises error when metadata.json is missing.""" + """Test upload fails when metadata.json is missing.""" client = HuggingFaceHubClient(token="test-token") # Create directory without metadata.json base_path = tmp_path / "dataset" base_path.mkdir() - with pytest.raises(HuggingFaceUploadError, match="Failed to read metadata.json"): - client._upload_dataset_card("test/dataset", base_path, "Test description") + with pytest.raises(HuggingFaceUploadError, match="Required file not found"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + description="Test description", + ) -def test_upload_dataset_card_calls_push_to_hub(sample_dataset_path: Path) -> None: - """Test _upload_dataset_card generates card and pushes to hub.""" +def test_upload_dataset_card_calls_push_to_hub(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: + """Test upload_dataset generates and pushes dataset card.""" client = HuggingFaceHubClient(token="test-token") with patch("data_designer.integrations.huggingface.client.DataDesignerDatasetCard") as mock_card_class: mock_card = MagicMock() mock_card_class.from_metadata.return_value = mock_card - client._upload_dataset_card("test/dataset", sample_dataset_path, "Test description") + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=sample_dataset_path, + description="Test description", + ) # Verify card was created and pushed mock_card_class.from_metadata.assert_called_once() @@ -351,81 +359,69 @@ def test_upload_dataset_multiple_processors( # Error handling and validation tests -def test_validate_repo_id_invalid_format() -> None: - """Test repo_id validation with invalid formats.""" +def test_validate_repo_id_invalid_format(sample_dataset_path: Path) -> None: + """Test upload fails with invalid repo_id formats.""" client = HuggingFaceHubClient(token="test-token") # Missing slash with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): - client._validate_repo_id("my-dataset") + client.upload_dataset("my-dataset", sample_dataset_path, "Test") # Too many slashes (caught by regex) with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): - client._validate_repo_id("user/org/dataset") + client.upload_dataset("user/org/dataset", sample_dataset_path, "Test") # Invalid characters (space) with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): - client._validate_repo_id("user/my dataset") + client.upload_dataset("user/my dataset", sample_dataset_path, "Test") # Empty string with pytest.raises(HuggingFaceUploadError, match="must be a non-empty string"): - client._validate_repo_id("") - - -def test_validate_repo_id_valid_formats() -> None: - """Test repo_id validation with valid formats.""" - client = HuggingFaceHubClient(token="test-token") - - # Valid formats should not raise - client._validate_repo_id("username/dataset") - client._validate_repo_id("org/my-dataset") - client._validate_repo_id("user/dataset_name") - client._validate_repo_id("user123/dataset-123") - client._validate_repo_id("user/dataset.v2") + client.upload_dataset("", sample_dataset_path, "Test") def test_validate_dataset_path_not_exists(tmp_path: Path) -> None: - """Test validation fails when dataset path doesn't exist.""" + """Test upload fails when dataset path doesn't exist.""" client = HuggingFaceHubClient(token="test-token") non_existent = tmp_path / "does-not-exist" with pytest.raises(HuggingFaceUploadError, match="does not exist"): - client._validate_dataset_path(non_existent) + client.upload_dataset("test/dataset", non_existent, "Test") def test_validate_dataset_path_is_file(tmp_path: Path) -> None: - """Test validation fails when dataset path is a file.""" + """Test upload fails when dataset path is a file.""" client = HuggingFaceHubClient(token="test-token") file_path = tmp_path / "file.txt" file_path.write_text("not a directory") with pytest.raises(HuggingFaceUploadError, match="not a directory"): - client._validate_dataset_path(file_path) + client.upload_dataset("test/dataset", file_path, "Test") def test_validate_dataset_path_missing_metadata(tmp_path: Path) -> None: - """Test validation fails when metadata.json is missing.""" + """Test upload fails when metadata.json is missing.""" client = HuggingFaceHubClient(token="test-token") base_path = tmp_path / "dataset" base_path.mkdir() - with pytest.raises(HuggingFaceUploadError, match="Required file not found.*metadata.json"): - client._validate_dataset_path(base_path) + with pytest.raises(HuggingFaceUploadError, match="Required file not found"): + client.upload_dataset("test/dataset", base_path, "Test") def test_validate_dataset_path_missing_parquet_folder(tmp_path: Path) -> None: - """Test validation fails when parquet-files directory is missing.""" + """Test upload fails when parquet-files directory is missing.""" client = HuggingFaceHubClient(token="test-token") base_path = tmp_path / "dataset" base_path.mkdir() (base_path / "metadata.json").write_text('{"target_num_records": 10}') - with pytest.raises(HuggingFaceUploadError, match="Required directory not found.*parquet-files"): - client._validate_dataset_path(base_path) + with pytest.raises(HuggingFaceUploadError, match="Required directory not found"): + client.upload_dataset("test/dataset", base_path, "Test") def test_validate_dataset_path_empty_parquet_folder(tmp_path: Path) -> None: - """Test validation fails when parquet-files directory is empty.""" + """Test upload fails when parquet-files directory is empty.""" client = HuggingFaceHubClient(token="test-token") base_path = tmp_path / "dataset" base_path.mkdir() @@ -433,12 +429,12 @@ def test_validate_dataset_path_empty_parquet_folder(tmp_path: Path) -> None: parquet_dir = base_path / "parquet-files" parquet_dir.mkdir() - with pytest.raises(HuggingFaceUploadError, match="parquet-files directory is empty"): - client._validate_dataset_path(base_path) + with pytest.raises(HuggingFaceUploadError, match="directory is empty"): + client.upload_dataset("test/dataset", base_path, "Test") def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None: - """Test validation fails when metadata.json contains invalid JSON.""" + """Test upload fails when metadata.json contains invalid JSON.""" client = HuggingFaceHubClient(token="test-token") base_path = tmp_path / "dataset" base_path.mkdir() @@ -447,12 +443,12 @@ def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None: parquet_dir.mkdir() (parquet_dir / "batch_00000.parquet").write_text("data") - with pytest.raises(HuggingFaceUploadError, match="Invalid JSON in metadata.json"): - client._validate_dataset_path(base_path) + with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"): + client.upload_dataset("test/dataset", base_path, "Test") def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None: - """Test validation fails when sdg.json contains invalid JSON.""" + """Test upload fails when sdg.json contains invalid JSON.""" client = HuggingFaceHubClient(token="test-token") base_path = tmp_path / "dataset" base_path.mkdir() @@ -462,8 +458,8 @@ def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None: parquet_dir.mkdir() (parquet_dir / "batch_00000.parquet").write_text("data") - with pytest.raises(HuggingFaceUploadError, match="Invalid JSON in sdg.json"): - client._validate_dataset_path(base_path) + with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"): + client.upload_dataset("test/dataset", base_path, "Test") def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_path: Path) -> None: @@ -513,14 +509,23 @@ def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_ def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None: - """Test _upload_dataset_card handles corrupted metadata.json.""" + """Test upload fails when metadata.json contains invalid JSON.""" client = HuggingFaceHubClient(token="test-token") base_path = tmp_path / "dataset" base_path.mkdir() (base_path / "metadata.json").write_text("invalid json") - with pytest.raises(HuggingFaceUploadError, match="Failed to parse metadata.json"): - client._upload_dataset_card("test/dataset", base_path, "Test description") + # Create parquet directory so validation reaches the metadata JSON check + parquet_dir = base_path / "parquet-files" + parquet_dir.mkdir() + (parquet_dir / "batch_00000.parquet").write_text("data") + + with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"): + client.upload_dataset( + repo_id="test/dataset", + base_dataset_path=base_path, + description="Test description", + ) def test_update_metadata_paths(tmp_path: Path) -> None: From 3270332e9554358a2ea92ce833ed956f44d4971b Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Mon, 2 Feb 2026 11:40:52 -0700 Subject: [PATCH 18/25] pull put hf hub datasets url to constants --- .../src/data_designer/config/utils/constants.py | 2 ++ .../src/data_designer/integrations/huggingface/client.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/data-designer-config/src/data_designer/config/utils/constants.py b/packages/data-designer-config/src/data_designer/config/utils/constants.py index 1a838f47..5c4cd38d 100644 --- a/packages/data-designer-config/src/data_designer/config/utils/constants.py +++ b/packages/data-designer-config/src/data_designer/config/utils/constants.py @@ -363,3 +363,5 @@ class NordColor(Enum): LOCALES_WITH_MANAGED_DATASETS = list[str](NEMOTRON_PERSONAS_DATASET_SIZES.keys()) NEMOTRON_PERSONAS_DATASET_PREFIX = "nemotron-personas-dataset-" + +HUGGINGFACE_HUB_DATASET_URL_PREFIX = "https://huggingface.co/datasets/" diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index df8e26e7..2afc27b7 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -12,6 +12,7 @@ from huggingface_hub import HfApi from huggingface_hub.utils import HfHubHTTPError +from data_designer.config.utils.constants import HUGGINGFACE_HUB_DATASET_URL_PREFIX from data_designer.engine.dataset_builders.artifact_storage import ( FINAL_DATASET_FOLDER_NAME, METADATA_FILENAME, @@ -107,7 +108,7 @@ def upload_dataset( sdg_path=base_dataset_path / SDG_CONFIG_FILENAME, ) - url = f"https://huggingface.co/datasets/{repo_id}" + url = f"{HUGGINGFACE_HUB_DATASET_URL_PREFIX}{repo_id}" logger.info(f" |-- {RandomEmoji.success()} Dataset uploaded successfully! View at: {url}") return url From ead52f5d871dbf0b1354bb9dd8b5e3355b1c9c9e Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 3 Feb 2026 09:08:28 -0700 Subject: [PATCH 19/25] HuggingfaceUploadError -> HuggingFaceHubClientUploadError --- .../integrations/huggingface/__init__.py | 4 +- .../integrations/huggingface/client.py | 56 +++++++++---------- .../integrations/huggingface/test_client.py | 34 +++++------ 3 files changed, 47 insertions(+), 47 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py index bbdaddff..9db42156 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/__init__.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError +from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceHubClientUploadError from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard -__all__ = ["HuggingFaceHubClient", "HuggingFaceUploadError", "DataDesignerDatasetCard"] +__all__ = ["HuggingFaceHubClient", "HuggingFaceHubClientUploadError", "DataDesignerDatasetCard"] diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 2afc27b7..8442597f 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) -class HuggingFaceUploadError(DataDesignerError): +class HuggingFaceHubClientUploadError(DataDesignerError): """Error during Hugging Face dataset upload.""" @@ -96,7 +96,7 @@ def upload_dataset( description=description, ) except Exception as e: - raise HuggingFaceUploadError(f"Failed to upload dataset card: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to upload dataset card: {e}") from e self._upload_main_dataset_files(repo_id=repo_id, parquet_folder=base_dataset_path / FINAL_DATASET_FOLDER_NAME) self._upload_processor_files( @@ -138,20 +138,20 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: ) except HfHubHTTPError as e: if e.response.status_code == 401: - raise HuggingFaceUploadError( + raise HuggingFaceHubClientUploadError( "Authentication failed. Please provide a valid Hugging Face token. " "You can set it via the token parameter or HF_TOKEN environment variable, " "or run 'huggingface-cli login'." ) from e elif e.response.status_code == 403: - raise HuggingFaceUploadError( + raise HuggingFaceHubClientUploadError( f"Permission denied. You don't have access to create repository '{repo_id}'. " "Check your token permissions or repository ownership." ) from e else: - raise HuggingFaceUploadError(f"Failed to create repository '{repo_id}': {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to create repository '{repo_id}': {e}") from e except Exception as e: - raise HuggingFaceUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e + raise HuggingFaceHubClientUploadError(f"Unexpected error creating repository '{repo_id}': {e}") from e def _upload_main_dataset_files(self, repo_id: str, parquet_folder: Path) -> None: """Upload main parquet dataset files. @@ -173,7 +173,7 @@ def _upload_main_dataset_files(self, repo_id: str, parquet_folder: Path) -> None commit_message="Upload main dataset files", ) except Exception as e: - raise HuggingFaceUploadError(f"Failed to upload parquet files: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to upload parquet files: {e}") from e def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None: """Upload processor output files. @@ -203,7 +203,7 @@ def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None commit_message=f"Upload {processor_dir.name} processor outputs", ) except Exception as e: - raise HuggingFaceUploadError( + raise HuggingFaceHubClientUploadError( f"Failed to upload processor outputs for '{processor_dir.name}': {e}" ) from e @@ -230,7 +230,7 @@ def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path commit_message="Upload sdg.json", ) except Exception as e: - raise HuggingFaceUploadError(f"Failed to upload sdg.json: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to upload sdg.json: {e}") from e if metadata_path.exists(): tmp_path = None @@ -248,7 +248,7 @@ def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path commit_message=f"Upload {METADATA_FILENAME}", ) except Exception as e: - raise HuggingFaceUploadError(f"Failed to upload {METADATA_FILENAME}: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to upload {METADATA_FILENAME}: {e}") from e finally: if tmp_path and Path(tmp_path).exists(): Path(tmp_path).unlink() @@ -269,9 +269,9 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path with open(metadata_path) as f: metadata = json.load(f) except json.JSONDecodeError as e: - raise HuggingFaceUploadError(f"Failed to parse {METADATA_FILENAME}: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to parse {METADATA_FILENAME}: {e}") from e except Exception as e: - raise HuggingFaceUploadError(f"Failed to read {METADATA_FILENAME}: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to read {METADATA_FILENAME}: {e}") from e sdg_config = None if sdg_path.exists(): @@ -279,9 +279,9 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path with open(sdg_path) as f: sdg_config = json.load(f) except json.JSONDecodeError as e: - raise HuggingFaceUploadError(f"Failed to parse sdg.json: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to parse sdg.json: {e}") from e except Exception as e: - raise HuggingFaceUploadError(f"Failed to read sdg.json: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to read sdg.json: {e}") from e try: card = DataDesignerDatasetCard.from_metadata( @@ -291,12 +291,12 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path description=description, ) except Exception as e: - raise HuggingFaceUploadError(f"Failed to generate dataset card: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to generate dataset card: {e}") from e try: card.push_to_hub(repo_id, repo_type="dataset") except Exception as e: - raise HuggingFaceUploadError(f"Failed to push dataset card to hub: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to push dataset card to hub: {e}") from e @staticmethod def _validate_repo_id(repo_id: str) -> None: @@ -309,12 +309,12 @@ def _validate_repo_id(repo_id: str) -> None: HuggingFaceUploadError: If repo_id format is invalid """ if not repo_id or not isinstance(repo_id, str): - raise HuggingFaceUploadError("repo_id must be a non-empty string") + raise HuggingFaceHubClientUploadError("repo_id must be a non-empty string") pattern = r"^[a-zA-Z0-9][-a-zA-Z0-9._]*/[a-zA-Z0-9][-a-zA-Z0-9._]*$" if not re.match(pattern, repo_id): - raise HuggingFaceUploadError( + raise HuggingFaceHubClientUploadError( f"Invalid repo_id format: '{repo_id}'. " "Expected format: 'username/dataset-name' or 'organization/dataset-name'. " "Names can contain alphanumeric characters, dashes, underscores, and dots." @@ -371,30 +371,30 @@ def _validate_dataset_path(base_dataset_path: Path) -> None: HuggingFaceUploadError: If directory structure is invalid """ if not base_dataset_path.exists(): - raise HuggingFaceUploadError(f"Dataset path does not exist: {base_dataset_path}") + raise HuggingFaceHubClientUploadError(f"Dataset path does not exist: {base_dataset_path}") if not base_dataset_path.is_dir(): - raise HuggingFaceUploadError(f"Dataset path is not a directory: {base_dataset_path}") + raise HuggingFaceHubClientUploadError(f"Dataset path is not a directory: {base_dataset_path}") metadata_path = base_dataset_path / METADATA_FILENAME if not metadata_path.exists(): - raise HuggingFaceUploadError(f"Required file not found: {metadata_path}") + raise HuggingFaceHubClientUploadError(f"Required file not found: {metadata_path}") if not metadata_path.is_file(): - raise HuggingFaceUploadError(f"{METADATA_FILENAME} is not a file: {metadata_path}") + raise HuggingFaceHubClientUploadError(f"{METADATA_FILENAME} is not a file: {metadata_path}") parquet_dir = base_dataset_path / FINAL_DATASET_FOLDER_NAME if not parquet_dir.exists(): - raise HuggingFaceUploadError( + raise HuggingFaceHubClientUploadError( f"Required directory not found: {parquet_dir}. " "Dataset must contain parquet-files directory with batch files." ) if not parquet_dir.is_dir(): - raise HuggingFaceUploadError(f"parquet-files is not a directory: {parquet_dir}") + raise HuggingFaceHubClientUploadError(f"parquet-files is not a directory: {parquet_dir}") if not any(parquet_dir.glob("*.parquet")): - raise HuggingFaceUploadError( + raise HuggingFaceHubClientUploadError( f"parquet-files directory is empty: {parquet_dir}. At least one .parquet file is required." ) @@ -402,14 +402,14 @@ def _validate_dataset_path(base_dataset_path: Path) -> None: with open(metadata_path) as f: json.load(f) except json.JSONDecodeError as e: - raise HuggingFaceUploadError(f"Invalid JSON in {METADATA_FILENAME}: {e}") + raise HuggingFaceHubClientUploadError(f"Invalid JSON in {METADATA_FILENAME}: {e}") sdg_path = base_dataset_path / SDG_CONFIG_FILENAME if sdg_path.exists(): if not sdg_path.is_file(): - raise HuggingFaceUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {sdg_path}") + raise HuggingFaceHubClientUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {sdg_path}") try: with open(sdg_path) as f: json.load(f) except json.JSONDecodeError as e: - raise HuggingFaceUploadError(f"Invalid JSON in {SDG_CONFIG_FILENAME}: {e}") + raise HuggingFaceHubClientUploadError(f"Invalid JSON in {SDG_CONFIG_FILENAME}: {e}") diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index 0c87ffa6..75d25f6c 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -10,7 +10,7 @@ import pytest from huggingface_hub.utils import HfHubHTTPError -from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceUploadError +from data_designer.integrations.huggingface.client import HuggingFaceHubClient, HuggingFaceHubClientUploadError @pytest.fixture @@ -242,7 +242,7 @@ def test_upload_dataset_card_missing_metadata(tmp_path: Path) -> None: base_path = tmp_path / "dataset" base_path.mkdir() - with pytest.raises(HuggingFaceUploadError, match="Required file not found"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Required file not found"): client.upload_dataset( repo_id="test/dataset", base_dataset_path=base_path, @@ -364,19 +364,19 @@ def test_validate_repo_id_invalid_format(sample_dataset_path: Path) -> None: client = HuggingFaceHubClient(token="test-token") # Missing slash - with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"): client.upload_dataset("my-dataset", sample_dataset_path, "Test") # Too many slashes (caught by regex) - with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"): client.upload_dataset("user/org/dataset", sample_dataset_path, "Test") # Invalid characters (space) - with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"): client.upload_dataset("user/my dataset", sample_dataset_path, "Test") # Empty string - with pytest.raises(HuggingFaceUploadError, match="must be a non-empty string"): + with pytest.raises(HuggingFaceHubClientUploadError, match="must be a non-empty string"): client.upload_dataset("", sample_dataset_path, "Test") @@ -385,7 +385,7 @@ def test_validate_dataset_path_not_exists(tmp_path: Path) -> None: client = HuggingFaceHubClient(token="test-token") non_existent = tmp_path / "does-not-exist" - with pytest.raises(HuggingFaceUploadError, match="does not exist"): + with pytest.raises(HuggingFaceHubClientUploadError, match="does not exist"): client.upload_dataset("test/dataset", non_existent, "Test") @@ -395,7 +395,7 @@ def test_validate_dataset_path_is_file(tmp_path: Path) -> None: file_path = tmp_path / "file.txt" file_path.write_text("not a directory") - with pytest.raises(HuggingFaceUploadError, match="not a directory"): + with pytest.raises(HuggingFaceHubClientUploadError, match="not a directory"): client.upload_dataset("test/dataset", file_path, "Test") @@ -405,7 +405,7 @@ def test_validate_dataset_path_missing_metadata(tmp_path: Path) -> None: base_path = tmp_path / "dataset" base_path.mkdir() - with pytest.raises(HuggingFaceUploadError, match="Required file not found"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Required file not found"): client.upload_dataset("test/dataset", base_path, "Test") @@ -416,7 +416,7 @@ def test_validate_dataset_path_missing_parquet_folder(tmp_path: Path) -> None: base_path.mkdir() (base_path / "metadata.json").write_text('{"target_num_records": 10}') - with pytest.raises(HuggingFaceUploadError, match="Required directory not found"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Required directory not found"): client.upload_dataset("test/dataset", base_path, "Test") @@ -429,7 +429,7 @@ def test_validate_dataset_path_empty_parquet_folder(tmp_path: Path) -> None: parquet_dir = base_path / "parquet-files" parquet_dir.mkdir() - with pytest.raises(HuggingFaceUploadError, match="directory is empty"): + with pytest.raises(HuggingFaceHubClientUploadError, match="directory is empty"): client.upload_dataset("test/dataset", base_path, "Test") @@ -443,7 +443,7 @@ def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None: parquet_dir.mkdir() (parquet_dir / "batch_00000.parquet").write_text("data") - with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid JSON"): client.upload_dataset("test/dataset", base_path, "Test") @@ -458,7 +458,7 @@ def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None: parquet_dir.mkdir() (parquet_dir / "batch_00000.parquet").write_text("data") - with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid JSON"): client.upload_dataset("test/dataset", base_path, "Test") @@ -466,7 +466,7 @@ def test_upload_dataset_invalid_repo_id(mock_hf_api: MagicMock, sample_dataset_p """Test upload_dataset fails with invalid repo_id.""" client = HuggingFaceHubClient(token="test-token") - with pytest.raises(HuggingFaceUploadError, match="Invalid repo_id format"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid repo_id format"): client.upload_dataset( repo_id="invalid-repo-id", # Missing slash base_dataset_path=sample_dataset_path, @@ -483,7 +483,7 @@ def test_upload_dataset_authentication_error(mock_hf_api: MagicMock, sample_data error_response.status_code = 401 mock_hf_api.create_repo.side_effect = HfHubHTTPError("Unauthorized", response=error_response) - with pytest.raises(HuggingFaceUploadError, match="Authentication failed"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Authentication failed"): client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, @@ -500,7 +500,7 @@ def test_upload_dataset_permission_error(mock_hf_api: MagicMock, sample_dataset_ error_response.status_code = 403 mock_hf_api.create_repo.side_effect = HfHubHTTPError("Forbidden", response=error_response) - with pytest.raises(HuggingFaceUploadError, match="Permission denied"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Permission denied"): client.upload_dataset( repo_id="test/dataset", base_dataset_path=sample_dataset_path, @@ -520,7 +520,7 @@ def test_upload_dataset_card_invalid_json(tmp_path: Path) -> None: parquet_dir.mkdir() (parquet_dir / "batch_00000.parquet").write_text("data") - with pytest.raises(HuggingFaceUploadError, match="Invalid JSON"): + with pytest.raises(HuggingFaceHubClientUploadError, match="Invalid JSON"): client.upload_dataset( repo_id="test/dataset", base_dataset_path=base_path, From bc90dcb6895b925968b646b99063637f7efbff12 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Tue, 3 Feb 2026 11:01:06 -0700 Subject: [PATCH 20/25] defer to hfhub repo validation --- .../integrations/huggingface/client.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 8442597f..fe789785 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -5,12 +5,12 @@ import json import logging -import re import tempfile from pathlib import Path from huggingface_hub import HfApi -from huggingface_hub.utils import HfHubHTTPError +from huggingface_hub.errors import HFValidationError +from huggingface_hub.utils import HfHubHTTPError, validate_repo_id from data_designer.config.utils.constants import HUGGINGFACE_HUB_DATASET_URL_PREFIX from data_designer.engine.dataset_builders.artifact_storage import ( @@ -306,20 +306,24 @@ def _validate_repo_id(repo_id: str) -> None: repo_id: Repository ID to validate Raises: - HuggingFaceUploadError: If repo_id format is invalid + HuggingFaceHubClientUploadError: If repo_id format is invalid """ - if not repo_id or not isinstance(repo_id, str): + # Check if repo_id is empty + if not repo_id or not repo_id.strip(): raise HuggingFaceHubClientUploadError("repo_id must be a non-empty string") - pattern = r"^[a-zA-Z0-9][-a-zA-Z0-9._]*/[a-zA-Z0-9][-a-zA-Z0-9._]*$" - - if not re.match(pattern, repo_id): + # Check for exactly one slash (username/dataset-name format). This is not enforced by huggingface_hub's validator. + if repo_id.count("/") != 1: raise HuggingFaceHubClientUploadError( - f"Invalid repo_id format: '{repo_id}'. " - "Expected format: 'username/dataset-name' or 'organization/dataset-name'. " - "Names can contain alphanumeric characters, dashes, underscores, and dots." + f"Invalid repo_id format: '{repo_id}'. Expected format: 'username/dataset-name'" ) + # Use huggingface_hub's validator for additional checks (characters, length, etc.) + try: + validate_repo_id(repo_id) + except HFValidationError as e: + raise HuggingFaceHubClientUploadError(f"Invalid repo_id format: '{repo_id}': {e}") from e + @staticmethod def _update_metadata_paths(metadata_path: Path) -> dict: """Update file paths in metadata.json to match Hugging Face dataset repository structure. From 4f8c4a0ddaf0ba467203c17a83f0d3569c09e897 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 4 Feb 2026 09:55:19 -0700 Subject: [PATCH 21/25] Update packages/data-designer/src/data_designer/integrations/huggingface/client.py Co-authored-by: Daniel van Strien --- .../src/data_designer/integrations/huggingface/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index fe789785..5e69ce81 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -141,7 +141,7 @@ def _create_or_get_repo(self, repo_id: str, *, private: bool = False) -> None: raise HuggingFaceHubClientUploadError( "Authentication failed. Please provide a valid Hugging Face token. " "You can set it via the token parameter or HF_TOKEN environment variable, " - "or run 'huggingface-cli login'." + "or run 'hf auth login'." ) from e elif e.response.status_code == 403: raise HuggingFaceHubClientUploadError( From d2fd6413ed72f810712c58b8d5f8c882d56576bd Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 4 Feb 2026 09:55:52 -0700 Subject: [PATCH 22/25] Update packages/data-designer/src/data_designer/interface/results.py Co-authored-by: Daniel van Strien --- packages/data-designer/src/data_designer/interface/results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index a37a5483..de66849c 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -121,7 +121,7 @@ def push_to_hub( Appears after the title. token: HuggingFace API token. If None, the token is automatically resolved from HF_TOKEN environment variable or cached credentials - from `huggingface-cli login`. + from `hf auth login`. private: Create private repo Returns: From afbdac740e8d08165d35ad93c664e92208facc7e Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 4 Feb 2026 09:56:25 -0700 Subject: [PATCH 23/25] Update packages/data-designer/src/data_designer/integrations/huggingface/client.py Co-authored-by: Daniel van Strien --- .../src/data_designer/integrations/huggingface/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 5e69ce81..35723b04 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -39,7 +39,7 @@ def __init__(self, token: str | None = None): Args: token: Hugging Face API token. If None, the token is automatically resolved from HF_TOKEN environment variable or cached credentials - from `huggingface-cli login`. + from `hf auth login`. """ self._token = token self._api = HfApi(token=token) From e56c846c3af25a463a9d2ec04d8b88178b626b67 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 4 Feb 2026 11:18:34 -0700 Subject: [PATCH 24/25] allow custom tags --- .../integrations/huggingface/client.py | 9 +- .../integrations/huggingface/dataset_card.py | 11 +- .../src/data_designer/interface/results.py | 5 +- .../huggingface/test_dataset_card.py | 258 ++++++++++-------- 4 files changed, 165 insertions(+), 118 deletions(-) diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 35723b04..7d8c54e4 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -60,6 +60,7 @@ def upload_dataset( description: str, *, private: bool = False, + tags: list[str] | None = None, ) -> str: """Upload dataset to Hugging Face Hub. @@ -74,6 +75,7 @@ def upload_dataset( base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.) description: Custom description text for dataset card private: Whether to create private repo + tags: Additional custom tags for the dataset Returns: URL to the uploaded dataset @@ -94,6 +96,7 @@ def upload_dataset( metadata_path=base_dataset_path / METADATA_FILENAME, sdg_path=base_dataset_path / SDG_CONFIG_FILENAME, description=description, + tags=tags, ) except Exception as e: raise HuggingFaceHubClientUploadError(f"Failed to upload dataset card: {e}") from e @@ -253,7 +256,9 @@ def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path if tmp_path and Path(tmp_path).exists(): Path(tmp_path).unlink() - def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path, description: str) -> None: + def _upload_dataset_card( + self, repo_id: str, metadata_path: Path, sdg_path: Path, description: str, tags: list[str] | None = None + ) -> None: """Generate and upload dataset card from metadata.json. Args: @@ -261,6 +266,7 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path metadata_path: Path to metadata.json file sdg_path: Path to sdg.json file description: Custom description text for dataset card + tags: Additional custom tags for the dataset Raises: HuggingFaceUploadError: If dataset card generation or upload fails @@ -289,6 +295,7 @@ def _upload_dataset_card(self, repo_id: str, metadata_path: Path, sdg_path: Path sdg_config=sdg_config, repo_id=repo_id, description=description, + tags=tags, ) except Exception as e: raise HuggingFaceHubClientUploadError(f"Failed to generate dataset card: {e}") from e diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py index 04bc6324..ba5d2f0e 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -9,6 +9,7 @@ from huggingface_hub import CardData, DatasetCard TEMPLATE_DATA_DESIGNER_DATASET_CARD_PATH = Path(__file__).parent / "dataset_card_template.md" +DEFAULT_DATASET_CARD_TAGS = ["synthetic", "datadesigner"] class DataDesignerDatasetCard(DatasetCard): @@ -28,6 +29,7 @@ def from_metadata( sdg_config: dict | None, repo_id: str, description: str, + tags: list[str] | None = None, ) -> DataDesignerDatasetCard: """Create dataset card from metadata.json and sdg.json. @@ -36,6 +38,7 @@ def from_metadata( sdg_config: Contents of sdg.json (optional) repo_id: Hugging Face dataset repo ID description: Custom description text + tags: Additional custom tags for the dataset. Returns: DataDesignerDatasetCard instance ready to upload @@ -71,13 +74,13 @@ def from_metadata( if "file_paths" in metadata and "processor-files" in metadata["file_paths"]: processor_names = list(metadata["file_paths"]["processor-files"].keys()) - # Prepare tags - tags = ["synthetic", "datadesigner"] + # Prepare tags: default tags + custom tags + all_tags = DEFAULT_DATASET_CARD_TAGS + (tags or []) # Prepare CardData (metadata for YAML frontmatter) card_data = CardData( size_categories=size_categories, - tags=tags, + tags=all_tags, ) # Prepare template variables @@ -95,7 +98,7 @@ def from_metadata( "current_year": datetime.now().year, "has_processors": len(processor_names) > 0, "processor_names": processor_names, - "tags": tags, + "tags": all_tags, "custom_description": description, } diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index de66849c..5a071469 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -105,6 +105,7 @@ def push_to_hub( *, token: str | None = None, private: bool = False, + tags: list[str] | None = None, ) -> str: """Push dataset to HuggingFace Hub. @@ -123,6 +124,7 @@ def push_to_hub( resolved from HF_TOKEN environment variable or cached credentials from `hf auth login`. private: Create private repo + tags: Additional custom tags for the dataset. Returns: URL to the uploaded dataset @@ -130,7 +132,7 @@ def push_to_hub( Example: >>> results = data_designer.create(config, num_records=1000) >>> description = "This dataset contains synthetic conversations for training chatbots." - >>> results.push_to_hub("username/my-synthetic-dataset", description) + >>> results.push_to_hub("username/my-synthetic-dataset", description, tags=["chatbot", "conversation"]) 'https://huggingface.co/datasets/username/my-synthetic-dataset' """ client = HuggingFaceHubClient(token=token) @@ -139,4 +141,5 @@ def push_to_hub( base_dataset_path=self.artifact_storage.base_dataset_path, private=private, description=description, + tags=tags, ) diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py index aa573cd6..a6342b0f 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py +++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py @@ -3,24 +3,17 @@ from __future__ import annotations -from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard - +import pytest -def test_compute_size_category() -> None: - """Test size category computation for various dataset sizes.""" - assert DataDesignerDatasetCard._compute_size_category(500) == "n<1K" - assert DataDesignerDatasetCard._compute_size_category(5000) == "1K10M" +from data_designer.integrations.huggingface.dataset_card import DataDesignerDatasetCard -def test_from_metadata_minimal() -> None: - """Test creating dataset card from minimal metadata.""" - metadata = { +@pytest.fixture +def stub_metadata() -> dict: + """Stub metadata fixture with single column that can be used/modified by most tests.""" + return { "target_num_records": 100, - "schema": {"col1": "string", "col2": "int64"}, + "schema": {"col1": "string"}, "column_statistics": [ { "column_name": "col1", @@ -33,8 +26,24 @@ def test_from_metadata_minimal() -> None: ], } + +def test_compute_size_category() -> None: + """Test size category computation for various dataset sizes.""" + assert DataDesignerDatasetCard._compute_size_category(500) == "n<1K" + assert DataDesignerDatasetCard._compute_size_category(5000) == "1K10M" + + +def test_from_metadata_minimal(stub_metadata: dict) -> None: + """Test creating dataset card from minimal metadata.""" + # Add second column for this test + stub_metadata["schema"]["col2"] = "int64" + card = DataDesignerDatasetCard.from_metadata( - metadata=metadata, + metadata=stub_metadata, sdg_config=None, repo_id="test/dataset", description="Test dataset for unit testing.", @@ -48,32 +57,31 @@ def test_from_metadata_minimal() -> None: assert "2" in str(card) # Number of columns -def test_from_metadata_with_sdg_config() -> None: +def test_from_metadata_with_sdg_config(stub_metadata: dict) -> None: """Test creating dataset card with sdg config.""" - metadata = { - "target_num_records": 50, - "schema": {"name": "string", "age": "int64"}, - "column_statistics": [ - { - "column_name": "name", - "num_records": 50, - "num_unique": 50, - "num_null": 0, - "simple_dtype": "string", - "column_type": "sampler", - "sampler_type": "person", - }, - { - "column_name": "age", - "num_records": 50, - "num_unique": 30, - "num_null": 0, - "simple_dtype": "int64", - "column_type": "sampler", - "sampler_type": "uniform", - }, - ], - } + # Customize for this test + stub_metadata["target_num_records"] = 50 + stub_metadata["schema"] = {"name": "string", "age": "int64"} + stub_metadata["column_statistics"] = [ + { + "column_name": "name", + "num_records": 50, + "num_unique": 50, + "num_null": 0, + "simple_dtype": "string", + "column_type": "sampler", + "sampler_type": "person", + }, + { + "column_name": "age", + "num_records": 50, + "num_unique": 30, + "num_null": 0, + "simple_dtype": "int64", + "column_type": "sampler", + "sampler_type": "uniform", + }, + ] sdg_config = { "data_designer": { @@ -85,7 +93,7 @@ def test_from_metadata_with_sdg_config() -> None: } card = DataDesignerDatasetCard.from_metadata( - metadata=metadata, + metadata=stub_metadata, sdg_config=sdg_config, repo_id="test/dataset-with-config", description="Test dataset with SDG config.", @@ -97,27 +105,26 @@ def test_from_metadata_with_sdg_config() -> None: assert "2 column" in str(card) -def test_from_metadata_with_llm_columns() -> None: +def test_from_metadata_with_llm_columns(stub_metadata: dict) -> None: """Test creating dataset card with LLM column statistics.""" - metadata = { - "target_num_records": 10, - "schema": {"prompt": "string", "response": "string"}, - "column_statistics": [ - { - "column_name": "response", - "num_records": 10, - "num_unique": 10, - "num_null": 0, - "simple_dtype": "string", - "column_type": "llm-text", - "output_tokens_mean": 50.5, - "input_tokens_mean": 20.3, - } - ], - } + # Customize for LLM test + stub_metadata["target_num_records"] = 10 + stub_metadata["schema"] = {"prompt": "string", "response": "string"} + stub_metadata["column_statistics"] = [ + { + "column_name": "response", + "num_records": 10, + "num_unique": 10, + "num_null": 0, + "simple_dtype": "string", + "column_type": "llm-text", + "output_tokens_mean": 50.5, + "input_tokens_mean": 20.3, + } + ] card = DataDesignerDatasetCard.from_metadata( - metadata=metadata, + metadata=stub_metadata, sdg_config=None, repo_id="test/llm-dataset", description="Test dataset with LLM columns.", @@ -128,32 +135,19 @@ def test_from_metadata_with_llm_columns() -> None: assert "Tokens:" in str(card) and "out" in str(card) and "in" in str(card) -def test_from_metadata_with_processors() -> None: +def test_from_metadata_with_processors(stub_metadata: dict) -> None: """Test creating dataset card with processor outputs includes loading examples.""" - metadata = { - "target_num_records": 100, - "schema": {"col1": "string"}, - "file_paths": { - "parquet-files": ["parquet-files/batch_00000.parquet"], - "processor-files": { - "processor1": ["processors-files/processor1/batch_00000.parquet"], - "processor2": ["processors-files/processor2/batch_00000.parquet"], - }, + # Add processor files for this test + stub_metadata["file_paths"] = { + "parquet-files": ["parquet-files/batch_00000.parquet"], + "processor-files": { + "processor1": ["processors-files/processor1/batch_00000.parquet"], + "processor2": ["processors-files/processor2/batch_00000.parquet"], }, - "column_statistics": [ - { - "column_name": "col1", - "num_records": 100, - "num_unique": 100, - "num_null": 0, - "simple_dtype": "string", - "column_type": "sampler", - } - ], } card = DataDesignerDatasetCard.from_metadata( - metadata=metadata, + metadata=stub_metadata, sdg_config=None, repo_id="test/dataset-with-processors", description="Test dataset with processor outputs.", @@ -168,27 +162,15 @@ def test_from_metadata_with_processors() -> None: assert "Load processor outputs" in card_str -def test_from_metadata_with_custom_description() -> None: +def test_from_metadata_with_custom_description(stub_metadata: dict) -> None: """Test creating dataset card with custom description.""" - metadata = { - "target_num_records": 100, - "schema": {"col1": "string", "col2": "int64"}, - "column_statistics": [ - { - "column_name": "col1", - "num_records": 100, - "num_unique": 100, - "num_null": 0, - "simple_dtype": "string", - "column_type": "sampler", - } - ], - } + # Add second column for this test + stub_metadata["schema"]["col2"] = "int64" description = "This dataset contains synthetic data for testing chatbot responses." card = DataDesignerDatasetCard.from_metadata( - metadata=metadata, + metadata=stub_metadata, sdg_config=None, repo_id="test/dataset-with-description", description=description, @@ -199,25 +181,14 @@ def test_from_metadata_with_custom_description() -> None: assert "This dataset contains synthetic data for testing chatbot responses." in card_str -def test_from_metadata_description_placement() -> None: +def test_from_metadata_description_placement(stub_metadata: dict) -> None: """Test that description appears in the correct location.""" - metadata = { - "target_num_records": 50, - "schema": {"col1": "string"}, - "column_statistics": [ - { - "column_name": "col1", - "num_records": 50, - "num_unique": 50, - "num_null": 0, - "simple_dtype": "string", - "column_type": "sampler", - } - ], - } + # Use 50 records for this test + stub_metadata["target_num_records"] = 50 + stub_metadata["column_statistics"][0]["num_records"] = 50 card = DataDesignerDatasetCard.from_metadata( - metadata=metadata, + metadata=stub_metadata, sdg_config=None, repo_id="test/dataset-description-placement", description="Test description placement.", @@ -231,3 +202,66 @@ def test_from_metadata_description_placement() -> None: desc_pos = card_str.find("Test description placement.") summary_pos = card_str.find("Dataset Summary") assert desc_pos < summary_pos + + +def test_from_metadata_default_tags(stub_metadata: dict) -> None: + """Test that default tags are included when no custom tags are provided.""" + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + sdg_config=None, + repo_id="test/dataset-default-tags", + description="Test dataset with default tags.", + ) + + card_str = str(card) + assert card is not None + # Check that default tags appear in the YAML frontmatter + assert "- synthetic" in card_str + assert "- datadesigner" in card_str + + +def test_from_metadata_with_custom_tags(stub_metadata: dict) -> None: + """Test that custom tags are added to default tags.""" + custom_tags = ["chatbot", "conversation", "qa"] + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + sdg_config=None, + repo_id="test/dataset-custom-tags", + description="Test dataset with custom tags.", + tags=custom_tags, + ) + + card_str = str(card) + assert card is not None + # Check that both default and custom tags appear in the YAML frontmatter + assert "- synthetic" in card_str + assert "- datadesigner" in card_str + assert "- chatbot" in card_str + assert "- conversation" in card_str + assert "- qa" in card_str + + +def test_from_metadata_tags_in_yaml_frontmatter(stub_metadata: dict) -> None: + """Test that tags appear in the YAML frontmatter section.""" + # Use 50 records for this test + stub_metadata["target_num_records"] = 50 + stub_metadata["column_statistics"][0]["num_records"] = 50 + + card = DataDesignerDatasetCard.from_metadata( + metadata=stub_metadata, + sdg_config=None, + repo_id="test/dataset-tags-frontmatter", + description="Test dataset.", + tags=["custom-tag"], + ) + + card_str = str(card) + assert card is not None + # Tags should appear before the main content (in YAML frontmatter) + tags_section = card_str.find("tags:") + quick_start_section = card_str.find("## 🚀 Quick Start") + assert tags_section < quick_start_section + assert tags_section != -1 # Make sure tags section exists + # Verify tags appear before the closing of YAML frontmatter + assert tags_section < card_str.find("---", tags_section) From 081ab2a6a413adc7304a2e766cc047261c034143 Mon Sep 17 00:00:00 2001 From: Nabin Mulepati Date: Wed, 4 Feb 2026 11:28:35 -0700 Subject: [PATCH 25/25] change sdg.json -> builder_config.json --- .../dataset_builders/artifact_storage.py | 2 +- .../integrations/huggingface/client.py | 55 ++++++++++--------- .../integrations/huggingface/dataset_card.py | 12 ++-- .../huggingface/dataset_card_template.md | 2 +- .../src/data_designer/interface/results.py | 2 +- .../integrations/huggingface/test_client.py | 28 +++++----- .../huggingface/test_dataset_card.py | 26 ++++----- 7 files changed, 66 insertions(+), 61 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py index e2daaca6..35e7d4f8 100644 --- a/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py +++ b/packages/data-designer-engine/src/data_designer/engine/dataset_builders/artifact_storage.py @@ -24,7 +24,7 @@ logger = logging.getLogger(__name__) BATCH_FILE_NAME_FORMAT = "batch_{batch_number:05d}.parquet" -SDG_CONFIG_FILENAME = "sdg.json" +SDG_CONFIG_FILENAME = "builder_config.json" METADATA_FILENAME = "metadata.json" FINAL_DATASET_FOLDER_NAME = "parquet-files" PROCESSORS_OUTPUTS_FOLDER_NAME = "processors-files" diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/client.py b/packages/data-designer/src/data_designer/integrations/huggingface/client.py index 7d8c54e4..0812b8de 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/client.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/client.py @@ -67,12 +67,12 @@ def upload_dataset( Uploads the complete dataset including: - Main parquet batch files from parquet-files/ → data/ - Processor output batch files from processors-files/{name}/ → {name}/ - - Existing sdg.json and metadata.json files + - Existing builder_config.json and metadata.json files - Auto-generated README.md (dataset card) Args: repo_id: Hugging Face dataset repo ID (e.g., "username/dataset-name") - base_dataset_path: Path to base_dataset_path (contains parquet-files/, sdg.json, etc.) + base_dataset_path: Path to base_dataset_path (contains parquet-files/, builder_config.json, etc.) description: Custom description text for dataset card private: Whether to create private repo tags: Additional custom tags for the dataset @@ -94,7 +94,7 @@ def upload_dataset( self._upload_dataset_card( repo_id=repo_id, metadata_path=base_dataset_path / METADATA_FILENAME, - sdg_path=base_dataset_path / SDG_CONFIG_FILENAME, + builder_config_path=base_dataset_path / SDG_CONFIG_FILENAME, description=description, tags=tags, ) @@ -108,7 +108,7 @@ def upload_dataset( self._upload_config_files( repo_id=repo_id, metadata_path=base_dataset_path / METADATA_FILENAME, - sdg_path=base_dataset_path / SDG_CONFIG_FILENAME, + builder_config_path=base_dataset_path / SDG_CONFIG_FILENAME, ) url = f"{HUGGINGFACE_HUB_DATASET_URL_PREFIX}{repo_id}" @@ -210,30 +210,30 @@ def _upload_processor_files(self, repo_id: str, processors_folder: Path) -> None f"Failed to upload processor outputs for '{processor_dir.name}': {e}" ) from e - def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path) -> None: - """Upload configuration files (sdg.json and metadata.json). + def _upload_config_files(self, repo_id: str, metadata_path: Path, builder_config_path: Path) -> None: + """Upload configuration files (builder_config.json and metadata.json). Args: repo_id: Hugging Face dataset repo ID metadata_path: Path to metadata.json file - sdg_path: Path to sdg.json file + builder_config_path: Path to builder_config.json file Raises: HuggingFaceUploadError: If upload fails """ logger.info(f" |-- {RandomEmoji.loading()} Uploading configuration files...") - if sdg_path.exists(): + if builder_config_path.exists(): try: self._api.upload_file( repo_id=repo_id, - path_or_fileobj=str(sdg_path), + path_or_fileobj=str(builder_config_path), path_in_repo=SDG_CONFIG_FILENAME, repo_type="dataset", - commit_message="Upload sdg.json", + commit_message="Upload builder_config.json", ) except Exception as e: - raise HuggingFaceHubClientUploadError(f"Failed to upload sdg.json: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to upload builder_config.json: {e}") from e if metadata_path.exists(): tmp_path = None @@ -257,14 +257,19 @@ def _upload_config_files(self, repo_id: str, metadata_path: Path, sdg_path: Path Path(tmp_path).unlink() def _upload_dataset_card( - self, repo_id: str, metadata_path: Path, sdg_path: Path, description: str, tags: list[str] | None = None + self, + repo_id: str, + metadata_path: Path, + builder_config_path: Path, + description: str, + tags: list[str] | None = None, ) -> None: """Generate and upload dataset card from metadata.json. Args: repo_id: Hugging Face dataset repo ID metadata_path: Path to metadata.json file - sdg_path: Path to sdg.json file + builder_config_path: Path to builder_config.json file description: Custom description text for dataset card tags: Additional custom tags for the dataset @@ -279,20 +284,20 @@ def _upload_dataset_card( except Exception as e: raise HuggingFaceHubClientUploadError(f"Failed to read {METADATA_FILENAME}: {e}") from e - sdg_config = None - if sdg_path.exists(): + builder_config = None + if builder_config_path.exists(): try: - with open(sdg_path) as f: - sdg_config = json.load(f) + with open(builder_config_path) as f: + builder_config = json.load(f) except json.JSONDecodeError as e: - raise HuggingFaceHubClientUploadError(f"Failed to parse sdg.json: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to parse builder_config.json: {e}") from e except Exception as e: - raise HuggingFaceHubClientUploadError(f"Failed to read sdg.json: {e}") from e + raise HuggingFaceHubClientUploadError(f"Failed to read builder_config.json: {e}") from e try: card = DataDesignerDatasetCard.from_metadata( metadata=metadata, - sdg_config=sdg_config, + builder_config=builder_config, repo_id=repo_id, description=description, tags=tags, @@ -415,12 +420,12 @@ def _validate_dataset_path(base_dataset_path: Path) -> None: except json.JSONDecodeError as e: raise HuggingFaceHubClientUploadError(f"Invalid JSON in {METADATA_FILENAME}: {e}") - sdg_path = base_dataset_path / SDG_CONFIG_FILENAME - if sdg_path.exists(): - if not sdg_path.is_file(): - raise HuggingFaceHubClientUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {sdg_path}") + builder_config_path = base_dataset_path / SDG_CONFIG_FILENAME + if builder_config_path.exists(): + if not builder_config_path.is_file(): + raise HuggingFaceHubClientUploadError(f"{SDG_CONFIG_FILENAME} is not a file: {builder_config_path}") try: - with open(sdg_path) as f: + with open(builder_config_path) as f: json.load(f) except json.JSONDecodeError as e: raise HuggingFaceHubClientUploadError(f"Invalid JSON in {SDG_CONFIG_FILENAME}: {e}") diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py index ba5d2f0e..3c57f743 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card.py @@ -26,16 +26,16 @@ class DataDesignerDatasetCard(DatasetCard): def from_metadata( cls, metadata: dict, - sdg_config: dict | None, + builder_config: dict | None, repo_id: str, description: str, tags: list[str] | None = None, ) -> DataDesignerDatasetCard: - """Create dataset card from metadata.json and sdg.json. + """Create dataset card from metadata.json and builder_config.json. Args: metadata: Contents of metadata.json - sdg_config: Contents of sdg.json (optional) + builder_config: Contents of builder_config.json (optional) repo_id: Hugging Face dataset repo ID description: Custom description text tags: Additional custom tags for the dataset. @@ -57,11 +57,11 @@ def from_metadata( # Compute size category size_categories = cls._compute_size_category(actual_num_records) - # Extract column types from sdg.json if available + # Extract column types from builder_config.json if available config_types: dict[str, int] = {} num_columns_configured = 0 - if sdg_config: - columns = sdg_config.get("data_designer", {}).get("columns", []) + if builder_config: + columns = builder_config.get("data_designer", {}).get("columns", []) num_columns_configured = len(columns) for col in columns: col_type = col.get("column_type", "unknown") diff --git a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md index 89c87e67..f01ce33d 100644 --- a/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md +++ b/packages/data-designer/src/data_designer/integrations/huggingface/dataset_card_template.md @@ -80,7 +80,7 @@ Generated with {{ num_columns_configured }} column configuration(s): {% endfor %} {% endif %} -📄 Full configuration available in [`sdg.json`](sdg.json) and detailed metadata in [`metadata.json`](metadata.json). +📄 Full configuration available in [`builder_config.json`](builder_config.json) and detailed metadata in [`metadata.json`](metadata.json). --- diff --git a/packages/data-designer/src/data_designer/interface/results.py b/packages/data-designer/src/data_designer/interface/results.py index 5a071469..f86acced 100644 --- a/packages/data-designer/src/data_designer/interface/results.py +++ b/packages/data-designer/src/data_designer/interface/results.py @@ -112,7 +112,7 @@ def push_to_hub( Uploads all artifacts including: - Main parquet batch files (data subset) - Processor output batch files ({processor_name} subsets) - - Configuration (sdg.json) + - Configuration (builder_config.json) - Metadata (metadata.json) - Auto-generated dataset card (README.md) diff --git a/packages/data-designer/tests/integrations/huggingface/test_client.py b/packages/data-designer/tests/integrations/huggingface/test_client.py index 75d25f6c..735ea3bc 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_client.py +++ b/packages/data-designer/tests/integrations/huggingface/test_client.py @@ -39,7 +39,7 @@ def sample_dataset_path(tmp_path: Path) -> Path: - parquet-files/: Main dataset batch files - processors-files/{processor_name}/: Processor output batch files (same structure) - metadata.json: Dataset metadata - - sdg.json: Configuration + - builder_config.json: Configuration """ base_path = tmp_path / "dataset" base_path.mkdir() @@ -92,8 +92,8 @@ def sample_dataset_path(tmp_path: Path) -> Path: } (base_path / "metadata.json").write_text(json.dumps(metadata)) - # Create sdg.json with realistic BuilderConfig structure - sdg_config = { + # Create builder_config.json with realistic BuilderConfig structure + builder_config = { "data_designer": { "columns": [ { @@ -109,7 +109,7 @@ def sample_dataset_path(tmp_path: Path) -> Path: "profilers": None, } } - (base_path / "sdg.json").write_text(json.dumps(sdg_config)) + (base_path / "builder_config.json").write_text(json.dumps(builder_config)) return base_path @@ -182,7 +182,7 @@ def test_upload_dataset_uploads_processor_outputs( def test_upload_dataset_uploads_config_files( mock_hf_api: MagicMock, mock_dataset_card: MagicMock, sample_dataset_path: Path ) -> None: - """Test that upload_dataset uploads sdg.json and metadata.json.""" + """Test that upload_dataset uploads builder_config.json and metadata.json.""" client = HuggingFaceHubClient(token="test-token") client.upload_dataset( @@ -194,7 +194,7 @@ def test_upload_dataset_uploads_config_files( # Check that upload_file was called for config files upload_file_calls = mock_hf_api.upload_file.call_args_list uploaded_files = [call.kwargs["path_in_repo"] for call in upload_file_calls] - assert "sdg.json" in uploaded_files + assert "builder_config.json" in uploaded_files assert "metadata.json" in uploaded_files @@ -301,10 +301,10 @@ def test_upload_dataset_without_processors( assert len(processor_calls) == 0 # No processor files -def test_upload_dataset_without_sdg_config( +def test_upload_dataset_without_builder_config( mock_hf_api: MagicMock, mock_dataset_card: MagicMock, tmp_path: Path ) -> None: - """Test upload_dataset when sdg.json doesn't exist.""" + """Test upload_dataset when builder_config.json doesn't exist.""" base_path = tmp_path / "dataset" base_path.mkdir() @@ -315,7 +315,7 @@ def test_upload_dataset_without_sdg_config( metadata = {"target_num_records": 10, "schema": {"col1": "string"}, "column_statistics": []} (base_path / "metadata.json").write_text(json.dumps(metadata)) - # No sdg.json file + # No builder_config.json file client = HuggingFaceHubClient(token="test-token") @@ -325,13 +325,13 @@ def test_upload_dataset_without_sdg_config( description="Test dataset", ) - # Should only upload metadata.json, not sdg.json + # Should only upload metadata.json, not builder_config.json file_calls = mock_hf_api.upload_file.call_args_list uploaded_files = [call.kwargs["path_in_repo"] for call in file_calls] assert len(uploaded_files) == 1 # Only metadata.json assert "metadata.json" in uploaded_files - assert "sdg.json" not in uploaded_files + assert "builder_config.json" not in uploaded_files def test_upload_dataset_multiple_processors( @@ -447,13 +447,13 @@ def test_validate_dataset_path_invalid_metadata_json(tmp_path: Path) -> None: client.upload_dataset("test/dataset", base_path, "Test") -def test_validate_dataset_path_invalid_sdg_json(tmp_path: Path) -> None: - """Test upload fails when sdg.json contains invalid JSON.""" +def test_validate_dataset_path_invalid_builder_config_json(tmp_path: Path) -> None: + """Test upload fails when builder_config.json contains invalid JSON.""" client = HuggingFaceHubClient(token="test-token") base_path = tmp_path / "dataset" base_path.mkdir() (base_path / "metadata.json").write_text('{"target_num_records": 10}') - (base_path / "sdg.json").write_text("invalid json {{{") + (base_path / "builder_config.json").write_text("invalid json {{{") parquet_dir = base_path / "parquet-files" parquet_dir.mkdir() (parquet_dir / "batch_00000.parquet").write_text("data") diff --git a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py index a6342b0f..ce7b2832 100644 --- a/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py +++ b/packages/data-designer/tests/integrations/huggingface/test_dataset_card.py @@ -44,7 +44,7 @@ def test_from_metadata_minimal(stub_metadata: dict) -> None: card = DataDesignerDatasetCard.from_metadata( metadata=stub_metadata, - sdg_config=None, + builder_config=None, repo_id="test/dataset", description="Test dataset for unit testing.", ) @@ -57,8 +57,8 @@ def test_from_metadata_minimal(stub_metadata: dict) -> None: assert "2" in str(card) # Number of columns -def test_from_metadata_with_sdg_config(stub_metadata: dict) -> None: - """Test creating dataset card with sdg config.""" +def test_from_metadata_with_builder_config(stub_metadata: dict) -> None: + """Test creating dataset card with builder config.""" # Customize for this test stub_metadata["target_num_records"] = 50 stub_metadata["schema"] = {"name": "string", "age": "int64"} @@ -83,7 +83,7 @@ def test_from_metadata_with_sdg_config(stub_metadata: dict) -> None: }, ] - sdg_config = { + builder_config = { "data_designer": { "columns": [ {"name": "name", "column_type": "sampler"}, @@ -94,9 +94,9 @@ def test_from_metadata_with_sdg_config(stub_metadata: dict) -> None: card = DataDesignerDatasetCard.from_metadata( metadata=stub_metadata, - sdg_config=sdg_config, + builder_config=builder_config, repo_id="test/dataset-with-config", - description="Test dataset with SDG config.", + description="Test dataset with builder config.", ) # Verify card includes config info @@ -125,7 +125,7 @@ def test_from_metadata_with_llm_columns(stub_metadata: dict) -> None: card = DataDesignerDatasetCard.from_metadata( metadata=stub_metadata, - sdg_config=None, + builder_config=None, repo_id="test/llm-dataset", description="Test dataset with LLM columns.", ) @@ -148,7 +148,7 @@ def test_from_metadata_with_processors(stub_metadata: dict) -> None: card = DataDesignerDatasetCard.from_metadata( metadata=stub_metadata, - sdg_config=None, + builder_config=None, repo_id="test/dataset-with-processors", description="Test dataset with processor outputs.", ) @@ -171,7 +171,7 @@ def test_from_metadata_with_custom_description(stub_metadata: dict) -> None: card = DataDesignerDatasetCard.from_metadata( metadata=stub_metadata, - sdg_config=None, + builder_config=None, repo_id="test/dataset-with-description", description=description, ) @@ -189,7 +189,7 @@ def test_from_metadata_description_placement(stub_metadata: dict) -> None: card = DataDesignerDatasetCard.from_metadata( metadata=stub_metadata, - sdg_config=None, + builder_config=None, repo_id="test/dataset-description-placement", description="Test description placement.", ) @@ -208,7 +208,7 @@ def test_from_metadata_default_tags(stub_metadata: dict) -> None: """Test that default tags are included when no custom tags are provided.""" card = DataDesignerDatasetCard.from_metadata( metadata=stub_metadata, - sdg_config=None, + builder_config=None, repo_id="test/dataset-default-tags", description="Test dataset with default tags.", ) @@ -226,7 +226,7 @@ def test_from_metadata_with_custom_tags(stub_metadata: dict) -> None: card = DataDesignerDatasetCard.from_metadata( metadata=stub_metadata, - sdg_config=None, + builder_config=None, repo_id="test/dataset-custom-tags", description="Test dataset with custom tags.", tags=custom_tags, @@ -250,7 +250,7 @@ def test_from_metadata_tags_in_yaml_frontmatter(stub_metadata: dict) -> None: card = DataDesignerDatasetCard.from_metadata( metadata=stub_metadata, - sdg_config=None, + builder_config=None, repo_id="test/dataset-tags-frontmatter", description="Test dataset.", tags=["custom-tag"],